mirror of
https://github.com/DayuanJiang/next-ai-draw-io.git
synced 2026-01-08 01:02:29 +08:00
* feat: add URL content extraction for AI diagram generation * Changes made as recommended by Claude: 1. Added a request timeout to prevent server resources from being tied up (route.ts) 2. Implemented runtime validation for the API response shape (url-utils.ts) 3. Removed hardcoded English error messages and replaced them with localized strings (url-input-dialog.tsx) 4. Fixed the incorrect i18n namespace (changed from pdf.* to url.*) (url-input-dialog.tsx and en/ja/zh.json) * chore: restore package.json and package-lock.json * fix: use i18n strings for URL dialog error messages --------- Co-authored-by: dayuan.jiang <jdy.toh@gmail.com>
155 lines
4.6 KiB
TypeScript
155 lines
4.6 KiB
TypeScript
import { extract } from "@extractus/article-extractor"
|
|
import { NextResponse } from "next/server"
|
|
import TurndownService from "turndown"
|
|
|
|
const MAX_CONTENT_LENGTH = 150000 // Match PDF limit
|
|
const EXTRACT_TIMEOUT_MS = 15000
|
|
|
|
// SSRF protection - block private/internal addresses
|
|
function isPrivateUrl(urlString: string): boolean {
|
|
try {
|
|
const url = new URL(urlString)
|
|
const hostname = url.hostname.toLowerCase()
|
|
|
|
// Block localhost
|
|
if (
|
|
hostname === "localhost" ||
|
|
hostname === "127.0.0.1" ||
|
|
hostname === "::1"
|
|
) {
|
|
return true
|
|
}
|
|
|
|
// Block AWS/cloud metadata endpoints
|
|
if (
|
|
hostname === "169.254.169.254" ||
|
|
hostname === "metadata.google.internal"
|
|
) {
|
|
return true
|
|
}
|
|
|
|
// Check for private IPv4 ranges
|
|
const ipv4Match = hostname.match(
|
|
/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/,
|
|
)
|
|
if (ipv4Match) {
|
|
const [, a, b] = ipv4Match.map(Number)
|
|
if (a === 10) return true // 10.0.0.0/8
|
|
if (a === 172 && b >= 16 && b <= 31) return true // 172.16.0.0/12
|
|
if (a === 192 && b === 168) return true // 192.168.0.0/16
|
|
if (a === 169 && b === 254) return true // 169.254.0.0/16 (link-local)
|
|
if (a === 127) return true // 127.0.0.0/8 (loopback)
|
|
}
|
|
|
|
// Block common internal hostnames
|
|
if (
|
|
hostname.endsWith(".local") ||
|
|
hostname.endsWith(".internal") ||
|
|
hostname.endsWith(".localhost")
|
|
) {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
} catch {
|
|
return true // Invalid URL - block it
|
|
}
|
|
}
|
|
|
|
export async function POST(req: Request) {
|
|
try {
|
|
const { url } = await req.json()
|
|
|
|
if (!url || typeof url !== "string") {
|
|
return NextResponse.json(
|
|
{ error: "URL is required" },
|
|
{ status: 400 },
|
|
)
|
|
}
|
|
|
|
// Validate URL format
|
|
try {
|
|
new URL(url)
|
|
} catch {
|
|
return NextResponse.json(
|
|
{ error: "Invalid URL format" },
|
|
{ status: 400 },
|
|
)
|
|
}
|
|
|
|
// SSRF protection
|
|
if (isPrivateUrl(url)) {
|
|
return NextResponse.json(
|
|
{ error: "Cannot access private/internal URLs" },
|
|
{ status: 400 },
|
|
)
|
|
}
|
|
|
|
// Extract article content with timeout to avoid tying up server resources
|
|
const controller = new AbortController()
|
|
const timeoutId = setTimeout(() => {
|
|
controller.abort()
|
|
}, EXTRACT_TIMEOUT_MS)
|
|
|
|
let article
|
|
try {
|
|
article = await extract(url, undefined, {
|
|
headers: {
|
|
"User-Agent": "Mozilla/5.0 (compatible; NextAIDrawio/1.0)",
|
|
},
|
|
signal: controller.signal,
|
|
})
|
|
} catch (err: any) {
|
|
if (err?.name === "AbortError") {
|
|
return NextResponse.json(
|
|
{ error: "Timed out while fetching URL content" },
|
|
{ status: 504 },
|
|
)
|
|
}
|
|
throw err
|
|
} finally {
|
|
clearTimeout(timeoutId)
|
|
}
|
|
|
|
if (!article || !article.content) {
|
|
return NextResponse.json(
|
|
{ error: "Could not extract content from URL" },
|
|
{ status: 400 },
|
|
)
|
|
}
|
|
|
|
// Convert HTML to Markdown
|
|
const turndownService = new TurndownService({
|
|
headingStyle: "atx",
|
|
codeBlockStyle: "fenced",
|
|
})
|
|
|
|
// Remove unwanted elements before conversion
|
|
turndownService.remove(["script", "style", "iframe", "noscript"])
|
|
|
|
const markdown = turndownService.turndown(article.content)
|
|
|
|
// Check content length
|
|
if (markdown.length > MAX_CONTENT_LENGTH) {
|
|
return NextResponse.json(
|
|
{
|
|
error: `Content exceeds ${MAX_CONTENT_LENGTH / 1000}k character limit (${(markdown.length / 1000).toFixed(1)}k chars)`,
|
|
},
|
|
{ status: 400 },
|
|
)
|
|
}
|
|
|
|
return NextResponse.json({
|
|
title: article.title || "Untitled",
|
|
content: markdown,
|
|
charCount: markdown.length,
|
|
})
|
|
} catch (error) {
|
|
console.error("URL extraction error:", error)
|
|
return NextResponse.json(
|
|
{ error: "Failed to fetch or parse URL content" },
|
|
{ status: 500 },
|
|
)
|
|
}
|
|
}
|