mirror of
https://github.com/DayuanJiang/next-ai-draw-io.git
synced 2026-01-09 17:52:30 +08:00
🔗 Add URL Content Extraction Feature (#514)
* feat: add URL content extraction for AI diagram generation * Changes made as recommended by Claude: 1. Added a request timeout to prevent server resources from being tied up (route.ts) 2. Implemented runtime validation for the API response shape (url-utils.ts) 3. Removed hardcoded English error messages and replaced them with localized strings (url-input-dialog.tsx) 4. Fixed the incorrect i18n namespace (changed from pdf.* to url.*) (url-input-dialog.tsx and en/ja/zh.json) * chore: restore package.json and package-lock.json * fix: use i18n strings for URL dialog error messages --------- Co-authored-by: dayuan.jiang <jdy.toh@gmail.com>
This commit is contained in:
154
app/api/parse-url/route.ts
Normal file
154
app/api/parse-url/route.ts
Normal file
@@ -0,0 +1,154 @@
|
||||
import { extract } from "@extractus/article-extractor"
|
||||
import { NextResponse } from "next/server"
|
||||
import TurndownService from "turndown"
|
||||
|
||||
const MAX_CONTENT_LENGTH = 150000 // Match PDF limit
|
||||
const EXTRACT_TIMEOUT_MS = 15000
|
||||
|
||||
// SSRF protection - block private/internal addresses
|
||||
function isPrivateUrl(urlString: string): boolean {
|
||||
try {
|
||||
const url = new URL(urlString)
|
||||
const hostname = url.hostname.toLowerCase()
|
||||
|
||||
// Block localhost
|
||||
if (
|
||||
hostname === "localhost" ||
|
||||
hostname === "127.0.0.1" ||
|
||||
hostname === "::1"
|
||||
) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Block AWS/cloud metadata endpoints
|
||||
if (
|
||||
hostname === "169.254.169.254" ||
|
||||
hostname === "metadata.google.internal"
|
||||
) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check for private IPv4 ranges
|
||||
const ipv4Match = hostname.match(
|
||||
/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/,
|
||||
)
|
||||
if (ipv4Match) {
|
||||
const [, a, b] = ipv4Match.map(Number)
|
||||
if (a === 10) return true // 10.0.0.0/8
|
||||
if (a === 172 && b >= 16 && b <= 31) return true // 172.16.0.0/12
|
||||
if (a === 192 && b === 168) return true // 192.168.0.0/16
|
||||
if (a === 169 && b === 254) return true // 169.254.0.0/16 (link-local)
|
||||
if (a === 127) return true // 127.0.0.0/8 (loopback)
|
||||
}
|
||||
|
||||
// Block common internal hostnames
|
||||
if (
|
||||
hostname.endsWith(".local") ||
|
||||
hostname.endsWith(".internal") ||
|
||||
hostname.endsWith(".localhost")
|
||||
) {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
} catch {
|
||||
return true // Invalid URL - block it
|
||||
}
|
||||
}
|
||||
|
||||
export async function POST(req: Request) {
|
||||
try {
|
||||
const { url } = await req.json()
|
||||
|
||||
if (!url || typeof url !== "string") {
|
||||
return NextResponse.json(
|
||||
{ error: "URL is required" },
|
||||
{ status: 400 },
|
||||
)
|
||||
}
|
||||
|
||||
// Validate URL format
|
||||
try {
|
||||
new URL(url)
|
||||
} catch {
|
||||
return NextResponse.json(
|
||||
{ error: "Invalid URL format" },
|
||||
{ status: 400 },
|
||||
)
|
||||
}
|
||||
|
||||
// SSRF protection
|
||||
if (isPrivateUrl(url)) {
|
||||
return NextResponse.json(
|
||||
{ error: "Cannot access private/internal URLs" },
|
||||
{ status: 400 },
|
||||
)
|
||||
}
|
||||
|
||||
// Extract article content with timeout to avoid tying up server resources
|
||||
const controller = new AbortController()
|
||||
const timeoutId = setTimeout(() => {
|
||||
controller.abort()
|
||||
}, EXTRACT_TIMEOUT_MS)
|
||||
|
||||
let article
|
||||
try {
|
||||
article = await extract(url, undefined, {
|
||||
headers: {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; NextAIDrawio/1.0)",
|
||||
},
|
||||
signal: controller.signal,
|
||||
})
|
||||
} catch (err: any) {
|
||||
if (err?.name === "AbortError") {
|
||||
return NextResponse.json(
|
||||
{ error: "Timed out while fetching URL content" },
|
||||
{ status: 504 },
|
||||
)
|
||||
}
|
||||
throw err
|
||||
} finally {
|
||||
clearTimeout(timeoutId)
|
||||
}
|
||||
|
||||
if (!article || !article.content) {
|
||||
return NextResponse.json(
|
||||
{ error: "Could not extract content from URL" },
|
||||
{ status: 400 },
|
||||
)
|
||||
}
|
||||
|
||||
// Convert HTML to Markdown
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: "atx",
|
||||
codeBlockStyle: "fenced",
|
||||
})
|
||||
|
||||
// Remove unwanted elements before conversion
|
||||
turndownService.remove(["script", "style", "iframe", "noscript"])
|
||||
|
||||
const markdown = turndownService.turndown(article.content)
|
||||
|
||||
// Check content length
|
||||
if (markdown.length > MAX_CONTENT_LENGTH) {
|
||||
return NextResponse.json(
|
||||
{
|
||||
error: `Content exceeds ${MAX_CONTENT_LENGTH / 1000}k character limit (${(markdown.length / 1000).toFixed(1)}k chars)`,
|
||||
},
|
||||
{ status: 400 },
|
||||
)
|
||||
}
|
||||
|
||||
return NextResponse.json({
|
||||
title: article.title || "Untitled",
|
||||
content: markdown,
|
||||
charCount: markdown.length,
|
||||
})
|
||||
} catch (error) {
|
||||
console.error("URL extraction error:", error)
|
||||
return NextResponse.json(
|
||||
{ error: "Failed to fetch or parse URL content" },
|
||||
{ status: 500 },
|
||||
)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user