Files
next-ai-draw-io/app/api/parse-url/route.ts
Biki Kalita 6326f9dec6 🔗 Add URL Content Extraction Feature (#514)
* feat: add URL content extraction for AI diagram generation

* Changes made as recommended by Claude:

1. Added a request timeout to prevent server resources from being tied up (route.ts)
2. Implemented runtime validation for the API response shape (url-utils.ts)
3. Removed hardcoded English error messages and replaced them with localized strings (url-input-dialog.tsx)
4. Fixed the incorrect i18n namespace (changed from pdf.* to url.*) (url-input-dialog.tsx and en/ja/zh.json)

* chore: restore package.json and package-lock.json

* fix: use i18n strings for URL dialog error messages

---------

Co-authored-by: dayuan.jiang <jdy.toh@gmail.com>
2026-01-06 00:23:50 +09:00

155 lines
4.6 KiB
TypeScript

import { extract } from "@extractus/article-extractor"
import { NextResponse } from "next/server"
import TurndownService from "turndown"
const MAX_CONTENT_LENGTH = 150000 // Match PDF limit
const EXTRACT_TIMEOUT_MS = 15000
// SSRF protection - block private/internal addresses
function isPrivateUrl(urlString: string): boolean {
try {
const url = new URL(urlString)
const hostname = url.hostname.toLowerCase()
// Block localhost
if (
hostname === "localhost" ||
hostname === "127.0.0.1" ||
hostname === "::1"
) {
return true
}
// Block AWS/cloud metadata endpoints
if (
hostname === "169.254.169.254" ||
hostname === "metadata.google.internal"
) {
return true
}
// Check for private IPv4 ranges
const ipv4Match = hostname.match(
/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/,
)
if (ipv4Match) {
const [, a, b] = ipv4Match.map(Number)
if (a === 10) return true // 10.0.0.0/8
if (a === 172 && b >= 16 && b <= 31) return true // 172.16.0.0/12
if (a === 192 && b === 168) return true // 192.168.0.0/16
if (a === 169 && b === 254) return true // 169.254.0.0/16 (link-local)
if (a === 127) return true // 127.0.0.0/8 (loopback)
}
// Block common internal hostnames
if (
hostname.endsWith(".local") ||
hostname.endsWith(".internal") ||
hostname.endsWith(".localhost")
) {
return true
}
return false
} catch {
return true // Invalid URL - block it
}
}
export async function POST(req: Request) {
try {
const { url } = await req.json()
if (!url || typeof url !== "string") {
return NextResponse.json(
{ error: "URL is required" },
{ status: 400 },
)
}
// Validate URL format
try {
new URL(url)
} catch {
return NextResponse.json(
{ error: "Invalid URL format" },
{ status: 400 },
)
}
// SSRF protection
if (isPrivateUrl(url)) {
return NextResponse.json(
{ error: "Cannot access private/internal URLs" },
{ status: 400 },
)
}
// Extract article content with timeout to avoid tying up server resources
const controller = new AbortController()
const timeoutId = setTimeout(() => {
controller.abort()
}, EXTRACT_TIMEOUT_MS)
let article
try {
article = await extract(url, undefined, {
headers: {
"User-Agent": "Mozilla/5.0 (compatible; NextAIDrawio/1.0)",
},
signal: controller.signal,
})
} catch (err: any) {
if (err?.name === "AbortError") {
return NextResponse.json(
{ error: "Timed out while fetching URL content" },
{ status: 504 },
)
}
throw err
} finally {
clearTimeout(timeoutId)
}
if (!article || !article.content) {
return NextResponse.json(
{ error: "Could not extract content from URL" },
{ status: 400 },
)
}
// Convert HTML to Markdown
const turndownService = new TurndownService({
headingStyle: "atx",
codeBlockStyle: "fenced",
})
// Remove unwanted elements before conversion
turndownService.remove(["script", "style", "iframe", "noscript"])
const markdown = turndownService.turndown(article.content)
// Check content length
if (markdown.length > MAX_CONTENT_LENGTH) {
return NextResponse.json(
{
error: `Content exceeds ${MAX_CONTENT_LENGTH / 1000}k character limit (${(markdown.length / 1000).toFixed(1)}k chars)`,
},
{ status: 400 },
)
}
return NextResponse.json({
title: article.title || "Untitled",
content: markdown,
charCount: markdown.length,
})
} catch (error) {
console.error("URL extraction error:", error)
return NextResponse.json(
{ error: "Failed to fetch or parse URL content" },
{ status: 500 },
)
}
}