diff --git a/app/api/parse-url/route.ts b/app/api/parse-url/route.ts new file mode 100644 index 0000000..f5278e6 --- /dev/null +++ b/app/api/parse-url/route.ts @@ -0,0 +1,154 @@ +import { extract } from "@extractus/article-extractor" +import { NextResponse } from "next/server" +import TurndownService from "turndown" + +const MAX_CONTENT_LENGTH = 150000 // Match PDF limit +const EXTRACT_TIMEOUT_MS = 15000 + +// SSRF protection - block private/internal addresses +function isPrivateUrl(urlString: string): boolean { + try { + const url = new URL(urlString) + const hostname = url.hostname.toLowerCase() + + // Block localhost + if ( + hostname === "localhost" || + hostname === "127.0.0.1" || + hostname === "::1" + ) { + return true + } + + // Block AWS/cloud metadata endpoints + if ( + hostname === "169.254.169.254" || + hostname === "metadata.google.internal" + ) { + return true + } + + // Check for private IPv4 ranges + const ipv4Match = hostname.match( + /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/, + ) + if (ipv4Match) { + const [, a, b] = ipv4Match.map(Number) + if (a === 10) return true // 10.0.0.0/8 + if (a === 172 && b >= 16 && b <= 31) return true // 172.16.0.0/12 + if (a === 192 && b === 168) return true // 192.168.0.0/16 + if (a === 169 && b === 254) return true // 169.254.0.0/16 (link-local) + if (a === 127) return true // 127.0.0.0/8 (loopback) + } + + // Block common internal hostnames + if ( + hostname.endsWith(".local") || + hostname.endsWith(".internal") || + hostname.endsWith(".localhost") + ) { + return true + } + + return false + } catch { + return true // Invalid URL - block it + } +} + +export async function POST(req: Request) { + try { + const { url } = await req.json() + + if (!url || typeof url !== "string") { + return NextResponse.json( + { error: "URL is required" }, + { status: 400 }, + ) + } + + // Validate URL format + try { + new URL(url) + } catch { + return NextResponse.json( + { error: "Invalid URL format" }, + { status: 400 }, + ) + } + + // SSRF protection + if (isPrivateUrl(url)) { + return NextResponse.json( + { error: "Cannot access private/internal URLs" }, + { status: 400 }, + ) + } + + // Extract article content with timeout to avoid tying up server resources + const controller = new AbortController() + const timeoutId = setTimeout(() => { + controller.abort() + }, EXTRACT_TIMEOUT_MS) + + let article + try { + article = await extract(url, undefined, { + headers: { + "User-Agent": "Mozilla/5.0 (compatible; NextAIDrawio/1.0)", + }, + signal: controller.signal, + }) + } catch (err: any) { + if (err?.name === "AbortError") { + return NextResponse.json( + { error: "Timed out while fetching URL content" }, + { status: 504 }, + ) + } + throw err + } finally { + clearTimeout(timeoutId) + } + + if (!article || !article.content) { + return NextResponse.json( + { error: "Could not extract content from URL" }, + { status: 400 }, + ) + } + + // Convert HTML to Markdown + const turndownService = new TurndownService({ + headingStyle: "atx", + codeBlockStyle: "fenced", + }) + + // Remove unwanted elements before conversion + turndownService.remove(["script", "style", "iframe", "noscript"]) + + const markdown = turndownService.turndown(article.content) + + // Check content length + if (markdown.length > MAX_CONTENT_LENGTH) { + return NextResponse.json( + { + error: `Content exceeds ${MAX_CONTENT_LENGTH / 1000}k character limit (${(markdown.length / 1000).toFixed(1)}k chars)`, + }, + { status: 400 }, + ) + } + + return NextResponse.json({ + title: article.title || "Untitled", + content: markdown, + charCount: markdown.length, + }) + } catch (error) { + console.error("URL extraction error:", error) + return NextResponse.json( + { error: "Failed to fetch or parse URL content" }, + { status: 500 }, + ) + } +} diff --git a/components/chat-input.tsx b/components/chat-input.tsx index 6a718a8..7ba67a0 100644 --- a/components/chat-input.tsx +++ b/components/chat-input.tsx @@ -4,6 +4,7 @@ import { Download, History, Image as ImageIcon, + Link, Loader2, Send, } from "lucide-react" @@ -18,11 +19,13 @@ import { SaveDialog } from "@/components/save-dialog" import { Button } from "@/components/ui/button" import { Textarea } from "@/components/ui/textarea" +import { UrlInputDialog } from "@/components/url-input-dialog" import { useDiagram } from "@/contexts/diagram-context" import { useDictionary } from "@/hooks/use-dictionary" import { formatMessage } from "@/lib/i18n/utils" import { isPdfFile, isTextFile } from "@/lib/pdf-utils" import type { FlattenedModel } from "@/lib/types/model-config" +import { extractUrlContent, type UrlData } from "@/lib/url-utils" import { FilePreviewList } from "./file-preview-list" const MAX_IMAGE_SIZE = 2 * 1024 * 1024 // 2MB @@ -144,6 +147,8 @@ interface ChatInputProps { File, { text: string; charCount: number; isExtracting: boolean } > + urlData?: Map + onUrlChange?: (data: Map) => void sessionId?: string error?: Error | null @@ -163,6 +168,8 @@ export function ChatInput({ files = [], onFileChange = () => {}, pdfData = new Map(), + urlData, + onUrlChange, sessionId, error = null, models = [], @@ -183,6 +190,8 @@ export function ChatInput({ const fileInputRef = useRef(null) const [isDragging, setIsDragging] = useState(false) const [showHistory, setShowHistory] = useState(false) + const [showUrlDialog, setShowUrlDialog] = useState(false) + const [isExtractingUrl, setIsExtractingUrl] = useState(false) // Allow retry when there's an error (even if status is still "streaming" or "submitted") const isDisabled = (status === "streaming" || status === "submitted") && !error @@ -312,6 +321,44 @@ export function ChatInput({ } } + const handleUrlExtract = async (url: string) => { + if (!onUrlChange) return + + setIsExtractingUrl(true) + + try { + const existing = urlData + ? new Map(urlData) + : new Map() + existing.set(url, { + url, + title: url, + content: "", + charCount: 0, + isExtracting: true, + }) + onUrlChange(existing) + + const data = await extractUrlContent(url) + + const newUrlData = new Map(existing) + newUrlData.set(url, data) + onUrlChange(newUrlData) + + setShowUrlDialog(false) + } catch (error) { + showErrorToast( + + {error instanceof Error + ? error.message + : "Failed to extract URL content"} + , + ) + } finally { + setIsExtractingUrl(false) + } + } + return (
- {/* File previews */} - {files.length > 0 && ( + {/* File & URL previews */} + {(files.length > 0 || (urlData && urlData.size > 0)) && (
{ + const next = new Map(urlData) + next.delete(url) + onUrlChange(next) + } + : undefined + } />
)} @@ -385,6 +442,20 @@ export function ChatInput({ + {onUrlChange && ( + setShowUrlDialog(true)} + disabled={isDisabled} + tooltipContent={dict.chat.ExtractURL} + className="h-8 w-8 p-0 text-muted-foreground hover:text-foreground" + > + + + )} + + {onUrlChange && ( + + )} ) } diff --git a/components/chat-panel.tsx b/components/chat-panel.tsx index acdb81d..0dda333 100644 --- a/components/chat-panel.tsx +++ b/components/chat-panel.tsx @@ -34,6 +34,7 @@ import { findCachedResponse } from "@/lib/cached-responses" import { formatMessage } from "@/lib/i18n/utils" import { isPdfFile, isTextFile } from "@/lib/pdf-utils" import { sanitizeMessages } from "@/lib/session-storage" +import type { UrlData } from "@/lib/url-utils" import { type FileData, useFileProcessor } from "@/lib/use-file-processor" import { useQuotaManager } from "@/lib/use-quota-manager" import { cn, formatXML, isRealDiagram } from "@/lib/utils" @@ -158,6 +159,7 @@ export default function ChatPanel({ // File processing using extracted hook const { files, pdfData, handleFileChange, setFiles } = useFileProcessor() + const [urlData, setUrlData] = useState>(new Map()) const [showSettingsDialog, setShowSettingsDialog] = useState(false) const [showModelConfigDialog, setShowModelConfigDialog] = useState(false) @@ -710,6 +712,8 @@ export default function ChatPanel({ input, files, pdfData, + undefined, + urlData, ) setMessages([ @@ -735,6 +739,7 @@ export default function ChatPanel({ setInput("") sessionStorage.removeItem(SESSION_STORAGE_INPUT_KEY) setFiles([]) + setUrlData(new Map()) return } } @@ -755,6 +760,7 @@ export default function ChatPanel({ files, pdfData, parts, + urlData, ) // Add the combined text as the first part @@ -779,6 +785,7 @@ export default function ChatPanel({ setInput("") sessionStorage.removeItem(SESSION_STORAGE_INPUT_KEY) setFiles([]) + setUrlData(new Map()) } catch (error) { console.error("Error fetching chart data:", error) } @@ -854,6 +861,7 @@ export default function ChatPanel({ clearDiagram() setDiagramHistory([]) handleFileChange([]) // Use handleFileChange to also clear pdfData + setUrlData(new Map()) const newSessionId = `session-${Date.now()}-${Math.random() .toString(36) .slice(2, 9)}` @@ -972,6 +980,7 @@ export default function ChatPanel({ files: File[], pdfData: Map, imageParts?: any[], + urlDataParam?: Map, ): Promise => { let userText = baseText @@ -1002,6 +1011,14 @@ export default function ChatPanel({ } } + if (urlDataParam) { + for (const [url, data] of urlDataParam) { + if (data.content) { + userText += `\n\n[URL: ${url}]\nTitle: ${data.title}\n\n${data.content}` + } + } + } + return userText } @@ -1264,6 +1281,8 @@ export default function ChatPanel({ files={files} onFileChange={handleFileChange} pdfData={pdfData} + urlData={urlData} + onUrlChange={setUrlData} sessionId={sessionId} error={error} models={modelConfig.models} diff --git a/components/file-preview-list.tsx b/components/file-preview-list.tsx index 509873e..06d4a4f 100644 --- a/components/file-preview-list.tsx +++ b/components/file-preview-list.tsx @@ -1,6 +1,6 @@ "use client" -import { FileCode, FileText, Loader2, X } from "lucide-react" +import { FileCode, FileText, Link, Loader2, X } from "lucide-react" import Image from "next/image" import { useEffect, useRef, useState } from "react" import { useDictionary } from "@/hooks/use-dictionary" @@ -20,12 +20,19 @@ interface FilePreviewListProps { File, { text: string; charCount: number; isExtracting: boolean } > + urlData?: Map< + string, + { url: string; title: string; charCount: number; isExtracting: boolean } + > + onRemoveUrl?: (url: string) => void } export function FilePreviewList({ files, onRemoveFile, pdfData = new Map(), + urlData, + onRemoveUrl, }: FilePreviewListProps) { const dict = useDictionary() const [selectedImage, setSelectedImage] = useState(null) @@ -77,7 +84,7 @@ export function FilePreviewList({ } }, [imageUrls, selectedImage]) - if (files.length === 0) return null + if (files.length === 0 && (!urlData || urlData.size === 0)) return null return ( <> @@ -152,6 +159,59 @@ export function FilePreviewList({ ) })} + {/* URL previews */} + {urlData && urlData.size > 0 && ( +
+ {Array.from(urlData.entries()).map( + ([url, data], index) => ( +
+
+
+ {data.isExtracting ? ( + <> + + + {dict.file.reading} + + + ) : ( + <> + + + {data.title.length > 10 + ? `${data.title.slice(0, 7)}...` + : data.title} + + {data.charCount && ( + + {formatCharCount( + data.charCount, + )}{" "} + {dict.file.chars} + + )} + + )} +
+
+ {onRemoveUrl && ( + + )} +
+ ), + )} +
+ )} {/* Image Modal/Lightbox */} {selectedImage && ( diff --git a/components/url-input-dialog.tsx b/components/url-input-dialog.tsx new file mode 100644 index 0000000..8d9e339 --- /dev/null +++ b/components/url-input-dialog.tsx @@ -0,0 +1,116 @@ +"use client" + +import { Link, Loader2 } from "lucide-react" +import { useState } from "react" +import { Button } from "@/components/ui/button" +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog" +import { Input } from "@/components/ui/input" +import { useDictionary } from "@/hooks/use-dictionary" + +interface UrlInputDialogProps { + open: boolean + onOpenChange: (open: boolean) => void + onSubmit: (url: string) => void + isExtracting: boolean +} + +export function UrlInputDialog({ + open, + onOpenChange, + onSubmit, + isExtracting, +}: UrlInputDialogProps) { + const dict = useDictionary() + const [url, setUrl] = useState("") + const [error, setError] = useState("") + + const handleSubmit = () => { + setError("") + + if (!url.trim()) { + setError(dict.url.enterUrl) + return + } + + try { + new URL(url) + } catch { + setError(dict.url.invalidFormat) + return + } + + onSubmit(url.trim()) + } + + const handleKeyDown = (e: React.KeyboardEvent) => { + if (e.key === "Enter" && !isExtracting) { + e.preventDefault() + handleSubmit() + } + } + + return ( + + + + {dict.url.title} + + {dict.url.description} + + + +
+
+ { + setUrl(e.target.value) + setError("") + }} + onKeyDown={handleKeyDown} + placeholder="https://example.com/article" + disabled={isExtracting} + autoFocus + /> + {error && ( +

{error}

+ )} +
+
+ + + + + +
+
+ ) +} diff --git a/lib/i18n/dictionaries/en.json b/lib/i18n/dictionaries/en.json index dba4a9c..1d845c7 100644 --- a/lib/i18n/dictionaries/en.json +++ b/lib/i18n/dictionaries/en.json @@ -51,7 +51,8 @@ "badResponse": "Bad response", "clickToEdit": "Click to edit", "editMessage": "Edit message", - "saveAndSubmit": "Save & Submit" + "saveAndSubmit": "Save & Submit", + "ExtractURL": "Extract from URL" }, "examples": { "title": "Create diagrams with AI", @@ -186,6 +187,15 @@ "chars": "chars", "removeFile": "Remove file" }, + "url": { + "title": "Extract Content from URL", + "description": "Paste a URL to extract and analyze its content", + "Extracting": "Extracting...", + "extract": "Extract", + "Cancel": "Cancel", + "enterUrl": "Please enter a URL", + "invalidFormat": "Invalid URL format" + }, "reasoning": { "thinking": "Thinking...", "thoughtFor": "Thought for {duration} seconds", diff --git a/lib/i18n/dictionaries/ja.json b/lib/i18n/dictionaries/ja.json index 0a6cd0e..f515f0c 100644 --- a/lib/i18n/dictionaries/ja.json +++ b/lib/i18n/dictionaries/ja.json @@ -51,7 +51,8 @@ "badResponse": "悪い応答", "clickToEdit": "クリックして編集", "editMessage": "メッセージを編集", - "saveAndSubmit": "保存して送信" + "saveAndSubmit": "保存して送信", + "ExtractURL": "URLから抽出" }, "examples": { "title": "AI でダイアグラムを作成", @@ -186,6 +187,15 @@ "chars": "文字", "removeFile": "ファイルを削除" }, + "url": { + "title": "URLからコンテンツを抽出", + "description": "URLを貼り付けてそのコンテンツを抽出および分析します", + "Extracting": "抽出中...", + "extract": "抽出", + "Cancel": "キャンセル", + "enterUrl": "URLを入力してください", + "invalidFormat": "無効なURL形式です" + }, "reasoning": { "thinking": "考え中...", "thoughtFor": "{duration} 秒考えました", diff --git a/lib/i18n/dictionaries/zh.json b/lib/i18n/dictionaries/zh.json index 9fca336..adcbfa1 100644 --- a/lib/i18n/dictionaries/zh.json +++ b/lib/i18n/dictionaries/zh.json @@ -51,7 +51,8 @@ "badResponse": "无帮助", "clickToEdit": "点击编辑", "editMessage": "编辑消息", - "saveAndSubmit": "保存并提交" + "saveAndSubmit": "保存并提交", + "ExtractURL": "从 URL 提取" }, "examples": { "title": "用 AI 创建图表", @@ -186,6 +187,15 @@ "chars": "字符", "removeFile": "移除文件" }, + "url": { + "title": "从 URL 提取内容", + "description": "粘贴 URL 以提取和分析其内容", + "Extracting": "提取中...", + "extract": "提取", + "Cancel": "取消", + "enterUrl": "请输入 URL", + "invalidFormat": "URL 格式无效" + }, "reasoning": { "thinking": "思考中...", "thoughtFor": "思考了 {duration} 秒", diff --git a/lib/url-utils.ts b/lib/url-utils.ts new file mode 100644 index 0000000..85f83bc --- /dev/null +++ b/lib/url-utils.ts @@ -0,0 +1,49 @@ +import { z } from "zod" + +export interface UrlData { + url: string + title: string + content: string + charCount: number + isExtracting: boolean +} + +const UrlResponseSchema = z.object({ + title: z.string().default("Untitled"), + content: z.string(), + charCount: z.number().int().nonnegative(), +}) + +export async function extractUrlContent(url: string): Promise { + const response = await fetch("/api/parse-url", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ url }), + }) + + // Try to parse JSON once + const raw = await response + .json() + .catch(() => ({ error: "Unexpected non-JSON response" })) + + if (!response.ok) { + const message = + typeof raw === "object" && raw && "error" in raw + ? String((raw as any).error) + : "Failed to extract URL content" + throw new Error(message) + } + + const parsed = UrlResponseSchema.safeParse(raw) + if (!parsed.success) { + throw new Error("Malformed response from URL extraction API") + } + + return { + url, + title: parsed.data.title, + content: parsed.data.content, + charCount: parsed.data.charCount, + isExtracting: false, + } +} diff --git a/package-lock.json b/package-lock.json index f5b23b8..4278de8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,6 +19,7 @@ "@ai-sdk/react": "^3.0.1", "@aws-sdk/client-dynamodb": "^3.957.0", "@aws-sdk/credential-providers": "^3.943.0", + "@extractus/article-extractor": "^8.0.18", "@formatjs/intl-localematcher": "^0.7.2", "@langfuse/client": "^4.4.9", "@langfuse/otel": "^4.4.4", @@ -66,6 +67,7 @@ "sonner": "^2.0.7", "tailwind-merge": "^3.0.2", "tailwindcss-animate": "^1.0.7", + "turndown": "^7.2.0", "unpdf": "^1.4.0", "zod": "^4.1.12" }, @@ -83,6 +85,7 @@ "@types/pako": "^2.0.3", "@types/react": "^19", "@types/react-dom": "^19", + "@types/turndown": "^5.0.6", "@vitejs/plugin-react": "^5.1.2", "@vitest/coverage-v8": "^4.0.16", "concurrently": "^9.2.1", @@ -6428,6 +6431,22 @@ } } }, + "node_modules/@extractus/article-extractor": { + "version": "8.0.20", + "resolved": "https://registry.npmjs.org/@extractus/article-extractor/-/article-extractor-8.0.20.tgz", + "integrity": "sha512-oxHLZ3X5ctLVkQfFkOLf8afvQq6aJ2VBxwQhAaV6ZypaaMJboFz8uwpCGy7QBehmQIvzgWhCwuu8j4ayJFvPcg==", + "license": "MIT", + "dependencies": { + "@mozilla/readability": "^0.6.0", + "@ndaidong/bellajs": "^12.0.1", + "cross-fetch": "^4.1.0", + "linkedom": "^0.18.12", + "sanitize-html": "2.17.0" + }, + "engines": { + "node": ">= 20" + } + }, "node_modules/@floating-ui/core": { "version": "1.6.9", "resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.6.9.tgz", @@ -7338,6 +7357,21 @@ "node": ">= 10.0.0" } }, + "node_modules/@mixmark-io/domino": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz", + "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==", + "license": "BSD-2-Clause" + }, + "node_modules/@mozilla/readability": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.6.0.tgz", + "integrity": "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==", + "license": "Apache-2.0", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@napi-rs/wasm-runtime": { "version": "0.2.12", "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz", @@ -7351,6 +7385,12 @@ "@tybys/wasm-util": "^0.10.0" } }, + "node_modules/@ndaidong/bellajs": { + "version": "12.0.1", + "resolved": "https://registry.npmjs.org/@ndaidong/bellajs/-/bellajs-12.0.1.tgz", + "integrity": "sha512-1iY42uiHz0cxNMbde7O3zVN+ZX1viOOUOBRt6ht6lkRZbSjwOnFV34Zv4URp3hGzEe6L9Byk7BOq/41H0PzAOQ==", + "license": "MIT" + }, "node_modules/@next/env": { "version": "16.1.1", "resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.1.tgz", @@ -11443,6 +11483,13 @@ "@types/node": "*" } }, + "node_modules/@types/turndown": { + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.6.tgz", + "integrity": "sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/unist": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", @@ -12983,6 +13030,12 @@ "url": "https://opencollective.com/express" } }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==", + "license": "ISC" + }, "node_modules/boolean": { "version": "3.2.0", "resolved": "https://registry.npmmirror.com/boolean/-/boolean-3.2.0.tgz", @@ -14028,6 +14081,15 @@ "node": ">=20" } }, + "node_modules/cross-fetch": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-4.1.0.tgz", + "integrity": "sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw==", + "license": "MIT", + "dependencies": { + "node-fetch": "^2.7.0" + } + }, "node_modules/cross-spawn": { "version": "7.0.6", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", @@ -14042,6 +14104,22 @@ "node": ">= 8" } }, + "node_modules/css-select": { + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz", + "integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/css-tree": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/css-tree/-/css-tree-3.1.0.tgz", @@ -14056,6 +14134,18 @@ "node": "^10 || ^12.20.0 || ^14.13.0 || >=15.0.0" } }, + "node_modules/css-what": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz", + "integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==", + "license": "BSD-2-Clause", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/cssesc": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", @@ -14069,6 +14159,12 @@ "node": ">=4" } }, + "node_modules/cssom": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.5.0.tgz", + "integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==", + "license": "MIT" + }, "node_modules/cssstyle": { "version": "5.3.5", "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-5.3.5.tgz", @@ -14238,6 +14334,15 @@ "dev": true, "license": "MIT" }, + "node_modules/deepmerge": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", + "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/defaults": { "version": "1.0.4", "resolved": "https://registry.npmmirror.com/defaults/-/defaults-1.0.4.tgz", @@ -14474,6 +14579,73 @@ "dev": true, "license": "MIT" }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/dom-serializer/node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "BSD-2-Clause" + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "license": "BSD-2-Clause", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz", + "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==", + "license": "BSD-2-Clause", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, "node_modules/dotenv": { "version": "16.6.1", "resolved": "https://registry.npmmirror.com/dotenv/-/dotenv-16.6.1.tgz", @@ -14850,7 +15022,6 @@ "version": "6.0.1", "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", - "dev": true, "license": "BSD-2-Clause", "engines": { "node": ">=0.12" @@ -16859,6 +17030,25 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/htmlparser2": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz", + "integrity": "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.2.1", + "entities": "^6.0.0" + } + }, "node_modules/http-cache-semantics": { "version": "4.2.0", "resolved": "https://registry.npmmirror.com/http-cache-semantics/-/http-cache-semantics-4.2.0.tgz", @@ -17600,6 +17790,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/is-plain-object": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-5.0.0.tgz", + "integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-potential-custom-element-name": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", @@ -18417,6 +18616,36 @@ "url": "https://opencollective.com/parcel" } }, + "node_modules/linkedom": { + "version": "0.18.12", + "resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.18.12.tgz", + "integrity": "sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q==", + "license": "ISC", + "dependencies": { + "css-select": "^5.1.0", + "cssom": "^0.5.0", + "html-escaper": "^3.0.3", + "htmlparser2": "^10.0.0", + "uhyphen": "^0.2.0" + }, + "engines": { + "node": ">=16" + }, + "peerDependencies": { + "canvas": ">= 2" + }, + "peerDependenciesMeta": { + "canvas": { + "optional": true + } + } + }, + "node_modules/linkedom/node_modules/html-escaper": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz", + "integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==", + "license": "MIT" + }, "node_modules/lint-staged": { "version": "16.2.7", "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-16.2.7.tgz", @@ -20705,6 +20934,18 @@ "node": ">=4" } }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, "node_modules/object-assign": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", @@ -21158,6 +21399,12 @@ "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==", "license": "MIT" }, + "node_modules/parse-srcset": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/parse-srcset/-/parse-srcset-1.0.2.tgz", + "integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q==", + "license": "MIT" + }, "node_modules/parse5": { "version": "8.0.0", "resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.0.tgz", @@ -21393,7 +21640,6 @@ "version": "8.5.6", "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", - "dev": true, "funding": [ { "type": "opencollective", @@ -22433,6 +22679,63 @@ "truncate-utf8-bytes": "^1.0.0" } }, + "node_modules/sanitize-html": { + "version": "2.17.0", + "resolved": "https://registry.npmjs.org/sanitize-html/-/sanitize-html-2.17.0.tgz", + "integrity": "sha512-dLAADUSS8rBwhaevT12yCezvioCA+bmUTPH/u57xKPT8d++voeYE6HeluA/bPbQ15TwDBG2ii+QZIEmYx8VdxA==", + "license": "MIT", + "dependencies": { + "deepmerge": "^4.2.2", + "escape-string-regexp": "^4.0.0", + "htmlparser2": "^8.0.0", + "is-plain-object": "^5.0.0", + "parse-srcset": "^1.0.2", + "postcss": "^8.3.11" + } + }, + "node_modules/sanitize-html/node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/sanitize-html/node_modules/escape-string-regexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", + "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/sanitize-html/node_modules/htmlparser2": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", + "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1", + "entities": "^4.4.0" + } + }, "node_modules/sax": { "version": "1.4.3", "resolved": "https://registry.npmmirror.com/sax/-/sax-1.4.3.tgz", @@ -24019,6 +24322,15 @@ "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", "license": "0BSD" }, + "node_modules/turndown": { + "version": "7.2.2", + "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.2.tgz", + "integrity": "sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==", + "license": "MIT", + "dependencies": { + "@mixmark-io/domino": "^2.2.0" + } + }, "node_modules/type-check": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", @@ -24201,6 +24513,12 @@ "typescript": ">=4.8.4 <6.0.0" } }, + "node_modules/uhyphen": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz", + "integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==", + "license": "ISC" + }, "node_modules/unbox-primitive": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz", diff --git a/package.json b/package.json index ef33436..5f210ba 100644 --- a/package.json +++ b/package.json @@ -40,6 +40,7 @@ "@ai-sdk/react": "^3.0.1", "@aws-sdk/client-dynamodb": "^3.957.0", "@aws-sdk/credential-providers": "^3.943.0", + "@extractus/article-extractor": "^8.0.18", "@formatjs/intl-localematcher": "^0.7.2", "@langfuse/client": "^4.4.9", "@langfuse/otel": "^4.4.4", @@ -87,6 +88,7 @@ "sonner": "^2.0.7", "tailwind-merge": "^3.0.2", "tailwindcss-animate": "^1.0.7", + "turndown": "^7.2.0", "unpdf": "^1.4.0", "zod": "^4.1.12" }, @@ -115,6 +117,7 @@ "@types/pako": "^2.0.3", "@types/react": "^19", "@types/react-dom": "^19", + "@types/turndown": "^5.0.6", "@vitejs/plugin-react": "^5.1.2", "@vitest/coverage-v8": "^4.0.16", "concurrently": "^9.2.1",