next-ai-draw-io/lib/pdf-utils.ts

import { extractText, getDocumentProxy } from "unpdf"

// Maximum characters allowed for extracted text (configurable via env)
const DEFAULT_MAX_EXTRACTED_CHARS = 150000 // 150k chars
export const MAX_EXTRACTED_CHARS =
    Number(process.env.NEXT_PUBLIC_MAX_EXTRACTED_CHARS) ||
    DEFAULT_MAX_EXTRACTED_CHARS

// Text file extensions we support
const TEXT_EXTENSIONS = [
    ".txt",
    ".md",
    ".markdown",
    ".json",
    ".csv",
    ".xml",
    ".html",
    ".css",
    ".js",
    ".ts",
    ".jsx",
    ".tsx",
    ".py",
    ".java",
    ".c",
    ".cpp",
    ".h",
    ".go",
    ".rs",
    ".yaml",
    ".yml",
    ".toml",
    ".ini",
    ".log",
    ".sh",
    ".bash",
    ".zsh",
]

/**
 * Extract text content from a PDF file
 * Uses unpdf library for client-side extraction
 */
export async function extractPdfText(file: File): Promise<string> {
    const buffer = await file.arrayBuffer()
    const pdf = await getDocumentProxy(new Uint8Array(buffer))
    const { text } = await extractText(pdf, { mergePages: true })
    return text as string
}

/**
 * Check if a file is a PDF
 */
export function isPdfFile(file: File): boolean {
    return file.type === "application/pdf" || file.name.endsWith(".pdf")
}

/**
 * Check if a file is a text file
 */
export function isTextFile(file: File): boolean {
    const name = file.name.toLowerCase()
    return (
        file.type.startsWith("text/") ||
        file.type === "application/json" ||
        TEXT_EXTENSIONS.some((ext) => name.endsWith(ext))
    )
}

/**
 * Extract text content from a text file
 */
export async function extractTextFileContent(file: File): Promise<string> {
    return await file.text()
}
feat: add PDF and text file upload support (#205) - Add client-side PDF text extraction using unpdf library - Support text files (.txt, .md, .json, .csv, .py, .js, .ts, etc.) - Add file preview with character count for PDF/text files - Add 150k character limit for extracted content - Highlight Paper to Diagram example with NEW badge - Fix React hydration error by adding explicit IDs to ResizablePanelGroup - Remove code duplication by centralizing file utilities in pdf-utils.ts 2025-12-10 21:32:35 +09:00			`import { extractText, getDocumentProxy } from "unpdf"`

feat: make PDF/text extraction char limit configurable via env (#214) Add NEXT_PUBLIC_MAX_EXTRACTED_CHARS environment variable to allow configuring the maximum characters extracted from PDF and text files. Defaults to 150000 (150k chars) if not set. 2025-12-11 14:14:31 +09:00			`// Maximum characters allowed for extracted text (configurable via env)`
			`const DEFAULT_MAX_EXTRACTED_CHARS = 150000 // 150k chars`
			`export const MAX_EXTRACTED_CHARS =`
			`Number(process.env.NEXT_PUBLIC_MAX_EXTRACTED_CHARS) \|\|`
			`DEFAULT_MAX_EXTRACTED_CHARS`
feat: add PDF and text file upload support (#205) - Add client-side PDF text extraction using unpdf library - Support text files (.txt, .md, .json, .csv, .py, .js, .ts, etc.) - Add file preview with character count for PDF/text files - Add 150k character limit for extracted content - Highlight Paper to Diagram example with NEW badge - Fix React hydration error by adding explicit IDs to ResizablePanelGroup - Remove code duplication by centralizing file utilities in pdf-utils.ts 2025-12-10 21:32:35 +09:00
			`// Text file extensions we support`
			`const TEXT_EXTENSIONS = [`
			`".txt",`
			`".md",`
			`".markdown",`
			`".json",`
			`".csv",`
			`".xml",`
			`".html",`
			`".css",`
			`".js",`
			`".ts",`
			`".jsx",`
			`".tsx",`
			`".py",`
			`".java",`
			`".c",`
			`".cpp",`
			`".h",`
			`".go",`
			`".rs",`
			`".yaml",`
			`".yml",`
			`".toml",`
			`".ini",`
			`".log",`
			`".sh",`
			`".bash",`
			`".zsh",`
			`]`

			`/**`
			`* Extract text content from a PDF file`
			`* Uses unpdf library for client-side extraction`
			`*/`
			`export async function extractPdfText(file: File): Promise<string> {`
			`const buffer = await file.arrayBuffer()`
			`const pdf = await getDocumentProxy(new Uint8Array(buffer))`
			`const { text } = await extractText(pdf, { mergePages: true })`
			`return text as string`
			`}`

			`/**`
			`* Check if a file is a PDF`
			`*/`
			`export function isPdfFile(file: File): boolean {`
			`return file.type === "application/pdf" \|\| file.name.endsWith(".pdf")`
			`}`

			`/**`
			`* Check if a file is a text file`
			`*/`
			`export function isTextFile(file: File): boolean {`
			`const name = file.name.toLowerCase()`
			`return (`
			`file.type.startsWith("text/") \|\|`
			`file.type === "application/json" \|\|`
			`TEXT_EXTENSIONS.some((ext) => name.endsWith(ext))`
			`)`
			`}`

			`/**`
			`* Extract text content from a text file`
			`*/`
			`export async function extractTextFileContent(file: File): Promise<string> {`
			`return await file.text()`
			`}`