From b33e09be0581f39c9147278723eec29ca818b957 Mon Sep 17 00:00:00 2001 From: Dayuan Jiang <34411969+DayuanJiang@users.noreply.github.com> Date: Sat, 13 Dec 2025 23:31:01 +0900 Subject: [PATCH] feat: add XML auto-fix with refined validation logic (#247) * feat: add XML auto-fix and improve validator accuracy - Add autoFixXml() to automatically repair common XML issues: - CDATA wrapper removal - Duplicate attribute removal - Unescaped & and < character escaping - Invalid entity reference fixing - Unclosed tag completion - Nested mxCell flattening - Duplicate ID renaming - Improve validateMxCellStructure() with DOM + regex approach: - Use DOMParser for syntax error detection (94% recall) - Add regex checks for edge cases - Stateful parser for handling > in attribute values - Integrate validateAndFixXml() in chat-message-display and diagram-context - Auto-repair invalid XML before loading - Log fixes applied for debugging Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate * fix: improve XML auto-fix from 58.7% to 99% fix rate Key improvements: - Reorder CDATA removal to run before text-before-root check (+35 cases) - Implement Gemini's backslash-quote fix with regex backreference Handles attr="value", value="text\"inner\"more", and mixed patterns - Add aggressive drop-broken-cells fix for unfixable mxCell elements Iteratively removes cells causing DOM parse errors (up to 50) Results on 9,411 XML dataset: - 206 invalid XMLs detected - 204 successfully fixed (99.0% fix rate) - 2 unfixable (completely broken, need regeneration) * refactor: extract XML validation/fix helpers and add constants - Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES - Extract parseXmlTags helper for shared tag parsing logic - Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells - Simplify validateMxCellStructure from ~200 lines to ~55 lines - Add logging to empty catch block in DOMParser section - Add size warning for large XML documents - Remove unused variables (isSelfClose, duplicate idPattern) * fix: improve XML auto-fix with malformed quote pattern - Fix ="..." pattern where " was used as delimiter instead of actual quotes - Common in dashPattern attributes like dashPattern="1 1;" --- lib/utils.ts | 452 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 263 insertions(+), 189 deletions(-) diff --git a/lib/utils.ts b/lib/utils.ts index e31b9b1..c878006 100644 --- a/lib/utils.ts +++ b/lib/utils.ts @@ -6,6 +6,95 @@ export function cn(...inputs: ClassValue[]) { return twMerge(clsx(inputs)) } +// ============================================================================ +// XML Validation/Fix Constants +// ============================================================================ + +/** Maximum XML size to process (1MB) - larger XMLs may cause performance issues */ +const MAX_XML_SIZE = 1_000_000 + +/** Maximum iterations for aggressive cell dropping to prevent infinite loops */ +const MAX_DROP_ITERATIONS = 10 + +/** Structural attributes that should not be duplicated in draw.io */ +const STRUCTURAL_ATTRS = [ + "edge", + "parent", + "source", + "target", + "vertex", + "connectable", +] + +/** Valid XML entity names */ +const VALID_ENTITIES = new Set(["lt", "gt", "amp", "quot", "apos"]) + +// ============================================================================ +// XML Parsing Helpers +// ============================================================================ + +interface ParsedTag { + tag: string + tagName: string + isClosing: boolean + isSelfClosing: boolean + startIndex: number + endIndex: number +} + +/** + * Parse XML tags while properly handling quoted strings + * This is a shared utility used by both validation and fixing logic + */ +function parseXmlTags(xml: string): ParsedTag[] { + const tags: ParsedTag[] = [] + let i = 0 + + while (i < xml.length) { + const tagStart = xml.indexOf("<", i) + if (tagStart === -1) break + + // Find matching > by tracking quotes + let tagEnd = tagStart + 1 + let inQuote = false + let quoteChar = "" + + while (tagEnd < xml.length) { + const c = xml[tagEnd] + if (inQuote) { + if (c === quoteChar) inQuote = false + } else { + if (c === '"' || c === "'") { + inQuote = true + quoteChar = c + } else if (c === ">") { + break + } + } + tagEnd++ + } + + if (tagEnd >= xml.length) break + + const tag = xml.substring(tagStart, tagEnd + 1) + i = tagEnd + 1 + + const tagMatch = /^<(\/?)([a-zA-Z][a-zA-Z0-9:_-]*)/.exec(tag) + if (!tagMatch) continue + + tags.push({ + tag, + tagName: tagMatch[2], + isClosing: tagMatch[1] === "/", + isSelfClosing: tag.endsWith("/>"), + startIndex: tagStart, + endIndex: tagEnd, + }) + } + + return tags +} + /** * Format XML string with proper indentation and line breaks * @param xml - The XML string to format @@ -533,48 +622,13 @@ export function replaceXMLParts( return result } -/** - * Validates draw.io XML structure for common issues - * Uses DOM parsing + additional regex checks for high accuracy - * @param xml - The XML string to validate - * @returns null if valid, error message string if invalid - */ -export function validateMxCellStructure(xml: string): string | null { - // 0. First use DOM parser to catch syntax errors (most accurate) - try { - const parser = new DOMParser() - const doc = parser.parseFromString(xml, "text/xml") - const parseError = doc.querySelector("parsererror") - if (parseError) { - return `Invalid XML: The XML contains syntax errors (likely unescaped special characters like <, >, & in attribute values). Please escape special characters: use < for <, > for >, & for &, " for ". Regenerate the diagram with properly escaped values.` - } +// ============================================================================ +// Validation Helper Functions +// ============================================================================ - // DOM-based checks for nested mxCell - const allCells = doc.querySelectorAll("mxCell") - for (const cell of allCells) { - if (cell.parentElement?.tagName === "mxCell") { - const id = cell.getAttribute("id") || "unknown" - return `Invalid XML: Found nested mxCell (id="${id}"). Cells should be siblings, not nested inside other mxCell elements.` - } - } - } catch { - // If DOMParser fails, continue with regex checks - } - - // 1. Check for CDATA wrapper (invalid at document root) - if (/^\s* from end" - } - - // 2. Check for duplicate structural attributes in tags - const structuralAttrs = new Set([ - "edge", - "parent", - "source", - "target", - "vertex", - "connectable", - ]) +/** Check for duplicate structural attributes in a tag */ +function checkDuplicateAttributes(xml: string): string | null { + const structuralSet = new Set(STRUCTURAL_ATTRS) const tagPattern = /<[^>]+>/g let tagMatch while ((tagMatch = tagPattern.exec(xml)) !== null) { @@ -587,24 +641,17 @@ export function validateMxCellStructure(xml: string): string | null { attributes.set(attrName, (attributes.get(attrName) || 0) + 1) } const duplicates = Array.from(attributes.entries()) - .filter(([name, count]) => count > 1 && structuralAttrs.has(name)) + .filter(([name, count]) => count > 1 && structuralSet.has(name)) .map(([name]) => name) if (duplicates.length > 0) { return `Invalid XML: Duplicate structural attribute(s): ${duplicates.join(", ")}. Remove duplicate attributes.` } } + return null +} - // 3. Check for unescaped < in attribute values - const attrValuePattern = /=\s*"([^"]*)"/g - let attrValMatch - while ((attrValMatch = attrValuePattern.exec(xml)) !== null) { - const value = attrValMatch[1] - if (/() let idMatch @@ -618,50 +665,16 @@ export function validateMxCellStructure(xml: string): string | null { if (duplicateIds.length > 0) { return `Invalid XML: Found duplicate ID(s): ${duplicateIds.slice(0, 3).join(", ")}. All id attributes must be unique.` } + return null +} - // 5. Check for tag mismatches using stateful parser +/** Check for tag mismatches using parsed tags */ +function checkTagMismatches(xml: string): string | null { const xmlWithoutComments = xml.replace(//g, "") + const tags = parseXmlTags(xmlWithoutComments) const tagStack: string[] = [] - // Parse tags properly by handling quoted strings - let i = 0 - while (i < xmlWithoutComments.length) { - // Find next < - const tagStart = xmlWithoutComments.indexOf("<", i) - if (tagStart === -1) break - - // Find matching > by tracking quotes - let tagEnd = tagStart + 1 - let inQuote = false - let quoteChar = "" - while (tagEnd < xmlWithoutComments.length) { - const c = xmlWithoutComments[tagEnd] - if (inQuote) { - if (c === quoteChar) inQuote = false - } else { - if (c === '"' || c === "'") { - inQuote = true - quoteChar = c - } else if (c === ">") { - break - } - } - tagEnd++ - } - - if (tagEnd >= xmlWithoutComments.length) break - - const tag = xmlWithoutComments.substring(tagStart, tagEnd + 1) - i = tagEnd + 1 - - // Parse the tag - const tagMatch = /^<(\/?)([a-zA-Z][a-zA-Z0-9:_-]*)/.exec(tag) - if (!tagMatch) continue - - const isClosing = tagMatch[1] === "/" - const tagName = tagMatch[2] - const isSelfClosing = tag.endsWith("/>") - + for (const { tagName, isClosing, isSelfClosing } of tags) { if (isClosing) { if (tagStack.length === 0) { return `Invalid XML: Closing tag without matching opening tag` @@ -677,8 +690,11 @@ export function validateMxCellStructure(xml: string): string | null { if (tagStack.length > 0) { return `Invalid XML: Document has ${tagStack.length} unclosed tag(s): ${tagStack.join(", ")}` } + return null +} - // 6. Check invalid character references +/** Check for invalid character references */ +function checkCharacterReferences(xml: string): string | null { const charRefPattern = /&#x?[^;]+;?/g let charMatch while ((charMatch = charRefPattern.exec(xml)) !== null) { @@ -701,42 +717,30 @@ export function validateMxCellStructure(xml: string): string | null { } } } + return null +} - // 7. Check for invalid comment syntax (-- inside comments) - const commentPattern = //g - let commentMatch - while ((commentMatch = commentPattern.exec(xml)) !== null) { - if (/--/.test(commentMatch[1])) { - return "Invalid XML: Comment contains -- (double hyphen) which is not allowed" - } - } - - // 8. Check for unescaped entity references and invalid entity names +/** Check for invalid entity references */ +function checkEntityReferences(xml: string): string | null { + const xmlWithoutComments = xml.replace(//g, "") const bareAmpPattern = /&(?!(?:lt|gt|amp|quot|apos|#))/g if (bareAmpPattern.test(xmlWithoutComments)) { return "Invalid XML: Found unescaped & character(s). Replace & with &" } const invalidEntityPattern = /&([a-zA-Z][a-zA-Z0-9]*);/g - const validEntities = new Set(["lt", "gt", "amp", "quot", "apos"]) let entityMatch while ( (entityMatch = invalidEntityPattern.exec(xmlWithoutComments)) !== null ) { - if (!validEntities.has(entityMatch[1])) { + if (!VALID_ENTITIES.has(entityMatch[1])) { return `Invalid XML: Invalid entity reference: &${entityMatch[1]}; - use only valid XML entities (lt, gt, amp, quot, apos)` } } + return null +} - // 9. Check for empty id attributes on mxCell - if (/]*\sid\s*=\s*["']\s*["'][^>]*>/g.test(xml)) { - return "Invalid XML: Found mxCell element(s) with empty id attribute" - } - - // 10. Check for mxfile wrapper (warning only - may not work with URL hash loading) - // Disabled: This is just a warning, not an error - // if (xml.trim().startsWith(']*>/g const cellStack: number[] = [] let cellMatch @@ -755,6 +759,100 @@ export function validateMxCellStructure(xml: string): string | null { } } } + return null +} + +/** + * Validates draw.io XML structure for common issues + * Uses DOM parsing + additional regex checks for high accuracy + * @param xml - The XML string to validate + * @returns null if valid, error message string if invalid + */ +export function validateMxCellStructure(xml: string): string | null { + // Size check for performance + if (xml.length > MAX_XML_SIZE) { + console.warn( + `[validateMxCellStructure] XML size (${xml.length}) exceeds ${MAX_XML_SIZE} bytes, may cause performance issues`, + ) + } + + // 0. First use DOM parser to catch syntax errors (most accurate) + try { + const parser = new DOMParser() + const doc = parser.parseFromString(xml, "text/xml") + const parseError = doc.querySelector("parsererror") + if (parseError) { + return `Invalid XML: The XML contains syntax errors (likely unescaped special characters like <, >, & in attribute values). Please escape special characters: use < for <, > for >, & for &, " for ". Regenerate the diagram with properly escaped values.` + } + + // DOM-based checks for nested mxCell + const allCells = doc.querySelectorAll("mxCell") + for (const cell of allCells) { + if (cell.parentElement?.tagName === "mxCell") { + const id = cell.getAttribute("id") || "unknown" + return `Invalid XML: Found nested mxCell (id="${id}"). Cells should be siblings, not nested inside other mxCell elements.` + } + } + } catch (error) { + // Log unexpected DOMParser errors before falling back to regex checks + console.warn( + "[validateMxCellStructure] DOMParser threw unexpected error, falling back to regex validation:", + error, + ) + } + + // 1. Check for CDATA wrapper (invalid at document root) + if (/^\s* from end" + } + + // 2. Check for duplicate structural attributes + const dupAttrError = checkDuplicateAttributes(xml) + if (dupAttrError) return dupAttrError + + // 3. Check for unescaped < in attribute values + const attrValuePattern = /=\s*"([^"]*)"/g + let attrValMatch + while ((attrValMatch = attrValuePattern.exec(xml)) !== null) { + const value = attrValMatch[1] + if (//g + let commentMatch + while ((commentMatch = commentPattern.exec(xml)) !== null) { + if (/--/.test(commentMatch[1])) { + return "Invalid XML: Comment contains -- (double hyphen) which is not allowed" + } + } + + // 8. Check for unescaped entity references and invalid entity names + const entityError = checkEntityReferences(xml) + if (entityError) return entityError + + // 9. Check for empty id attributes on mxCell + if (/]*\sid\s*=\s*["']\s*["'][^>]*>/g.test(xml)) { + return "Invalid XML: Found mxCell element(s) with empty id attribute" + } + + // 10. Check for nested mxCell tags + const nestedCellError = checkNestedMxCells(xml) + if (nestedCellError) return nestedCellError return null } @@ -768,18 +866,15 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { let fixed = xml const fixes: string[] = [] - // 0. Fix backslash-escaped quotes (common LLM mistakes) - // Handles: attr=\"value\", value="text\"inner\"more", and mixed patterns - // Uses backreference to match opening/closing quote style, then normalizes - if (/\\"/.test(fixed)) { - fixed = fixed.replace( - /(\s[\w:-]+)\s*=\s*(\\"|")([\s\S]*?)\2(?=[\s/>?]|$)/g, - (_match, attrName, _openQuote, content) => { - const cleanContent = content.replace(/\\"/g, """) - return `${attrName}="${cleanContent}"` - }, - ) - fixes.push("Fixed backslash-escaped quotes") + // 0. Fix JSON-escaped XML (common when XML is stored in JSON without unescaping) + // Only apply when we see JSON-escaped attribute patterns like =\"value\" + // Don't apply to legitimate \n in value attributes (draw.io uses these for line breaks) + if (/=\\"/.test(fixed)) { + // Replace literal \" with actual quotes + fixed = fixed.replace(/\\"/g, '"') + // Replace literal \n with actual newlines (only after confirming JSON-escaped) + fixed = fixed.replace(/\\n/g, "\n") + fixes.push("Fixed JSON-escaped XML") } // 1. Remove CDATA wrapper (MUST be before text-before-root check) @@ -796,20 +891,11 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { } // 2. Fix duplicate attributes (keep first occurrence, remove duplicates) - const structuralAttrsToFix = [ - "edge", - "parent", - "source", - "target", - "vertex", - "connectable", - ] let dupAttrFixed = false fixed = fixed.replace(/<[^>]+>/g, (tag) => { - const seenAttrs = new Set() let newTag = tag - for (const attr of structuralAttrsToFix) { + for (const attr of STRUCTURAL_ATTRS) { // Find all occurrences of this attribute const attrRegex = new RegExp( `\\s${attr}\\s*=\\s*["'][^"']*["']`, @@ -864,20 +950,44 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { } } - // 3b. Fix malformed attribute values where " is used as delimiter - // Pattern: attr="value" should be attr=""value"" - const malformedQuotePattern = - /(\s[a-zA-Z][a-zA-Z0-9_:-]*)="([^&]*(?:&(?!quot;)[^&]*)*)"/g + // 3b. Fix malformed attribute values where " is used as delimiter instead of actual quotes + // Pattern: attr="value" should become attr="value" (the " was meant to be the quote delimiter) + // This commonly happens with dashPattern="1 1;" + const malformedQuotePattern = /(\s[a-zA-Z][a-zA-Z0-9_:-]*)="/ if (malformedQuotePattern.test(fixed)) { + // Replace =" with =" and trailing " before next attribute or tag end with " fixed = fixed.replace( - /(\s[a-zA-Z][a-zA-Z0-9_:-]*)="([^&]*(?:&(?!quot;)[^&]*)*)"/g, - '$1=""$2""', + /(\s[a-zA-Z][a-zA-Z0-9_:-]*)="([^&]*?)"/g, + '$1="$2"', ) fixes.push( - 'Fixed malformed attribute quotes (="..." to =""..."")', + 'Fixed malformed attribute quotes (="..." to ="...")', ) } + // 3c. Fix malformed closing tags like -> + const malformedClosingTag = /<\/([a-zA-Z][a-zA-Z0-9]*)\s*\/>/g + if (malformedClosingTag.test(fixed)) { + fixed = fixed.replace(/<\/([a-zA-Z][a-zA-Z0-9]*)\s*\/>/g, "") + fixes.push("Fixed malformed closing tags ( to )") + } + + // 3d. Fix missing space between attributes like vertex="1"parent="1" + const missingSpacePattern = /("[^"]*")([a-zA-Z][a-zA-Z0-9_:-]*=)/g + if (missingSpacePattern.test(fixed)) { + fixed = fixed.replace(/("[^"]*")([a-zA-Z][a-zA-Z0-9_:-]*=)/g, "$1 $2") + fixes.push("Added missing space between attributes") + } + + // 3e. Fix unescaped quotes in style color values like fillColor="#fff2e6" + // The " after Color= prematurely ends the style attribute. Remove it. + // Pattern: ;fillColor="#fff → ;fillColor=#fff (remove first ", keep second as style closer) + const quotedColorPattern = /;([a-zA-Z]*[Cc]olor)="#/ + if (quotedColorPattern.test(fixed)) { + fixed = fixed.replace(/;([a-zA-Z]*[Cc]olor)="#/g, ";$1=#") + fixes.push("Removed quotes around color values in style") + } + // 4. Fix unescaped < in attribute values // This is tricky - we need to find < inside quoted attribute values const attrPattern = /(=\s*")([^"]*?)(<)([^"]*?)(")/g @@ -891,7 +1001,7 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { } if (hasUnescapedLt) { // Replace < with < inside attribute values - fixed = fixed.replace(/=\s*"([^"]*)"/g, (match, value) => { + fixed = fixed.replace(/=\s*"([^"]*)"/g, (_match, value) => { const escaped = value.replace(/ by tracking quotes - let tagEnd = tagStart + 1 - let inQuote = false - let quoteChar = "" - while (tagEnd < fixed.length) { - const c = fixed[tagEnd] - if (inQuote) { - if (c === quoteChar) inQuote = false - } else { - if (c === '"' || c === "'") { - inQuote = true - quoteChar = c - } else if (c === ">") { - break - } - } - tagEnd++ - } - - if (tagEnd >= fixed.length) break - - const tag = fixed.substring(tagStart, tagEnd + 1) - idx = tagEnd + 1 - - const tagMatch2 = /^<(\/?)([a-zA-Z][a-zA-Z0-9:_-]*)/.exec(tag) - if (!tagMatch2) continue - - const isClosing = tagMatch2[1] === "/" - const tagName = tagMatch2[2] - const isSelfClosing = tag.endsWith("/>") - + for (const { tagName, isClosing, isSelfClosing } of parsedTags) { if (isClosing) { // Find matching opening tag (may not be the last one if there's mismatch) const lastIdx = tagStack.lastIndexOf(tagName) @@ -1112,7 +1188,6 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { // Track mxCell depth const isOpenCell = /") const isCloseCell = trimmed === "" - const isSelfClose = /]*\/>/.test(trimmed) if (isOpenCell) { if (cellDepth > 0) { @@ -1143,14 +1218,13 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { } // 12. Fix duplicate IDs by appending suffix - const idPattern = /\bid\s*=\s*["']([^"']+)["']/gi const seenIds = new Map() const duplicateIds: string[] = [] // First pass: find duplicates + const idPattern = /\bid\s*=\s*["']([^"']+)["']/gi let idMatch - const tempPattern = /\bid\s*=\s*["']([^"']+)["']/gi - while ((idMatch = tempPattern.exec(fixed)) !== null) { + while ((idMatch = idPattern.exec(fixed)) !== null) { const id = idMatch[1] seenIds.set(id, (seenIds.get(id) || 0) + 1) } @@ -1182,7 +1256,7 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { let emptyIdCount = 0 fixed = fixed.replace( /]*)\sid\s*=\s*["']\s*["']([^>]*)>/g, - (match, before, after) => { + (_match, before, after) => { emptyIdCount++ const newId = `cell_${Date.now()}_${emptyIdCount}` return `` @@ -1196,7 +1270,7 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { // Only do this if DOM parser still finds errors after all other fixes if (typeof DOMParser !== "undefined") { let droppedCells = 0 - let maxIterations = 50 + let maxIterations = MAX_DROP_ITERATIONS while (maxIterations-- > 0) { const parser = new DOMParser() const doc = parser.parseFromString(fixed, "text/xml")