From 6024443816e56237982c86739477ee2fcceea425 Mon Sep 17 00:00:00 2001 From: "dayuan.jiang" Date: Sat, 13 Dec 2025 16:02:56 +0900 Subject: [PATCH] fix: improve XML auto-fix from 58.7% to 99% fix rate Key improvements: - Reorder CDATA removal to run before text-before-root check (+35 cases) - Implement Gemini's backslash-quote fix with regex backreference Handles attr="value", value="text\"inner\"more", and mixed patterns - Add aggressive drop-broken-cells fix for unfixable mxCell elements Iteratively removes cells causing DOM parse errors (up to 50) Results on 9,411 XML dataset: - 206 invalid XMLs detected - 204 successfully fixed (99.0% fix rate) - 2 unfixable (completely broken, need regeneration) --- lib/utils.ts | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/lib/utils.ts b/lib/utils.ts index 34a15a1..e31b9b1 100644 --- a/lib/utils.ts +++ b/lib/utils.ts @@ -768,12 +768,33 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { let fixed = xml const fixes: string[] = [] - // 1. Remove CDATA wrapper + // 0. Fix backslash-escaped quotes (common LLM mistakes) + // Handles: attr=\"value\", value="text\"inner\"more", and mixed patterns + // Uses backreference to match opening/closing quote style, then normalizes + if (/\\"/.test(fixed)) { + fixed = fixed.replace( + /(\s[\w:-]+)\s*=\s*(\\"|")([\s\S]*?)\2(?=[\s/>?]|$)/g, + (_match, attrName, _openQuote, content) => { + const cleanContent = content.replace(/\\"/g, """) + return `${attrName}="${cleanContent}"` + }, + ) + fixes.push("Fixed backslash-escaped quotes") + } + + // 1. Remove CDATA wrapper (MUST be before text-before-root check) if (/^\s*\s*$/, "") fixes.push("Removed CDATA wrapper") } + // 2. Remove text before XML declaration or root element (only if it's garbage text, not valid XML) + const xmlStart = fixed.search(/<(\?xml|mxGraphModel|mxfile)/i) + if (xmlStart > 0 && !/^<[a-zA-Z]/.test(fixed.trim())) { + fixed = fixed.substring(xmlStart) + fixes.push("Removed text before XML root") + } + // 2. Fix duplicate attributes (keep first occurrence, remove duplicates) const structuralAttrsToFix = [ "edge", @@ -1171,6 +1192,54 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { fixes.push(`Generated ${emptyIdCount} missing ID(s)`) } + // 13. Aggressive: drop broken mxCell elements that can't be fixed + // Only do this if DOM parser still finds errors after all other fixes + if (typeof DOMParser !== "undefined") { + let droppedCells = 0 + let maxIterations = 50 + while (maxIterations-- > 0) { + const parser = new DOMParser() + const doc = parser.parseFromString(fixed, "text/xml") + const parseError = doc.querySelector("parsererror") + if (!parseError) break // Valid now! + + const errText = parseError.textContent || "" + const match = errText.match(/(\d+):\d+:/) + if (!match) break + + const errLine = parseInt(match[1], 10) - 1 + const lines = fixed.split("\n") + + // Find the mxCell containing this error line + let cellStart = errLine + let cellEnd = errLine + + // Go back to find 0 && !lines[cellStart].includes(" or /> + while (cellEnd < lines.length - 1) { + if ( + lines[cellEnd].includes("") || + lines[cellEnd].trim().endsWith("/>") + ) { + break + } + cellEnd++ + } + + // Remove these lines + lines.splice(cellStart, cellEnd - cellStart + 1) + fixed = lines.join("\n") + droppedCells++ + } + if (droppedCells > 0) { + fixes.push(`Dropped ${droppedCells} unfixable mxCell element(s)`) + } + } + return { fixed, fixes } }