fix: improve XML auto-fix from 58.7% to 99% fix rate

Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
  Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
  Iteratively removes cells causing DOM parse errors (up to 50)

Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
This commit is contained in:
dayuan.jiang
2025-12-13 16:02:56 +09:00
parent 4b838fd6d5
commit 6024443816

View File

@@ -768,12 +768,33 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
let fixed = xml let fixed = xml
const fixes: string[] = [] const fixes: string[] = []
// 1. Remove CDATA wrapper // 0. Fix backslash-escaped quotes (common LLM mistakes)
// Handles: attr=\"value\", value="text\"inner\"more", and mixed patterns
// Uses backreference to match opening/closing quote style, then normalizes
if (/\\"/.test(fixed)) {
fixed = fixed.replace(
/(\s[\w:-]+)\s*=\s*(\\"|")([\s\S]*?)\2(?=[\s/>?]|$)/g,
(_match, attrName, _openQuote, content) => {
const cleanContent = content.replace(/\\"/g, """)
return `${attrName}="${cleanContent}"`
},
)
fixes.push("Fixed backslash-escaped quotes")
}
// 1. Remove CDATA wrapper (MUST be before text-before-root check)
if (/^\s*<!\[CDATA\[/.test(fixed)) { if (/^\s*<!\[CDATA\[/.test(fixed)) {
fixed = fixed.replace(/^\s*<!\[CDATA\[/, "").replace(/\]\]>\s*$/, "") fixed = fixed.replace(/^\s*<!\[CDATA\[/, "").replace(/\]\]>\s*$/, "")
fixes.push("Removed CDATA wrapper") fixes.push("Removed CDATA wrapper")
} }
// 2. Remove text before XML declaration or root element (only if it's garbage text, not valid XML)
const xmlStart = fixed.search(/<(\?xml|mxGraphModel|mxfile)/i)
if (xmlStart > 0 && !/^<[a-zA-Z]/.test(fixed.trim())) {
fixed = fixed.substring(xmlStart)
fixes.push("Removed text before XML root")
}
// 2. Fix duplicate attributes (keep first occurrence, remove duplicates) // 2. Fix duplicate attributes (keep first occurrence, remove duplicates)
const structuralAttrsToFix = [ const structuralAttrsToFix = [
"edge", "edge",
@@ -1171,6 +1192,54 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
fixes.push(`Generated ${emptyIdCount} missing ID(s)`) fixes.push(`Generated ${emptyIdCount} missing ID(s)`)
} }
// 13. Aggressive: drop broken mxCell elements that can't be fixed
// Only do this if DOM parser still finds errors after all other fixes
if (typeof DOMParser !== "undefined") {
let droppedCells = 0
let maxIterations = 50
while (maxIterations-- > 0) {
const parser = new DOMParser()
const doc = parser.parseFromString(fixed, "text/xml")
const parseError = doc.querySelector("parsererror")
if (!parseError) break // Valid now!
const errText = parseError.textContent || ""
const match = errText.match(/(\d+):\d+:/)
if (!match) break
const errLine = parseInt(match[1], 10) - 1
const lines = fixed.split("\n")
// Find the mxCell containing this error line
let cellStart = errLine
let cellEnd = errLine
// Go back to find <mxCell
while (cellStart > 0 && !lines[cellStart].includes("<mxCell")) {
cellStart--
}
// Go forward to find </mxCell> or />
while (cellEnd < lines.length - 1) {
if (
lines[cellEnd].includes("</mxCell>") ||
lines[cellEnd].trim().endsWith("/>")
) {
break
}
cellEnd++
}
// Remove these lines
lines.splice(cellStart, cellEnd - cellStart + 1)
fixed = lines.join("\n")
droppedCells++
}
if (droppedCells > 0) {
fixes.push(`Dropped ${droppedCells} unfixable mxCell element(s)`)
}
}
return { fixed, fixes } return { fixed, fixes }
} }