fix: escape HTML in XML attribute values to prevent parse errors (#386)

- Add HTML escaping (<, >) in convertToLegalXml for attribute values
- Update isMxCellXmlComplete to handle any LLM provider's wrapper tags
- Add wrapper tag stripping in wrapWithMxFile for DeepSeek/Anthropic tags
- Update autoFixXml to escape both < and > in attribute values

Fixes 'Malformed XML detected in final output' error when AI generates
diagrams with HTML content in value attributes like <b>Title</b>.
This commit is contained in:
Dayuan Jiang
2025-12-24 09:31:54 +09:00
committed by GitHub
parent 3218ccc909
commit 0d2e7a7ad6
2 changed files with 147 additions and 51 deletions

View File

@@ -36,29 +36,32 @@ const VALID_ENTITIES = new Set(["lt", "gt", "amp", "quot", "apos"])
/** /**
* Check if mxCell XML output is complete (not truncated). * Check if mxCell XML output is complete (not truncated).
* Complete XML ends with a self-closing tag (/>) or closing mxCell tag. * Complete XML ends with a self-closing tag (/>) or closing mxCell tag.
* Also handles function-calling wrapper tags that may be incorrectly included. * Uses a robust approach that handles any LLM provider's wrapper tags
* by finding the last valid mxCell ending and checking if suffix is just closing tags.
* @param xml - The XML string to check (can be undefined/null) * @param xml - The XML string to check (can be undefined/null)
* @returns true if XML appears complete, false if truncated or empty * @returns true if XML appears complete, false if truncated or empty
*/ */
export function isMxCellXmlComplete(xml: string | undefined | null): boolean { export function isMxCellXmlComplete(xml: string | undefined | null): boolean {
let trimmed = xml?.trim() || "" const trimmed = xml?.trim() || ""
if (!trimmed) return false if (!trimmed) return false
// Strip Anthropic function-calling wrapper tags if present // Find position of last complete mxCell ending (either /> or </mxCell>)
// These can leak into tool input due to AI SDK parsing issues const lastSelfClose = trimmed.lastIndexOf("/>")
// Use loop because tags are nested: </mxCell></mxParameter></invoke> const lastMxCellClose = trimmed.lastIndexOf("</mxCell>")
let prev = ""
while (prev !== trimmed) {
prev = trimmed
trimmed = trimmed
.replace(/<\/mxParameter>\s*$/i, "")
.replace(/<\/invoke>\s*$/i, "")
.replace(/<\/antml:parameter>\s*$/i, "")
.replace(/<\/antml:invoke>\s*$/i, "")
.trim()
}
return trimmed.endsWith("/>") || trimmed.endsWith("</mxCell>") const lastValidEnd = Math.max(lastSelfClose, lastMxCellClose)
// No valid ending found at all
if (lastValidEnd === -1) return false
// Check what comes after the last valid ending
// For />: add 2 chars, for </mxCell>: add 9 chars
const endOffset = lastMxCellClose > lastSelfClose ? 9 : 2
const suffix = trimmed.slice(lastValidEnd + endOffset)
// If suffix is empty or only contains closing tags (any provider's wrapper) or whitespace, it's complete
// This regex matches any sequence of closing XML tags like </foo>, </bar>, </DSMLxyz>
return /^(\s*<\/[^>]+>)*\s*$/.test(suffix)
} }
/** /**
@@ -262,6 +265,21 @@ export function convertToLegalXml(xmlString: string): string {
"&amp;", "&amp;",
) )
// Fix unescaped < and > in attribute values for XML parsing
// HTML content in value attributes (e.g., <b>Title</b>) needs to be escaped
// This is critical because DOMParser will fail on unescaped < > in attributes
if (/=\s*"[^"]*<[^"]*"/.test(cellContent)) {
cellContent = cellContent.replace(
/=\s*"([^"]*)"/g,
(_match, value) => {
const escaped = value
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
return `="${escaped}"`
},
)
}
// Indent each line of the matched block for readability. // Indent each line of the matched block for readability.
const formatted = cellContent const formatted = cellContent
.split("\n") .split("\n")
@@ -306,6 +324,20 @@ export function wrapWithMxFile(xml: string): string {
content = xml.replace(/<\/?root>/g, "").trim() content = xml.replace(/<\/?root>/g, "").trim()
} }
// Strip trailing LLM wrapper tags (from any provider: Anthropic, DeepSeek, etc.)
// Find the last valid mxCell ending and remove everything after it
const lastSelfClose = content.lastIndexOf("/>")
const lastMxCellClose = content.lastIndexOf("</mxCell>")
const lastValidEnd = Math.max(lastSelfClose, lastMxCellClose)
if (lastValidEnd !== -1) {
const endOffset = lastMxCellClose > lastSelfClose ? 9 : 2
const suffix = content.slice(lastValidEnd + endOffset)
// If suffix is only closing tags (wrapper tags), strip it
if (/^(\s*<\/[^>]+>)*\s*$/.test(suffix)) {
content = content.slice(0, lastValidEnd + endOffset)
}
}
// Remove any existing root cells from content (LLM shouldn't include them, but handle it gracefully) // Remove any existing root cells from content (LLM shouldn't include them, but handle it gracefully)
// Use flexible patterns that match both self-closing (/>) and non-self-closing (></mxCell>) formats // Use flexible patterns that match both self-closing (/>) and non-self-closing (></mxCell>) formats
content = content content = content
@@ -910,6 +942,21 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
fixes.push("Removed CDATA wrapper") fixes.push("Removed CDATA wrapper")
} }
// 1b. Strip trailing LLM wrapper tags (DeepSeek, Anthropic, etc.)
// These are closing tags after the last valid mxCell that break XML parsing
const lastSelfClose = fixed.lastIndexOf("/>")
const lastMxCellClose = fixed.lastIndexOf("</mxCell>")
const lastValidEnd = Math.max(lastSelfClose, lastMxCellClose)
if (lastValidEnd !== -1) {
const endOffset = lastMxCellClose > lastSelfClose ? 9 : 2
const suffix = fixed.slice(lastValidEnd + endOffset)
// If suffix contains only closing tags (wrapper tags) or whitespace, strip it
if (/^(\s*<\/[^>]+>)+\s*$/.test(suffix)) {
fixed = fixed.slice(0, lastValidEnd + endOffset)
fixes.push("Stripped trailing LLM wrapper tags")
}
}
// 2. Remove text before XML declaration or root element (only if it's garbage text, not valid XML) // 2. Remove text before XML declaration or root element (only if it's garbage text, not valid XML)
const xmlStart = fixed.search(/<(\?xml|mxGraphModel|mxfile)/i) const xmlStart = fixed.search(/<(\?xml|mxGraphModel|mxfile)/i)
if (xmlStart > 0 && !/^<[a-zA-Z]/.test(fixed.trim())) { if (xmlStart > 0 && !/^<[a-zA-Z]/.test(fixed.trim())) {
@@ -1015,8 +1062,8 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
fixes.push("Removed quotes around color values in style") fixes.push("Removed quotes around color values in style")
} }
// 4. Fix unescaped < in attribute values // 4. Fix unescaped < and > in attribute values
// This is tricky - we need to find < inside quoted attribute values // < is required to be escaped, > is not strictly required but we escape for consistency
const attrPattern = /(=\s*")([^"]*?)(<)([^"]*?)(")/g const attrPattern = /(=\s*")([^"]*?)(<)([^"]*?)(")/g
let attrMatch let attrMatch
let hasUnescapedLt = false let hasUnescapedLt = false
@@ -1027,12 +1074,12 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
} }
} }
if (hasUnescapedLt) { if (hasUnescapedLt) {
// Replace < with &lt; inside attribute values // Replace < and > with &lt; and &gt; inside attribute values
fixed = fixed.replace(/=\s*"([^"]*)"/g, (_match, value) => { fixed = fixed.replace(/=\s*"([^"]*)"/g, (_match, value) => {
const escaped = value.replace(/</g, "&lt;") const escaped = value.replace(/</g, "&lt;").replace(/>/g, "&gt;")
return `="${escaped}"` return `="${escaped}"`
}) })
fixes.push("Escaped < characters in attribute values") fixes.push("Escaped <> characters in attribute values")
} }
// 5. Fix invalid character references (remove malformed ones) // 5. Fix invalid character references (remove malformed ones)
@@ -1120,7 +1167,8 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
} }
// 8c. Remove non-draw.io tags (after typo fixes so lowercase variants are fixed first) // 8c. Remove non-draw.io tags (after typo fixes so lowercase variants are fixed first)
// Valid draw.io tags: mxfile, diagram, mxGraphModel, root, mxCell, mxGeometry, mxPoint, Array, Object // IMPORTANT: Only remove tags at the element level, NOT inside quoted attribute values
// Tags like <b>, <br> inside value="<b>text</b>" should be preserved (they're HTML content)
const validDrawioTags = new Set([ const validDrawioTags = new Set([
"mxfile", "mxfile",
"diagram", "diagram",
@@ -1133,25 +1181,59 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
"Object", "Object",
"mxRectangle", "mxRectangle",
]) ])
// Helper: Check if a position is inside a quoted attribute value
// by counting unescaped quotes before that position
const isInsideQuotes = (str: string, pos: number): boolean => {
let inQuote = false
let quoteChar = ""
for (let i = 0; i < pos && i < str.length; i++) {
const c = str[i]
if (inQuote) {
if (c === quoteChar) inQuote = false
} else if (c === '"' || c === "'") {
// Check if this quote is part of an attribute (preceded by =)
// Look back for = sign
let j = i - 1
while (j >= 0 && /\s/.test(str[j])) j--
if (j >= 0 && str[j] === "=") {
inQuote = true
quoteChar = c
}
}
}
return inQuote
}
const foreignTagPattern = /<\/?([a-zA-Z][a-zA-Z0-9_]*)[^>]*>/g const foreignTagPattern = /<\/?([a-zA-Z][a-zA-Z0-9_]*)[^>]*>/g
let foreignMatch let foreignMatch
const foreignTags = new Set<string>() const foreignTags = new Set<string>()
const foreignTagPositions: Array<{
tag: string
start: number
end: number
}> = []
while ((foreignMatch = foreignTagPattern.exec(fixed)) !== null) { while ((foreignMatch = foreignTagPattern.exec(fixed)) !== null) {
const tagName = foreignMatch[1] const tagName = foreignMatch[1]
if (!validDrawioTags.has(tagName)) { // Skip if this is a valid draw.io tag
foreignTags.add(tagName) if (validDrawioTags.has(tagName)) continue
} // Skip if this tag is inside a quoted attribute value
if (isInsideQuotes(fixed, foreignMatch.index)) continue
foreignTags.add(tagName)
foreignTagPositions.push({
tag: tagName,
start: foreignMatch.index,
end: foreignMatch.index + foreignMatch[0].length,
})
} }
if (foreignTags.size > 0) {
console.log( if (foreignTagPositions.length > 0) {
"[autoFixXml] Step 8c: Found foreign tags:", // Remove tags from end to start to preserve indices
Array.from(foreignTags), foreignTagPositions.sort((a, b) => b.start - a.start)
) for (const { start, end } of foreignTagPositions) {
for (const tag of foreignTags) { fixed = fixed.slice(0, start) + fixed.slice(end)
// Remove opening tags (with or without attributes)
fixed = fixed.replace(new RegExp(`<${tag}[^>]*>`, "gi"), "")
// Remove closing tags
fixed = fixed.replace(new RegExp(`</${tag}>`, "gi"), "")
} }
fixes.push( fixes.push(
`Removed foreign tags: ${Array.from(foreignTags).join(", ")}`, `Removed foreign tags: ${Array.from(foreignTags).join(", ")}`,
@@ -1202,6 +1284,7 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
// 10b. Remove extra closing tags (more closes than opens) // 10b. Remove extra closing tags (more closes than opens)
// Need to properly count self-closing tags (they don't need closing tags) // Need to properly count self-closing tags (they don't need closing tags)
// IMPORTANT: Only count tags at element level, NOT inside quoted attribute values
const tagCounts = new Map< const tagCounts = new Map<
string, string,
{ opens: number; closes: number; selfClosing: number } { opens: number; closes: number; selfClosing: number }
@@ -1210,12 +1293,18 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
const fullTagPattern = /<(\/?[a-zA-Z][a-zA-Z0-9]*)[^>]*>/g const fullTagPattern = /<(\/?[a-zA-Z][a-zA-Z0-9]*)[^>]*>/g
let tagCountMatch let tagCountMatch
while ((tagCountMatch = fullTagPattern.exec(fixed)) !== null) { while ((tagCountMatch = fullTagPattern.exec(fixed)) !== null) {
// Skip tags inside quoted attribute values (e.g., value="<b>Title</b>")
if (isInsideQuotes(fixed, tagCountMatch.index)) continue
const fullMatch = tagCountMatch[0] // e.g., "<mxCell .../>" or "</mxCell>" const fullMatch = tagCountMatch[0] // e.g., "<mxCell .../>" or "</mxCell>"
const tagPart = tagCountMatch[1] // e.g., "mxCell" or "/mxCell" const tagPart = tagCountMatch[1] // e.g., "mxCell" or "/mxCell"
const isClosing = tagPart.startsWith("/") const isClosing = tagPart.startsWith("/")
const isSelfClosing = fullMatch.endsWith("/>") const isSelfClosing = fullMatch.endsWith("/>")
const tagName = isClosing ? tagPart.slice(1) : tagPart const tagName = isClosing ? tagPart.slice(1) : tagPart
// Only count valid draw.io tags - skip partial/invalid tags like "mx" from streaming
if (!validDrawioTags.has(tagName)) continue
let counts = tagCounts.get(tagName) let counts = tagCounts.get(tagName)
if (!counts) { if (!counts) {
counts = { opens: 0, closes: 0, selfClosing: 0 } counts = { opens: 0, closes: 0, selfClosing: 0 }

View File

@@ -459,7 +459,8 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
fixes.push("Removed quotes around color values in style") fixes.push("Removed quotes around color values in style")
} }
// 10. Fix unescaped < in attribute values // 10. Fix unescaped < and > in attribute values
// < is required to be escaped, > is not strictly required but we escape for consistency
const attrPattern = /(=\s*")([^"]*?)(<)([^"]*?)(")/g const attrPattern = /(=\s*")([^"]*?)(<)([^"]*?)(")/g
let attrMatch let attrMatch
let hasUnescapedLt = false let hasUnescapedLt = false
@@ -471,10 +472,10 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
} }
if (hasUnescapedLt) { if (hasUnescapedLt) {
fixed = fixed.replace(/=\s*"([^"]*)"/g, (_match, value) => { fixed = fixed.replace(/=\s*"([^"]*)"/g, (_match, value) => {
const escaped = value.replace(/</g, "&lt;") const escaped = value.replace(/</g, "&lt;").replace(/>/g, "&gt;")
return `="${escaped}"` return `="${escaped}"`
}) })
fixes.push("Escaped < characters in attribute values") fixes.push("Escaped <> characters in attribute values")
} }
// 11. Fix invalid hex character references // 11. Fix invalid hex character references
@@ -903,24 +904,30 @@ export function validateAndFixXml(xml: string): {
/** /**
* Check if mxCell XML output is complete (not truncated). * Check if mxCell XML output is complete (not truncated).
* Uses a robust approach that handles any LLM provider's wrapper tags
* by finding the last valid mxCell ending and checking if suffix is just closing tags.
* @param xml - The XML string to check (can be undefined/null) * @param xml - The XML string to check (can be undefined/null)
* @returns true if XML appears complete, false if truncated or empty * @returns true if XML appears complete, false if truncated or empty
*/ */
export function isMxCellXmlComplete(xml: string | undefined | null): boolean { export function isMxCellXmlComplete(xml: string | undefined | null): boolean {
let trimmed = xml?.trim() || "" const trimmed = xml?.trim() || ""
if (!trimmed) return false if (!trimmed) return false
// Strip wrapper tags if present // Find position of last complete mxCell ending (either /> or </mxCell>)
let prev = "" const lastSelfClose = trimmed.lastIndexOf("/>")
while (prev !== trimmed) { const lastMxCellClose = trimmed.lastIndexOf("</mxCell>")
prev = trimmed
trimmed = trimmed
.replace(/<\/mxParameter>\s*$/i, "")
.replace(/<\/invoke>\s*$/i, "")
.replace(/<\/antml:parameter>\s*$/i, "")
.replace(/<\/antml:invoke>\s*$/i, "")
.trim()
}
return trimmed.endsWith("/>") || trimmed.endsWith("</mxCell>") const lastValidEnd = Math.max(lastSelfClose, lastMxCellClose)
// No valid ending found at all
if (lastValidEnd === -1) return false
// Check what comes after the last valid ending
// For />: add 2 chars, for </mxCell>: add 9 chars
const endOffset = lastMxCellClose > lastSelfClose ? 9 : 2
const suffix = trimmed.slice(lastValidEnd + endOffset)
// If suffix is empty or only contains closing tags (any provider's wrapper) or whitespace, it's complete
// This regex matches any sequence of closing XML tags like </foo>, </bar>, </DSMLxyz>
return /^(\s*<\/[^>]+>)*\s*$/.test(suffix)
} }