From 0d2e7a7ad67591ccf06a780c62790f51b90e224a Mon Sep 17 00:00:00 2001 From: Dayuan Jiang <34411969+DayuanJiang@users.noreply.github.com> Date: Wed, 24 Dec 2025 09:31:54 +0900 Subject: [PATCH] fix: escape HTML in XML attribute values to prevent parse errors (#386) - Add HTML escaping (<, >) in convertToLegalXml for attribute values - Update isMxCellXmlComplete to handle any LLM provider's wrapper tags - Add wrapper tag stripping in wrapWithMxFile for DeepSeek/Anthropic tags - Update autoFixXml to escape both < and > in attribute values Fixes 'Malformed XML detected in final output' error when AI generates diagrams with HTML content in value attributes like Title. --- lib/utils.ts | 159 +++++++++++++++++----- packages/mcp-server/src/xml-validation.ts | 39 +++--- 2 files changed, 147 insertions(+), 51 deletions(-) diff --git a/lib/utils.ts b/lib/utils.ts index 6ee7ac2..6d36865 100644 --- a/lib/utils.ts +++ b/lib/utils.ts @@ -36,29 +36,32 @@ const VALID_ENTITIES = new Set(["lt", "gt", "amp", "quot", "apos"]) /** * Check if mxCell XML output is complete (not truncated). * Complete XML ends with a self-closing tag (/>) or closing mxCell tag. - * Also handles function-calling wrapper tags that may be incorrectly included. + * Uses a robust approach that handles any LLM provider's wrapper tags + * by finding the last valid mxCell ending and checking if suffix is just closing tags. * @param xml - The XML string to check (can be undefined/null) * @returns true if XML appears complete, false if truncated or empty */ export function isMxCellXmlComplete(xml: string | undefined | null): boolean { - let trimmed = xml?.trim() || "" + const trimmed = xml?.trim() || "" if (!trimmed) return false - // Strip Anthropic function-calling wrapper tags if present - // These can leak into tool input due to AI SDK parsing issues - // Use loop because tags are nested: - let prev = "" - while (prev !== trimmed) { - prev = trimmed - trimmed = trimmed - .replace(/<\/mxParameter>\s*$/i, "") - .replace(/<\/invoke>\s*$/i, "") - .replace(/<\/antml:parameter>\s*$/i, "") - .replace(/<\/antml:invoke>\s*$/i, "") - .trim() - } + // Find position of last complete mxCell ending (either /> or ) + const lastSelfClose = trimmed.lastIndexOf("/>") + const lastMxCellClose = trimmed.lastIndexOf("") - return trimmed.endsWith("/>") || trimmed.endsWith("") + const lastValidEnd = Math.max(lastSelfClose, lastMxCellClose) + + // No valid ending found at all + if (lastValidEnd === -1) return false + + // Check what comes after the last valid ending + // For />: add 2 chars, for : add 9 chars + const endOffset = lastMxCellClose > lastSelfClose ? 9 : 2 + const suffix = trimmed.slice(lastValidEnd + endOffset) + + // If suffix is empty or only contains closing tags (any provider's wrapper) or whitespace, it's complete + // This regex matches any sequence of closing XML tags like , , + return /^(\s*<\/[^>]+>)*\s*$/.test(suffix) } /** @@ -262,6 +265,21 @@ export function convertToLegalXml(xmlString: string): string { "&", ) + // Fix unescaped < and > in attribute values for XML parsing + // HTML content in value attributes (e.g., Title) needs to be escaped + // This is critical because DOMParser will fail on unescaped < > in attributes + if (/=\s*"[^"]*<[^"]*"/.test(cellContent)) { + cellContent = cellContent.replace( + /=\s*"([^"]*)"/g, + (_match, value) => { + const escaped = value + .replace(//g, ">") + return `="${escaped}"` + }, + ) + } + // Indent each line of the matched block for readability. const formatted = cellContent .split("\n") @@ -306,6 +324,20 @@ export function wrapWithMxFile(xml: string): string { content = xml.replace(/<\/?root>/g, "").trim() } + // Strip trailing LLM wrapper tags (from any provider: Anthropic, DeepSeek, etc.) + // Find the last valid mxCell ending and remove everything after it + const lastSelfClose = content.lastIndexOf("/>") + const lastMxCellClose = content.lastIndexOf("") + const lastValidEnd = Math.max(lastSelfClose, lastMxCellClose) + if (lastValidEnd !== -1) { + const endOffset = lastMxCellClose > lastSelfClose ? 9 : 2 + const suffix = content.slice(lastValidEnd + endOffset) + // If suffix is only closing tags (wrapper tags), strip it + if (/^(\s*<\/[^>]+>)*\s*$/.test(suffix)) { + content = content.slice(0, lastValidEnd + endOffset) + } + } + // Remove any existing root cells from content (LLM shouldn't include them, but handle it gracefully) // Use flexible patterns that match both self-closing (/>) and non-self-closing (>) formats content = content @@ -910,6 +942,21 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { fixes.push("Removed CDATA wrapper") } + // 1b. Strip trailing LLM wrapper tags (DeepSeek, Anthropic, etc.) + // These are closing tags after the last valid mxCell that break XML parsing + const lastSelfClose = fixed.lastIndexOf("/>") + const lastMxCellClose = fixed.lastIndexOf("") + const lastValidEnd = Math.max(lastSelfClose, lastMxCellClose) + if (lastValidEnd !== -1) { + const endOffset = lastMxCellClose > lastSelfClose ? 9 : 2 + const suffix = fixed.slice(lastValidEnd + endOffset) + // If suffix contains only closing tags (wrapper tags) or whitespace, strip it + if (/^(\s*<\/[^>]+>)+\s*$/.test(suffix)) { + fixed = fixed.slice(0, lastValidEnd + endOffset) + fixes.push("Stripped trailing LLM wrapper tags") + } + } + // 2. Remove text before XML declaration or root element (only if it's garbage text, not valid XML) const xmlStart = fixed.search(/<(\?xml|mxGraphModel|mxfile)/i) if (xmlStart > 0 && !/^<[a-zA-Z]/.test(fixed.trim())) { @@ -1015,8 +1062,8 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { fixes.push("Removed quotes around color values in style") } - // 4. Fix unescaped < in attribute values - // This is tricky - we need to find < inside quoted attribute values + // 4. Fix unescaped < and > in attribute values + // < is required to be escaped, > is not strictly required but we escape for consistency const attrPattern = /(=\s*")([^"]*?)(<)([^"]*?)(")/g let attrMatch let hasUnescapedLt = false @@ -1027,12 +1074,12 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { } } if (hasUnescapedLt) { - // Replace < with < inside attribute values + // Replace < and > with < and > inside attribute values fixed = fixed.replace(/=\s*"([^"]*)"/g, (_match, value) => { - const escaped = value.replace(//g, ">") return `="${escaped}"` }) - fixes.push("Escaped < characters in attribute values") + fixes.push("Escaped <> characters in attribute values") } // 5. Fix invalid character references (remove malformed ones) @@ -1120,7 +1167,8 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { } // 8c. Remove non-draw.io tags (after typo fixes so lowercase variants are fixed first) - // Valid draw.io tags: mxfile, diagram, mxGraphModel, root, mxCell, mxGeometry, mxPoint, Array, Object + // IMPORTANT: Only remove tags at the element level, NOT inside quoted attribute values + // Tags like ,
inside value="text" should be preserved (they're HTML content) const validDrawioTags = new Set([ "mxfile", "diagram", @@ -1133,25 +1181,59 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { "Object", "mxRectangle", ]) + + // Helper: Check if a position is inside a quoted attribute value + // by counting unescaped quotes before that position + const isInsideQuotes = (str: string, pos: number): boolean => { + let inQuote = false + let quoteChar = "" + for (let i = 0; i < pos && i < str.length; i++) { + const c = str[i] + if (inQuote) { + if (c === quoteChar) inQuote = false + } else if (c === '"' || c === "'") { + // Check if this quote is part of an attribute (preceded by =) + // Look back for = sign + let j = i - 1 + while (j >= 0 && /\s/.test(str[j])) j-- + if (j >= 0 && str[j] === "=") { + inQuote = true + quoteChar = c + } + } + } + return inQuote + } + const foreignTagPattern = /<\/?([a-zA-Z][a-zA-Z0-9_]*)[^>]*>/g let foreignMatch const foreignTags = new Set() + const foreignTagPositions: Array<{ + tag: string + start: number + end: number + }> = [] + while ((foreignMatch = foreignTagPattern.exec(fixed)) !== null) { const tagName = foreignMatch[1] - if (!validDrawioTags.has(tagName)) { - foreignTags.add(tagName) - } + // Skip if this is a valid draw.io tag + if (validDrawioTags.has(tagName)) continue + // Skip if this tag is inside a quoted attribute value + if (isInsideQuotes(fixed, foreignMatch.index)) continue + + foreignTags.add(tagName) + foreignTagPositions.push({ + tag: tagName, + start: foreignMatch.index, + end: foreignMatch.index + foreignMatch[0].length, + }) } - if (foreignTags.size > 0) { - console.log( - "[autoFixXml] Step 8c: Found foreign tags:", - Array.from(foreignTags), - ) - for (const tag of foreignTags) { - // Remove opening tags (with or without attributes) - fixed = fixed.replace(new RegExp(`<${tag}[^>]*>`, "gi"), "") - // Remove closing tags - fixed = fixed.replace(new RegExp(``, "gi"), "") + + if (foreignTagPositions.length > 0) { + // Remove tags from end to start to preserve indices + foreignTagPositions.sort((a, b) => b.start - a.start) + for (const { start, end } of foreignTagPositions) { + fixed = fixed.slice(0, start) + fixed.slice(end) } fixes.push( `Removed foreign tags: ${Array.from(foreignTags).join(", ")}`, @@ -1202,6 +1284,7 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { // 10b. Remove extra closing tags (more closes than opens) // Need to properly count self-closing tags (they don't need closing tags) + // IMPORTANT: Only count tags at element level, NOT inside quoted attribute values const tagCounts = new Map< string, { opens: number; closes: number; selfClosing: number } @@ -1210,12 +1293,18 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { const fullTagPattern = /<(\/?[a-zA-Z][a-zA-Z0-9]*)[^>]*>/g let tagCountMatch while ((tagCountMatch = fullTagPattern.exec(fixed)) !== null) { + // Skip tags inside quoted attribute values (e.g., value="Title") + if (isInsideQuotes(fixed, tagCountMatch.index)) continue + const fullMatch = tagCountMatch[0] // e.g., "" or "" const tagPart = tagCountMatch[1] // e.g., "mxCell" or "/mxCell" const isClosing = tagPart.startsWith("/") const isSelfClosing = fullMatch.endsWith("/>") const tagName = isClosing ? tagPart.slice(1) : tagPart + // Only count valid draw.io tags - skip partial/invalid tags like "mx" from streaming + if (!validDrawioTags.has(tagName)) continue + let counts = tagCounts.get(tagName) if (!counts) { counts = { opens: 0, closes: 0, selfClosing: 0 } diff --git a/packages/mcp-server/src/xml-validation.ts b/packages/mcp-server/src/xml-validation.ts index d6c2daf..278978d 100644 --- a/packages/mcp-server/src/xml-validation.ts +++ b/packages/mcp-server/src/xml-validation.ts @@ -459,7 +459,8 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { fixes.push("Removed quotes around color values in style") } - // 10. Fix unescaped < in attribute values + // 10. Fix unescaped < and > in attribute values + // < is required to be escaped, > is not strictly required but we escape for consistency const attrPattern = /(=\s*")([^"]*?)(<)([^"]*?)(")/g let attrMatch let hasUnescapedLt = false @@ -471,10 +472,10 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } { } if (hasUnescapedLt) { fixed = fixed.replace(/=\s*"([^"]*)"/g, (_match, value) => { - const escaped = value.replace(//g, ">") return `="${escaped}"` }) - fixes.push("Escaped < characters in attribute values") + fixes.push("Escaped <> characters in attribute values") } // 11. Fix invalid hex character references @@ -903,24 +904,30 @@ export function validateAndFixXml(xml: string): { /** * Check if mxCell XML output is complete (not truncated). + * Uses a robust approach that handles any LLM provider's wrapper tags + * by finding the last valid mxCell ending and checking if suffix is just closing tags. * @param xml - The XML string to check (can be undefined/null) * @returns true if XML appears complete, false if truncated or empty */ export function isMxCellXmlComplete(xml: string | undefined | null): boolean { - let trimmed = xml?.trim() || "" + const trimmed = xml?.trim() || "" if (!trimmed) return false - // Strip wrapper tags if present - let prev = "" - while (prev !== trimmed) { - prev = trimmed - trimmed = trimmed - .replace(/<\/mxParameter>\s*$/i, "") - .replace(/<\/invoke>\s*$/i, "") - .replace(/<\/antml:parameter>\s*$/i, "") - .replace(/<\/antml:invoke>\s*$/i, "") - .trim() - } + // Find position of last complete mxCell ending (either /> or ) + const lastSelfClose = trimmed.lastIndexOf("/>") + const lastMxCellClose = trimmed.lastIndexOf("") - return trimmed.endsWith("/>") || trimmed.endsWith("") + const lastValidEnd = Math.max(lastSelfClose, lastMxCellClose) + + // No valid ending found at all + if (lastValidEnd === -1) return false + + // Check what comes after the last valid ending + // For />: add 2 chars, for : add 9 chars + const endOffset = lastMxCellClose > lastSelfClose ? 9 : 2 + const suffix = trimmed.slice(lastValidEnd + endOffset) + + // If suffix is empty or only contains closing tags (any provider's wrapper) or whitespace, it's complete + // This regex matches any sequence of closing XML tags like , , + return /^(\s*<\/[^>]+>)*\s*$/.test(suffix) }