2025-12-06 12:46:40 +09:00
import { type ClassValue , clsx } from "clsx"
import * as pako from "pako"
2025-03-19 06:04:06 +00:00
import { twMerge } from "tailwind-merge"
export function cn ( . . . inputs : ClassValue [ ] ) {
2025-12-06 12:46:40 +09:00
return twMerge ( clsx ( inputs ) )
2025-03-19 06:04:06 +00:00
}
2025-03-22 15:45:49 +00:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// ============================================================================
// XML Validation/Fix Constants
// ============================================================================
/** Maximum XML size to process (1MB) - larger XMLs may cause performance issues */
const MAX_XML_SIZE = 1 _000_000
/** Maximum iterations for aggressive cell dropping to prevent infinite loops */
const MAX_DROP_ITERATIONS = 10
/** Structural attributes that should not be duplicated in draw.io */
const STRUCTURAL_ATTRS = [
"edge" ,
"parent" ,
"source" ,
"target" ,
"vertex" ,
"connectable" ,
]
/** Valid XML entity names */
const VALID_ENTITIES = new Set ( [ "lt" , "gt" , "amp" , "quot" , "apos" ] )
2025-12-14 14:04:44 +09:00
// ============================================================================
// mxCell XML Helpers
// ============================================================================
/ * *
* Check if mxCell XML output is complete ( not truncated ) .
* Complete XML ends with a self - closing tag ( / > ) o r c l o s i n g m x C e l l t a g .
2025-12-14 19:38:40 +09:00
* Also handles function - calling wrapper tags that may be incorrectly included .
2025-12-14 14:04:44 +09:00
* @param xml - The XML string to check ( can be undefined / null )
* @returns true if XML appears complete , false if truncated or empty
* /
export function isMxCellXmlComplete ( xml : string | undefined | null ) : boolean {
2025-12-14 19:38:40 +09:00
let trimmed = xml ? . trim ( ) || ""
2025-12-14 14:04:44 +09:00
if ( ! trimmed ) return false
2025-12-14 19:38:40 +09:00
// Strip Anthropic function-calling wrapper tags if present
// These can leak into tool input due to AI SDK parsing issues
// Use loop because tags are nested: </mxCell></mxParameter></invoke>
let prev = ""
while ( prev !== trimmed ) {
prev = trimmed
trimmed = trimmed
. replace ( /<\/mxParameter>\s*$/i , "" )
. replace ( /<\/invoke>\s*$/i , "" )
. replace ( /<\/antml:parameter>\s*$/i , "" )
. replace ( /<\/antml:invoke>\s*$/i , "" )
. trim ( )
}
2025-12-14 14:04:44 +09:00
return trimmed . endsWith ( "/>" ) || trimmed . endsWith ( "</mxCell>" )
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// ============================================================================
// XML Parsing Helpers
// ============================================================================
interface ParsedTag {
tag : string
tagName : string
isClosing : boolean
isSelfClosing : boolean
startIndex : number
endIndex : number
}
/ * *
* Parse XML tags while properly handling quoted strings
* This is a shared utility used by both validation and fixing logic
* /
function parseXmlTags ( xml : string ) : ParsedTag [ ] {
const tags : ParsedTag [ ] = [ ]
let i = 0
while ( i < xml . length ) {
const tagStart = xml . indexOf ( "<" , i )
if ( tagStart === - 1 ) break
// Find matching > by tracking quotes
let tagEnd = tagStart + 1
let inQuote = false
let quoteChar = ""
while ( tagEnd < xml . length ) {
const c = xml [ tagEnd ]
if ( inQuote ) {
if ( c === quoteChar ) inQuote = false
} else {
if ( c === '"' || c === "'" ) {
inQuote = true
quoteChar = c
} else if ( c === ">" ) {
break
}
}
tagEnd ++
}
if ( tagEnd >= xml . length ) break
const tag = xml . substring ( tagStart , tagEnd + 1 )
i = tagEnd + 1
const tagMatch = /^<(\/?)([a-zA-Z][a-zA-Z0-9:_-]*)/ . exec ( tag )
if ( ! tagMatch ) continue
tags . push ( {
tag ,
tagName : tagMatch [ 2 ] ,
isClosing : tagMatch [ 1 ] === "/" ,
isSelfClosing : tag.endsWith ( "/>" ) ,
startIndex : tagStart ,
endIndex : tagEnd ,
} )
}
return tags
}
2025-08-31 20:52:04 +09:00
/ * *
* Format XML string with proper indentation and line breaks
* @param xml - The XML string to format
* @param indent - The indentation string ( default : ' ' )
* @returns Formatted XML string
* /
2025-12-06 12:46:40 +09:00
export function formatXML ( xml : string , indent : string = " " ) : string {
let formatted = ""
let pad = 0
// Remove existing whitespace between tags
xml = xml . replace ( />\s*</g , "><" ) . trim ( )
// Split on tags
const tags = xml . split ( /(?=<)|(?<=>)/g ) . filter ( Boolean )
tags . forEach ( ( node ) = > {
if ( node . match ( /^<\/\w/ ) ) {
// Closing tag - decrease indent
pad = Math . max ( 0 , pad - 1 )
formatted += indent . repeat ( pad ) + node + "\n"
} else if ( node . match ( /^<\w[^>]*[^/]>.*$/ ) ) {
// Opening tag
formatted += indent . repeat ( pad ) + node
// Only add newline if next item is a tag
const nextIndex = tags . indexOf ( node ) + 1
if ( nextIndex < tags . length && tags [ nextIndex ] . startsWith ( "<" ) ) {
formatted += "\n"
if ( ! node . match ( /^<\w[^>]*\/>$/ ) ) {
pad ++
}
}
} else if ( node . match ( /^<\w[^>]*\/>$/ ) ) {
// Self-closing tag
formatted += indent . repeat ( pad ) + node + "\n"
} else if ( node . startsWith ( "<" ) ) {
// Other tags (like <?xml)
formatted += indent . repeat ( pad ) + node + "\n"
} else {
// Text content
formatted += node
2025-08-31 20:52:04 +09:00
}
2025-12-06 12:46:40 +09:00
} )
2025-08-31 20:52:04 +09:00
2025-12-06 12:46:40 +09:00
return formatted . trim ( )
2025-08-31 20:52:04 +09:00
}
2025-12-06 12:46:40 +09:00
/ * *
2025-03-25 08:56:24 +00:00
* Efficiently converts a potentially incomplete XML string to a legal XML string by closing any open tags properly .
* Additionally , if an < mxCell > tag does not have an mxGeometry child ( e . g . < mxCell id = "3" > ) ,
* it removes that tag from the output .
2025-12-07 00:40:19 +09:00
* Also removes orphaned < mxPoint > elements that aren 't inside <Array> or don' t have proper 'as' attribute .
2025-03-22 15:45:49 +00:00
* @param xmlString The potentially incomplete XML string
2025-03-25 08:56:24 +00:00
* @returns A legal XML string with properly closed tags and removed incomplete mxCell elements .
2025-03-22 15:45:49 +00:00
* /
export function convertToLegalXml ( xmlString : string ) : string {
2025-12-06 12:46:40 +09:00
// This regex will match either self-closing <mxCell .../> or a block element
// <mxCell ...> ... </mxCell>. Unfinished ones are left out because they don't match.
const regex = /<mxCell\b[^>]*(?:\/>|>([\s\S]*?)<\/mxCell>)/g
let match : RegExpExecArray | null
let result = "<root>\n"
while ( ( match = regex . exec ( xmlString ) ) !== null ) {
// match[0] contains the entire matched mxCell block
2025-12-07 00:40:19 +09:00
let cellContent = match [ 0 ]
// Remove orphaned <mxPoint> elements that are directly inside <mxGeometry>
// without an 'as' attribute (like as="sourcePoint", as="targetPoint")
// and not inside <Array as="points">
// These cause "Could not add object mxPoint" errors in draw.io
// First check if there's an <Array as="points"> - if so, keep all mxPoints inside it
const hasArrayPoints = /<Array\s+as="points">/ . test ( cellContent )
if ( ! hasArrayPoints ) {
// Remove mxPoint elements without 'as' attribute
cellContent = cellContent . replace (
/<mxPoint\b[^>]*\/>/g ,
( pointMatch ) = > {
// Keep if it has an 'as' attribute
if ( /\sas=/ . test ( pointMatch ) ) {
return pointMatch
}
// Remove orphaned mxPoint
return ""
} ,
)
}
2025-12-14 19:38:40 +09:00
// Fix unescaped & characters in attribute values (but not valid entities)
// This prevents DOMParser from failing on content like "semantic & missing-step"
cellContent = cellContent . replace (
/&(?!(?:lt|gt|amp|quot|apos|#[0-9]+|#x[0-9a-fA-F]+);)/g ,
"&" ,
)
2025-12-06 12:46:40 +09:00
// Indent each line of the matched block for readability.
2025-12-07 00:40:19 +09:00
const formatted = cellContent
2025-12-06 12:46:40 +09:00
. split ( "\n" )
. map ( ( line ) = > " " + line . trim ( ) )
2025-12-07 00:40:19 +09:00
. filter ( ( line ) = > line . trim ( ) ) // Remove empty lines from removed mxPoints
2025-12-06 12:46:40 +09:00
. join ( "\n" )
result += formatted + "\n"
}
result += "</root>"
2025-03-22 15:45:49 +00:00
2025-12-06 12:46:40 +09:00
return result
}
2025-03-25 08:56:24 +00:00
2025-12-09 15:53:59 +09:00
/ * *
* Wrap XML content with the full mxfile structure required by draw . io .
2025-12-14 14:04:44 +09:00
* Always adds root cells ( id = "0" and id = "1" ) automatically .
* If input already contains root cells , they are removed to avoid duplication .
* LLM should only generate mxCell elements starting from id = "2" .
* @param xml - The XML string ( bare mxCells , < root > , < mxGraphModel > , or full < mxfile > )
* @returns Full mxfile - wrapped XML string with root cells included
2025-12-09 15:53:59 +09:00
* /
export function wrapWithMxFile ( xml : string ) : string {
2025-12-14 14:04:44 +09:00
const ROOT_CELLS = '<mxCell id="0"/><mxCell id="1" parent="0"/>'
if ( ! xml || ! xml . trim ( ) ) {
return ` <mxfile><diagram name="Page-1" id="page-1"><mxGraphModel><root> ${ ROOT_CELLS } </root></mxGraphModel></diagram></mxfile> `
2025-12-09 15:53:59 +09:00
}
// Already has full structure
if ( xml . includes ( "<mxfile" ) ) {
return xml
}
// Has mxGraphModel but not mxfile
if ( xml . includes ( "<mxGraphModel" ) ) {
return ` <mxfile><diagram name="Page-1" id="page-1"> ${ xml } </diagram></mxfile> `
}
2025-12-14 14:04:44 +09:00
// Has <root> wrapper - extract inner content
let content = xml
if ( xml . includes ( "<root>" ) ) {
content = xml . replace ( /<\/?root>/g , "" ) . trim ( )
}
// Remove any existing root cells from content (LLM shouldn't include them, but handle it gracefully)
// Use flexible patterns that match both self-closing (/>) and non-self-closing (></mxCell>) formats
content = content
. replace ( /<mxCell[^>]*\bid=["']0["'][^>]*(?:\/>|><\/mxCell>)/g , "" )
. replace ( /<mxCell[^>]*\bid=["']1["'][^>]*(?:\/>|><\/mxCell>)/g , "" )
. trim ( )
return ` <mxfile><diagram name="Page-1" id="page-1"><mxGraphModel><root> ${ ROOT_CELLS } ${ content } </root></mxGraphModel></diagram></mxfile> `
2025-12-09 15:53:59 +09:00
}
2025-03-25 08:56:24 +00:00
/ * *
* Replace nodes in a Draw . io XML diagram
* @param currentXML - The original Draw . io XML string
* @param nodes - The XML string containing new nodes to replace in the diagram
* @returns The updated XML string with replaced nodes
* /
export function replaceNodes ( currentXML : string , nodes : string ) : string {
2025-12-06 12:46:40 +09:00
// Check for valid inputs
if ( ! currentXML || ! nodes ) {
throw new Error ( "Both currentXML and nodes must be provided" )
2025-03-25 08:56:24 +00:00
}
2025-12-06 12:46:40 +09:00
try {
// Parse the XML strings to create DOM objects
const parser = new DOMParser ( )
const currentDoc = parser . parseFromString ( currentXML , "text/xml" )
2025-03-25 08:56:24 +00:00
2025-12-06 12:46:40 +09:00
// Handle nodes input - if it doesn't contain <root>, wrap it
let nodesString = nodes
if ( ! nodes . includes ( "<root>" ) ) {
nodesString = ` <root> ${ nodes } </root> `
}
2025-03-25 08:56:24 +00:00
2025-12-06 12:46:40 +09:00
const nodesDoc = parser . parseFromString ( nodesString , "text/xml" )
2025-03-25 08:56:24 +00:00
2025-12-06 12:46:40 +09:00
// Find the root element in the current document
let currentRoot = currentDoc . querySelector ( "mxGraphModel > root" )
if ( ! currentRoot ) {
// If no root element is found, create the proper structure
const mxGraphModel =
currentDoc . querySelector ( "mxGraphModel" ) ||
currentDoc . createElement ( "mxGraphModel" )
2025-03-25 08:56:24 +00:00
2025-12-06 12:46:40 +09:00
if ( ! currentDoc . contains ( mxGraphModel ) ) {
currentDoc . appendChild ( mxGraphModel )
}
2025-03-25 08:56:24 +00:00
2025-12-06 12:46:40 +09:00
currentRoot = currentDoc . createElement ( "root" )
mxGraphModel . appendChild ( currentRoot )
}
2025-03-25 08:56:24 +00:00
2025-12-06 12:46:40 +09:00
// Find the root element in the nodes document
const nodesRoot = nodesDoc . querySelector ( "root" )
if ( ! nodesRoot ) {
throw new Error (
"Invalid nodes: Could not find or create <root> element" ,
)
}
2025-03-25 08:56:24 +00:00
2025-12-06 12:46:40 +09:00
// Clear all existing child elements from the current root
while ( currentRoot . firstChild ) {
currentRoot . removeChild ( currentRoot . firstChild )
}
// Ensure the base cells exist
const hasCell0 = Array . from ( nodesRoot . childNodes ) . some (
( node ) = >
node . nodeName === "mxCell" &&
( node as Element ) . getAttribute ( "id" ) === "0" ,
)
const hasCell1 = Array . from ( nodesRoot . childNodes ) . some (
( node ) = >
node . nodeName === "mxCell" &&
( node as Element ) . getAttribute ( "id" ) === "1" ,
)
// Copy all child nodes from the nodes root to the current root
Array . from ( nodesRoot . childNodes ) . forEach ( ( node ) = > {
const importedNode = currentDoc . importNode ( node , true )
currentRoot . appendChild ( importedNode )
} )
// Add default cells if they don't exist
if ( ! hasCell0 ) {
const cell0 = currentDoc . createElement ( "mxCell" )
cell0 . setAttribute ( "id" , "0" )
currentRoot . insertBefore ( cell0 , currentRoot . firstChild )
}
if ( ! hasCell1 ) {
const cell1 = currentDoc . createElement ( "mxCell" )
cell1 . setAttribute ( "id" , "1" )
cell1 . setAttribute ( "parent" , "0" )
// Insert after cell0 if possible
const cell0 = currentRoot . querySelector ( 'mxCell[id="0"]' )
2025-12-06 16:18:26 +09:00
if ( cell0 ? . nextSibling ) {
2025-12-06 12:46:40 +09:00
currentRoot . insertBefore ( cell1 , cell0 . nextSibling )
} else {
currentRoot . appendChild ( cell1 )
}
}
2025-03-25 08:56:24 +00:00
2025-12-06 12:46:40 +09:00
// Convert the modified DOM back to a string
const serializer = new XMLSerializer ( )
return serializer . serializeToString ( currentDoc )
} catch ( error ) {
throw new Error ( ` Error replacing nodes: ${ error } ` )
}
2025-03-27 06:45:38 +00:00
}
2025-12-15 14:22:56 +09:00
// ============================================================================
// ID-based Diagram Operations
// ============================================================================
2025-12-04 13:26:06 +09:00
2025-12-15 14:22:56 +09:00
export interface DiagramOperation {
type : "update" | "add" | "delete"
cell_id : string
new_xml? : string
}
2025-12-04 13:26:06 +09:00
2025-12-15 14:22:56 +09:00
export interface OperationError {
type : "update" | "add" | "delete"
cellId : string
message : string
}
2025-12-04 13:26:06 +09:00
2025-12-15 14:22:56 +09:00
export interface ApplyOperationsResult {
result : string
errors : OperationError [ ]
2025-12-04 13:26:06 +09:00
}
2025-08-31 20:52:04 +09:00
/ * *
2025-12-15 14:22:56 +09:00
* Apply diagram operations ( update / add / delete ) using ID - based lookup .
* This replaces the text - matching approach with direct DOM manipulation .
*
* @param xmlContent - The full mxfile XML content
* @param operations - Array of operations to apply
* @returns Object with result XML and any errors
2025-08-31 20:52:04 +09:00
* /
2025-12-15 14:22:56 +09:00
export function applyDiagramOperations (
2025-12-06 12:46:40 +09:00
xmlContent : string ,
2025-12-15 14:22:56 +09:00
operations : DiagramOperation [ ] ,
) : ApplyOperationsResult {
const errors : OperationError [ ] = [ ]
// Parse the XML
const parser = new DOMParser ( )
const doc = parser . parseFromString ( xmlContent , "text/xml" )
// Check for parse errors
const parseError = doc . querySelector ( "parsererror" )
if ( parseError ) {
return {
result : xmlContent ,
errors : [
{
type : "update" ,
cellId : "" ,
message : ` XML parse error: ${ parseError . textContent } ` ,
} ,
] ,
2025-12-06 12:46:40 +09:00
}
2025-12-15 14:22:56 +09:00
}
2025-08-31 20:52:04 +09:00
2025-12-15 14:22:56 +09:00
// Find the root element (inside mxGraphModel)
const root = doc . querySelector ( "root" )
if ( ! root ) {
return {
result : xmlContent ,
errors : [
{
type : "update" ,
cellId : "" ,
message : "Could not find <root> element in XML" ,
} ,
] ,
2025-08-31 20:52:04 +09:00
}
2025-12-15 14:22:56 +09:00
}
2025-08-31 20:52:04 +09:00
2025-12-15 14:22:56 +09:00
// Build a map of cell IDs to elements
const cellMap = new Map < string , Element > ( )
root . querySelectorAll ( "mxCell" ) . forEach ( ( cell ) = > {
const id = cell . getAttribute ( "id" )
if ( id ) cellMap . set ( id , cell )
} )
2025-08-31 20:52:04 +09:00
2025-12-15 14:22:56 +09:00
// Process each operation
for ( const op of operations ) {
if ( op . type === "update" ) {
const existingCell = cellMap . get ( op . cell_id )
if ( ! existingCell ) {
errors . push ( {
type : "update" ,
cellId : op.cell_id ,
message : ` Cell with id=" ${ op . cell_id } " not found ` ,
} )
continue
2025-12-06 12:46:40 +09:00
}
2025-08-31 20:52:04 +09:00
2025-12-15 14:22:56 +09:00
if ( ! op . new_xml ) {
errors . push ( {
type : "update" ,
cellId : op.cell_id ,
message : "new_xml is required for update operation" ,
} )
continue
2025-12-06 12:46:40 +09:00
}
2025-08-31 20:52:04 +09:00
2025-12-15 14:22:56 +09:00
// Parse the new XML
const newDoc = parser . parseFromString (
` <wrapper> ${ op . new_xml } </wrapper> ` ,
"text/xml" ,
)
const newCell = newDoc . querySelector ( "mxCell" )
if ( ! newCell ) {
errors . push ( {
type : "update" ,
cellId : op.cell_id ,
message : "new_xml must contain an mxCell element" ,
} )
continue
2025-12-06 12:46:40 +09:00
}
2025-12-04 13:26:06 +09:00
2025-12-15 14:22:56 +09:00
// Validate ID matches
const newCellId = newCell . getAttribute ( "id" )
if ( newCellId !== op . cell_id ) {
errors . push ( {
type : "update" ,
cellId : op.cell_id ,
message : ` ID mismatch: cell_id is " ${ op . cell_id } " but new_xml has id=" ${ newCellId } " ` ,
} )
continue
2025-12-04 22:56:59 +09:00
}
2025-12-15 14:22:56 +09:00
// Import and replace the node
const importedNode = doc . importNode ( newCell , true )
existingCell . parentNode ? . replaceChild ( importedNode , existingCell )
// Update the map with the new element
cellMap . set ( op . cell_id , importedNode )
} else if ( op . type === "add" ) {
// Check if ID already exists
if ( cellMap . has ( op . cell_id ) ) {
errors . push ( {
type : "add" ,
cellId : op.cell_id ,
message : ` Cell with id=" ${ op . cell_id } " already exists ` ,
} )
continue
2025-12-07 00:40:16 +09:00
}
2025-12-15 14:22:56 +09:00
if ( ! op . new_xml ) {
errors . push ( {
type : "add" ,
cellId : op.cell_id ,
message : "new_xml is required for add operation" ,
} )
continue
2025-12-07 00:40:16 +09:00
}
2025-12-15 14:22:56 +09:00
// Parse the new XML
const newDoc = parser . parseFromString (
` <wrapper> ${ op . new_xml } </wrapper> ` ,
"text/xml" ,
2025-12-06 12:46:40 +09:00
)
2025-12-15 14:22:56 +09:00
const newCell = newDoc . querySelector ( "mxCell" )
if ( ! newCell ) {
errors . push ( {
type : "add" ,
cellId : op.cell_id ,
message : "new_xml must contain an mxCell element" ,
} )
continue
}
2025-08-31 20:52:04 +09:00
2025-12-15 14:22:56 +09:00
// Validate ID matches
const newCellId = newCell . getAttribute ( "id" )
if ( newCellId !== op . cell_id ) {
errors . push ( {
type : "add" ,
cellId : op.cell_id ,
message : ` ID mismatch: cell_id is " ${ op . cell_id } " but new_xml has id=" ${ newCellId } " ` ,
} )
continue
}
2025-08-31 20:52:04 +09:00
2025-12-15 14:22:56 +09:00
// Import and append the node
const importedNode = doc . importNode ( newCell , true )
root . appendChild ( importedNode )
// Add to map
cellMap . set ( op . cell_id , importedNode )
} else if ( op . type === "delete" ) {
const existingCell = cellMap . get ( op . cell_id )
if ( ! existingCell ) {
errors . push ( {
type : "delete" ,
cellId : op.cell_id ,
message : ` Cell with id=" ${ op . cell_id } " not found ` ,
} )
continue
}
2025-08-31 20:52:04 +09:00
2025-12-15 14:22:56 +09:00
// Check for edges referencing this cell (warning only, still delete)
const referencingEdges = root . querySelectorAll (
` mxCell[source=" ${ op . cell_id } "], mxCell[target=" ${ op . cell_id } "] ` ,
)
if ( referencingEdges . length > 0 ) {
const edgeIds = Array . from ( referencingEdges )
. map ( ( e ) = > e . getAttribute ( "id" ) )
. join ( ", " )
console . warn (
` [applyDiagramOperations] Deleting cell " ${ op . cell_id } " which is referenced by edges: ${ edgeIds } ` ,
)
}
2025-08-31 20:52:04 +09:00
2025-12-15 14:22:56 +09:00
// Remove the node
existingCell . parentNode ? . removeChild ( existingCell )
cellMap . delete ( op . cell_id )
}
2025-08-31 20:52:04 +09:00
}
2025-12-15 14:22:56 +09:00
// Serialize back to string
const serializer = new XMLSerializer ( )
const result = serializer . serializeToString ( doc )
return { result , errors }
2025-08-31 20:52:04 +09:00
}
2025-03-27 06:45:38 +00:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// ============================================================================
// Validation Helper Functions
// ============================================================================
2025-12-03 16:14:53 +09:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
/** Check for duplicate structural attributes in a tag */
function checkDuplicateAttributes ( xml : string ) : string | null {
const structuralSet = new Set ( STRUCTURAL_ATTRS )
2025-12-13 15:00:28 +09:00
const tagPattern = /<[^>]+>/g
let tagMatch
while ( ( tagMatch = tagPattern . exec ( xml ) ) !== null ) {
const tag = tagMatch [ 0 ]
const attrPattern = /\s([a-zA-Z_:][a-zA-Z0-9_:.-]*)\s*=/g
const attributes = new Map < string , number > ( )
let attrMatch
while ( ( attrMatch = attrPattern . exec ( tag ) ) !== null ) {
const attrName = attrMatch [ 1 ]
attributes . set ( attrName , ( attributes . get ( attrName ) || 0 ) + 1 )
}
const duplicates = Array . from ( attributes . entries ( ) )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
. filter ( ( [ name , count ] ) = > count > 1 && structuralSet . has ( name ) )
2025-12-13 15:00:28 +09:00
. map ( ( [ name ] ) = > name )
if ( duplicates . length > 0 ) {
return ` Invalid XML: Duplicate structural attribute(s): ${ duplicates . join ( ", " ) } . Remove duplicate attributes. `
}
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return null
}
2025-12-13 15:00:28 +09:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
/** Check for duplicate IDs in XML */
function checkDuplicateIds ( xml : string ) : string | null {
2025-12-13 15:00:28 +09:00
const idPattern = /\bid\s*=\s*["']([^"']+)["']/gi
const ids = new Map < string , number > ( )
let idMatch
while ( ( idMatch = idPattern . exec ( xml ) ) !== null ) {
const id = idMatch [ 1 ]
ids . set ( id , ( ids . get ( id ) || 0 ) + 1 )
}
const duplicateIds = Array . from ( ids . entries ( ) )
. filter ( ( [ , count ] ) = > count > 1 )
. map ( ( [ id , count ] ) = > ` ' ${ id } ' ( ${ count } x) ` )
if ( duplicateIds . length > 0 ) {
return ` Invalid XML: Found duplicate ID(s): ${ duplicateIds . slice ( 0 , 3 ) . join ( ", " ) } . All id attributes must be unique. `
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return null
}
2025-12-13 15:00:28 +09:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
/** Check for tag mismatches using parsed tags */
function checkTagMismatches ( xml : string ) : string | null {
2025-12-13 15:00:28 +09:00
const xmlWithoutComments = xml . replace ( /<!--[\s\S]*?-->/g , "" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
const tags = parseXmlTags ( xmlWithoutComments )
2025-12-13 15:00:28 +09:00
const tagStack : string [ ] = [ ]
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
for ( const { tagName , isClosing , isSelfClosing } of tags ) {
2025-12-13 15:00:28 +09:00
if ( isClosing ) {
if ( tagStack . length === 0 ) {
return ` Invalid XML: Closing tag </ ${ tagName } > without matching opening tag `
2025-12-06 12:46:40 +09:00
}
2025-12-13 15:00:28 +09:00
const expected = tagStack . pop ( )
if ( expected ? . toLowerCase ( ) !== tagName . toLowerCase ( ) ) {
return ` Invalid XML: Expected closing tag </ ${ expected } > but found </ ${ tagName } > `
}
} else if ( ! isSelfClosing ) {
tagStack . push ( tagName )
2025-12-06 12:46:40 +09:00
}
2025-12-13 15:00:28 +09:00
}
if ( tagStack . length > 0 ) {
return ` Invalid XML: Document has ${ tagStack . length } unclosed tag(s): ${ tagStack . join ( ", " ) } `
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return null
}
2025-12-06 12:46:40 +09:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
/** Check for invalid character references */
function checkCharacterReferences ( xml : string ) : string | null {
2025-12-13 15:00:28 +09:00
const charRefPattern = /&#x?[^;]+;?/g
let charMatch
while ( ( charMatch = charRefPattern . exec ( xml ) ) !== null ) {
const ref = charMatch [ 0 ]
if ( ref . startsWith ( "&#x" ) ) {
if ( ! ref . endsWith ( ";" ) ) {
return ` Invalid XML: Missing semicolon after hex reference: ${ ref } `
}
const hexDigits = ref . substring ( 3 , ref . length - 1 )
if ( hexDigits . length === 0 || ! /^[0-9a-fA-F]+$/ . test ( hexDigits ) ) {
return ` Invalid XML: Invalid hex character reference: ${ ref } `
}
} else if ( ref . startsWith ( "&#" ) ) {
if ( ! ref . endsWith ( ";" ) ) {
return ` Invalid XML: Missing semicolon after decimal reference: ${ ref } `
}
const decDigits = ref . substring ( 2 , ref . length - 1 )
if ( decDigits . length === 0 || ! /^[0-9]+$/ . test ( decDigits ) ) {
return ` Invalid XML: Invalid decimal character reference: ${ ref } `
}
2025-12-06 12:46:40 +09:00
}
2025-12-13 15:00:28 +09:00
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return null
}
2025-12-06 12:46:40 +09:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
/** Check for invalid entity references */
function checkEntityReferences ( xml : string ) : string | null {
const xmlWithoutComments = xml . replace ( /<!--[\s\S]*?-->/g , "" )
2025-12-13 15:00:28 +09:00
const bareAmpPattern = /&(?!(?:lt|gt|amp|quot|apos|#))/g
if ( bareAmpPattern . test ( xmlWithoutComments ) ) {
return "Invalid XML: Found unescaped & character(s). Replace & with &"
}
const invalidEntityPattern = /&([a-zA-Z][a-zA-Z0-9]*);/g
let entityMatch
while (
( entityMatch = invalidEntityPattern . exec ( xmlWithoutComments ) ) !== null
) {
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
if ( ! VALID_ENTITIES . has ( entityMatch [ 1 ] ) ) {
2025-12-13 15:00:28 +09:00
return ` Invalid XML: Invalid entity reference: & ${ entityMatch [ 1 ] } ; - use only valid XML entities (lt, gt, amp, quot, apos) `
}
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return null
}
2025-12-13 15:00:28 +09:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
/** Check for nested mxCell tags using regex */
function checkNestedMxCells ( xml : string ) : string | null {
2025-12-13 15:00:28 +09:00
const cellTagPattern = /<\/?mxCell[^>]*>/g
const cellStack : number [ ] = [ ]
let cellMatch
while ( ( cellMatch = cellTagPattern . exec ( xml ) ) !== null ) {
const tag = cellMatch [ 0 ]
if ( tag . startsWith ( "</mxCell>" ) ) {
if ( cellStack . length > 0 ) cellStack . pop ( )
} else if ( ! tag . endsWith ( "/>" ) ) {
const isLabelOrGeometry =
/\sas\s*=\s*["'](valueLabel|geometry)["']/ . test ( tag )
if ( ! isLabelOrGeometry ) {
cellStack . push ( cellMatch . index )
if ( cellStack . length > 1 ) {
return "Invalid XML: Found nested mxCell tags. Cells should be siblings, not nested inside other mxCell elements."
}
2025-12-06 12:46:40 +09:00
}
}
2025-12-13 15:00:28 +09:00
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return null
}
/ * *
* Validates draw . io XML structure for common issues
* Uses DOM parsing + additional regex checks for high accuracy
* @param xml - The XML string to validate
* @returns null if valid , error message string if invalid
* /
export function validateMxCellStructure ( xml : string ) : string | null {
2025-12-14 21:23:14 +09:00
console . time ( "perf:validateMxCellStructure" )
console . log ( ` perf:validateMxCellStructure XML size: ${ xml . length } bytes ` )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// Size check for performance
if ( xml . length > MAX_XML_SIZE ) {
console . warn (
` [validateMxCellStructure] XML size ( ${ xml . length } ) exceeds ${ MAX_XML_SIZE } bytes, may cause performance issues ` ,
)
}
// 0. First use DOM parser to catch syntax errors (most accurate)
try {
2025-12-14 21:23:14 +09:00
console . time ( "perf:validate-DOMParser" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
const parser = new DOMParser ( )
const doc = parser . parseFromString ( xml , "text/xml" )
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:validate-DOMParser" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
const parseError = doc . querySelector ( "parsererror" )
if ( parseError ) {
2025-12-14 19:38:40 +09:00
const actualError = parseError . textContent || "Unknown parse error"
console . log (
"[validateMxCellStructure] DOMParser error:" ,
actualError ,
)
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:validateMxCellStructure" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return ` Invalid XML: The XML contains syntax errors (likely unescaped special characters like <, >, & in attribute values). Please escape special characters: use < for <, > for >, & for &, " for ". Regenerate the diagram with properly escaped values. `
}
// DOM-based checks for nested mxCell
const allCells = doc . querySelectorAll ( "mxCell" )
for ( const cell of allCells ) {
if ( cell . parentElement ? . tagName === "mxCell" ) {
const id = cell . getAttribute ( "id" ) || "unknown"
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:validateMxCellStructure" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return ` Invalid XML: Found nested mxCell (id=" ${ id } "). Cells should be siblings, not nested inside other mxCell elements. `
}
}
} catch ( error ) {
// Log unexpected DOMParser errors before falling back to regex checks
console . warn (
"[validateMxCellStructure] DOMParser threw unexpected error, falling back to regex validation:" ,
error ,
)
}
// 1. Check for CDATA wrapper (invalid at document root)
if ( /^\s*<!\[CDATA\[/ . test ( xml ) ) {
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:validateMxCellStructure" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return "Invalid XML: XML is wrapped in CDATA section - remove <![CDATA[ from start and ]]> from end"
}
// 2. Check for duplicate structural attributes
2025-12-14 21:23:14 +09:00
console . time ( "perf:checkDuplicateAttributes" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
const dupAttrError = checkDuplicateAttributes ( xml )
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:checkDuplicateAttributes" )
if ( dupAttrError ) {
console . timeEnd ( "perf:validateMxCellStructure" )
return dupAttrError
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// 3. Check for unescaped < in attribute values
const attrValuePattern = /=\s*"([^"]*)"/g
let attrValMatch
while ( ( attrValMatch = attrValuePattern . exec ( xml ) ) !== null ) {
const value = attrValMatch [ 1 ]
if ( /</ . test ( value ) && ! /</ . test ( value ) ) {
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:validateMxCellStructure" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return "Invalid XML: Unescaped < character in attribute values. Replace < with <"
}
}
// 4. Check for duplicate IDs
2025-12-14 21:23:14 +09:00
console . time ( "perf:checkDuplicateIds" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
const dupIdError = checkDuplicateIds ( xml )
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:checkDuplicateIds" )
if ( dupIdError ) {
console . timeEnd ( "perf:validateMxCellStructure" )
return dupIdError
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// 5. Check for tag mismatches
2025-12-14 21:23:14 +09:00
console . time ( "perf:checkTagMismatches" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
const tagMismatchError = checkTagMismatches ( xml )
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:checkTagMismatches" )
if ( tagMismatchError ) {
console . timeEnd ( "perf:validateMxCellStructure" )
return tagMismatchError
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// 6. Check invalid character references
const charRefError = checkCharacterReferences ( xml )
2025-12-14 21:23:14 +09:00
if ( charRefError ) {
console . timeEnd ( "perf:validateMxCellStructure" )
return charRefError
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// 7. Check for invalid comment syntax (-- inside comments)
const commentPattern = /<!--([\s\S]*?)-->/g
let commentMatch
while ( ( commentMatch = commentPattern . exec ( xml ) ) !== null ) {
if ( /--/ . test ( commentMatch [ 1 ] ) ) {
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:validateMxCellStructure" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return "Invalid XML: Comment contains -- (double hyphen) which is not allowed"
}
}
// 8. Check for unescaped entity references and invalid entity names
const entityError = checkEntityReferences ( xml )
2025-12-14 21:23:14 +09:00
if ( entityError ) {
console . timeEnd ( "perf:validateMxCellStructure" )
return entityError
}
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// 9. Check for empty id attributes on mxCell
if ( /<mxCell[^>]*\sid\s*=\s*["']\s*["'][^>]*>/g . test ( xml ) ) {
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:validateMxCellStructure" )
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
return "Invalid XML: Found mxCell element(s) with empty id attribute"
}
// 10. Check for nested mxCell tags
const nestedCellError = checkNestedMxCells ( xml )
2025-12-14 21:23:14 +09:00
if ( nestedCellError ) {
console . timeEnd ( "perf:validateMxCellStructure" )
return nestedCellError
}
2025-12-13 15:00:28 +09:00
2025-12-14 21:23:14 +09:00
console . timeEnd ( "perf:validateMxCellStructure" )
2025-12-13 15:00:28 +09:00
return null
}
/ * *
* Attempts to auto - fix common XML issues in draw . io diagrams
* @param xml - The XML string to fix
* @returns Object with fixed XML and list of fixes applied
* /
export function autoFixXml ( xml : string ) : { fixed : string ; fixes : string [ ] } {
let fixed = xml
const fixes : string [ ] = [ ]
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// 0. Fix JSON-escaped XML (common when XML is stored in JSON without unescaping)
// Only apply when we see JSON-escaped attribute patterns like =\"value\"
// Don't apply to legitimate \n in value attributes (draw.io uses these for line breaks)
if ( /=\\"/ . test ( fixed ) ) {
// Replace literal \" with actual quotes
fixed = fixed . replace ( /\\"/g , '"' )
// Replace literal \n with actual newlines (only after confirming JSON-escaped)
fixed = fixed . replace ( /\\n/g , "\n" )
fixes . push ( "Fixed JSON-escaped XML" )
2025-12-13 16:02:56 +09:00
}
// 1. Remove CDATA wrapper (MUST be before text-before-root check)
2025-12-13 15:00:28 +09:00
if ( /^\s*<!\[CDATA\[/ . test ( fixed ) ) {
fixed = fixed . replace ( /^\s*<!\[CDATA\[/ , "" ) . replace ( /\]\]>\s*$/ , "" )
fixes . push ( "Removed CDATA wrapper" )
}
2025-12-06 12:46:40 +09:00
2025-12-13 16:02:56 +09:00
// 2. Remove text before XML declaration or root element (only if it's garbage text, not valid XML)
const xmlStart = fixed . search ( /<(\?xml|mxGraphModel|mxfile)/i )
if ( xmlStart > 0 && ! /^<[a-zA-Z]/ . test ( fixed . trim ( ) ) ) {
fixed = fixed . substring ( xmlStart )
fixes . push ( "Removed text before XML root" )
}
2025-12-13 15:00:28 +09:00
// 2. Fix duplicate attributes (keep first occurrence, remove duplicates)
let dupAttrFixed = false
fixed = fixed . replace ( /<[^>]+>/g , ( tag ) = > {
let newTag = tag
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
for ( const attr of STRUCTURAL_ATTRS ) {
2025-12-13 15:00:28 +09:00
// Find all occurrences of this attribute
const attrRegex = new RegExp (
` \\ s ${ attr } \\ s*= \\ s*["'][^"']*["'] ` ,
"gi" ,
)
const matches = tag . match ( attrRegex )
if ( matches && matches . length > 1 ) {
// Keep first, remove others
let firstKept = false
newTag = newTag . replace ( attrRegex , ( m ) = > {
if ( ! firstKept ) {
firstKept = true
return m
}
dupAttrFixed = true
return ""
} )
}
2025-12-06 12:46:40 +09:00
}
2025-12-13 15:00:28 +09:00
return newTag
2025-12-06 12:46:40 +09:00
} )
2025-12-13 15:00:28 +09:00
if ( dupAttrFixed ) {
fixes . push ( "Removed duplicate structural attributes" )
}
2025-12-06 12:46:40 +09:00
2025-12-13 15:00:28 +09:00
// 3. Fix unescaped & characters (but not valid entities)
// Match & not followed by valid entity pattern
const ampersandPattern =
/&(?!(?:lt|gt|amp|quot|apos|#[0-9]+|#x[0-9a-fA-F]+);)/g
if ( ampersandPattern . test ( fixed ) ) {
fixed = fixed . replace (
/&(?!(?:lt|gt|amp|quot|apos|#[0-9]+|#x[0-9a-fA-F]+);)/g ,
"&" ,
)
fixes . push ( "Escaped unescaped & characters" )
2025-12-03 16:14:53 +09:00
}
2025-12-13 15:00:28 +09:00
// 3. Fix invalid entity names like &quot; -> "
// Common mistake: double-escaping
const invalidEntities = [
{ pattern : /&quot;/g , replacement : """ , name : "&quot;" } ,
{ pattern : /&lt;/g , replacement : "<" , name : "&lt;" } ,
{ pattern : /&gt;/g , replacement : ">" , name : "&gt;" } ,
{ pattern : /&apos;/g , replacement : "'" , name : "&apos;" } ,
{ pattern : /&amp;/g , replacement : "&" , name : "&amp;" } ,
]
for ( const { pattern , replacement , name } of invalidEntities ) {
if ( pattern . test ( fixed ) ) {
fixed = fixed . replace ( pattern , replacement )
fixes . push ( ` Fixed double-escaped entity ${ name } ` )
}
2025-12-03 16:14:53 +09:00
}
2025-12-06 12:46:40 +09:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// 3b. Fix malformed attribute values where " is used as delimiter instead of actual quotes
// Pattern: attr="value" should become attr="value" (the " was meant to be the quote delimiter)
// This commonly happens with dashPattern="1 1;"
const malformedQuotePattern = /(\s[a-zA-Z][a-zA-Z0-9_:-]*)="/
2025-12-13 15:00:28 +09:00
if ( malformedQuotePattern . test ( fixed ) ) {
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// Replace =" with =" and trailing " before next attribute or tag end with "
2025-12-13 15:00:28 +09:00
fixed = fixed . replace (
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
/(\s[a-zA-Z][a-zA-Z0-9_:-]*)="([^&]*?)"/g ,
'$1="$2"' ,
2025-12-13 15:00:28 +09:00
)
fixes . push (
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
'Fixed malformed attribute quotes (="..." to ="...")' ,
2025-12-13 15:00:28 +09:00
)
2025-12-03 16:14:53 +09:00
}
2025-12-06 12:46:40 +09:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// 3c. Fix malformed closing tags like </tag/> -> </tag>
const malformedClosingTag = /<\/([a-zA-Z][a-zA-Z0-9]*)\s*\/>/g
if ( malformedClosingTag . test ( fixed ) ) {
fixed = fixed . replace ( /<\/([a-zA-Z][a-zA-Z0-9]*)\s*\/>/g , "</$1>" )
fixes . push ( "Fixed malformed closing tags (</tag/> to </tag>)" )
}
// 3d. Fix missing space between attributes like vertex="1"parent="1"
const missingSpacePattern = /("[^"]*")([a-zA-Z][a-zA-Z0-9_:-]*=)/g
if ( missingSpacePattern . test ( fixed ) ) {
fixed = fixed . replace ( /("[^"]*")([a-zA-Z][a-zA-Z0-9_:-]*=)/g , "$1 $2" )
fixes . push ( "Added missing space between attributes" )
}
// 3e. Fix unescaped quotes in style color values like fillColor="#fff2e6"
// The " after Color= prematurely ends the style attribute. Remove it.
// Pattern: ;fillColor="#fff → ;fillColor=#fff (remove first ", keep second as style closer)
const quotedColorPattern = /;([a-zA-Z]*[Cc]olor)="#/
if ( quotedColorPattern . test ( fixed ) ) {
fixed = fixed . replace ( /;([a-zA-Z]*[Cc]olor)="#/g , ";$1=#" )
fixes . push ( "Removed quotes around color values in style" )
}
2025-12-13 15:00:28 +09:00
// 4. Fix unescaped < in attribute values
// This is tricky - we need to find < inside quoted attribute values
const attrPattern = /(=\s*")([^"]*?)(<)([^"]*?)(")/g
let attrMatch
let hasUnescapedLt = false
while ( ( attrMatch = attrPattern . exec ( fixed ) ) !== null ) {
if ( ! attrMatch [ 3 ] . startsWith ( "<" ) ) {
hasUnescapedLt = true
break
}
}
if ( hasUnescapedLt ) {
// Replace < with < inside attribute values
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
fixed = fixed . replace ( /=\s*"([^"]*)"/g , ( _match , value ) = > {
2025-12-13 15:00:28 +09:00
const escaped = value . replace ( /</g , "<" )
return ` =" ${ escaped } " `
} )
fixes . push ( "Escaped < characters in attribute values" )
2025-12-03 16:14:53 +09:00
}
2025-12-13 15:00:28 +09:00
// 5. Fix invalid character references (remove malformed ones)
// Pattern: &#x followed by non-hex chars before ;
const invalidHexRefs : string [ ] = [ ]
fixed = fixed . replace ( /&#x([^;]*);/g , ( match , hex ) = > {
if ( /^[0-9a-fA-F]+$/ . test ( hex ) && hex . length > 0 ) {
return match // Valid hex ref, keep it
2025-12-06 12:46:40 +09:00
}
2025-12-13 15:00:28 +09:00
invalidHexRefs . push ( match )
return "" // Remove invalid ref
} )
if ( invalidHexRefs . length > 0 ) {
fixes . push (
` Removed ${ invalidHexRefs . length } invalid hex character reference(s) ` ,
)
}
// 6. Fix invalid decimal character references
const invalidDecRefs : string [ ] = [ ]
fixed = fixed . replace ( /&#([^x][^;]*);/g , ( match , dec ) = > {
if ( /^[0-9]+$/ . test ( dec ) && dec . length > 0 ) {
return match // Valid decimal ref, keep it
2025-12-06 12:46:40 +09:00
}
2025-12-13 15:00:28 +09:00
invalidDecRefs . push ( match )
return "" // Remove invalid ref
2025-12-06 12:46:40 +09:00
} )
2025-12-13 15:00:28 +09:00
if ( invalidDecRefs . length > 0 ) {
fixes . push (
` Removed ${ invalidDecRefs . length } invalid decimal character reference(s) ` ,
)
}
2025-12-03 16:14:53 +09:00
2025-12-13 15:00:28 +09:00
// 7. Fix invalid comment syntax (replace -- with - repeatedly until none left)
fixed = fixed . replace ( /<!--([\s\S]*?)-->/g , ( match , content ) = > {
if ( /--/ . test ( content ) ) {
// Keep replacing until no double hyphens remain
let fixedContent = content
while ( /--/ . test ( fixedContent ) ) {
fixedContent = fixedContent . replace ( /--/g , "-" )
2025-12-07 00:40:19 +09:00
}
2025-12-13 15:00:28 +09:00
fixes . push ( "Fixed invalid comment syntax (removed double hyphens)" )
return ` <!-- ${ fixedContent } --> `
2025-12-07 00:40:19 +09:00
}
2025-12-13 15:00:28 +09:00
return match
2025-12-07 00:40:19 +09:00
} )
2025-12-13 15:00:28 +09:00
// 8. Fix <Cell> tags that should be <mxCell> (common LLM mistake)
// This handles both opening and closing tags
const hasCellTags = /<\/?Cell[\s>]/i . test ( fixed )
if ( hasCellTags ) {
2025-12-14 19:38:40 +09:00
console . log ( "[autoFixXml] Step 8: Found <Cell> tags to fix" )
const beforeFix = fixed
2025-12-13 15:00:28 +09:00
fixed = fixed . replace ( /<Cell(\s)/gi , "<mxCell$1" )
fixed = fixed . replace ( /<Cell>/gi , "<mxCell>" )
fixed = fixed . replace ( /<\/Cell>/gi , "</mxCell>" )
2025-12-14 19:38:40 +09:00
if ( beforeFix !== fixed ) {
console . log ( "[autoFixXml] Step 8: Fixed <Cell> tags" )
}
2025-12-13 15:00:28 +09:00
fixes . push ( "Fixed <Cell> tags to <mxCell>" )
2025-12-07 00:40:19 +09:00
}
2025-12-14 19:38:40 +09:00
// 8b. Remove non-draw.io tags (LLM sometimes includes Claude's function calling XML)
// Valid draw.io tags: mxfile, diagram, mxGraphModel, root, mxCell, mxGeometry, mxPoint, Array, Object
const validDrawioTags = new Set ( [
"mxfile" ,
"diagram" ,
"mxGraphModel" ,
"root" ,
"mxCell" ,
"mxGeometry" ,
"mxPoint" ,
"Array" ,
"Object" ,
"mxRectangle" ,
] )
const foreignTagPattern = /<\/?([a-zA-Z][a-zA-Z0-9_]*)[^>]*>/g
let foreignMatch
const foreignTags = new Set < string > ( )
while ( ( foreignMatch = foreignTagPattern . exec ( fixed ) ) !== null ) {
const tagName = foreignMatch [ 1 ]
if ( ! validDrawioTags . has ( tagName ) ) {
foreignTags . add ( tagName )
}
}
if ( foreignTags . size > 0 ) {
console . log (
"[autoFixXml] Step 8b: Found foreign tags:" ,
Array . from ( foreignTags ) ,
)
for ( const tag of foreignTags ) {
// Remove opening tags (with or without attributes)
fixed = fixed . replace ( new RegExp ( ` < ${ tag } [^>]*> ` , "gi" ) , "" )
// Remove closing tags
fixed = fixed . replace ( new RegExp ( ` </ ${ tag } > ` , "gi" ) , "" )
}
fixes . push (
` Removed foreign tags: ${ Array . from ( foreignTags ) . join ( ", " ) } ` ,
)
}
2025-12-13 15:00:28 +09:00
// 9. Fix common closing tag typos
const tagTypos = [
{ wrong : /<\/mxElement>/gi , right : "</mxCell>" , name : "</mxElement>" } ,
{ wrong : /<\/mxcell>/g , right : "</mxCell>" , name : "</mxcell>" } , // case sensitivity
{
wrong : /<\/mxgeometry>/g ,
right : "</mxGeometry>" ,
name : "</mxgeometry>" ,
} ,
{ wrong : /<\/mxpoint>/g , right : "</mxPoint>" , name : "</mxpoint>" } ,
{
wrong : /<\/mxgraphmodel>/gi ,
right : "</mxGraphModel>" ,
name : "</mxgraphmodel>" ,
} ,
]
for ( const { wrong , right , name } of tagTypos ) {
if ( wrong . test ( fixed ) ) {
fixed = fixed . replace ( wrong , right )
fixes . push ( ` Fixed typo ${ name } to ${ right } ` )
}
}
// 10. Fix unclosed tags by appending missing closing tags
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
// Use parseXmlTags helper to track open tags
2025-12-13 15:00:28 +09:00
const tagStack : string [ ] = [ ]
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
const parsedTags = parseXmlTags ( fixed )
2025-12-13 15:00:28 +09:00
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
for ( const { tagName , isClosing , isSelfClosing } of parsedTags ) {
2025-12-13 15:00:28 +09:00
if ( isClosing ) {
// Find matching opening tag (may not be the last one if there's mismatch)
const lastIdx = tagStack . lastIndexOf ( tagName )
if ( lastIdx !== - 1 ) {
tagStack . splice ( lastIdx , 1 )
}
} else if ( ! isSelfClosing ) {
tagStack . push ( tagName )
}
}
// If there are unclosed tags, append closing tags in reverse order
// But first verify with simple count that they're actually unclosed
if ( tagStack . length > 0 ) {
const tagsToClose : string [ ] = [ ]
for ( const tagName of tagStack . reverse ( ) ) {
// Simple count check: only close if opens > closes
const openCount = (
fixed . match ( new RegExp ( ` < ${ tagName } [ \\ s>] ` , "gi" ) ) || [ ]
) . length
const closeCount = (
fixed . match ( new RegExp ( ` </ ${ tagName } > ` , "gi" ) ) || [ ]
) . length
if ( openCount > closeCount ) {
tagsToClose . push ( tagName )
}
}
if ( tagsToClose . length > 0 ) {
const closingTags = tagsToClose . map ( ( t ) = > ` </ ${ t } > ` ) . join ( "\n" )
fixed = fixed . trimEnd ( ) + "\n" + closingTags
fixes . push (
` Closed ${ tagsToClose . length } unclosed tag(s): ${ tagsToClose . join ( ", " ) } ` ,
)
}
}
2025-12-14 19:38:40 +09:00
// 10b. Remove extra closing tags (more closes than opens)
// Need to properly count self-closing tags (they don't need closing tags)
const tagCounts = new Map <
string ,
{ opens : number ; closes : number ; selfClosing : number }
> ( )
// Match full tags to detect self-closing by checking if ends with />
const fullTagPattern = /<(\/?[a-zA-Z][a-zA-Z0-9]*)[^>]*>/g
let tagCountMatch
while ( ( tagCountMatch = fullTagPattern . exec ( fixed ) ) !== null ) {
const fullMatch = tagCountMatch [ 0 ] // e.g., "<mxCell .../>" or "</mxCell>"
const tagPart = tagCountMatch [ 1 ] // e.g., "mxCell" or "/mxCell"
const isClosing = tagPart . startsWith ( "/" )
const isSelfClosing = fullMatch . endsWith ( "/>" )
const tagName = isClosing ? tagPart . slice ( 1 ) : tagPart
let counts = tagCounts . get ( tagName )
if ( ! counts ) {
counts = { opens : 0 , closes : 0 , selfClosing : 0 }
tagCounts . set ( tagName , counts )
}
if ( isClosing ) {
counts . closes ++
} else if ( isSelfClosing ) {
counts . selfClosing ++
} else {
counts . opens ++
}
}
// Log tag counts for debugging
for ( const [ tagName , counts ] of tagCounts ) {
if (
tagName === "mxCell" ||
tagName === "mxGeometry" ||
counts . opens !== counts . closes
) {
console . log (
` [autoFixXml] Step 10b: ${ tagName } - opens: ${ counts . opens } , closes: ${ counts . closes } , selfClosing: ${ counts . selfClosing } ` ,
)
}
}
// Find tags with extra closing tags (self-closing tags are balanced, don't need closing)
for ( const [ tagName , counts ] of tagCounts ) {
const extraCloses = counts . closes - counts . opens // Only compare opens vs closes (self-closing are balanced)
if ( extraCloses > 0 ) {
console . log (
` [autoFixXml] Step 10b: ${ tagName } has ${ counts . opens } opens, ${ counts . closes } closes, removing ${ extraCloses } extra ` ,
)
// Remove extra closing tags from the end
let removed = 0
const closeTagPattern = new RegExp ( ` </ ${ tagName } > ` , "g" )
const matches = [ . . . fixed . matchAll ( closeTagPattern ) ]
// Remove from the end (last occurrences are likely the extras)
for (
let i = matches . length - 1 ;
i >= 0 && removed < extraCloses ;
i --
) {
const match = matches [ i ]
const idx = match . index ? ? 0
fixed = fixed . slice ( 0 , idx ) + fixed . slice ( idx + match [ 0 ] . length )
removed ++
}
if ( removed > 0 ) {
console . log (
` [autoFixXml] Step 10b: Removed ${ removed } extra </ ${ tagName } > ` ,
)
fixes . push (
` Removed ${ removed } extra </ ${ tagName } > closing tag(s) ` ,
)
}
}
}
// 10c. Remove trailing garbage after last XML tag (e.g., stray backslashes, text)
// Find the last valid closing tag or self-closing tag
const closingTagPattern = /<\/[a-zA-Z][a-zA-Z0-9]*>|\/>/g
let lastValidTagEnd = - 1
let closingMatch
while ( ( closingMatch = closingTagPattern . exec ( fixed ) ) !== null ) {
lastValidTagEnd = closingMatch . index + closingMatch [ 0 ] . length
}
if ( lastValidTagEnd > 0 && lastValidTagEnd < fixed . length ) {
const trailing = fixed . slice ( lastValidTagEnd ) . trim ( )
if ( trailing ) {
fixed = fixed . slice ( 0 , lastValidTagEnd )
fixes . push ( "Removed trailing garbage after last XML tag" )
}
}
2025-12-13 15:00:28 +09:00
// 11. Fix nested mxCell by flattening
// Pattern A: <mxCell id="X">...<mxCell id="X">...</mxCell></mxCell> (duplicate ID)
// Pattern B: <mxCell id="X">...<mxCell id="Y">...</mxCell></mxCell> (different ID - true nesting)
const lines = fixed . split ( "\n" )
let newLines : string [ ] = [ ]
let nestedFixed = 0
let extraClosingToRemove = 0
// First pass: fix duplicate ID nesting (same as before)
for ( let i = 0 ; i < lines . length ; i ++ ) {
const line = lines [ i ]
const nextLine = lines [ i + 1 ]
// Check if current line and next line are both mxCell opening tags with same ID
if (
nextLine &&
/<mxCell\s/ . test ( line ) &&
/<mxCell\s/ . test ( nextLine ) &&
! line . includes ( "/>" ) &&
! nextLine . includes ( "/>" )
) {
const id1 = line . match ( /\bid\s*=\s*["']([^"']+)["']/ ) ? . [ 1 ]
const id2 = nextLine . match ( /\bid\s*=\s*["']([^"']+)["']/ ) ? . [ 1 ]
if ( id1 && id1 === id2 ) {
nestedFixed ++
extraClosingToRemove ++ // Need to remove one </mxCell> later
continue // Skip this duplicate opening line
}
}
// Remove extra </mxCell> if we have pending removals
if ( extraClosingToRemove > 0 && /^\s*<\/mxCell>\s*$/ . test ( line ) ) {
extraClosingToRemove --
continue // Skip this closing tag
}
newLines . push ( line )
}
if ( nestedFixed > 0 ) {
fixed = newLines . join ( "\n" )
fixes . push ( ` Flattened ${ nestedFixed } duplicate-ID nested mxCell(s) ` )
}
// Second pass: fix true nesting (different IDs)
// Insert </mxCell> before nested child to close parent
const lines2 = fixed . split ( "\n" )
newLines = [ ]
let trueNestedFixed = 0
let cellDepth = 0
let pendingCloseRemoval = 0
for ( let i = 0 ; i < lines2 . length ; i ++ ) {
const line = lines2 [ i ]
const trimmed = line . trim ( )
// Track mxCell depth
const isOpenCell = /<mxCell\s/ . test ( trimmed ) && ! trimmed . endsWith ( "/>" )
const isCloseCell = trimmed === "</mxCell>"
if ( isOpenCell ) {
if ( cellDepth > 0 ) {
// Found nested cell - insert closing tag for parent before this line
const indent = line . match ( /^(\s*)/ ) ? . [ 1 ] || ""
newLines . push ( indent + "</mxCell>" )
trueNestedFixed ++
pendingCloseRemoval ++ // Need to remove one </mxCell> later
}
cellDepth = 1 // Reset to 1 since we just opened a new cell
newLines . push ( line )
} else if ( isCloseCell ) {
if ( pendingCloseRemoval > 0 ) {
pendingCloseRemoval --
// Skip this extra closing tag
} else {
cellDepth = Math . max ( 0 , cellDepth - 1 )
newLines . push ( line )
}
} else {
newLines . push ( line )
}
}
if ( trueNestedFixed > 0 ) {
fixed = newLines . join ( "\n" )
fixes . push ( ` Fixed ${ trueNestedFixed } true nested mxCell(s) ` )
}
// 12. Fix duplicate IDs by appending suffix
const seenIds = new Map < string , number > ( )
const duplicateIds : string [ ] = [ ]
// First pass: find duplicates
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
const idPattern = /\bid\s*=\s*["']([^"']+)["']/gi
2025-12-13 15:00:28 +09:00
let idMatch
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
while ( ( idMatch = idPattern . exec ( fixed ) ) !== null ) {
2025-12-13 15:00:28 +09:00
const id = idMatch [ 1 ]
seenIds . set ( id , ( seenIds . get ( id ) || 0 ) + 1 )
}
// Find which IDs are duplicated
for ( const [ id , count ] of seenIds ) {
if ( count > 1 ) duplicateIds . push ( id )
}
// Second pass: rename duplicates (keep first occurrence, rename others)
if ( duplicateIds . length > 0 ) {
const idCounters = new Map < string , number > ( )
fixed = fixed . replace ( /\bid\s*=\s*["']([^"']+)["']/gi , ( match , id ) = > {
if ( ! duplicateIds . includes ( id ) ) return match
const count = idCounters . get ( id ) || 0
idCounters . set ( id , count + 1 )
if ( count === 0 ) return match // Keep first occurrence
// Rename subsequent occurrences
const newId = ` ${ id } _dup ${ count } `
return match . replace ( id , newId )
} )
fixes . push ( ` Renamed ${ duplicateIds . length } duplicate ID(s) ` )
}
// 9. Fix empty id attributes by generating unique IDs
let emptyIdCount = 0
fixed = fixed . replace (
/<mxCell([^>]*)\sid\s*=\s*["']\s*["']([^>]*)>/g ,
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
( _match , before , after ) = > {
2025-12-13 15:00:28 +09:00
emptyIdCount ++
const newId = ` cell_ ${ Date . now ( ) } _ ${ emptyIdCount } `
return ` <mxCell ${ before } id=" ${ newId } " ${ after } > `
} ,
)
if ( emptyIdCount > 0 ) {
fixes . push ( ` Generated ${ emptyIdCount } missing ID(s) ` )
}
2025-12-13 16:02:56 +09:00
// 13. Aggressive: drop broken mxCell elements that can't be fixed
// Only do this if DOM parser still finds errors after all other fixes
if ( typeof DOMParser !== "undefined" ) {
let droppedCells = 0
feat: add XML auto-fix with refined validation logic (#247)
* feat: add XML auto-fix and improve validator accuracy
- Add autoFixXml() to automatically repair common XML issues:
- CDATA wrapper removal
- Duplicate attribute removal
- Unescaped & and < character escaping
- Invalid entity reference fixing
- Unclosed tag completion
- Nested mxCell flattening
- Duplicate ID renaming
- Improve validateMxCellStructure() with DOM + regex approach:
- Use DOMParser for syntax error detection (94% recall)
- Add regex checks for edge cases
- Stateful parser for handling > in attribute values
- Integrate validateAndFixXml() in chat-message-display and diagram-context
- Auto-repair invalid XML before loading
- Log fixes applied for debugging
Metrics: 99.77% accuracy, 94.06% recall, 94.4% auto-fix success rate
* fix: improve XML auto-fix from 58.7% to 99% fix rate
Key improvements:
- Reorder CDATA removal to run before text-before-root check (+35 cases)
- Implement Gemini's backslash-quote fix with regex backreference
Handles attr="value", value="text\"inner\"more", and mixed patterns
- Add aggressive drop-broken-cells fix for unfixable mxCell elements
Iteratively removes cells causing DOM parse errors (up to 50)
Results on 9,411 XML dataset:
- 206 invalid XMLs detected
- 204 successfully fixed (99.0% fix rate)
- 2 unfixable (completely broken, need regeneration)
* refactor: extract XML validation/fix helpers and add constants
- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
* fix: improve XML auto-fix with malformed quote pattern
- Fix ="..." pattern where " was used as delimiter instead of actual quotes
- Common in dashPattern attributes like dashPattern="1 1;"
2025-12-13 23:31:01 +09:00
let maxIterations = MAX_DROP_ITERATIONS
2025-12-13 16:02:56 +09:00
while ( maxIterations -- > 0 ) {
const parser = new DOMParser ( )
const doc = parser . parseFromString ( fixed , "text/xml" )
const parseError = doc . querySelector ( "parsererror" )
if ( ! parseError ) break // Valid now!
const errText = parseError . textContent || ""
const match = errText . match ( /(\d+):\d+:/ )
if ( ! match ) break
const errLine = parseInt ( match [ 1 ] , 10 ) - 1
const lines = fixed . split ( "\n" )
// Find the mxCell containing this error line
let cellStart = errLine
let cellEnd = errLine
// Go back to find <mxCell
while ( cellStart > 0 && ! lines [ cellStart ] . includes ( "<mxCell" ) ) {
cellStart --
}
// Go forward to find </mxCell> or />
while ( cellEnd < lines . length - 1 ) {
if (
lines [ cellEnd ] . includes ( "</mxCell>" ) ||
lines [ cellEnd ] . trim ( ) . endsWith ( "/>" )
) {
break
}
cellEnd ++
}
// Remove these lines
lines . splice ( cellStart , cellEnd - cellStart + 1 )
fixed = lines . join ( "\n" )
droppedCells ++
}
if ( droppedCells > 0 ) {
fixes . push ( ` Dropped ${ droppedCells } unfixable mxCell element(s) ` )
}
}
2025-12-13 15:00:28 +09:00
return { fixed , fixes }
}
/ * *
* Validates XML and attempts to fix if invalid
* @param xml - The XML string to validate and potentially fix
* @returns Object with validation result , fixed XML if applicable , and fixes applied
* /
export function validateAndFixXml ( xml : string ) : {
valid : boolean
error : string | null
fixed : string | null
fixes : string [ ]
} {
// First validation attempt
let error = validateMxCellStructure ( xml )
if ( ! error ) {
return { valid : true , error : null , fixed : null , fixes : [ ] }
}
// Try to fix
const { fixed , fixes } = autoFixXml ( xml )
2025-12-14 19:38:40 +09:00
console . log ( "[validateAndFixXml] Fixes applied:" , fixes )
2025-12-13 15:00:28 +09:00
// Validate the fixed version
error = validateMxCellStructure ( fixed )
2025-12-14 19:38:40 +09:00
if ( error ) {
console . log ( "[validateAndFixXml] Still invalid after fix:" , error )
}
2025-12-13 15:00:28 +09:00
if ( ! error ) {
return { valid : true , error : null , fixed , fixes }
}
2025-12-14 19:38:40 +09:00
// Still invalid after fixes - but return the partially fixed XML
// so we can see what was fixed and what error remains
return {
valid : false ,
error ,
fixed : fixes.length > 0 ? fixed : null ,
fixes ,
}
2025-12-03 16:14:53 +09:00
}
2025-03-27 06:45:38 +00:00
export function extractDiagramXML ( xml_svg_string : string ) : string {
2025-12-06 12:46:40 +09:00
try {
// 1. Parse the SVG string (using built-in DOMParser in a browser-like environment)
const svgString = atob ( xml_svg_string . slice ( 26 ) )
const parser = new DOMParser ( )
const svgDoc = parser . parseFromString ( svgString , "image/svg+xml" )
const svgElement = svgDoc . querySelector ( "svg" )
if ( ! svgElement ) {
throw new Error ( "No SVG element found in the input string." )
}
// 2. Extract the 'content' attribute
const encodedContent = svgElement . getAttribute ( "content" )
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
if ( ! encodedContent ) {
throw new Error ( "SVG element does not have a 'content' attribute." )
}
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
// 3. Decode HTML entities (using a minimal function)
function decodeHtmlEntities ( str : string ) {
const textarea = document . createElement ( "textarea" ) // Use built-in element
textarea . innerHTML = str
return textarea . value
}
const xmlContent = decodeHtmlEntities ( encodedContent )
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
// 4. Parse the XML content
const xmlDoc = parser . parseFromString ( xmlContent , "text/xml" )
const diagramElement = xmlDoc . querySelector ( "diagram" )
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
if ( ! diagramElement ) {
throw new Error ( "No diagram element found" )
}
// 5. Extract base64 encoded data
const base64EncodedData = diagramElement . textContent
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
if ( ! base64EncodedData ) {
throw new Error ( "No encoded data found in the diagram element" )
}
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
// 6. Decode base64 data
const binaryString = atob ( base64EncodedData )
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
// 7. Convert binary string to Uint8Array
const len = binaryString . length
const bytes = new Uint8Array ( len )
for ( let i = 0 ; i < len ; i ++ ) {
bytes [ i ] = binaryString . charCodeAt ( i )
}
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
// 8. Decompress data using pako (equivalent to zlib.decompress with wbits=-15)
const decompressedData = pako . inflate ( bytes , { windowBits : - 15 } )
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
// 9. Convert the decompressed data to a string
const decoder = new TextDecoder ( "utf-8" )
const decodedString = decoder . decode ( decompressedData )
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
// Decode URL-encoded content (equivalent to Python's urllib.parse.unquote)
const urlDecodedString = decodeURIComponent ( decodedString )
2025-03-27 06:45:38 +00:00
2025-12-06 12:46:40 +09:00
return urlDecodedString
} catch ( error ) {
console . error ( "Error extracting diagram XML:" , error )
throw error // Re-throw for caller handling
}
2025-03-27 06:45:38 +00:00
}