refactor: extract XML validation/fix helpers and add constants

- Add constants: MAX_XML_SIZE (1MB), MAX_DROP_ITERATIONS (10), STRUCTURAL_ATTRS, VALID_ENTITIES
- Extract parseXmlTags helper for shared tag parsing logic
- Extract validation helpers: checkDuplicateAttributes, checkDuplicateIds, checkTagMismatches, checkCharacterReferences, checkEntityReferences, checkNestedMxCells
- Simplify validateMxCellStructure from ~200 lines to ~55 lines
- Add logging to empty catch block in DOMParser section
- Add size warning for large XML documents
- Remove unused variables (isSelfClose, duplicate idPattern)
This commit is contained in:
dayuan.jiang
2025-12-13 16:31:31 +09:00
parent 6024443816
commit 39c0497762

View File

@@ -6,6 +6,95 @@ export function cn(...inputs: ClassValue[]) {
return twMerge(clsx(inputs))
}
// ============================================================================
// XML Validation/Fix Constants
// ============================================================================
/** Maximum XML size to process (1MB) - larger XMLs may cause performance issues */
const MAX_XML_SIZE = 1_000_000
/** Maximum iterations for aggressive cell dropping to prevent infinite loops */
const MAX_DROP_ITERATIONS = 10
/** Structural attributes that should not be duplicated in draw.io */
const STRUCTURAL_ATTRS = [
"edge",
"parent",
"source",
"target",
"vertex",
"connectable",
]
/** Valid XML entity names */
const VALID_ENTITIES = new Set(["lt", "gt", "amp", "quot", "apos"])
// ============================================================================
// XML Parsing Helpers
// ============================================================================
interface ParsedTag {
tag: string
tagName: string
isClosing: boolean
isSelfClosing: boolean
startIndex: number
endIndex: number
}
/**
* Parse XML tags while properly handling quoted strings
* This is a shared utility used by both validation and fixing logic
*/
function parseXmlTags(xml: string): ParsedTag[] {
const tags: ParsedTag[] = []
let i = 0
while (i < xml.length) {
const tagStart = xml.indexOf("<", i)
if (tagStart === -1) break
// Find matching > by tracking quotes
let tagEnd = tagStart + 1
let inQuote = false
let quoteChar = ""
while (tagEnd < xml.length) {
const c = xml[tagEnd]
if (inQuote) {
if (c === quoteChar) inQuote = false
} else {
if (c === '"' || c === "'") {
inQuote = true
quoteChar = c
} else if (c === ">") {
break
}
}
tagEnd++
}
if (tagEnd >= xml.length) break
const tag = xml.substring(tagStart, tagEnd + 1)
i = tagEnd + 1
const tagMatch = /^<(\/?)([a-zA-Z][a-zA-Z0-9:_-]*)/.exec(tag)
if (!tagMatch) continue
tags.push({
tag,
tagName: tagMatch[2],
isClosing: tagMatch[1] === "/",
isSelfClosing: tag.endsWith("/>"),
startIndex: tagStart,
endIndex: tagEnd,
})
}
return tags
}
/**
* Format XML string with proper indentation and line breaks
* @param xml - The XML string to format
@@ -533,48 +622,13 @@ export function replaceXMLParts(
return result
}
/**
* Validates draw.io XML structure for common issues
* Uses DOM parsing + additional regex checks for high accuracy
* @param xml - The XML string to validate
* @returns null if valid, error message string if invalid
*/
export function validateMxCellStructure(xml: string): string | null {
// 0. First use DOM parser to catch syntax errors (most accurate)
try {
const parser = new DOMParser()
const doc = parser.parseFromString(xml, "text/xml")
const parseError = doc.querySelector("parsererror")
if (parseError) {
return `Invalid XML: The XML contains syntax errors (likely unescaped special characters like <, >, & in attribute values). Please escape special characters: use &lt; for <, &gt; for >, &amp; for &, &quot; for ". Regenerate the diagram with properly escaped values.`
}
// ============================================================================
// Validation Helper Functions
// ============================================================================
// DOM-based checks for nested mxCell
const allCells = doc.querySelectorAll("mxCell")
for (const cell of allCells) {
if (cell.parentElement?.tagName === "mxCell") {
const id = cell.getAttribute("id") || "unknown"
return `Invalid XML: Found nested mxCell (id="${id}"). Cells should be siblings, not nested inside other mxCell elements.`
}
}
} catch {
// If DOMParser fails, continue with regex checks
}
// 1. Check for CDATA wrapper (invalid at document root)
if (/^\s*<!\[CDATA\[/.test(xml)) {
return "Invalid XML: XML is wrapped in CDATA section - remove <![CDATA[ from start and ]]> from end"
}
// 2. Check for duplicate structural attributes in tags
const structuralAttrs = new Set([
"edge",
"parent",
"source",
"target",
"vertex",
"connectable",
])
/** Check for duplicate structural attributes in a tag */
function checkDuplicateAttributes(xml: string): string | null {
const structuralSet = new Set(STRUCTURAL_ATTRS)
const tagPattern = /<[^>]+>/g
let tagMatch
while ((tagMatch = tagPattern.exec(xml)) !== null) {
@@ -587,24 +641,17 @@ export function validateMxCellStructure(xml: string): string | null {
attributes.set(attrName, (attributes.get(attrName) || 0) + 1)
}
const duplicates = Array.from(attributes.entries())
.filter(([name, count]) => count > 1 && structuralAttrs.has(name))
.filter(([name, count]) => count > 1 && structuralSet.has(name))
.map(([name]) => name)
if (duplicates.length > 0) {
return `Invalid XML: Duplicate structural attribute(s): ${duplicates.join(", ")}. Remove duplicate attributes.`
}
}
return null
}
// 3. Check for unescaped < in attribute values
const attrValuePattern = /=\s*"([^"]*)"/g
let attrValMatch
while ((attrValMatch = attrValuePattern.exec(xml)) !== null) {
const value = attrValMatch[1]
if (/</.test(value) && !/&lt;/.test(value)) {
return "Invalid XML: Unescaped < character in attribute values. Replace < with &lt;"
}
}
// 4. Check for duplicate IDs
/** Check for duplicate IDs in XML */
function checkDuplicateIds(xml: string): string | null {
const idPattern = /\bid\s*=\s*["']([^"']+)["']/gi
const ids = new Map<string, number>()
let idMatch
@@ -618,50 +665,16 @@ export function validateMxCellStructure(xml: string): string | null {
if (duplicateIds.length > 0) {
return `Invalid XML: Found duplicate ID(s): ${duplicateIds.slice(0, 3).join(", ")}. All id attributes must be unique.`
}
return null
}
// 5. Check for tag mismatches using stateful parser
/** Check for tag mismatches using parsed tags */
function checkTagMismatches(xml: string): string | null {
const xmlWithoutComments = xml.replace(/<!--[\s\S]*?-->/g, "")
const tags = parseXmlTags(xmlWithoutComments)
const tagStack: string[] = []
// Parse tags properly by handling quoted strings
let i = 0
while (i < xmlWithoutComments.length) {
// Find next <
const tagStart = xmlWithoutComments.indexOf("<", i)
if (tagStart === -1) break
// Find matching > by tracking quotes
let tagEnd = tagStart + 1
let inQuote = false
let quoteChar = ""
while (tagEnd < xmlWithoutComments.length) {
const c = xmlWithoutComments[tagEnd]
if (inQuote) {
if (c === quoteChar) inQuote = false
} else {
if (c === '"' || c === "'") {
inQuote = true
quoteChar = c
} else if (c === ">") {
break
}
}
tagEnd++
}
if (tagEnd >= xmlWithoutComments.length) break
const tag = xmlWithoutComments.substring(tagStart, tagEnd + 1)
i = tagEnd + 1
// Parse the tag
const tagMatch = /^<(\/?)([a-zA-Z][a-zA-Z0-9:_-]*)/.exec(tag)
if (!tagMatch) continue
const isClosing = tagMatch[1] === "/"
const tagName = tagMatch[2]
const isSelfClosing = tag.endsWith("/>")
for (const { tagName, isClosing, isSelfClosing } of tags) {
if (isClosing) {
if (tagStack.length === 0) {
return `Invalid XML: Closing tag </${tagName}> without matching opening tag`
@@ -677,8 +690,11 @@ export function validateMxCellStructure(xml: string): string | null {
if (tagStack.length > 0) {
return `Invalid XML: Document has ${tagStack.length} unclosed tag(s): ${tagStack.join(", ")}`
}
return null
}
// 6. Check invalid character references
/** Check for invalid character references */
function checkCharacterReferences(xml: string): string | null {
const charRefPattern = /&#x?[^;]+;?/g
let charMatch
while ((charMatch = charRefPattern.exec(xml)) !== null) {
@@ -701,42 +717,30 @@ export function validateMxCellStructure(xml: string): string | null {
}
}
}
return null
}
// 7. Check for invalid comment syntax (-- inside comments)
const commentPattern = /<!--([\s\S]*?)-->/g
let commentMatch
while ((commentMatch = commentPattern.exec(xml)) !== null) {
if (/--/.test(commentMatch[1])) {
return "Invalid XML: Comment contains -- (double hyphen) which is not allowed"
}
}
// 8. Check for unescaped entity references and invalid entity names
/** Check for invalid entity references */
function checkEntityReferences(xml: string): string | null {
const xmlWithoutComments = xml.replace(/<!--[\s\S]*?-->/g, "")
const bareAmpPattern = /&(?!(?:lt|gt|amp|quot|apos|#))/g
if (bareAmpPattern.test(xmlWithoutComments)) {
return "Invalid XML: Found unescaped & character(s). Replace & with &amp;"
}
const invalidEntityPattern = /&([a-zA-Z][a-zA-Z0-9]*);/g
const validEntities = new Set(["lt", "gt", "amp", "quot", "apos"])
let entityMatch
while (
(entityMatch = invalidEntityPattern.exec(xmlWithoutComments)) !== null
) {
if (!validEntities.has(entityMatch[1])) {
if (!VALID_ENTITIES.has(entityMatch[1])) {
return `Invalid XML: Invalid entity reference: &${entityMatch[1]}; - use only valid XML entities (lt, gt, amp, quot, apos)`
}
}
return null
}
// 9. Check for empty id attributes on mxCell
if (/<mxCell[^>]*\sid\s*=\s*["']\s*["'][^>]*>/g.test(xml)) {
return "Invalid XML: Found mxCell element(s) with empty id attribute"
}
// 10. Check for mxfile wrapper (warning only - may not work with URL hash loading)
// Disabled: This is just a warning, not an error
// if (xml.trim().startsWith('<mxfile')) { ... }
// 11. Check for nested mxCell tags
/** Check for nested mxCell tags using regex */
function checkNestedMxCells(xml: string): string | null {
const cellTagPattern = /<\/?mxCell[^>]*>/g
const cellStack: number[] = []
let cellMatch
@@ -755,6 +759,100 @@ export function validateMxCellStructure(xml: string): string | null {
}
}
}
return null
}
/**
* Validates draw.io XML structure for common issues
* Uses DOM parsing + additional regex checks for high accuracy
* @param xml - The XML string to validate
* @returns null if valid, error message string if invalid
*/
export function validateMxCellStructure(xml: string): string | null {
// Size check for performance
if (xml.length > MAX_XML_SIZE) {
console.warn(
`[validateMxCellStructure] XML size (${xml.length}) exceeds ${MAX_XML_SIZE} bytes, may cause performance issues`,
)
}
// 0. First use DOM parser to catch syntax errors (most accurate)
try {
const parser = new DOMParser()
const doc = parser.parseFromString(xml, "text/xml")
const parseError = doc.querySelector("parsererror")
if (parseError) {
return `Invalid XML: The XML contains syntax errors (likely unescaped special characters like <, >, & in attribute values). Please escape special characters: use &lt; for <, &gt; for >, &amp; for &, &quot; for ". Regenerate the diagram with properly escaped values.`
}
// DOM-based checks for nested mxCell
const allCells = doc.querySelectorAll("mxCell")
for (const cell of allCells) {
if (cell.parentElement?.tagName === "mxCell") {
const id = cell.getAttribute("id") || "unknown"
return `Invalid XML: Found nested mxCell (id="${id}"). Cells should be siblings, not nested inside other mxCell elements.`
}
}
} catch (error) {
// Log unexpected DOMParser errors before falling back to regex checks
console.warn(
"[validateMxCellStructure] DOMParser threw unexpected error, falling back to regex validation:",
error,
)
}
// 1. Check for CDATA wrapper (invalid at document root)
if (/^\s*<!\[CDATA\[/.test(xml)) {
return "Invalid XML: XML is wrapped in CDATA section - remove <![CDATA[ from start and ]]> from end"
}
// 2. Check for duplicate structural attributes
const dupAttrError = checkDuplicateAttributes(xml)
if (dupAttrError) return dupAttrError
// 3. Check for unescaped < in attribute values
const attrValuePattern = /=\s*"([^"]*)"/g
let attrValMatch
while ((attrValMatch = attrValuePattern.exec(xml)) !== null) {
const value = attrValMatch[1]
if (/</.test(value) && !/&lt;/.test(value)) {
return "Invalid XML: Unescaped < character in attribute values. Replace < with &lt;"
}
}
// 4. Check for duplicate IDs
const dupIdError = checkDuplicateIds(xml)
if (dupIdError) return dupIdError
// 5. Check for tag mismatches
const tagMismatchError = checkTagMismatches(xml)
if (tagMismatchError) return tagMismatchError
// 6. Check invalid character references
const charRefError = checkCharacterReferences(xml)
if (charRefError) return charRefError
// 7. Check for invalid comment syntax (-- inside comments)
const commentPattern = /<!--([\s\S]*?)-->/g
let commentMatch
while ((commentMatch = commentPattern.exec(xml)) !== null) {
if (/--/.test(commentMatch[1])) {
return "Invalid XML: Comment contains -- (double hyphen) which is not allowed"
}
}
// 8. Check for unescaped entity references and invalid entity names
const entityError = checkEntityReferences(xml)
if (entityError) return entityError
// 9. Check for empty id attributes on mxCell
if (/<mxCell[^>]*\sid\s*=\s*["']\s*["'][^>]*>/g.test(xml)) {
return "Invalid XML: Found mxCell element(s) with empty id attribute"
}
// 10. Check for nested mxCell tags
const nestedCellError = checkNestedMxCells(xml)
if (nestedCellError) return nestedCellError
return null
}
@@ -796,20 +894,11 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
}
// 2. Fix duplicate attributes (keep first occurrence, remove duplicates)
const structuralAttrsToFix = [
"edge",
"parent",
"source",
"target",
"vertex",
"connectable",
]
let dupAttrFixed = false
fixed = fixed.replace(/<[^>]+>/g, (tag) => {
const seenAttrs = new Set<string>()
let newTag = tag
for (const attr of structuralAttrsToFix) {
for (const attr of STRUCTURAL_ATTRS) {
// Find all occurrences of this attribute
const attrRegex = new RegExp(
`\\s${attr}\\s*=\\s*["'][^"']*["']`,
@@ -891,7 +980,7 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
}
if (hasUnescapedLt) {
// Replace < with &lt; inside attribute values
fixed = fixed.replace(/=\s*"([^"]*)"/g, (match, value) => {
fixed = fixed.replace(/=\s*"([^"]*)"/g, (_match, value) => {
const escaped = value.replace(/</g, "&lt;")
return `="${escaped}"`
})
@@ -977,45 +1066,11 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
}
// 10. Fix unclosed tags by appending missing closing tags
// Track open tags and close any that are left open using stateful parser
// Use parseXmlTags helper to track open tags
const tagStack: string[] = []
const parsedTags = parseXmlTags(fixed)
let idx = 0
while (idx < fixed.length) {
const tagStart = fixed.indexOf("<", idx)
if (tagStart === -1) break
// Find matching > by tracking quotes
let tagEnd = tagStart + 1
let inQuote = false
let quoteChar = ""
while (tagEnd < fixed.length) {
const c = fixed[tagEnd]
if (inQuote) {
if (c === quoteChar) inQuote = false
} else {
if (c === '"' || c === "'") {
inQuote = true
quoteChar = c
} else if (c === ">") {
break
}
}
tagEnd++
}
if (tagEnd >= fixed.length) break
const tag = fixed.substring(tagStart, tagEnd + 1)
idx = tagEnd + 1
const tagMatch2 = /^<(\/?)([a-zA-Z][a-zA-Z0-9:_-]*)/.exec(tag)
if (!tagMatch2) continue
const isClosing = tagMatch2[1] === "/"
const tagName = tagMatch2[2]
const isSelfClosing = tag.endsWith("/>")
for (const { tagName, isClosing, isSelfClosing } of parsedTags) {
if (isClosing) {
// Find matching opening tag (may not be the last one if there's mismatch)
const lastIdx = tagStack.lastIndexOf(tagName)
@@ -1112,7 +1167,6 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
// Track mxCell depth
const isOpenCell = /<mxCell\s/.test(trimmed) && !trimmed.endsWith("/>")
const isCloseCell = trimmed === "</mxCell>"
const isSelfClose = /<mxCell[^>]*\/>/.test(trimmed)
if (isOpenCell) {
if (cellDepth > 0) {
@@ -1143,14 +1197,13 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
}
// 12. Fix duplicate IDs by appending suffix
const idPattern = /\bid\s*=\s*["']([^"']+)["']/gi
const seenIds = new Map<string, number>()
const duplicateIds: string[] = []
// First pass: find duplicates
const idPattern = /\bid\s*=\s*["']([^"']+)["']/gi
let idMatch
const tempPattern = /\bid\s*=\s*["']([^"']+)["']/gi
while ((idMatch = tempPattern.exec(fixed)) !== null) {
while ((idMatch = idPattern.exec(fixed)) !== null) {
const id = idMatch[1]
seenIds.set(id, (seenIds.get(id) || 0) + 1)
}
@@ -1182,7 +1235,7 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
let emptyIdCount = 0
fixed = fixed.replace(
/<mxCell([^>]*)\sid\s*=\s*["']\s*["']([^>]*)>/g,
(match, before, after) => {
(_match, before, after) => {
emptyIdCount++
const newId = `cell_${Date.now()}_${emptyIdCount}`
return `<mxCell${before} id="${newId}"${after}>`
@@ -1196,7 +1249,7 @@ export function autoFixXml(xml: string): { fixed: string; fixes: string[] } {
// Only do this if DOM parser still finds errors after all other fixes
if (typeof DOMParser !== "undefined") {
let droppedCells = 0
let maxIterations = 50
let maxIterations = MAX_DROP_ITERATIONS
while (maxIterations-- > 0) {
const parser = new DOMParser()
const doc = parser.parseFromString(fixed, "text/xml")