Browse Source

Avoid modifying the XML string where possible

domparser
J. King 1 year ago
parent
commit
b7535c9b61
  1. 37
      lib/DOMParser.php

37
lib/DOMParser.php

@ -22,7 +22,7 @@ class DOMParser {
/^
<\?xml
(\s+version=(?:"1\.[0-9]+"|'1\.[0-9]+'))
(?:\s+encoding=(?:"[A-Za-z][A-Za-z0-9\._\-]*"|'[A-Za-z][A-Za-z0-9\._\-]*'))?
(?:\s+encoding=("[A-Za-z][A-Za-z0-9\._\-]*"|'[A-Za-z][A-Za-z0-9\._\-]*'))?
(\s+standalone=(?:"yes"|"no"|'yes'|'no'))?
(\s*)\?>
/sx
@ -63,8 +63,7 @@ XMLDECL;
// check the type for a charset parameter if there is no BOM
$charset = $t->params['charset'] ?? "";
if ($charset) {
$encoding = Encoding::matchLabel($charset);
if ($encoding) {
if ($encoding = Encoding::matchLabel($charset)) {
$charset = $encoding['name'];
}
}
@ -79,9 +78,10 @@ XMLDECL;
// or ISO 2022-JP never contain non-ASCII characters
// before encoding information is seen)
if ($charset === "UTF-16BE" || $charset === "UTF-16LE") {
// NOTE: the transcoding operation may throw an
// exception, which is why this whole operation
// is wrapped in a try block
// NOTE: the transcoding operation may throw an
// exception due to unpaired surrogates, which
// is why this whole operation is wrapped in a
// try block
$decoder = Encoding::createDecoder($charset, $string, true, false);
$string = "";
while (strlen($c = $decoder->nextChar())) {
@ -93,10 +93,27 @@ XMLDECL;
}
// look for an XML declaration
if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
// substitute the information if one is found
$string = "<?xml".$match[1]." encoding=\"$charset\"".$match[2].$match[3]."?>".substr($string, strlen($match[0]));
} else {
// add a declaration if none is found
// if an existing encoding declaration is found,
// keep it only if it matches; if no encoding
// declaration is found but the encoding is UTF-8
// this is also acceptable
$keep = false;
if ($match[2]) {
$candidate = substr($match[2], 1, strlen($match[2]) - 2);
if ($encoding = Encoding::matchLabel($candidate)) {
if ($charset === $encoding['name']) {
$keep = true;
}
}
} elseif ($charset === "UTF-8") {
$keep = true;
}
// substitute the encoding declaration where necessary
if (!$keep) {
$string = "<?xml".$match[1]." encoding=\"$charset\"".$match[3].$match[4]."?>".substr($string, strlen($match[0]));
}
} elseif ($charset !== "UTF-8") {
// add a declaration if none is found and the encoding is not UTF-8
$string = "<?xml version=\"1.0\" encoding=\"$charset\" ?>".$string;
}
}

Loading…
Cancel
Save