Browse Source

Simplify UTF-16 handling in DOMParser

domparser
J. King 1 year ago
parent
commit
504d713139
  1. 42
      lib/DOMParser.php
  2. 44
      tests/cases/TestDOMParser.php

42
lib/DOMParser.php

@ -52,11 +52,13 @@ XMLDECL;
// for HTML we invoke our parser which has its own handling for everything
return $this->createDocumentHtml($string, $type);
} elseif ($t->isXml) {
// for XML we have to jump through a few hoops to deal with encoding;
// if we have a known encoding we want to make sure the XML parser
// doesn't try to do its own detection. The only way to do this is
// to convert to UTF-8 where necessary and remove any XML
// declaration encoding information
// for XML we have to jump through a few hoops to deal with
// encoding; if we have a known encoding we want to make sure
// the XML parser doesn't try to do its own detection. We can
// treat byte order marks as authoritative. In their absence we
// can add BOMs to UTF-16 documents, but for other encodings we
// must parse XML declarations and validate that any encoding
// declaration is correct and change it if it is incorrect
try {
// first check for a byte order mark; if one exists we can go straight to parsing
if (!Encoding::sniffBOM($string)) {
@ -70,28 +72,14 @@ XMLDECL;
// if a supported encoding was parsed from the type, act
// accordingly; otherwise skip to parsing and let the
// XML parser detect encoding
if ($charset) {
// if the string is UTF-16, transcode it to UTF-8 so
// we're always dealing with an ASCII-compatible
// encoding (XML's parsing rules ensure documents
// in semi-ASCII-compatible encodings like Shift_JIS
// or ISO 2022-JP never contain non-ASCII characters
// before encoding information is seen)
if ($charset === "UTF-16BE" || $charset === "UTF-16LE") {
// NOTE: the transcoding operation may throw an
// exception due to unpaired surrogates, which
// is why this whole operation is wrapped in a
// try block
$decoder = Encoding::createDecoder($charset, $string, true, false);
$string = "";
while (strlen($c = $decoder->nextChar())) {
$string .= $c;
$string .= $decoder->asciiSpanNot("");
}
unset($decoder);
$charset = "UTF-8";
}
// look for an XML declaration
if ($charset === "UTF-16BE") {
// if the string is UTF-16BE, adding a BOM is sufficient
$string = self::BOM_UTF16BE.$string;
} elseif ($charset === "UTF-16LE") {
// if the string is UTF-16LE, adding a BOM is sufficient
$string = self::BOM_UTF16LE.$string;
} elseif ($charset) {
// for ASCII-compatible encodings look for an XML declaration
if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
// if an existing encoding declaration is found,
// keep it only if it matches; if no encoding

44
tests/cases/TestDOMParser.php

@ -16,24 +16,42 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
public function testParseADocument(string $input, string $type, bool $parseError, string $exp): void {
$p = new DOMParser;
$document = $p->parseFromString($input, $type);
$root = $parseError ? "parserror" : "html";
$this->assertSame($root, $document->documentElement->tagName);
$root = $parseError ? "parsererror" : "html";
$this->assertSame($exp, $document->documentElement->textContent);
$this->assertSame($root, $document->documentElement->tagName);
}
public function provideDocuments(): iterable {
$mkUtf16 = function(string $s, bool $le) {
$replacement = $le ? "$0\x00" : "\x00$0";
return preg_replace("/[\x{01}-\x{7F}]/s", $replacement, $s);
};
return [
["Test", "text/html", false, "Test"],
["Ol\xE9", "text/html", false, "Ol\u{E9}"],
["Ol\u{E9}", "text/html;charset=utf8", false, "Ol\u{E9}"],
["<meta charset=utf8>Ol\u{E9}", "text/html", false, "Ol\u{E9}"],
["<html>Test</html>", "text/xml", false, "Test"],
["<html>Ol\u{E9}</html>", "text/xml", false, "Ol\u{E9}"],
["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
["\u{FEFF}<html>Ol\u{E9}</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
["<?xml version='1.0' encoding='windows-1252'?><html>Ol\xE9</html>", "text/xml", false, "Ol\u{E9}"],
["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
["<?xml version='1.2' encoding='windows-1252'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["Test", "text/html", false, "Test"],
["Ol\xE9", "text/html", false, "Ol\u{E9}"],
["Ol\u{E9}", "text/html;charset=utf8", false, "Ol\u{E9}"],
["<meta charset=utf8>Ol\u{E9}", "text/html", false, "Ol\u{E9}"],
["<html>Test</html>", "text/xml", false, "Test"],
["<html>Ol\u{E9}</html>", "text/xml", false, "Ol\u{E9}"],
["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
["\u{FEFF}<html>Ol\u{E9}</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
["<?xml version='1.0' encoding='windows-1252'?><html>Ol\xE9</html>", "text/xml", false, "Ol\u{E9}"],
["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
["<html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.1' encoding='windows-1252'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.1' encoding='utf8'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.1'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.1' ?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.0' standalone='yes'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.0' standalone='yes'?><html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
[$mkUtf16("\xFE\xFF<html>Ol\x00\xE9</html>", false), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("\xFF\xFE<html>Ol\xE9\x00</html>", true), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("<?xml version='1.0' encoding='UTF-16'?><html>Ol\x00\xE9</html>", false), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("<?xml version='1.0' encoding='UTF-16'?><html>Ol\xE9\x00</html>", true), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("\xFE\xFF<?xml version='1.0' encoding='UTF-8'?><html>Ol\x00\xE9</html>", false), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("\xFF\xFE<?xml version='1.0' encoding='UTF-8'?><html>Ol\xE9\x00</html>", true), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("<?xml version='1.0' encoding='UTF-8'?><html>Ol\x00\xE9</html>", false), "text/xml;charset=utf-16be", false, "Ol\u{E9}"],
[$mkUtf16("<?xml version='1.0' encoding='UTF-8'?><html>Ol\xE9\x00</html>", true), "text/xml;charset=utf-16le", false, "Ol\u{E9}"],
];
}
}

Loading…
Cancel
Save