From 504d713139947bc868a588cb3dd6243974b10a45 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 1 Apr 2023 11:28:05 -0400 Subject: [PATCH] Simplify UTF-16 handling in DOMParser --- lib/DOMParser.php | 42 ++++++++++++--------------------- tests/cases/TestDOMParser.php | 44 ++++++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 40 deletions(-) diff --git a/lib/DOMParser.php b/lib/DOMParser.php index 603bca5..09ce276 100644 --- a/lib/DOMParser.php +++ b/lib/DOMParser.php @@ -52,11 +52,13 @@ XMLDECL; // for HTML we invoke our parser which has its own handling for everything return $this->createDocumentHtml($string, $type); } elseif ($t->isXml) { - // for XML we have to jump through a few hoops to deal with encoding; - // if we have a known encoding we want to make sure the XML parser - // doesn't try to do its own detection. The only way to do this is - // to convert to UTF-8 where necessary and remove any XML - // declaration encoding information + // for XML we have to jump through a few hoops to deal with + // encoding; if we have a known encoding we want to make sure + // the XML parser doesn't try to do its own detection. We can + // treat byte order marks as authoritative. In their absence we + // can add BOMs to UTF-16 documents, but for other encodings we + // must parse XML declarations and validate that any encoding + // declaration is correct and change it if it is incorrect try { // first check for a byte order mark; if one exists we can go straight to parsing if (!Encoding::sniffBOM($string)) { @@ -70,28 +72,14 @@ XMLDECL; // if a supported encoding was parsed from the type, act // accordingly; otherwise skip to parsing and let the // XML parser detect encoding - if ($charset) { - // if the string is UTF-16, transcode it to UTF-8 so - // we're always dealing with an ASCII-compatible - // encoding (XML's parsing rules ensure documents - // in semi-ASCII-compatible encodings like Shift_JIS - // or ISO 2022-JP never contain non-ASCII characters - // before encoding information is seen) - if ($charset === "UTF-16BE" || $charset === "UTF-16LE") { - // NOTE: the transcoding operation may throw an - // exception due to unpaired surrogates, which - // is why this whole operation is wrapped in a - // try block - $decoder = Encoding::createDecoder($charset, $string, true, false); - $string = ""; - while (strlen($c = $decoder->nextChar())) { - $string .= $c; - $string .= $decoder->asciiSpanNot(""); - } - unset($decoder); - $charset = "UTF-8"; - } - // look for an XML declaration + if ($charset === "UTF-16BE") { + // if the string is UTF-16BE, adding a BOM is sufficient + $string = self::BOM_UTF16BE.$string; + } elseif ($charset === "UTF-16LE") { + // if the string is UTF-16LE, adding a BOM is sufficient + $string = self::BOM_UTF16LE.$string; + } elseif ($charset) { + // for ASCII-compatible encodings look for an XML declaration if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) { // if an existing encoding declaration is found, // keep it only if it matches; if no encoding diff --git a/tests/cases/TestDOMParser.php b/tests/cases/TestDOMParser.php index bf9cc4e..fd8542c 100644 --- a/tests/cases/TestDOMParser.php +++ b/tests/cases/TestDOMParser.php @@ -16,24 +16,42 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase { public function testParseADocument(string $input, string $type, bool $parseError, string $exp): void { $p = new DOMParser; $document = $p->parseFromString($input, $type); - $root = $parseError ? "parserror" : "html"; - $this->assertSame($root, $document->documentElement->tagName); + $root = $parseError ? "parsererror" : "html"; $this->assertSame($exp, $document->documentElement->textContent); + $this->assertSame($root, $document->documentElement->tagName); } public function provideDocuments(): iterable { + $mkUtf16 = function(string $s, bool $le) { + $replacement = $le ? "$0\x00" : "\x00$0"; + return preg_replace("/[\x{01}-\x{7F}]/s", $replacement, $s); + }; return [ - ["Test", "text/html", false, "Test"], - ["Ol\xE9", "text/html", false, "Ol\u{E9}"], - ["Ol\u{E9}", "text/html;charset=utf8", false, "Ol\u{E9}"], - ["Ol\u{E9}", "text/html", false, "Ol\u{E9}"], - ["Test", "text/xml", false, "Test"], - ["Ol\u{E9}", "text/xml", false, "Ol\u{E9}"], - ["Ol\xE9", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], - ["\u{FEFF}Ol\u{E9}", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], - ["Ol\xE9", "text/xml", false, "Ol\u{E9}"], - ["Ol\xE9", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], - ["Ol\u{E9}", "text/xml;charset=UTF-8", false, "Ol\u{E9}"], + ["Test", "text/html", false, "Test"], + ["Ol\xE9", "text/html", false, "Ol\u{E9}"], + ["Ol\u{E9}", "text/html;charset=utf8", false, "Ol\u{E9}"], + ["Ol\u{E9}", "text/html", false, "Ol\u{E9}"], + ["Test", "text/xml", false, "Test"], + ["Ol\u{E9}", "text/xml", false, "Ol\u{E9}"], + ["Ol\xE9", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], + ["\u{FEFF}Ol\u{E9}", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], + ["Ol\xE9", "text/xml", false, "Ol\u{E9}"], + ["Ol\xE9", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], + ["Ol\u{E9}", "text/xml;charset=UTF-8", false, "Ol\u{E9}"], + ["Ol\u{E9}", "text/xml;charset=UTF-8", false, "Ol\u{E9}"], + ["Ol\u{E9}", "text/xml;charset=UTF-8", false, "Ol\u{E9}"], + ["Ol\u{E9}", "text/xml;charset=UTF-8", false, "Ol\u{E9}"], + ["Ol\u{E9}", "text/xml;charset=UTF-8", false, "Ol\u{E9}"], + ["Ol\u{E9}", "text/xml;charset=UTF-8", false, "Ol\u{E9}"], + ["Ol\xE9", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], + [$mkUtf16("\xFE\xFFOl\x00\xE9", false), "text/xml", false, "Ol\u{E9}"], + [$mkUtf16("\xFF\xFEOl\xE9\x00", true), "text/xml", false, "Ol\u{E9}"], + [$mkUtf16("Ol\x00\xE9", false), "text/xml", false, "Ol\u{E9}"], + [$mkUtf16("Ol\xE9\x00", true), "text/xml", false, "Ol\u{E9}"], + [$mkUtf16("\xFE\xFFOl\x00\xE9", false), "text/xml", false, "Ol\u{E9}"], + [$mkUtf16("\xFF\xFEOl\xE9\x00", true), "text/xml", false, "Ol\u{E9}"], + [$mkUtf16("Ol\x00\xE9", false), "text/xml;charset=utf-16be", false, "Ol\u{E9}"], + [$mkUtf16("Ol\xE9\x00", true), "text/xml;charset=utf-16le", false, "Ol\u{E9}"], ]; } }