From 504d713139947bc868a588cb3dd6243974b10a45 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Sat, 1 Apr 2023 11:28:05 -0400
Subject: [PATCH] Simplify UTF-16 handling in DOMParser

---
 lib/DOMParser.php             | 42 ++++++++++++---------------------
 tests/cases/TestDOMParser.php | 44 ++++++++++++++++++++++++-----------
 2 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/lib/DOMParser.php b/lib/DOMParser.php
index 603bca5..09ce276 100644
--- a/lib/DOMParser.php
+++ b/lib/DOMParser.php
@@ -52,11 +52,13 @@ XMLDECL;
             // for HTML we invoke our parser which has its own handling for everything
             return $this->createDocumentHtml($string, $type);
         } elseif ($t->isXml) {
-            // for XML we have to jump through a few hoops to deal with encoding;
-            //   if we have a known encoding we want to make sure the XML parser
-            //   doesn't try to do its own detection. The only way to do this is
-            //   to convert to UTF-8 where necessary and remove any XML
-            //   declaration encoding information
+            // for XML we have to jump through a few hoops to deal with
+            //   encoding; if we have a known encoding we want to make sure
+            //   the XML parser doesn't try to do its own detection. We can
+            //   treat byte order marks as authoritative. In their absence we
+            //   can add BOMs to UTF-16 documents, but for other encodings we
+            //   must parse XML declarations and validate that any encoding
+            //   declaration is correct and change it if it is incorrect
             try {
                 // first check for a byte order mark; if one exists we can go straight to parsing
                 if (!Encoding::sniffBOM($string)) {
@@ -70,28 +72,14 @@ XMLDECL;
                     // if a supported encoding was parsed from the type, act
                     //   accordingly; otherwise skip to parsing and let the
                     //   XML parser detect encoding
-                    if ($charset) {
-                        // if the string is UTF-16, transcode it to UTF-8 so
-                        //   we're always dealing with an ASCII-compatible
-                        //   encoding (XML's parsing rules ensure documents
-                        //   in semi-ASCII-compatible encodings like Shift_JIS
-                        //   or ISO 2022-JP never contain non-ASCII characters
-                        //   before encoding information is seen)
-                        if ($charset === "UTF-16BE" || $charset === "UTF-16LE") {
-                            // NOTE: the transcoding operation may throw an
-                            //   exception due to unpaired surrogates, which
-                            //   is why this whole operation is wrapped in a
-                            //   try block
-                            $decoder = Encoding::createDecoder($charset, $string, true, false);
-                            $string = "";
-                            while (strlen($c = $decoder->nextChar())) {
-                                $string .= $c;
-                                $string .= $decoder->asciiSpanNot("");
-                            }
-                            unset($decoder);
-                            $charset = "UTF-8";
-                        }
-                        // look for an XML declaration
+                    if ($charset === "UTF-16BE") {
+                        // if the string is UTF-16BE, adding a BOM is sufficient
+                        $string = self::BOM_UTF16BE.$string;
+                     } elseif ($charset === "UTF-16LE") {
+                        // if the string is UTF-16LE, adding a BOM is sufficient
+                        $string = self::BOM_UTF16LE.$string;
+                     } elseif ($charset) {
+                        // for ASCII-compatible encodings look for an XML declaration
                         if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
                             // if an existing encoding declaration is found,
                             //   keep it only if it matches; if no encoding
diff --git a/tests/cases/TestDOMParser.php b/tests/cases/TestDOMParser.php
index bf9cc4e..fd8542c 100644
--- a/tests/cases/TestDOMParser.php
+++ b/tests/cases/TestDOMParser.php
@@ -16,24 +16,42 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
     public function testParseADocument(string $input, string $type, bool $parseError, string $exp): void {
         $p = new DOMParser;
         $document = $p->parseFromString($input, $type);
-        $root = $parseError ? "parserror" : "html";
-        $this->assertSame($root, $document->documentElement->tagName);
+        $root = $parseError ? "parsererror" : "html";
         $this->assertSame($exp, $document->documentElement->textContent);
+        $this->assertSame($root, $document->documentElement->tagName);
     }
 
     public function provideDocuments(): iterable {
+        $mkUtf16 = function(string $s, bool $le) {
+            $replacement = $le ? "$0\x00" : "\x00$0";
+            return preg_replace("/[\x{01}-\x{7F}]/s", $replacement, $s);
+        };
         return [
-            ["Test",                                                               "text/html",                     false, "Test"],
-            ["Ol\xE9",                                                             "text/html",                     false, "Ol\u{E9}"],
-            ["Ol\u{E9}",                                                           "text/html;charset=utf8",        false, "Ol\u{E9}"],
-            ["<meta charset=utf8>Ol\u{E9}",                                        "text/html",                     false, "Ol\u{E9}"],
-            ["<html>Test</html>",                                                  "text/xml",                      false, "Test"],
-            ["<html>Ol\u{E9}</html>",                                              "text/xml",                      false, "Ol\u{E9}"],
-            ["<html>Ol\xE9</html>",                                                "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
-            ["\u{FEFF}<html>Ol\u{E9}</html>",                                      "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
-            ["<?xml version='1.0' encoding='windows-1252'?><html>Ol\xE9</html>",   "text/xml",                      false, "Ol\u{E9}"],
-            ["<html>Ol\xE9</html>",                                                "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
-            ["<?xml version='1.2' encoding='windows-1252'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8",        false, "Ol\u{E9}"],
+            ["Test",                                                                                   "text/html",                     false, "Test"],
+            ["Ol\xE9",                                                                                 "text/html",                     false, "Ol\u{E9}"],
+            ["Ol\u{E9}",                                                                               "text/html;charset=utf8",        false, "Ol\u{E9}"],
+            ["<meta charset=utf8>Ol\u{E9}",                                                            "text/html",                     false, "Ol\u{E9}"],
+            ["<html>Test</html>",                                                                      "text/xml",                      false, "Test"],
+            ["<html>Ol\u{E9}</html>",                                                                  "text/xml",                      false, "Ol\u{E9}"],
+            ["<html>Ol\xE9</html>",                                                                    "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
+            ["\u{FEFF}<html>Ol\u{E9}</html>",                                                          "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
+            ["<?xml version='1.0' encoding='windows-1252'?><html>Ol\xE9</html>",                       "text/xml",                      false, "Ol\u{E9}"],
+            ["<html>Ol\xE9</html>",                                                                    "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
+            ["<html>Ol\u{E9}</html>",                                                                  "text/xml;charset=UTF-8",        false, "Ol\u{E9}"],
+            ["<?xml version='1.1' encoding='windows-1252'?><html>Ol\u{E9}</html>",                     "text/xml;charset=UTF-8",        false, "Ol\u{E9}"],
+            ["<?xml version='1.1' encoding='utf8'?><html>Ol\u{E9}</html>",                             "text/xml;charset=UTF-8",        false, "Ol\u{E9}"],
+            ["<?xml version='1.1'?><html>Ol\u{E9}</html>",                                             "text/xml;charset=UTF-8",        false, "Ol\u{E9}"],
+            ["<?xml version='1.1' ?><html>Ol\u{E9}</html>",                                            "text/xml;charset=UTF-8",        false, "Ol\u{E9}"],
+            ["<?xml version='1.0' standalone='yes'?><html>Ol\u{E9}</html>",                            "text/xml;charset=UTF-8",        false, "Ol\u{E9}"],
+            ["<?xml version='1.0' standalone='yes'?><html>Ol\xE9</html>",                              "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
+            [$mkUtf16("\xFE\xFF<html>Ol\x00\xE9</html>", false),                                       "text/xml",                      false, "Ol\u{E9}"],
+            [$mkUtf16("\xFF\xFE<html>Ol\xE9\x00</html>", true),                                        "text/xml",                      false, "Ol\u{E9}"],
+            [$mkUtf16("<?xml version='1.0' encoding='UTF-16'?><html>Ol\x00\xE9</html>", false),        "text/xml",                      false, "Ol\u{E9}"],
+            [$mkUtf16("<?xml version='1.0' encoding='UTF-16'?><html>Ol\xE9\x00</html>", true),         "text/xml",                      false, "Ol\u{E9}"],
+            [$mkUtf16("\xFE\xFF<?xml version='1.0' encoding='UTF-8'?><html>Ol\x00\xE9</html>", false), "text/xml",                      false, "Ol\u{E9}"],
+            [$mkUtf16("\xFF\xFE<?xml version='1.0' encoding='UTF-8'?><html>Ol\xE9\x00</html>", true),  "text/xml",                      false, "Ol\u{E9}"],
+            [$mkUtf16("<?xml version='1.0' encoding='UTF-8'?><html>Ol\x00\xE9</html>", false),         "text/xml;charset=utf-16be",     false, "Ol\u{E9}"],
+            [$mkUtf16("<?xml version='1.0' encoding='UTF-8'?><html>Ol\xE9\x00</html>", true),          "text/xml;charset=utf-16le",     false, "Ol\u{E9}"],
         ];
     }
 }