From 9dcc37995083ef00781b039c607f9941baebc902 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Thu, 30 Mar 2023 10:50:15 -0400
Subject: [PATCH] Allow encoding detection in DOMParser

For HTML the HTML parser's machinery is used; for XML we implement our
own logic for known encodings
---
 lib/DOMParser.php | 101 ++++++++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 44 deletions(-)

diff --git a/lib/DOMParser.php b/lib/DOMParser.php
index 0984dff..9af7d02 100644
--- a/lib/DOMParser.php
+++ b/lib/DOMParser.php
@@ -9,65 +9,78 @@ namespace MensBeam\HTML;
 use MensBeam\Mime\MimeType;
 use MensBeam\Intl\Encoding;
 
-/** The DOMParser interface allows authors to create new DOMDocument objects by parsing strings, as either HTML or XML. */
+/** The DOMParser interface allows authors to create new DOMDocument objects by parsing strings, as either HTML or XML */
 class DOMParser {
-    protected const TYPES = [
-        "text/html",
-        "text/xml",
-        "application/xml",
-        "application/xhtml+xml",
-        "image/svg+xml"
-    ];
+    /** @var A UTF-8 byte order mark */
+    protected const BOM_UTF8 = "\xEF\xBB\xBF";
+    /** @var A UTF-16 (big-endian) byte order mark */
+    protected const BOM_UTF16BE = "\xFE\xFF";
+    /** @var A UTF-16 (little-endian) byte order mark */
+    protected const BOM_UTF16LE = "\xFF\xFE";
 
-    /** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument`. 
+    /** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument`
      * 
-     * `$type` can be `"text/html"` (which will invoke the HTML parser), or any of `"text/xml"`, `"application/xml"`, 
-     * `"application/xhtml+xml"`, or `"image/svg+xml"` (which will invoke the XML parser).
+     * `$type` can be `"text/html"` (which will invoke the HTML parser), or
+     * any XML type (which will invoke the XML parser). A `charset` parameter
+     * may be included to specify the document encoding; otherwise encoding
+     * will be detected from document hints. This differs from the standard
+     * interface which only accepts certain XML types, and requires Unicode
+     * characters rather than bytes as input, obviating the need for encoding
+     * detection
      * 
-     * For the XML parser, if `$string` cannot be parsed, then the returned `DOMDocument` will contain elements describing the resulting error.
+     * For the XML parser, if `$string` cannot be parsed, then the returned
+     * `DOMDocument` will contain elements describing the resulting error
      * 
-     * Note that script elements are not evaluated during parsing, and the resulting document's encoding will always be UTF-8.
-     * 
-     * Values other than the above for `$type` will cause an `InvalidArgumentException` exception to be thrown.
-     * 
-     * Since PHP strings are bytes, `$type` may include a `charset` parameter. If no parameter is is supplied UTF-8 is assumed.
+     * If no encoding is specified and none can be detected from the document,
+     * the default encoding is Windows-1252 for HTML and UTF-8 for XML
      */
     public function parseFromString(string $string, string $type): \DOMDocument {
         // start by parsing the type
         $t = MimeType::parseBytes($type);
-        if (!in_array($t->essence, self::TYPES)) {
-            throw new \InvalidArgumentException("\$type must be one of ".implode(", ", self::TYPES));
-        }
-        $charset = $t->params['charset'] ?? "UTF-8";
-        $encoding = Encoding::matchLabel($charset);
-        if (!$encoding) {
-            throw new \InvalidArgumentException("Specified charset is not supported");
+        if (!$t->isHtml && !$t->isXml) {
+            throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type"));
         }
-        $charset = $encoding['name'];
         // parse the string as either HTML or XML
-        if ($t->essence === "text/html") {
-            // for HTML we invoke our parser
-            $config = new Parser\Config;
-            $config->encodingFallback = "UTF-8";
-            $config->encodingPrescanBytes = 0;
-            return Parser::parse($string, $charset, $config)->document;
+        if ($t->isHtml) {
+            // for HTML we invoke our parser which has its own handling for everything
+            return Parser::parse($string, $type)->document;
         } else {
-            // for XML we have to jump through a few hoops to make sure the DOMDocument doesn't make a hash of things, or try to detect encoding
+            // for XML we have to jump through a few hoops to deal with encoding;
+            //   if we have a known encoding we want to make sure the XML parser
+            //   doesn't try to do its own detection. The best way to do this is
+            //   to add a Unicode byte order mark if the string doesn't have one
             $doc = new \DOMDocument();
             try {
-                if ($charset !== "UTF-8") {
-                    // transcode the string to UTF-8 where necessary
-                    $decoder = Encoding::createDecoder($charset, $string, true, false);
-                    $string = "";
-                    while (strlen($c = $decoder->nextChar())) {
-                        $string .= $c;
-                        $string .= $decoder->asciiSpanNot("");
+                // first check for a byte order mark; if one exists we can go straight to parsing
+                if (!Encoding::sniffBOM($string)) {
+                    // check the type for a charset parameter if there is no BOM
+                    $charset = $t->params['charset'] ?? "";
+                    if ($charset) {
+                        $encoding = Encoding::matchLabel($charset);
+                        if (!$encoding) {
+                            throw new \InvalidArgumentException("Specified charset is not supported");
+                        }
+                        $charset = $encoding['name'];
+                    }
+                    if ($charset) {
+                        // if the string is known to be UTF-8 or UTF-16 according to the type but has no BOM, add one
+                        if ($charset === "UTF-8") {
+                            $string = self::BOM_UTF8.$string;
+                        } elseif ($charset === "UTF-16BE") {
+                            $string = self::BOM_UTF16BE.$string;
+                        } elseif ($charset === "UTF-16LE") {
+                            $string = self::BOM_UTF16LE.$string;
+                        } else {
+                            // transcode the string to UTF-8 with a BOM where the string's encoding cannot include a BOM
+                            $decoder = Encoding::createDecoder($charset, $string, true, false);
+                            $string = self::BOM_UTF8;
+                            while (strlen($c = $decoder->nextChar())) {
+                                $string .= $c;
+                                $string .= $decoder->asciiSpanNot("");
+                            }
+                            unset($decoder);
+                        }
                     }
-                    unset($decoder);
-                }
-                // add a byte-order mark if the string doesn't have one; this serves as an authoritative encoding specifier
-                if (substr($string, 0, 3) !== "\xEF\xBB\xBF") {
-                    $string = "\xEF\xBB\xBF".$string;
                 }
                 // parse the document
                 if (!$doc->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) {