Don't trust libxml to handle standard encodings

Many standard encoding labels are not understood by libxml, so we must insert our own XML declarations to add encoding information which libxml will understand.
2024-04-20 11:01:50 -04:00 · 2024-04-20 11:01:50 -04:00 · 445fd545c7
commit 445fd545c7
parent 4e41632769
3 changed files with 82 additions and 41 deletions
--- a/lib/DOMParser.php
+++ b/lib/DOMParser.php
@ -24,9 +24,35 @@ class DOMParser {
    (\s+version=(?:"1\.[0-9]+"|'1\.[0-9]+'))
    (?:\s+encoding=("[A-Za-z][A-Za-z0-9\._\-]*"|'[A-Za-z][A-Za-z0-9\._\-]*'))?
    (\s+standalone=(?:"yes"|"no"|'yes'|'no'))?
-    (\s*)\?>
+    \s*\?>
    /sx
 XMLDECL;
+	/** @var array A list of standard encoding labels which DOMDocument either does not know or does not map to the correct encoding; this is a worst-case list taken from PHP 5.6 on Windows with some exclusions for encodings which are completely unsupported */
+	const ENCODING_NAUGHTY_LIST = [
+		"unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "x-unicode20utf8",
+		"iso88592", "iso88593", "iso88594", "iso88595", "csiso88596e",
+		"csiso88596i", "iso-8859-6-e", "iso-8859-6-i", "iso88596", "iso88597",
+		"sun_eu_greek", "csiso88598e", "iso-8859-8-e", "iso88598", "visual",
+		"csiso88598i", "iso-8859-8-i", "logical", "iso885910", "iso885913",
+		"iso885914", "csisolatin9", "iso885915", "l9", "koi", "koi8", "koi8_r",
+		"x-mac-roman", "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
+		"tis-620", "x-cp1250", "x-cp1251", "ansi_x3.4-1968", "ascii", "cp819",
+		"csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1",
+		"iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1",
+		"us-ascii", "x-cp1252", "x-cp1253", "iso88599", "x-cp1254",
+		"x-cp1255", "x-cp1256", "x-cp1257", "cp1258", "windows-1258",
+		"x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312",
+		"gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5",
+		"csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis",
+		"cseuckr", "euc-kr", "replacement",
+	];
+	/** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */
+	const ENCODING_ALIAS_MAP = [
+		'windows-1258' => "x-cp1258",
+		'GBK' => "x-gbk",
+		'Big5' => "big5-hkscs",
+		'EUC-KR' => "korean",
+	];

    /** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument`
     * 
@ -59,50 +85,50 @@ XMLDECL;
            //   can add BOMs to UTF-16 documents, but for other encodings we
            //   must parse XML declarations and validate that any encoding
            //   declaration is correct and change it if it is incorrect
+
+            // this process is further complicated by libxml not understanding
+            //   all labels from the Encoding specification (which we try to
+            //   honour since it can be assumed to be a best practice), so we
+            //   must also rewrite some encoding declarations
            try {
                // first check for a byte order mark; if one exists we can go straight to parsing
                if (!Encoding::sniffBOM($string)) {
-                    // check the type for a charset parameter if there is no BOM
-                    $charset = $t->params['charset'] ?? "";
-                    if ($charset) {
-                        if ($encoding = Encoding::matchLabel($charset)) {
-                            $charset = $encoding['name'];
-                        }
+                    // otherwise determine the embedded encoding of the document
+                    if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
+                        $match[2] = ($match[2] ?? "") ?: '"utf-8"'; // declaration without encoding is UTF-8
+                        $xmlDeclaration = $match[0];
+                        $xmlVersion = $match[1];
+                        $xmlEncoding = substr($match[2], 1, strlen($match[2]) - 2);
+                        $xmlStandalone = $match[3] ?? "";
+                        $docEnc = Encoding::matchLabel($xmlEncoding);
+                    } else {
+                        $xmlDeclaration = "";
+                        $xmlVersion = " version=\"1.0\"";
+                        $xmlEncoding = "";
+                        $xmlStandalone = "";
+                        $docEnc = Encoding::matchLabel("utf-8");
                    }
-                    // if a supported encoding was parsed from the type, act
-                    //   accordingly; otherwise skip to parsing and let the
-                    //   XML parser detect encoding
-                    if ($charset === "UTF-16BE") {
-                        // if the string is UTF-16BE, adding a BOM is sufficient
-                        $string = self::BOM_UTF16BE.$string;
-                     } elseif ($charset === "UTF-16LE") {
-                        // if the string is UTF-16LE, adding a BOM is sufficient
-                        $string = self::BOM_UTF16LE.$string;
-                     } elseif ($charset) {
-                        // for ASCII-compatible encodings look for an XML declaration
-                        if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
-                            // if an existing encoding declaration is found,
-                            //   keep it only if it matches; if no encoding
-                            //   declaration is found but the encoding is UTF-8
-                            //   this is also acceptable
-                            $keep = false;
-                            if ($match[2]) {
-                                $candidate = substr($match[2], 1, strlen($match[2]) - 2);
-                                if ($encoding = Encoding::matchLabel($candidate)) {
-                                    if ($charset === $encoding['name']) {
-                                        $keep = true;
-                                    }
-                                }
-                            } elseif ($charset === "UTF-8") {
-                                $keep = true;
-                            }
-                            // substitute the encoding declaration where necessary
-                            if (!$keep) {
-                                $string = "<?xml".$match[1]." encoding=\"$charset\"".$match[3].$match[4]."?>".substr($string, strlen($match[0]));
-                            }
-                        } elseif ($charset !== "UTF-8") {
-                            // add a declaration if none is found and the encoding is not UTF-8
-                            $string = "<?xml version=\"1.0\" encoding=\"$charset\" ?>".$string;
+                    // next check the type for a charset parameter if there is one
+                    $typeEnc = Encoding::matchLabel($t->params['charset'] ?? "");
+                    // if the document encoding differs from the type encoding
+                    //   or the document encoding is not recognized by libxml,
+                    //   we need to mangle the document before parsing
+                    if (($typeEnc && $docEnc && $docEnc['name'] !== $typeEnc['name']) || ($docEnc && in_array($docEnc['label'], self::ENCODING_NAUGHTY_LIST)) || (!$docEnc && !$typeEnc)) {
+                        $charset = ($typeEnc ?? $docEnc)['name'] ?? "UTF-8";
+                        // some canonical names are not recognized by libxml, so we must use other labels
+                        $charset = self::ENCODING_ALIAS_MAP[$charset] ?? $charset;
+                        if ($charset === "UTF-8") {
+                            // if the string is UTF-8, adding a BOM is sufficient
+                            $string = self::BOM_UTF8.$string;
+                        } elseif ($charset === "UTF-16BE") {
+                            // if the string is UTF-16BE, adding a BOM is sufficient
+                            $string = self::BOM_UTF16BE.$string;
+                        } elseif ($charset === "UTF-16LE") {
+                            // if the string is UTF-16LE, adding a BOM is sufficient
+                            $string = self::BOM_UTF16LE.$string;
+                        } elseif ($charset) {
+                            // otherwise substitute the encoding declaration if any
+                            $string = "<?xml".$xmlVersion." encoding=\"$charset\"".$xmlStandalone."?>".substr($string, strlen($xmlDeclaration));
                        }
                    }
                }
--- a/tests/cases/TestDOMParser.php
+++ b/tests/cases/TestDOMParser.php
@ -43,6 +43,10 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
            ["<?xml version='1.1' ?><html>Ol\u{E9}</html>",                                            "text/xml;charset=UTF-8",        "Ol\u{E9}"],
            ["<?xml version='1.0' standalone='yes'?><html>Ol\u{E9}</html>",                            "text/xml;charset=UTF-8",        "Ol\u{E9}"],
            ["<?xml version='1.0' standalone='yes'?><html>Ol\xE9</html>",                              "text/xml;charset=windows-1252", "Ol\u{E9}"],
+            ["<?xml version='1.0' encoding='bogus'?><html>Ol\u{E9}</html>",                            "text/xml",                      "Ol\u{E9}"],
+            ["<?xml version='1.0'?><html>Ol\u{E9}</html>",                                             "text/xml;charset=bogus",        "Ol\u{E9}"],
+            ["<?xml version='1.0' encoding='bogus'?><html>Ol\u{E9}</html>",                            "text/xml;charset=bogus",        "Ol\u{E9}"],
+            ["<html>\x81\xE9</html>",                                                                  "text/xml;charset=euc-kr",       "\u{ACF2}"],
            [$mkUtf16("\xFE\xFF<html>Ol\x00\xE9</html>", false),                                       "text/xml",                      "Ol\u{E9}"],
            [$mkUtf16("\xFF\xFE<html>Ol\xE9\x00</html>", true),                                        "text/xml",                      "Ol\u{E9}"],
            [$mkUtf16("<?xml version='1.0' encoding='UTF-16'?><html>Ol\x00\xE9</html>", false),        "text/xml",                      "Ol\u{E9}"],
@ -69,4 +73,13 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
        $this->expectException(\InvalidArgumentException::class);
        $p->parseFromString($in, "text/plain");
    }
+
+    public function testParseWithInvalidEncoding(): void {
+        $in = "<html>Test</html>";
+        $p = new DOMParser;
+        $d = $p->parseFromString($in, "text/xml;charset=csiso2022kr");
+        $this->assertSame("parsererror", $d->documentElement->tagName);
+        $this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
+        $this->assertNotSame("", trim($d->documentElement->textContent));
+    }
 }
--- a/tests/phpunit.dist.xml
+++ b/tests/phpunit.dist.xml
@ -27,6 +27,8 @@
    </testsuite>
    <testsuite name="Parser">
        <file>cases/TestParser.php</file>
+    </testsuite>
+    <testsuite name="DOMParser">
        <file>cases/TestDOMParser.php</file>
    </testsuite>
    <testsuite name="Serializer">