Browse Source

Don't trust libxml to handle standard encodings

Many standard encoding labels are not understood by libxml, so we must insert
our own XML declarations to add encoding information which libxml will
understand.
master 1.3.2
J. King 2 weeks ago
parent
commit
445fd545c7
  1. 108
      lib/DOMParser.php
  2. 13
      tests/cases/TestDOMParser.php
  3. 2
      tests/phpunit.dist.xml

108
lib/DOMParser.php

@ -24,9 +24,35 @@ class DOMParser {
(\s+version=(?:"1\.[0-9]+"|'1\.[0-9]+'))
(?:\s+encoding=("[A-Za-z][A-Za-z0-9\._\-]*"|'[A-Za-z][A-Za-z0-9\._\-]*'))?
(\s+standalone=(?:"yes"|"no"|'yes'|'no'))?
(\s*)\?>
\s*\?>
/sx
XMLDECL;
/** @var array A list of standard encoding labels which DOMDocument either does not know or does not map to the correct encoding; this is a worst-case list taken from PHP 5.6 on Windows with some exclusions for encodings which are completely unsupported */
const ENCODING_NAUGHTY_LIST = [
"unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "x-unicode20utf8",
"iso88592", "iso88593", "iso88594", "iso88595", "csiso88596e",
"csiso88596i", "iso-8859-6-e", "iso-8859-6-i", "iso88596", "iso88597",
"sun_eu_greek", "csiso88598e", "iso-8859-8-e", "iso88598", "visual",
"csiso88598i", "iso-8859-8-i", "logical", "iso885910", "iso885913",
"iso885914", "csisolatin9", "iso885915", "l9", "koi", "koi8", "koi8_r",
"x-mac-roman", "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
"tis-620", "x-cp1250", "x-cp1251", "ansi_x3.4-1968", "ascii", "cp819",
"csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1",
"iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1",
"us-ascii", "x-cp1252", "x-cp1253", "iso88599", "x-cp1254",
"x-cp1255", "x-cp1256", "x-cp1257", "cp1258", "windows-1258",
"x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312",
"gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5",
"csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis",
"cseuckr", "euc-kr", "replacement",
];
/** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */
const ENCODING_ALIAS_MAP = [
'windows-1258' => "x-cp1258",
'GBK' => "x-gbk",
'Big5' => "big5-hkscs",
'EUC-KR' => "korean",
];
/** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument`
*
@ -59,50 +85,50 @@ XMLDECL;
// can add BOMs to UTF-16 documents, but for other encodings we
// must parse XML declarations and validate that any encoding
// declaration is correct and change it if it is incorrect
// this process is further complicated by libxml not understanding
// all labels from the Encoding specification (which we try to
// honour since it can be assumed to be a best practice), so we
// must also rewrite some encoding declarations
try {
// first check for a byte order mark; if one exists we can go straight to parsing
if (!Encoding::sniffBOM($string)) {
// check the type for a charset parameter if there is no BOM
$charset = $t->params['charset'] ?? "";
if ($charset) {
if ($encoding = Encoding::matchLabel($charset)) {
$charset = $encoding['name'];
}
// otherwise determine the embedded encoding of the document
if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
$match[2] = ($match[2] ?? "") ?: '"utf-8"'; // declaration without encoding is UTF-8
$xmlDeclaration = $match[0];
$xmlVersion = $match[1];
$xmlEncoding = substr($match[2], 1, strlen($match[2]) - 2);
$xmlStandalone = $match[3] ?? "";
$docEnc = Encoding::matchLabel($xmlEncoding);
} else {
$xmlDeclaration = "";
$xmlVersion = " version=\"1.0\"";
$xmlEncoding = "";
$xmlStandalone = "";
$docEnc = Encoding::matchLabel("utf-8");
}
// if a supported encoding was parsed from the type, act
// accordingly; otherwise skip to parsing and let the
// XML parser detect encoding
if ($charset === "UTF-16BE") {
// if the string is UTF-16BE, adding a BOM is sufficient
$string = self::BOM_UTF16BE.$string;
} elseif ($charset === "UTF-16LE") {
// if the string is UTF-16LE, adding a BOM is sufficient
$string = self::BOM_UTF16LE.$string;
} elseif ($charset) {
// for ASCII-compatible encodings look for an XML declaration
if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
// if an existing encoding declaration is found,
// keep it only if it matches; if no encoding
// declaration is found but the encoding is UTF-8
// this is also acceptable
$keep = false;
if ($match[2]) {
$candidate = substr($match[2], 1, strlen($match[2]) - 2);
if ($encoding = Encoding::matchLabel($candidate)) {
if ($charset === $encoding['name']) {
$keep = true;
}
}
} elseif ($charset === "UTF-8") {
$keep = true;
}
// substitute the encoding declaration where necessary
if (!$keep) {
$string = "<?xml".$match[1]." encoding=\"$charset\"".$match[3].$match[4]."?>".substr($string, strlen($match[0]));
}
} elseif ($charset !== "UTF-8") {
// add a declaration if none is found and the encoding is not UTF-8
$string = "<?xml version=\"1.0\" encoding=\"$charset\" ?>".$string;
// next check the type for a charset parameter if there is one
$typeEnc = Encoding::matchLabel($t->params['charset'] ?? "");
// if the document encoding differs from the type encoding
// or the document encoding is not recognized by libxml,
// we need to mangle the document before parsing
if (($typeEnc && $docEnc && $docEnc['name'] !== $typeEnc['name']) || ($docEnc && in_array($docEnc['label'], self::ENCODING_NAUGHTY_LIST)) || (!$docEnc && !$typeEnc)) {
$charset = ($typeEnc ?? $docEnc)['name'] ?? "UTF-8";
// some canonical names are not recognized by libxml, so we must use other labels
$charset = self::ENCODING_ALIAS_MAP[$charset] ?? $charset;
if ($charset === "UTF-8") {
// if the string is UTF-8, adding a BOM is sufficient
$string = self::BOM_UTF8.$string;
} elseif ($charset === "UTF-16BE") {
// if the string is UTF-16BE, adding a BOM is sufficient
$string = self::BOM_UTF16BE.$string;
} elseif ($charset === "UTF-16LE") {
// if the string is UTF-16LE, adding a BOM is sufficient
$string = self::BOM_UTF16LE.$string;
} elseif ($charset) {
// otherwise substitute the encoding declaration if any
$string = "<?xml".$xmlVersion." encoding=\"$charset\"".$xmlStandalone."?>".substr($string, strlen($xmlDeclaration));
}
}
}

13
tests/cases/TestDOMParser.php

@ -43,6 +43,10 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
["<?xml version='1.1' ?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
["<?xml version='1.0' standalone='yes'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
["<?xml version='1.0' standalone='yes'?><html>Ol\xE9</html>", "text/xml;charset=windows-1252", "Ol\u{E9}"],
["<?xml version='1.0' encoding='bogus'?><html>Ol\u{E9}</html>", "text/xml", "Ol\u{E9}"],
["<?xml version='1.0'?><html>Ol\u{E9}</html>", "text/xml;charset=bogus", "Ol\u{E9}"],
["<?xml version='1.0' encoding='bogus'?><html>Ol\u{E9}</html>", "text/xml;charset=bogus", "Ol\u{E9}"],
["<html>\x81\xE9</html>", "text/xml;charset=euc-kr", "\u{ACF2}"],
[$mkUtf16("\xFE\xFF<html>Ol\x00\xE9</html>", false), "text/xml", "Ol\u{E9}"],
[$mkUtf16("\xFF\xFE<html>Ol\xE9\x00</html>", true), "text/xml", "Ol\u{E9}"],
[$mkUtf16("<?xml version='1.0' encoding='UTF-16'?><html>Ol\x00\xE9</html>", false), "text/xml", "Ol\u{E9}"],
@ -69,4 +73,13 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
$this->expectException(\InvalidArgumentException::class);
$p->parseFromString($in, "text/plain");
}
public function testParseWithInvalidEncoding(): void {
$in = "<html>Test</html>";
$p = new DOMParser;
$d = $p->parseFromString($in, "text/xml;charset=csiso2022kr");
$this->assertSame("parsererror", $d->documentElement->tagName);
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
$this->assertNotSame("", trim($d->documentElement->textContent));
}
}

2
tests/phpunit.dist.xml

@ -27,6 +27,8 @@
</testsuite>
<testsuite name="Parser">
<file>cases/TestParser.php</file>
</testsuite>
<testsuite name="DOMParser">
<file>cases/TestDOMParser.php</file>
</testsuite>
<testsuite name="Serializer">

Loading…
Cancel
Save