Browse Source

Simplify UTF-16 handling in DOMParser

domparser
J. King 1 year ago
parent
commit
504d713139
  1. 42
      lib/DOMParser.php
  2. 44
      tests/cases/TestDOMParser.php

42
lib/DOMParser.php

@ -52,11 +52,13 @@ XMLDECL;
// for HTML we invoke our parser which has its own handling for everything // for HTML we invoke our parser which has its own handling for everything
return $this->createDocumentHtml($string, $type); return $this->createDocumentHtml($string, $type);
} elseif ($t->isXml) { } elseif ($t->isXml) {
// for XML we have to jump through a few hoops to deal with encoding; // for XML we have to jump through a few hoops to deal with
// if we have a known encoding we want to make sure the XML parser // encoding; if we have a known encoding we want to make sure
// doesn't try to do its own detection. The only way to do this is // the XML parser doesn't try to do its own detection. We can
// to convert to UTF-8 where necessary and remove any XML // treat byte order marks as authoritative. In their absence we
// declaration encoding information // can add BOMs to UTF-16 documents, but for other encodings we
// must parse XML declarations and validate that any encoding
// declaration is correct and change it if it is incorrect
try { try {
// first check for a byte order mark; if one exists we can go straight to parsing // first check for a byte order mark; if one exists we can go straight to parsing
if (!Encoding::sniffBOM($string)) { if (!Encoding::sniffBOM($string)) {
@ -70,28 +72,14 @@ XMLDECL;
// if a supported encoding was parsed from the type, act // if a supported encoding was parsed from the type, act
// accordingly; otherwise skip to parsing and let the // accordingly; otherwise skip to parsing and let the
// XML parser detect encoding // XML parser detect encoding
if ($charset) { if ($charset === "UTF-16BE") {
// if the string is UTF-16, transcode it to UTF-8 so // if the string is UTF-16BE, adding a BOM is sufficient
// we're always dealing with an ASCII-compatible $string = self::BOM_UTF16BE.$string;
// encoding (XML's parsing rules ensure documents } elseif ($charset === "UTF-16LE") {
// in semi-ASCII-compatible encodings like Shift_JIS // if the string is UTF-16LE, adding a BOM is sufficient
// or ISO 2022-JP never contain non-ASCII characters $string = self::BOM_UTF16LE.$string;
// before encoding information is seen) } elseif ($charset) {
if ($charset === "UTF-16BE" || $charset === "UTF-16LE") { // for ASCII-compatible encodings look for an XML declaration
// NOTE: the transcoding operation may throw an
// exception due to unpaired surrogates, which
// is why this whole operation is wrapped in a
// try block
$decoder = Encoding::createDecoder($charset, $string, true, false);
$string = "";
while (strlen($c = $decoder->nextChar())) {
$string .= $c;
$string .= $decoder->asciiSpanNot("");
}
unset($decoder);
$charset = "UTF-8";
}
// look for an XML declaration
if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) { if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
// if an existing encoding declaration is found, // if an existing encoding declaration is found,
// keep it only if it matches; if no encoding // keep it only if it matches; if no encoding

44
tests/cases/TestDOMParser.php

@ -16,24 +16,42 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
public function testParseADocument(string $input, string $type, bool $parseError, string $exp): void { public function testParseADocument(string $input, string $type, bool $parseError, string $exp): void {
$p = new DOMParser; $p = new DOMParser;
$document = $p->parseFromString($input, $type); $document = $p->parseFromString($input, $type);
$root = $parseError ? "parserror" : "html"; $root = $parseError ? "parsererror" : "html";
$this->assertSame($root, $document->documentElement->tagName);
$this->assertSame($exp, $document->documentElement->textContent); $this->assertSame($exp, $document->documentElement->textContent);
$this->assertSame($root, $document->documentElement->tagName);
} }
public function provideDocuments(): iterable { public function provideDocuments(): iterable {
$mkUtf16 = function(string $s, bool $le) {
$replacement = $le ? "$0\x00" : "\x00$0";
return preg_replace("/[\x{01}-\x{7F}]/s", $replacement, $s);
};
return [ return [
["Test", "text/html", false, "Test"], ["Test", "text/html", false, "Test"],
["Ol\xE9", "text/html", false, "Ol\u{E9}"], ["Ol\xE9", "text/html", false, "Ol\u{E9}"],
["Ol\u{E9}", "text/html;charset=utf8", false, "Ol\u{E9}"], ["Ol\u{E9}", "text/html;charset=utf8", false, "Ol\u{E9}"],
["<meta charset=utf8>Ol\u{E9}", "text/html", false, "Ol\u{E9}"], ["<meta charset=utf8>Ol\u{E9}", "text/html", false, "Ol\u{E9}"],
["<html>Test</html>", "text/xml", false, "Test"], ["<html>Test</html>", "text/xml", false, "Test"],
["<html>Ol\u{E9}</html>", "text/xml", false, "Ol\u{E9}"], ["<html>Ol\u{E9}</html>", "text/xml", false, "Ol\u{E9}"],
["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], ["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
["\u{FEFF}<html>Ol\u{E9}</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], ["\u{FEFF}<html>Ol\u{E9}</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
["<?xml version='1.0' encoding='windows-1252'?><html>Ol\xE9</html>", "text/xml", false, "Ol\u{E9}"], ["<?xml version='1.0' encoding='windows-1252'?><html>Ol\xE9</html>", "text/xml", false, "Ol\u{E9}"],
["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"], ["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
["<?xml version='1.2' encoding='windows-1252'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"], ["<html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.1' encoding='windows-1252'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.1' encoding='utf8'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.1'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.1' ?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.0' standalone='yes'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", false, "Ol\u{E9}"],
["<?xml version='1.0' standalone='yes'?><html>Ol\xE9</html>", "text/xml;charset=windows-1252", false, "Ol\u{E9}"],
[$mkUtf16("\xFE\xFF<html>Ol\x00\xE9</html>", false), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("\xFF\xFE<html>Ol\xE9\x00</html>", true), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("<?xml version='1.0' encoding='UTF-16'?><html>Ol\x00\xE9</html>", false), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("<?xml version='1.0' encoding='UTF-16'?><html>Ol\xE9\x00</html>", true), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("\xFE\xFF<?xml version='1.0' encoding='UTF-8'?><html>Ol\x00\xE9</html>", false), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("\xFF\xFE<?xml version='1.0' encoding='UTF-8'?><html>Ol\xE9\x00</html>", true), "text/xml", false, "Ol\u{E9}"],
[$mkUtf16("<?xml version='1.0' encoding='UTF-8'?><html>Ol\x00\xE9</html>", false), "text/xml;charset=utf-16be", false, "Ol\u{E9}"],
[$mkUtf16("<?xml version='1.0' encoding='UTF-8'?><html>Ol\xE9\x00</html>", true), "text/xml;charset=utf-16le", false, "Ol\u{E9}"],
]; ];
} }
} }

Loading…
Cancel
Save