Browse Source

Fix XML encoding unconditionally

This is required because document encoding might be in the naughty list
master 1.4.0
J. King 5 months ago
parent
commit
a5c94cc577
  1. 15
      lib/DOMParser.php
  2. 2
      tests/cases/TestDOMParser.php

15
lib/DOMParser.php

@ -79,13 +79,8 @@ XMLDECL;
return $this->createDocumentHtml($string, $type);
} elseif ($t->isXml) {
// for XML we have to jump through a few hoops to deal with
// encoding; if we have a known encoding we want to make sure
// the XML parser doesn't try to do its own detection.
if (isset($t->params['charset'])) {
$string = $this->fixXmlEncoding($string, $t->params['charset']);
}
// parse the document
return $this->createDocumentXml($string);
// encoding
return $this->createDocumentXml($this->fixXmlEncoding($string, $t->params['charset'] ?? ""));
} else {
throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type");
}
@ -150,7 +145,11 @@ XMLDOC;
// if the document encoding differs from the type encoding
// or the document encoding is not recognized by libxml,
// we need to mangle the document before parsing
if (($typeEnc && $docEnc && $docEnc['name'] !== $typeEnc['name']) || ($docEnc && in_array($docEnc['label'], self::ENCODING_NAUGHTY_LIST)) || (!$docEnc && !$typeEnc)) {
if (
($typeEnc && $docEnc && $docEnc['name'] !== $typeEnc['name'])
|| ($typeEnc && !$docEnc && $typeEnc !== "UTF-8")
|| ($docEnc && in_array($docEnc['label'], self::ENCODING_NAUGHTY_LIST))
) {
$charset = ($typeEnc ?? $docEnc)['name'] ?? "UTF-8";
// some canonical names are not recognized by libxml, so we must use other labels
$charset = self::ENCODING_ALIAS_MAP[$charset] ?? $charset;

2
tests/cases/TestDOMParser.php

@ -44,7 +44,7 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
["<?xml version='1.0' standalone='yes'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
["<?xml version='1.0' standalone='yes'?><html>Ol\xE9</html>", "text/xml;charset=windows-1252", "Ol\u{E9}"],
["<?xml version='1.0'?><html>Ol\u{E9}</html>", "text/xml;charset=bogus", "Ol\u{E9}"],
["<?xml version='1.0' encoding='bogus'?><html>Ol\u{E9}</html>", "text/xml;charset=bogus", "Ol\u{E9}"],
["<?xml version='1.0' encoding='utf-8'?><html>Ol\u{E9}</html>", "text/xml;charset=bogus", "Ol\u{E9}"],
["<html>\x81\xE9</html>", "text/xml;charset=euc-kr", "\u{ACF2}"],
[$mkUtf16("\xFE\xFF<html>Ol\x00\xE9</html>", false), "text/xml", "Ol\u{E9}"],
[$mkUtf16("\xFF\xFE<html>Ol\xE9\x00</html>", true), "text/xml", "Ol\u{E9}"],

Loading…
Cancel
Save