Use new parsers for DOMParser in PHP 8.4 when available

Because the new HTML parser uses UTF-8 as a fallback encoding, we have adjusted the configured fallback encoding of our parser to match
This commit is contained in:
J. King 2024-12-29 09:48:58 -05:00
parent 024011fed0
commit 6fd16295df
7 changed files with 175 additions and 52 deletions

View file

@ -40,7 +40,7 @@ Like the standard interface, it will parse either HTML or XML documents. This im
- Any XML MIME content-type (e.g. `application/rss+xml`) is acceptable, not just the restricted list mandated by the interface
- MIME content-types may include a `charset` parameter to specify an authoritative encoding of the document
- If no `charset` is provided encoding will be detected from document hints; the default encoding for HTML is `windows-1252` and for XML `UTF-8`
- If no `charset` is provided encoding will be detected from document hints; the default encoding is `UTF-8`
- `InvalidArgumentException` is thrown in place of JavaScript's `TypeError`
### Parsing into existing documents

View file

@ -6,6 +6,7 @@
declare(strict_types=1);
namespace MensBeam\HTML;
use MensBeam\HTML\Parser\Config;
use MensBeam\Mime\MimeType;
use MensBeam\Intl\Encoding;
@ -46,7 +47,7 @@ XMLDECL;
"csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis",
"cseuckr", "euc-kr", "replacement",
];
/** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */
/** @var array A List of canonical encoding names DOMDocument does not understand, with aliases to labels it does understand */
const ENCODING_ALIAS_MAP = [
'windows-1258' => "x-cp1258",
'GBK' => "x-gbk",
@ -65,34 +66,62 @@ XMLDECL;
* detection
*
* For the XML parser, if `$string` cannot be parsed, then the returned
* `DOMDocument` will contain elements describing the resulting error
* document will contain elements describing the resulting error
*
* If no encoding is specified and none can be detected from the document,
* the default encoding is Windows-1252 for HTML and UTF-8 for XML
* the default encoding is UTF-8 for both HTML and XML
*
* @return \DOMDocument|\Dom\HTMLDocument|\Dom\XMLDocument
*/
public function parseFromString(string $string, string $type): \DOMDocument {
// start by parsing the type
public function parseFromString(string $string, string $type) {
// parse the Content-Type
$t = MimeType::parseBytes($type);
// determine authoritative encoding from BOM or Content-Type
$encoding = Encoding::sniffBOM($string) ?? $t->params['charset'] ?? "";
$label = Encoding::matchLabel($encoding);
if ($label) {
$encoding = $label['name'];
} else {
$encoding = null;
}
// parse the string as either HTML or XML
if ($t->isHtml) {
// for HTML we invoke our parser which has its own handling for everything
return $this->createDocumentHtml($string, $type);
// if we're using PHP 8.4, we can use the modern built-in parser
if ($this->useNewParsers()) {
return \Dom\HTMLDocument::createFromString($string, \LIBXML_NOERROR | \LIBXML_COMPACT, $encoding);
}
// otherwise we invoke our parser which has its own handling for everything
$c = new Config;
$c->encodingFallback = "UTF-8";
return Parser::parse($string, $encoding, $c)->document;
} elseif ($t->isXml) {
// for XML we have to jump through a few hoops to deal with
// encoding
return $this->createDocumentXml($this->fixXmlEncoding($string, $t->params['charset'] ?? ""));
// for XML we have to jump through a few hoops to deal with errors,
// as well as with encoding, so we put this in
// another function.
return $this->createDocumentXml($string, $encoding);
} else {
throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type");
}
}
protected function createDocumentHtml(string $string, string $type): \DOMDocument {
return Parser::parse($string, $type)->document;
protected function useNewParsers(): bool {
return class_exists(\Dom\Document::class);
}
protected function createDocumentXml(string $string): \DOMDocument {
$document = new \DOMDocument;
if (!$document->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) {
protected function createDocumentXml(string $string, ?string $encoding) {
$string = $this->fixXmlEncoding($string, $encoding ?? "");
try {
if ($this->useNewParsers()) {
return \Dom\XMLDocument::createFromString($string, \LIBXML_NOERROR | \LIBXML_COMPACT);
} else {
$document = new \DOMDocument;
if ($document->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) {
return $document;
} else {
throw new \Exception;
}
}
} catch (\Exception $e) {
$err = libxml_get_last_error();
$message = trim(htmlspecialchars($err->message, \ENT_NOQUOTES | \ENT_SUBSTITUTE | \ENT_XML1, "UTF-8"));
$string = <<<XMLDOC
@ -104,9 +133,8 @@ XMLDECL;
column="{$err->column}"
>{$err->code}: "$message" on line {$err->line}, column {$err->column}</parsererror>
XMLDOC;
return $this->createDocumentXml($string);
return $this->createDocumentXml($string, "UTF-8");
}
return $document;
}
protected function fixXmlEncoding(string $string, string $encoding) {
@ -162,6 +190,8 @@ XMLDOC;
} elseif ($charset === "UTF-16LE") {
// if the string is UTF-16LE, adding a BOM is sufficient
return self::BOM_UTF16LE.$string;
} elseif ($charset === "replacement") {
return "\u{FFFD}";
} elseif ($charset) {
// otherwise substitute the encoding declaration if any
return "<?xml".$xmlVersion." encoding=\"$charset\"".$xmlStandalone."?>".substr($string, strlen($xmlDeclaration));

View file

@ -12,12 +12,18 @@ use MensBeam\HTML\DOMParser;
* @covers \MensBeam\HTML\DOMParser
*/
class TestDOMParser extends \PHPUnit\Framework\TestCase {
protected $p;
public function setUp(): void {
$this->p = \Phake::partialMock(DOMParser::class);
\Phake::when($this->p)->useNewParsers->thenReturn(false);
}
/** @dataProvider provideDocuments */
public function testParseADocument(string $input, string $type, string $exp): void {
$p = new DOMParser;
$document = $p->parseFromString($input, $type);
$document = $this->p->parseFromString($input, $type);
$this->assertSame($exp, $document->documentElement->textContent);
$this->assertSame("html", $document->documentElement->tagName);
$this->assertSame("html", $document->documentElement->localName);
}
public function provideDocuments(): iterable {
@ -27,7 +33,7 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
};
return [
["Test", "text/html", "Test"],
["Ol\xE9", "text/html", "Ol\u{E9}"],
["Ol\u{E9}", "text/html", "Ol\u{E9}"],
["Ol\u{E9}", "text/html;charset=utf8", "Ol\u{E9}"],
["<meta charset=utf8>Ol\u{E9}", "text/html", "Ol\u{E9}"],
["<html>Test</html>", "text/xml", "Test"],
@ -37,10 +43,6 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
["<?xml version='1.0' encoding='windows-1252'?><html>Ol\xE9</html>", "text/xml", "Ol\u{E9}"],
["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", "Ol\u{E9}"],
["<html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
["<?xml version='1.1' encoding='windows-1252'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
["<?xml version='1.1' encoding='utf8'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
["<?xml version='1.1'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
["<?xml version='1.1' ?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
["<?xml version='1.0' standalone='yes'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
["<?xml version='1.0' standalone='yes'?><html>Ol\xE9</html>", "text/xml;charset=windows-1252", "Ol\u{E9}"],
["<?xml version='1.0'?><html>Ol\u{E9}</html>", "text/xml;charset=bogus", "Ol\u{E9}"],
@ -59,33 +61,29 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
public function testFailToParseADocument(): void {
$in = "<html>Test</html><!--Test-->Test";
$p = new DOMParser;
$d = $p->parseFromString($in, "text/xml");
$this->assertSame("parsererror", $d->documentElement->tagName);
$d = $this->p->parseFromString($in, "text/xml");
$this->assertSame("parsererror", $d->documentElement->localName);
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
$this->assertNotSame("", trim($d->documentElement->textContent));
}
public function testParseWithIncorrectType(): void {
$in = "<html>Ol\u{E9}</html>";
$p = new DOMParser;
$this->expectException(\InvalidArgumentException::class);
$p->parseFromString($in, "text/plain");
$this->p->parseFromString($in, "text/plain");
}
public function testParseWithInvalidEncodingInHeader(): void {
$in = "<html>Test</html>";
$p = new DOMParser;
$d = $p->parseFromString($in, "text/xml;charset=csiso2022kr");
$this->assertSame("parsererror", $d->documentElement->tagName);
$d = $this->p->parseFromString($in, "text/xml;charset=csiso2022kr");
$this->assertSame("parsererror", $d->documentElement->localName);
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
$this->assertNotSame("", trim($d->documentElement->textContent));
}
public function testParseWithInvalidEncodingInDocument(): void {
$in = "<?xml version='1.0' encoding='bogus'?><html>Test</html>";
$p = new DOMParser;
$d = $p->parseFromString($in, "text/xml");
$this->assertSame("parsererror", $d->documentElement->tagName);
$d = $this->p->parseFromString($in, "text/xml");
$this->assertSame("parsererror", $d->documentElement->localName);
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
$this->assertNotSame("", trim($d->documentElement->textContent));
}

View file

@ -0,0 +1,22 @@
<?php
/** @license MIT
* Copyright 2017 , Dustin Wilson, J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\HTML\TestCase;
use MensBeam\HTML\DOMParser;
/**
* @covers \MensBeam\HTML\DOMParser
* @requires PHP >= 8.4
*/
class TestDOMParserNew extends TestDOMParser {
protected $p;
public function setUp(): void {
$this->p = \Phake::partialMock(DOMParser::class);
\Phake::when($this->p)->useNewParsers->thenReturn(true);
}
}

View file

@ -30,6 +30,7 @@
</testsuite>
<testsuite name="DOMParser">
<file>cases/TestDOMParser.php</file>
<file>cases/TestDOMParserNew.php</file>
</testsuite>
<testsuite name="Serializer">
<file>cases/TestSerializer.php</file>

View file

@ -1,5 +1,6 @@
{
"require": {
"phpunit/phpunit": "^8.5 | ^9.0"
"phpunit/phpunit": "^8.5 | ^9.0",
"phake/phake": "^4.4"
}
}

View file

@ -4,34 +4,34 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "1fa48e20f042190f12cc36db8c803244",
"content-hash": "b1dfc38c12e26ec3570d91cdad266647",
"packages": [
{
"name": "doctrine/instantiator",
"version": "2.0.0",
"version": "1.5.0",
"source": {
"type": "git",
"url": "https://github.com/doctrine/instantiator.git",
"reference": "c6222283fa3f4ac679f8b9ced9a4e23f163e80d0"
"reference": "0a0fa9780f5d4e507415a065172d26a98d02047b"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/doctrine/instantiator/zipball/c6222283fa3f4ac679f8b9ced9a4e23f163e80d0",
"reference": "c6222283fa3f4ac679f8b9ced9a4e23f163e80d0",
"url": "https://api.github.com/repos/doctrine/instantiator/zipball/0a0fa9780f5d4e507415a065172d26a98d02047b",
"reference": "0a0fa9780f5d4e507415a065172d26a98d02047b",
"shasum": ""
},
"require": {
"php": "^8.1"
"php": "^7.1 || ^8.0"
},
"require-dev": {
"doctrine/coding-standard": "^11",
"doctrine/coding-standard": "^9 || ^11",
"ext-pdo": "*",
"ext-phar": "*",
"phpbench/phpbench": "^1.2",
"phpstan/phpstan": "^1.9.4",
"phpstan/phpstan-phpunit": "^1.3",
"phpunit/phpunit": "^9.5.27",
"vimeo/psalm": "^5.4"
"phpbench/phpbench": "^0.16 || ^1",
"phpstan/phpstan": "^1.4",
"phpstan/phpstan-phpunit": "^1",
"phpunit/phpunit": "^7.5 || ^8.5 || ^9.5",
"vimeo/psalm": "^4.30 || ^5.4"
},
"type": "library",
"autoload": {
@ -58,7 +58,7 @@
],
"support": {
"issues": "https://github.com/doctrine/instantiator/issues",
"source": "https://github.com/doctrine/instantiator/tree/2.0.0"
"source": "https://github.com/doctrine/instantiator/tree/1.5.0"
},
"funding": [
{
@ -74,7 +74,7 @@
"type": "tidelift"
}
],
"time": "2022-12-30T00:23:10+00:00"
"time": "2022-12-30T00:15:36+00:00"
},
{
"name": "myclabs/deep-copy",
@ -194,6 +194,77 @@
},
"time": "2024-10-08T18:51:32+00:00"
},
{
"name": "phake/phake",
"version": "v4.5.3",
"source": {
"type": "git",
"url": "https://github.com/phake/phake.git",
"reference": "695a4feda9ff25608e5065bfd48a7d3c6add57ce"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/phake/phake/zipball/695a4feda9ff25608e5065bfd48a7d3c6add57ce",
"reference": "695a4feda9ff25608e5065bfd48a7d3c6add57ce",
"shasum": ""
},
"require": {
"doctrine/instantiator": "^1.4",
"php": "^7.1|^8.0",
"sebastian/comparator": "^1.1|^2.0|^3.0|^4.0|^5.0|^6.0"
},
"require-dev": {
"doctrine/annotations": "^1.13",
"hamcrest/hamcrest-php": "^1.1|^2.0",
"phpunit/phpunit": "^6.5|^7.0|^8.0|^9.0|^10.0|^11.0",
"psalm/phar": "^4.18"
},
"suggest": {
"doctrine/annotations": "Allows mock annotations to use import statements for classes.",
"hamcrest/hamcrest-php": "Use Hamcrest matchers."
},
"type": "library",
"extra": {
"branch-alias": {
"dev-4.4": "4.4.x-dev",
"dev-master": "5.0.x-dev"
}
},
"autoload": {
"files": [
"src/Phake.php"
],
"psr-4": {
"Phake\\": "src/Phake"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Mike Lively",
"email": "m@digitalsandwich.com"
}
],
"description": "The Phake mock testing library",
"homepage": "https://phake.github.io",
"keywords": [
"mock",
"phake",
"spy",
"stub",
"test-doubles",
"testing"
],
"support": {
"docs": "https://phake.github.io/doc/",
"issues": "https://github.com/phake/phake/issues",
"source": "https://github.com/phake/phake/tree/v4.5.3"
},
"time": "2024-12-09T14:46:24+00:00"
},
{
"name": "phar-io/manifest",
"version": "2.0.4",