Use new parsers for DOMParser in PHP 8.4 when available
Because the new HTML parser uses UTF-8 as a fallback encoding, we have adjusted the configured fallback encoding of our parser to match
This commit is contained in:
parent
024011fed0
commit
6fd16295df
7 changed files with 175 additions and 52 deletions
|
@ -40,7 +40,7 @@ Like the standard interface, it will parse either HTML or XML documents. This im
|
|||
|
||||
- Any XML MIME content-type (e.g. `application/rss+xml`) is acceptable, not just the restricted list mandated by the interface
|
||||
- MIME content-types may include a `charset` parameter to specify an authoritative encoding of the document
|
||||
- If no `charset` is provided encoding will be detected from document hints; the default encoding for HTML is `windows-1252` and for XML `UTF-8`
|
||||
- If no `charset` is provided encoding will be detected from document hints; the default encoding is `UTF-8`
|
||||
- `InvalidArgumentException` is thrown in place of JavaScript's `TypeError`
|
||||
|
||||
### Parsing into existing documents
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
declare(strict_types=1);
|
||||
namespace MensBeam\HTML;
|
||||
|
||||
use MensBeam\HTML\Parser\Config;
|
||||
use MensBeam\Mime\MimeType;
|
||||
use MensBeam\Intl\Encoding;
|
||||
|
||||
|
@ -46,7 +47,7 @@ XMLDECL;
|
|||
"csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis",
|
||||
"cseuckr", "euc-kr", "replacement",
|
||||
];
|
||||
/** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */
|
||||
/** @var array A List of canonical encoding names DOMDocument does not understand, with aliases to labels it does understand */
|
||||
const ENCODING_ALIAS_MAP = [
|
||||
'windows-1258' => "x-cp1258",
|
||||
'GBK' => "x-gbk",
|
||||
|
@ -65,34 +66,62 @@ XMLDECL;
|
|||
* detection
|
||||
*
|
||||
* For the XML parser, if `$string` cannot be parsed, then the returned
|
||||
* `DOMDocument` will contain elements describing the resulting error
|
||||
* document will contain elements describing the resulting error
|
||||
*
|
||||
* If no encoding is specified and none can be detected from the document,
|
||||
* the default encoding is Windows-1252 for HTML and UTF-8 for XML
|
||||
* the default encoding is UTF-8 for both HTML and XML
|
||||
*
|
||||
* @return \DOMDocument|\Dom\HTMLDocument|\Dom\XMLDocument
|
||||
*/
|
||||
public function parseFromString(string $string, string $type): \DOMDocument {
|
||||
// start by parsing the type
|
||||
public function parseFromString(string $string, string $type) {
|
||||
// parse the Content-Type
|
||||
$t = MimeType::parseBytes($type);
|
||||
// determine authoritative encoding from BOM or Content-Type
|
||||
$encoding = Encoding::sniffBOM($string) ?? $t->params['charset'] ?? "";
|
||||
$label = Encoding::matchLabel($encoding);
|
||||
if ($label) {
|
||||
$encoding = $label['name'];
|
||||
} else {
|
||||
$encoding = null;
|
||||
}
|
||||
// parse the string as either HTML or XML
|
||||
if ($t->isHtml) {
|
||||
// for HTML we invoke our parser which has its own handling for everything
|
||||
return $this->createDocumentHtml($string, $type);
|
||||
// if we're using PHP 8.4, we can use the modern built-in parser
|
||||
if ($this->useNewParsers()) {
|
||||
return \Dom\HTMLDocument::createFromString($string, \LIBXML_NOERROR | \LIBXML_COMPACT, $encoding);
|
||||
}
|
||||
// otherwise we invoke our parser which has its own handling for everything
|
||||
$c = new Config;
|
||||
$c->encodingFallback = "UTF-8";
|
||||
return Parser::parse($string, $encoding, $c)->document;
|
||||
} elseif ($t->isXml) {
|
||||
// for XML we have to jump through a few hoops to deal with
|
||||
// encoding
|
||||
return $this->createDocumentXml($this->fixXmlEncoding($string, $t->params['charset'] ?? ""));
|
||||
// for XML we have to jump through a few hoops to deal with errors,
|
||||
// as well as with encoding, so we put this in
|
||||
// another function.
|
||||
return $this->createDocumentXml($string, $encoding);
|
||||
} else {
|
||||
throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type");
|
||||
}
|
||||
}
|
||||
|
||||
protected function createDocumentHtml(string $string, string $type): \DOMDocument {
|
||||
return Parser::parse($string, $type)->document;
|
||||
protected function useNewParsers(): bool {
|
||||
return class_exists(\Dom\Document::class);
|
||||
}
|
||||
|
||||
protected function createDocumentXml(string $string): \DOMDocument {
|
||||
$document = new \DOMDocument;
|
||||
if (!$document->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) {
|
||||
protected function createDocumentXml(string $string, ?string $encoding) {
|
||||
$string = $this->fixXmlEncoding($string, $encoding ?? "");
|
||||
try {
|
||||
if ($this->useNewParsers()) {
|
||||
return \Dom\XMLDocument::createFromString($string, \LIBXML_NOERROR | \LIBXML_COMPACT);
|
||||
} else {
|
||||
$document = new \DOMDocument;
|
||||
if ($document->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) {
|
||||
return $document;
|
||||
} else {
|
||||
throw new \Exception;
|
||||
}
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
$err = libxml_get_last_error();
|
||||
$message = trim(htmlspecialchars($err->message, \ENT_NOQUOTES | \ENT_SUBSTITUTE | \ENT_XML1, "UTF-8"));
|
||||
$string = <<<XMLDOC
|
||||
|
@ -104,9 +133,8 @@ XMLDECL;
|
|||
column="{$err->column}"
|
||||
>{$err->code}: "$message" on line {$err->line}, column {$err->column}</parsererror>
|
||||
XMLDOC;
|
||||
return $this->createDocumentXml($string);
|
||||
return $this->createDocumentXml($string, "UTF-8");
|
||||
}
|
||||
return $document;
|
||||
}
|
||||
|
||||
protected function fixXmlEncoding(string $string, string $encoding) {
|
||||
|
@ -162,6 +190,8 @@ XMLDOC;
|
|||
} elseif ($charset === "UTF-16LE") {
|
||||
// if the string is UTF-16LE, adding a BOM is sufficient
|
||||
return self::BOM_UTF16LE.$string;
|
||||
} elseif ($charset === "replacement") {
|
||||
return "\u{FFFD}";
|
||||
} elseif ($charset) {
|
||||
// otherwise substitute the encoding declaration if any
|
||||
return "<?xml".$xmlVersion." encoding=\"$charset\"".$xmlStandalone."?>".substr($string, strlen($xmlDeclaration));
|
||||
|
|
|
@ -12,12 +12,18 @@ use MensBeam\HTML\DOMParser;
|
|||
* @covers \MensBeam\HTML\DOMParser
|
||||
*/
|
||||
class TestDOMParser extends \PHPUnit\Framework\TestCase {
|
||||
protected $p;
|
||||
|
||||
public function setUp(): void {
|
||||
$this->p = \Phake::partialMock(DOMParser::class);
|
||||
\Phake::when($this->p)->useNewParsers->thenReturn(false);
|
||||
}
|
||||
|
||||
/** @dataProvider provideDocuments */
|
||||
public function testParseADocument(string $input, string $type, string $exp): void {
|
||||
$p = new DOMParser;
|
||||
$document = $p->parseFromString($input, $type);
|
||||
$document = $this->p->parseFromString($input, $type);
|
||||
$this->assertSame($exp, $document->documentElement->textContent);
|
||||
$this->assertSame("html", $document->documentElement->tagName);
|
||||
$this->assertSame("html", $document->documentElement->localName);
|
||||
}
|
||||
|
||||
public function provideDocuments(): iterable {
|
||||
|
@ -27,7 +33,7 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
|
|||
};
|
||||
return [
|
||||
["Test", "text/html", "Test"],
|
||||
["Ol\xE9", "text/html", "Ol\u{E9}"],
|
||||
["Ol\u{E9}", "text/html", "Ol\u{E9}"],
|
||||
["Ol\u{E9}", "text/html;charset=utf8", "Ol\u{E9}"],
|
||||
["<meta charset=utf8>Ol\u{E9}", "text/html", "Ol\u{E9}"],
|
||||
["<html>Test</html>", "text/xml", "Test"],
|
||||
|
@ -37,10 +43,6 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
|
|||
["<?xml version='1.0' encoding='windows-1252'?><html>Ol\xE9</html>", "text/xml", "Ol\u{E9}"],
|
||||
["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", "Ol\u{E9}"],
|
||||
["<html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
|
||||
["<?xml version='1.1' encoding='windows-1252'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
|
||||
["<?xml version='1.1' encoding='utf8'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
|
||||
["<?xml version='1.1'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
|
||||
["<?xml version='1.1' ?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
|
||||
["<?xml version='1.0' standalone='yes'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
|
||||
["<?xml version='1.0' standalone='yes'?><html>Ol\xE9</html>", "text/xml;charset=windows-1252", "Ol\u{E9}"],
|
||||
["<?xml version='1.0'?><html>Ol\u{E9}</html>", "text/xml;charset=bogus", "Ol\u{E9}"],
|
||||
|
@ -59,33 +61,29 @@ class TestDOMParser extends \PHPUnit\Framework\TestCase {
|
|||
|
||||
public function testFailToParseADocument(): void {
|
||||
$in = "<html>Test</html><!--Test-->Test";
|
||||
$p = new DOMParser;
|
||||
$d = $p->parseFromString($in, "text/xml");
|
||||
$this->assertSame("parsererror", $d->documentElement->tagName);
|
||||
$d = $this->p->parseFromString($in, "text/xml");
|
||||
$this->assertSame("parsererror", $d->documentElement->localName);
|
||||
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
|
||||
$this->assertNotSame("", trim($d->documentElement->textContent));
|
||||
}
|
||||
|
||||
public function testParseWithIncorrectType(): void {
|
||||
$in = "<html>Ol\u{E9}</html>";
|
||||
$p = new DOMParser;
|
||||
$this->expectException(\InvalidArgumentException::class);
|
||||
$p->parseFromString($in, "text/plain");
|
||||
$this->p->parseFromString($in, "text/plain");
|
||||
}
|
||||
|
||||
public function testParseWithInvalidEncodingInHeader(): void {
|
||||
$in = "<html>Test</html>";
|
||||
$p = new DOMParser;
|
||||
$d = $p->parseFromString($in, "text/xml;charset=csiso2022kr");
|
||||
$this->assertSame("parsererror", $d->documentElement->tagName);
|
||||
$d = $this->p->parseFromString($in, "text/xml;charset=csiso2022kr");
|
||||
$this->assertSame("parsererror", $d->documentElement->localName);
|
||||
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
|
||||
$this->assertNotSame("", trim($d->documentElement->textContent));
|
||||
}
|
||||
public function testParseWithInvalidEncodingInDocument(): void {
|
||||
$in = "<?xml version='1.0' encoding='bogus'?><html>Test</html>";
|
||||
$p = new DOMParser;
|
||||
$d = $p->parseFromString($in, "text/xml");
|
||||
$this->assertSame("parsererror", $d->documentElement->tagName);
|
||||
$d = $this->p->parseFromString($in, "text/xml");
|
||||
$this->assertSame("parsererror", $d->documentElement->localName);
|
||||
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
|
||||
$this->assertNotSame("", trim($d->documentElement->textContent));
|
||||
}
|
||||
|
|
22
tests/cases/TestDOMParserNew.php
Normal file
22
tests/cases/TestDOMParserNew.php
Normal file
|
@ -0,0 +1,22 @@
|
|||
<?php
|
||||
/** @license MIT
|
||||
* Copyright 2017 , Dustin Wilson, J. King et al.
|
||||
* See LICENSE and AUTHORS files for details */
|
||||
|
||||
declare(strict_types=1);
|
||||
namespace MensBeam\HTML\TestCase;
|
||||
|
||||
use MensBeam\HTML\DOMParser;
|
||||
|
||||
/**
|
||||
* @covers \MensBeam\HTML\DOMParser
|
||||
* @requires PHP >= 8.4
|
||||
*/
|
||||
class TestDOMParserNew extends TestDOMParser {
|
||||
protected $p;
|
||||
|
||||
public function setUp(): void {
|
||||
$this->p = \Phake::partialMock(DOMParser::class);
|
||||
\Phake::when($this->p)->useNewParsers->thenReturn(true);
|
||||
}
|
||||
}
|
|
@ -30,6 +30,7 @@
|
|||
</testsuite>
|
||||
<testsuite name="DOMParser">
|
||||
<file>cases/TestDOMParser.php</file>
|
||||
<file>cases/TestDOMParserNew.php</file>
|
||||
</testsuite>
|
||||
<testsuite name="Serializer">
|
||||
<file>cases/TestSerializer.php</file>
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
"require": {
|
||||
"phpunit/phpunit": "^8.5 | ^9.0"
|
||||
"phpunit/phpunit": "^8.5 | ^9.0",
|
||||
"phake/phake": "^4.4"
|
||||
}
|
||||
}
|
||||
|
|
99
vendor-bin/phpunit/composer.lock
generated
99
vendor-bin/phpunit/composer.lock
generated
|
@ -4,34 +4,34 @@
|
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "1fa48e20f042190f12cc36db8c803244",
|
||||
"content-hash": "b1dfc38c12e26ec3570d91cdad266647",
|
||||
"packages": [
|
||||
{
|
||||
"name": "doctrine/instantiator",
|
||||
"version": "2.0.0",
|
||||
"version": "1.5.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/doctrine/instantiator.git",
|
||||
"reference": "c6222283fa3f4ac679f8b9ced9a4e23f163e80d0"
|
||||
"reference": "0a0fa9780f5d4e507415a065172d26a98d02047b"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/doctrine/instantiator/zipball/c6222283fa3f4ac679f8b9ced9a4e23f163e80d0",
|
||||
"reference": "c6222283fa3f4ac679f8b9ced9a4e23f163e80d0",
|
||||
"url": "https://api.github.com/repos/doctrine/instantiator/zipball/0a0fa9780f5d4e507415a065172d26a98d02047b",
|
||||
"reference": "0a0fa9780f5d4e507415a065172d26a98d02047b",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": "^8.1"
|
||||
"php": "^7.1 || ^8.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"doctrine/coding-standard": "^11",
|
||||
"doctrine/coding-standard": "^9 || ^11",
|
||||
"ext-pdo": "*",
|
||||
"ext-phar": "*",
|
||||
"phpbench/phpbench": "^1.2",
|
||||
"phpstan/phpstan": "^1.9.4",
|
||||
"phpstan/phpstan-phpunit": "^1.3",
|
||||
"phpunit/phpunit": "^9.5.27",
|
||||
"vimeo/psalm": "^5.4"
|
||||
"phpbench/phpbench": "^0.16 || ^1",
|
||||
"phpstan/phpstan": "^1.4",
|
||||
"phpstan/phpstan-phpunit": "^1",
|
||||
"phpunit/phpunit": "^7.5 || ^8.5 || ^9.5",
|
||||
"vimeo/psalm": "^4.30 || ^5.4"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
|
@ -58,7 +58,7 @@
|
|||
],
|
||||
"support": {
|
||||
"issues": "https://github.com/doctrine/instantiator/issues",
|
||||
"source": "https://github.com/doctrine/instantiator/tree/2.0.0"
|
||||
"source": "https://github.com/doctrine/instantiator/tree/1.5.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
@ -74,7 +74,7 @@
|
|||
"type": "tidelift"
|
||||
}
|
||||
],
|
||||
"time": "2022-12-30T00:23:10+00:00"
|
||||
"time": "2022-12-30T00:15:36+00:00"
|
||||
},
|
||||
{
|
||||
"name": "myclabs/deep-copy",
|
||||
|
@ -194,6 +194,77 @@
|
|||
},
|
||||
"time": "2024-10-08T18:51:32+00:00"
|
||||
},
|
||||
{
|
||||
"name": "phake/phake",
|
||||
"version": "v4.5.3",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/phake/phake.git",
|
||||
"reference": "695a4feda9ff25608e5065bfd48a7d3c6add57ce"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/phake/phake/zipball/695a4feda9ff25608e5065bfd48a7d3c6add57ce",
|
||||
"reference": "695a4feda9ff25608e5065bfd48a7d3c6add57ce",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"doctrine/instantiator": "^1.4",
|
||||
"php": "^7.1|^8.0",
|
||||
"sebastian/comparator": "^1.1|^2.0|^3.0|^4.0|^5.0|^6.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"doctrine/annotations": "^1.13",
|
||||
"hamcrest/hamcrest-php": "^1.1|^2.0",
|
||||
"phpunit/phpunit": "^6.5|^7.0|^8.0|^9.0|^10.0|^11.0",
|
||||
"psalm/phar": "^4.18"
|
||||
},
|
||||
"suggest": {
|
||||
"doctrine/annotations": "Allows mock annotations to use import statements for classes.",
|
||||
"hamcrest/hamcrest-php": "Use Hamcrest matchers."
|
||||
},
|
||||
"type": "library",
|
||||
"extra": {
|
||||
"branch-alias": {
|
||||
"dev-4.4": "4.4.x-dev",
|
||||
"dev-master": "5.0.x-dev"
|
||||
}
|
||||
},
|
||||
"autoload": {
|
||||
"files": [
|
||||
"src/Phake.php"
|
||||
],
|
||||
"psr-4": {
|
||||
"Phake\\": "src/Phake"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"BSD-3-Clause"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Mike Lively",
|
||||
"email": "m@digitalsandwich.com"
|
||||
}
|
||||
],
|
||||
"description": "The Phake mock testing library",
|
||||
"homepage": "https://phake.github.io",
|
||||
"keywords": [
|
||||
"mock",
|
||||
"phake",
|
||||
"spy",
|
||||
"stub",
|
||||
"test-doubles",
|
||||
"testing"
|
||||
],
|
||||
"support": {
|
||||
"docs": "https://phake.github.io/doc/",
|
||||
"issues": "https://github.com/phake/phake/issues",
|
||||
"source": "https://github.com/phake/phake/tree/v4.5.3"
|
||||
},
|
||||
"time": "2024-12-09T14:46:24+00:00"
|
||||
},
|
||||
{
|
||||
"name": "phar-io/manifest",
|
||||
"version": "2.0.4",
|
||||
|
|
Loading…
Reference in a new issue