diff --git a/composer.json b/composer.json index 6a63624..f7076f4 100644 --- a/composer.json +++ b/composer.json @@ -4,7 +4,8 @@ "type": "library", "require": { "php": ">=7.1", - "ext-dom": "*" + "ext-dom": "*", + "mensbeam/html-parser": "dev-master" }, "scripts": { "post-install-cmd": ["@composer bin all install"], diff --git a/composer.lock b/composer.lock index de8f061..ed2175c 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "0e733e74b1b163aa4cd80329ff9c71d0", + "content-hash": "0301f6145278bf69bbcca49992befa89", "packages": [], "packages-dev": [ { @@ -534,6 +534,167 @@ }, "time": "2020-12-25T05:00:37+00:00" }, + { + "name": "mensbeam/html-parser", + "version": "dev-master", + "source": { + "type": "git", + "url": "https://code.mensbeam.com/MensBeam/HTML-Parser", + "reference": "8115ac07d55b96d866da996f6329f6287f09bc49" + }, + "require": { + "ext-dom": "*", + "mensbeam/intl": ">=0.9.0", + "mensbeam/mimesniff": "^0.2.0", + "php": ">=7.1" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.3", + "masterminds/html5": "^2.7" + }, + "suggest": { + "ext-ctype": "Improved performance" + }, + "default-branch": true, + "type": "library", + "autoload": { + "psr-4": { + "MensBeam\\HTML\\": [ + "lib/" + ] + }, + "classmap": [ + "lib/Parser/Token.php" + ], + "files": [ + "lib/Parser/ctype.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Dustin Wilson", + "email": "dustin@dustinwilson.com", + "homepage": "https://dustinwilson.com/" + }, + { + "name": "J. King", + "email": "jking@jkingweb.ca", + "homepage": "https://jkingweb.ca/" + } + ], + "description": "Parses modern HTML text into a PHP DOMDocument", + "time": "2021-09-26T20:09:27+00:00" + }, + { + "name": "mensbeam/intl", + "version": "0.9.0", + "source": { + "type": "git", + "url": "https://github.com/mensbeam/intl.git", + "reference": "de037b182ce99aaa90ebc09b0ee0457ddf1d07bc" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/mensbeam/intl/zipball/de037b182ce99aaa90ebc09b0ee0457ddf1d07bc", + "reference": "de037b182ce99aaa90ebc09b0ee0457ddf1d07bc", + "shasum": "" + }, + "require": { + "php": ">=7.1" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "*", + "ext-intl": "*" + }, + "type": "library", + "autoload": { + "psr-4": { + "MensBeam\\Intl\\": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "J. King", + "email": "jking@jkingweb.ca", + "homepage": "https://jkingweb.ca/" + } + ], + "description": "A set of dependency-free basic internationalization tools", + "keywords": [ + "WHATWG", + "charset", + "encoding", + "internationalization", + "intl", + "unicode", + "utf-8", + "utf8" + ], + "support": { + "issues": "https://github.com/mensbeam/intl/issues", + "source": "https://github.com/mensbeam/intl/tree/0.9.0" + }, + "time": "2021-03-25T19:08:04+00:00" + }, + { + "name": "mensbeam/mimesniff", + "version": "0.2.1", + "source": { + "type": "git", + "url": "https://github.com/mensbeam/mime.git", + "reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/mensbeam/mime/zipball/c19be2496ab1e27fbf9c3483c2a9faa2781796cd", + "reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd", + "shasum": "" + }, + "require": { + "php": ">=7.1", + "psr/http-message": "^1.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.3", + "ext-intl": "*" + }, + "type": "library", + "autoload": { + "psr-4": { + "MensBeam\\Mime\\": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "J. King", + "email": "jking@jkingweb.ca", + "homepage": "https://jkingweb.ca/" + } + ], + "description": "An implementation of the WHATWG MIME Sniffing specification", + "keywords": [ + "WHATWG", + "mime", + "mimesniff" + ], + "support": { + "issues": "https://github.com/mensbeam/mime/issues", + "source": "https://github.com/mensbeam/mime/tree/0.2.1" + }, + "time": "2021-03-07T03:58:00+00:00" + }, { "name": "psr/container", "version": "1.1.1", @@ -2294,7 +2455,9 @@ ], "aliases": [], "minimum-stability": "stable", - "stability-flags": [], + "stability-flags": { + "mensbeam/html-parser": 20 + }, "prefer-stable": false, "prefer-lowest": false, "platform": { diff --git a/lib/Document.php b/lib/Document.php index 54daffc..77c64e7 100644 --- a/lib/Document.php +++ b/lib/Document.php @@ -4,11 +4,17 @@ * See LICENSE and AUTHORS files for details */ declare(strict_types=1); -namespace MensBeam\HTML; +namespace MensBeam\HTML\DOM; +use MensBeam\HTML\Parser, + MensBeam\HTML\Parser\Data; + class Document extends AbstractDocument { protected ?Element $_body = null; /** Nonstandard */ + protected ?string $_documentEncoding; + protected int $_quirksMode = 0; + /** Nonstandard */ protected ?\DOMXPath $_xpath = null; // List of elements that are treated as block elements for the purposes of @@ -94,6 +100,14 @@ class Document extends AbstractDocument { $this->_body = $value; } + public function __get_documentEncoding(): ?string { + return $this->_documentEncoding; + } + + public function __get_quirksMode(): int { + return $this->_quirksMode; + } + public function __get_xpath(): \DOMXPath { if ($this->_xpath === null) { $this->_xpath = new \DOMXPath($this); @@ -102,34 +116,26 @@ class Document extends AbstractDocument { } - public function __construct($source = null, ?string $encodingOrContentType = null) { + public function __construct($source = null, ?string $encoding = null, int $quirksMode = 0) { // Because we cannot have union types until php 8... :) if ($source !== null && !$source instanceof \DOMDocument && !is_string($source)) { throw new DOMException(DOMException::ARGUMENT_TYPE_ERROR, 1, 'source', 'string|\DOMDocument', gettype($source)); - } elseif ($source instanceof self) { - return $source; } parent::__construct(); - $this->registerNodeClass('DOMDocument', '\MensBeam\HTML\Document'); - $this->registerNodeClass('DOMComment', '\MensBeam\HTML\Comment'); - $this->registerNodeClass('DOMDocumentFragment', '\MensBeam\HTML\DocumentFragment'); - $this->registerNodeClass('DOMElement', '\MensBeam\HTML\Element'); - $this->registerNodeClass('DOMProcessingInstruction', '\MensBeam\HTML\ProcessingInstruction'); - $this->registerNodeClass('DOMText', '\MensBeam\HTML\Text'); + $this->registerNodeClass('DOMDocument', '\MensBeam\HTML\DOM\Document'); + $this->registerNodeClass('DOMComment', '\MensBeam\HTML\DOM\Comment'); + $this->registerNodeClass('DOMDocumentFragment', '\MensBeam\HTML\DOM\DocumentFragment'); + $this->registerNodeClass('DOMElement', '\MensBeam\HTML\DOM\Element'); + $this->registerNodeClass('DOMProcessingInstruction', '\MensBeam\HTML\DOM\ProcessingInstruction'); + $this->registerNodeClass('DOMText', '\MensBeam\HTML\DOM\Text'); if ($source !== null) { if (is_string($source)) { - $source = Parser::parse($source, null, $encodingOrContentType); - } - - foreach ($source->childNodes as $child) { - if (!$child instanceof \DOMDocumentType) { - $this->appendChild($this->importNode($child, true)); - } else { - $this->appendChild($this->implementation->createDocumentType($child->name ?? ' ', $child->public ?? '', $child->system ?? '')); - } + $this->loadHTML($source, null, $encoding); + } else { + $this->loadDOM($source, $encoding, $quirksMode); } } } @@ -178,7 +184,7 @@ class Document extends AbstractDocument { if ($qualifiedName !== 'template' || $namespaceURI !== null) { $e = parent::createElementNS($namespaceURI, $qualifiedName, $value); } else { - $e = new TemplateElement($this, $qualifiedName, $value); + $e = new HTMLTemplateElement($this, $qualifiedName, $value); // Template elements need to have a reference kept in userland ElementMap::set($e); $e->content = $this->createDocumentFragment(); @@ -203,23 +209,38 @@ class Document extends AbstractDocument { return false; } - public function load($filename, $options = null, ?string $encodingOrContentType = null): bool { - $data = Parser::fetchFile($filename, $encodingOrContentType); + public function importNode(\DOMNode $node, bool $deep = false) { + $node = parent::importNode($node, $deep); + + if ($node instanceof \DOMElement) { + $node = $this->convertElementToSubClass($node); + } + + return $node; + } + + public function load($filename, $options = null, ?string $encoding = null): bool { + $data = Parser::fetchFile($filename, $encoding); if (!$data) { return false; } [$data, $encodingOrContentType] = $data; - Parser::parse($data, $this, $encodingOrContentType, null, (string)$filename); + $this->loadHTML($data, null, $encoding); return true; } - public function loadHTML($source, $options = null, ?string $encodingOrContentType = null): bool { - if (!is_string($source)) { - throw new DOMException(DOMException::ARGUMENT_TYPE_ERROR, 1, 'source', 'string', gettype($source)); + public function loadDOM(\DOMDocument $source, ?string $encoding = null, int $quirksMode = 0) { + if (!$source instanceof \DOMDocument) { + throw new DOMException(DOMException::ARGUMENT_TYPE_ERROR, 1, 'source', '\DOMDocument', gettype($source)); } - if (is_string($source)) { - $source = Parser::parse($source, null, $encodingOrContentType); + $this->_documentEncoding = $encoding; + $this->_quirksMode = $quirksMode; + + // If there are already existing child nodes then remove them before loading the + // DOM. + while ($this->hasChildNodes()) { + $this->removeChild($this->firstChild); } foreach ($source->childNodes as $child) { @@ -230,13 +251,32 @@ class Document extends AbstractDocument { } } - assert(is_string($source), new DOMException(DOMException::STRING_EXPECTED, 'source', gettype($source))); - Parser::parse($source, $this, $encodingOrContentType); + $templates = $this->walk(function($n) { + if ($n instanceof Element && $n->namespaceURI === null && $n->nodeName === 'template') { + return true; + } + }); + + foreach ($templates as $template) { + $template->replaceWith($this->convertElementToSubClass($template)); + } + return true; } - public function loadHTMLFile($filename, $options = null, ?string $encodingOrContentType = null): bool { - return $this->load($filename, $options, $encodingOrContentType); + public function loadHTML($source, $options = null, ?string $encoding = null): bool { + if (!is_string($source)) { + throw new DOMException(DOMException::ARGUMENT_TYPE_ERROR, 1, 'source', 'string', gettype($source)); + } + + $source = Parser::parse($source, $encoding, null); + $this->loadDOM($source->document, $source->encoding, $source->quirksMode); + + return true; + } + + public function loadHTMLFile($filename, $options = null, ?string $encoding = null): bool { + return $this->load($filename, $options, $encoding); } public function loadXML($source, $options = null): bool { @@ -248,18 +288,6 @@ class Document extends AbstractDocument { } public function saveHTML(\DOMNode $node = null): string { - return $node->serialize($node); - } - - public function saveHTMLFile($filename): int { - return $this->save($filename); - } - - public function saveXML(?\DOMNode $node = null, $options = null): bool { - return false; - } - - public function serialize(\DOMNode $node = null): string { $node = $node ?? $this; $formatOutput = $this->formatOutput; @@ -297,6 +325,14 @@ class Document extends AbstractDocument { return $this->serializeFragment($node, $formatOutput); } + public function saveHTMLFile($filename): int { + return $this->save($filename); + } + + public function saveXML(?\DOMNode $node = null, $options = null): bool { + return false; + } + public function validate(): bool { return true; } @@ -433,7 +469,7 @@ class Document extends AbstractDocument { # 3. If the node is a template element, then let the node instead be the # template element’s template contents (a DocumentFragment node). - if ($node instanceof TemplateElement) { + if ($node instanceof HTMLTemplateElement) { $node = $node->content; } @@ -749,7 +785,25 @@ class Document extends AbstractDocument { } + private function convertElementToSubClass(\DOMElement $element): \DOMElement { + if ($element->namespaceURI === null && $element->nodeName === 'template') { + $template = $this->createElement('template'); + + while ($element->attributes->length > 0) { + $template->setAttributeNode($element->attributes->item(0)); + } + while ($element->hasChildNodes()) { + $template->content->appendChild($element->firstChild); + } + + $element = $template; + } + + return $element; + } + + public function __toString() { - return $this->serialize(); + return $this->saveHTML(); } } diff --git a/lib/TemplateElement.php b/lib/HTMLTemplateElement.php similarity index 95% rename from lib/TemplateElement.php rename to lib/HTMLTemplateElement.php index 51db140..92b8e9c 100644 --- a/lib/TemplateElement.php +++ b/lib/HTMLTemplateElement.php @@ -7,7 +7,7 @@ declare(strict_types=1); namespace MensBeam\HTML\DOM; /** Class specifically for template elements to handle its content property. */ -class TemplateElement extends Element { +class HTMLTemplateElement extends Element { public $content = null; public function __construct(Document $ownerDocument, string $qualifiedName, ?string $value = null, string $namespace = '') { diff --git a/lib/traits/Moonwalk.php b/lib/traits/Moonwalk.php index ce972ee..eab4c17 100644 --- a/lib/traits/Moonwalk.php +++ b/lib/traits/Moonwalk.php @@ -15,7 +15,7 @@ trait Moonwalk { private function moonwalkGenerator(\DOMNode $node, ?\Closure $filter = null) { do { while (true) { - if ($filter === null || $filter($node)) { + if ($filter === null || $filter($node) === true) { yield $node; } diff --git a/lib/traits/MoonwalkShallow.php b/lib/traits/MoonwalkShallow.php index c52f988..fcf2e5a 100644 --- a/lib/traits/MoonwalkShallow.php +++ b/lib/traits/MoonwalkShallow.php @@ -19,7 +19,7 @@ trait MoonwalkShallow { $childNodesLength = $node->childNodes->length; for ($childNodesLength = $node->childNodes->length, $i = $childNodesLength - 1; $i >= 0; $i--) { $child = $node->childNodes[$i]; - if ($filter === null || $filter($child)) { + if ($filter === null || $filter($child) === true) { yield $child; } } diff --git a/lib/traits/ToString.php b/lib/traits/ToString.php index 898792b..922ff44 100644 --- a/lib/traits/ToString.php +++ b/lib/traits/ToString.php @@ -10,6 +10,6 @@ trait ToString { public function __toString(): string { $frag = $this->ownerDocument->createDocumentFragment(); $frag->appendChild($this->cloneNode(true)); - return $this->ownerDocument->serialize($frag); + return $this->ownerDocument->saveHTML($frag); } } diff --git a/lib/traits/Walk.php b/lib/traits/Walk.php index 3f5a397..65af4ad 100644 --- a/lib/traits/Walk.php +++ b/lib/traits/Walk.php @@ -13,7 +13,7 @@ trait Walk { } private function walkGenerator(\DOMNode $node, ?\Closure $filter = null) { - if ($filter === null || $filter($node)) { + if ($filter === null || $filter($node) === true) { yield $node; } diff --git a/lib/traits/WalkShallow.php b/lib/traits/WalkShallow.php index bfd36fe..af5cba0 100644 --- a/lib/traits/WalkShallow.php +++ b/lib/traits/WalkShallow.php @@ -16,7 +16,7 @@ trait WalkShallow { $node = (!$this instanceof TemplateElement) ? $this : $this->content; foreach ($node->childNodes as $child) { - if ($filter === null || $filter($child)) { + if ($filter === null || $filter($child) === true) { yield $child; } } diff --git a/tests/cases/TestCharset.php b/tests/cases/TestCharset.php deleted file mode 100644 index 81c3f24..0000000 --- a/tests/cases/TestCharset.php +++ /dev/null @@ -1,107 +0,0 @@ -assertSame($exp, Charset::fromCharset($in)); - } - - public function provideCharsets() { - return [ - ["UTF-8", "UTF-8"], - [" utf8 ", "UTF-8"], - ["ISO-8859-1", "windows-1252"], - ["text/html; charset=utf8", null], - ]; - } - - /** @dataProvider provideContentTypes */ - public function testDetermineEncodingFromContentType(string $input, ?string $exp) { - $this->assertSame($exp, Charset::fromTransport($input)); - } - - public function provideContentTypes() { - return [ - ["UTF-8", null], - ["charset=utf8", null], - ["text/html", null], - ["text/html charset=utf8", null], - ["text/html; charset=utf8", "UTF-8"], - ["text/html;charset=utf8", "UTF-8"], - ["text/html; charset=\"utf8\"", "UTF-8"], - ["image/svg+xml; param=value; charset=utf8", "UTF-8"], - ["image/svg+xml; charset=utf8; charset=big5", "UTF-8"], - ["image/svg+xml; charset=utf8;charset=big5", "UTF-8"], - ["text/html; charset=not-valid; charset=big5", null], - ["text/html; charset=not-valid", null], - ["text/html; charsaaet=\"a \\\"fancy\\\" encoding\"", null], - ]; - } - - /** @dataProvider provideBOMs */ - public function testDetermineEncodingFromByteOrderMark(string $input, ?string $exp) { - $this->assertSame($exp, Charset::fromBOM($input)); - } - - public function provideBOMs() { - return [ - 'UTF-8' => ["\u{FEFF}Hello world!", "UTF-8"], - 'UTF-16 (big-endian)' => ["\xFE\xFF\0H\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d\0!", "UTF-16BE"], - 'UTF-16 (little-endian)' => ["\xFF\xFEH\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d\0!\0", "UTF-16LE"], - 'No byte order mark' => ["Hello world!", null], - ]; - } - - /** @dataProvider provideStandardEncodingTests */ - public function testStandardEncoderTests(string $input, string $exp) { - $exp = strtolower($exp); - $this->assertSame(strtolower($exp), strtolower(Charset::fromBOM($input)?? Charset::fromPrescan($input, \PHP_INT_MAX) ?? "Windows-1252")); - } - - public function provideStandardEncodingTests() { - $tests = []; - $blacklist = []; - $files = new \AppendIterator(); - $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/html5lib-tests/encoding/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); - $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/cases/encoding/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); - foreach ($files as $file) { - if (!in_array(basename($file), $blacklist)) { - $tests[] = $file; - } - } - return $this->makeEncodingTests(...$tests); - } - - protected function makeEncodingTests(string ...$file): iterable { - foreach ($file as $path) { - $f = basename($path); - $test = file($path); - $end = sizeof($test); - $l = 0; - $index = 0; - while ($l < $end) { - $testId = "$f #".$index++; - $data = ""; - while ($l < $end && !preg_match("/^#data\s+$/", @$test[$l++])); - while ($l < $end && !preg_match("/^#encoding\s+$/", ($line = @$test[$l++]))) { - $data .= $line; - } - if ($l >= $end) { - return; - } - yield $testId => [trim($data, "\r\n"), trim($test[$l++])]; - } - } - } -} diff --git a/tests/cases/TestTokenizer.php b/tests/cases/TestTokenizer.php deleted file mode 100644 index 863cf80..0000000 --- a/tests/cases/TestTokenizer.php +++ /dev/null @@ -1,235 +0,0 @@ - Tokenizer::DATA_STATE, - 'PLAINTEXT state' => Tokenizer::PLAINTEXT_STATE, - 'RCDATA state' => Tokenizer::RCDATA_STATE, - 'RAWTEXT state' => Tokenizer::RAWTEXT_STATE, - 'Script data state' => Tokenizer::SCRIPT_DATA_STATE, - 'CDATA section state' => Tokenizer::CDATA_SECTION_STATE, - ]; - - /** @dataProvider provideStandardTokenizerTests */ - public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $expErrors) { - // convert parse error constants into standard symbols in specification - $errorMap = array_map(function($str) { - return strtolower(str_replace("_", "-", $str)); - }, array_flip(array_filter((new \ReflectionClass(ParseError::class))->getConstants(), function($v) { - return is_int($v); - }))); - // create a stub error handler which collects parse errors - $errors = []; - $errorHandler = $this->createStub(ParseError::class); - $errorHandler->method("emit")->willReturnCallback(function($file, $line, $col, $code) use (&$errors, $errorMap) { - $errors[] = ['code' => $errorMap[$code], 'line' => $line, 'col' => $col]; - return true; - }); - // initialize a stack of open elements, possibly with an open element - $stack = new OpenElementsStack(); - if ($open) { - $stack[] = (new \DOMDocument)->createElement($open); - } - // initialize the data stream and tokenizer - $data = new Data($input, "STDIN", $errorHandler, "UTF-8"); - $tokenizer = new Tokenizer($data, $stack, $errorHandler); - $tokenizer->state = $state; - // perform the test - $actual = []; - try { - foreach ($tokenizer->tokenize() as $t) { - assert( - (!$t instanceof CharacterToken) - || ($t instanceof NullCharacterToken && $t->data === "\0") - || ($t instanceof WhitespaceToken && strspn($t->data, Data::WHITESPACE) === strlen($t->data)) - || ($t->data !== "\0" && strspn($t->data, Data::WHITESPACE) === 0) - , new \Exception("Character token must either consist of a single null character, consist only of whitespace, or start with other than whitespace: ".get_class($t)." ".var_export($t->data ?? "''", true))); - $actual[] = $t; - } - } finally { - $actual = $this->combineCharacterTokens($actual); - $this->assertEquals($expected, $actual, $tokenizer->debugLog); - $this->assertEquals($expErrors, $errors, $tokenizer->debugLog); - } - } - - public function provideStandardTokenizerTests() { - $tests = []; - $blacklist = ["xmlViolation.test"]; - $files = new \AppendIterator(); - $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/html5lib-tests/tokenizer/*.test", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); - $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/cases/tokenizer/*.test", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); - foreach ($files as $file) { - if (!in_array(basename($file), $blacklist)) { - $tests[] = $file; - } - } - return $this->makeTokenTests(...$tests); - } - - protected function reverseDoubleEscape(string $str): string { - if (preg_match_all("/\\\\u([0-9a-f]{4})/i", $str, $matches)) { - for ($a = 0; $a < sizeof($matches[0]); $a++) { - $esc = $matches[0][$a]; - $chr = \MensBeam\Intl\Encoding\UTF8::encode(hexdec($matches[1][$a])); - $str = str_replace($esc, $chr, $str); - } - } - return $str; - } - - protected function combineCharacterTokens(array $tokens) : array { - $out = []; - $pending = null; - foreach ($tokens as $t) { - if ($t instanceof CharacterToken) { - if (!$pending) { - if ($t instanceof WhitespaceToken || $t instanceof NullCharacterToken) { - $t = new CharacterToken($t->data); - } - $pending = $t; - } else { - $pending->data .= $t->data; - } - } else { - if ($pending) { - $out[] = $pending; - $pending = null; - } - $out[] = $t; - } - } - if ($pending) { - $out[] = $pending; - } - return $out; - } - - protected function makeTokenTests(string ...$file): iterable { - foreach ($file as $path) { - $f = basename($path); - $testSet = json_decode(file_get_contents($path), true); - foreach ($testSet['tests'] ?? $testSet['xmlViolationTests'] as $index => $test) { - $testId = "$f #$index"; - if ($test['doubleEscaped'] ?? false) { - $test['input'] = $this->reverseDoubleEscape($test['input']); - for ($a = 0; $a < sizeof($test['output']); $a++) { - for ($b = 0; $b < sizeof($test['output'][$a]); $b++) { - if (is_string($test['output'][$a][$b])) { - $test['output'][$a][$b] = $this->reverseDoubleEscape($test['output'][$a][$b]); - } - } - } - } - $test['initialStates'] = $test['initialStates'] ?? ["Data state"]; - // check if a test needs a patch due to trivial differences in implementation - $this->patchTest($test); - for ($a = 0; $a < sizeof($test['initialStates']); $a++) { - $tokens = []; - foreach ($test['output'] as $token) { - switch ($token[0]) { - case "DOCTYPE": - $t = new DOCTYPEToken((string) $token[1], (string) $token[2], (string) $token[3]); - $t->forceQuirks = !$token[4]; - $tokens[] = $t; - break; - case "StartTag": - $t = new StartTagToken($token[1], $token[3] ?? false); - foreach ($token[2] ?? [] as $name => $value) { - $t->attributes[] = new TokenAttr((string) $name, $value); - } - $tokens[] = $t; - break; - case "EndTag": - $tokens[] = new EndTagToken($token[1]); - break; - case "Character": - $tokens[] = new CharacterToken($token[1]); - break; - case "Comment": - $tokens[] = new CommentToken($token[1]); - break; - default: - throw new \Exception("Token type '{$token[0]}' not implemented in standard test interpreter"); - } - unset($t); - } - $tokens[] = new EOFToken; - yield "$testId: {$test['description']} ({$test['initialStates'][$a]})" => [ - $test['input'], // input - $tokens, // output - self::STATE_MAP[$test['initialStates'][$a]], // initial state - $test['lastStartTag'] ?? null, // open element, if any - $test['errors'] ?? [], // errors, if any - ]; - } - } - } - } - - protected function patchTest(&$test): void { - $id = [$test['input'], $test['initialStates']]; - switch ($id) { - // test emits input stream error first despite peeking - case [" positions in some tests don't make sense - // https://github.com/html5lib/html5lib-tests/issues/125 - case ["", ["CDATA section state"]]: - // there is no position 2 - $test['errors'][0]['col']--; - break; - case ["\u{A}", ["CDATA section state"]]: - // the line break is, for some reason, not counted in the test - $test['errors'][0]['line']++; - $test['errors'][0]['col'] = 1; - break; - case ["
helloexcite!me!", // this one is pretty hairy with buffered characters - ])) { - $errors = false; - } - if ($errors) { - // some "old" errors are made redundant by "new" errors - $obsoleteSymbolList = implode("|", [ - "illegal-codepoint-for-numeric-entity", - "eof-in-attribute-value-double-quote", - "non-void-element-with-trailing-solidus", - "invalid-character-in-attribute-name", - "attributes-in-end-tag", - "expected-tag-name", - "unexpected-character-after-solidus-in-tag", - "expected-closing-tag-but-got-char", - "eof-in-tag-name", - "need-space-after-doctype", - "expected-doctype-name-but-got-right-bracket", - "expected-dashes-or-doctype", - "expected-space-or-right-bracket-in-doctype", - "unexpected-char-in-comment", - "eof-in-comment-double-dash", - "expected-named-entity", - "named-entity-without-semicolon", - "numeric-entity-without-semicolon", - "expected-numeric-entity", - "eof-in-attribute-name", - "unexpected-eof-in-text-mode", - "unexpected-EOF-after-solidus-in-tag", - "expected-attribute-name-but-got-eof", - "eof-in-script-in-script", - "expected-script-data-but-got-eof", - "unexpected-EOF-in-text-mode", - "expected-tag-name-but-got-question-mark", - "incorrect-comment", - "self-closing-flag-on-end-tag", - "invalid-codepoint", - "invalid-codepoint-in-body", - "invalid-codepoint-in-foreign-content", - "end-table-tag-in-caption", - "equals-in-unquoted-attribute-value", - "eof-in-numeric-entity", - "unexpected-char-in-doctype", - "unexpected-end-of-doctype", - "unexpected-dash-after-double-dash-in-comment", - "unexpected-bang-after-double-dash-in-comment", - ]); - for ($a = 0, $stop = sizeof($errors); $a < $stop; $a++) { - if (preg_match("/^\(\d+,\d+\):? ($obsoleteSymbolList)$/", $errors[$a])) { - // these errors are redundant with "new" errors - unset($errors[$a]); - } - } - $errors = array_values($errors); - // some other errors appear to document implementation details - // rather than what the specificatioon dictates, or are - // simple duplicates - for ($a = 0, $stop = sizeof($errors); $a < $stop; $a++) { - if ( - preg_match("/^\(\d+,\d+\): unexpected-end-tag-in-special-element$/", $errors[$a]) - || preg_match('/^\d+: Unclosed element “[^”]+”\.$/u', $errors[$a]) - || ($data === '"); - } elseif ($n instanceof \DOMCharacterData) { - $this->push('"'.$n->data.'"'); - } else { - throw new \Exception("Node type ".get_class($n)." not handled"); - } - } - - public function provideStandardTreeTests(): iterable { - $blacklist = []; - $files = new \AppendIterator(); - $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/html5lib-tests/tree-construction/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); - $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/cases/tree-construction/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); - foreach ($files as $file) { - $index = 0; - $l = 0; - if (!in_array(basename($file), $blacklist)) { - $lines = array_map(function($v) { - return rtrim($v, "\n"); - }, file($file)); - while ($l < sizeof($lines)) { - $pos = $l + 1; - assert($lines[$l] === "#data", new \Exception("Test $file #$index does not start with #data tag at line ".($l + 1))); - // collect the test input - $data = []; - for (++$l; $l < sizeof($lines); $l++) { - if ($lines[$l] === "#errors") { - break; - } - $data[] = $lines[$l]; - } - $data = implode("\n", $data); - // collect the test errors - $errors = []; - assert(($lines[$l] ?? "") === "#errors", new \Exception("Test $file #$index does not list errors at line ".($l + 1))); - for (++$l; $l < sizeof($lines); $l++) { - if ($lines[$l] === "#new-errors") { - continue; - } elseif (preg_match('/^#(document(-fragment)?|script-(on|off)|)$/', $lines[$l])) { - break; - } - $errors[] = $lines[$l]; - } - // set the script mode, if present - assert(preg_match('/^#(script-(on|off)|document(-fragment)?)$/', $lines[$l]) === 1, new \Exception("Test $file #$index follows errors with something other than script flag, document fragment, or document at line ".($l + 1))); - $script = null; - if ($lines[$l] === "#script-off") { - $script = false; - $l++; - } elseif ($lines[$l] === "#script-on") { - $script = true; - $l++; - } - // collect the document fragment, if present - assert(preg_match('/^#document(-fragment)?$/', $lines[$l]) === 1, new \Exception("Test $file #$index follows script flag with something other than document fragment or document at line ".($l + 1))); - $fragment = null; - if ($lines[$l] === "#document-fragment") { - $fragment = $lines[++$l]; - $l++; - } - // collect the output tree - $exp = []; - assert($lines[$l] === "#document", new \Exception("Test $file #$index follows document fragment with something other than document at line ".($l + 1))); - for (++$l; $l < sizeof($lines); $l++) { - if ($lines[$l] === "" && ($lines[$l + 1] ?? "") === "#data") { - break; - } elseif (($lines[$l][0] ?? "") !== "|") { - // apend the data to the previous token - $exp[sizeof($exp) - 1] .= "\n".$lines[$l]; - continue; - } - assert(preg_match('/^[^#]/', $lines[$l]) === 1, new \Exception("Test $file #$index contains unrecognized data after document at line ".($l + 1))); - $exp[] = $lines[$l]; - } - if (!$script) { - // scripting-dependent tests are skipped entirely since we will not support scripting - yield basename($file)." #$index (line $pos)" => [$data, $exp, $errors, $fragment]; - } - $l++; - $index++; - } - } - } - } -} diff --git a/tests/cases/encoding/mensbeam01.dat b/tests/cases/encoding/mensbeam01.dat deleted file mode 100644 index 0d4a8b6..0000000 --- a/tests/cases/encoding/mensbeam01.dat +++ /dev/null @@ -1,36 +0,0 @@ -#data - - -#encoding -Windows-1252 - -#data - - -#encoding -UTF-8 - -#data - - - - -#encoding -Windows-1252 - -#data - - -#encoding -Windows-1252 - -#data - - -#encoding -UTF-8 diff --git a/tests/cases/tokenizer/mensbeam01.test b/tests/cases/tokenizer/mensbeam01.test deleted file mode 100644 index 539a87f..0000000 --- a/tests/cases/tokenizer/mensbeam01.test +++ /dev/null @@ -1,27 +0,0 @@ -{ - "tests": [ - { - "description":"Whitespace character references", - "initialStates":["RCDATA state"], - "input":" ", - "output":[["Character", " "]] - }, - { - "description":"Script end tag with whitespace", - "initialStates":["Script data state"], - "lastStartTag":"script", - "input":"", - "output":[["EndTag", "script"]] - }, - { - "description":"Self-closing script end tag", - "initialStates":["Script data state"], - "lastStartTag":"script", - "input":"", - "output":[["EndTag", "script"]], - "errors":[ - { "code": "end-tag-with-trailing-solidus", "line": 1, "col": 10 } - ] - } - ] -} diff --git a/tests/cases/tree-construction/mensbeam01.dat b/tests/cases/tree-construction/mensbeam01.dat deleted file mode 100644 index 08a8f66..0000000 --- a/tests/cases/tree-construction/mensbeam01.dat +++ /dev/null @@ -1,68 +0,0 @@ -#data - -#errors -#document -| -| -| -| -| -| xmlns xmlns="http://www.w3.org/2000/svg" - -#data - -#errors -(1,58): invalid-namespace-attribute-value -#document -| -| -| -| -| - -#data - -#errors -#document -| -| -| -| -| -| xmlns xlink="http://www.w3.org/1999/xlink" - -#data - -#errors -#document -| -| -| -| -| -| xlink href="http://example.com/" - -#data - -#errors -(1,97): invalid-namespace-attribute-value -#document -| -| -| -| -| -| xlink href="http://example.com/" - -#data -

-#errors -#document -| -| -| -| -| -| xml id="proper" -|

-| xml:id="bogus" diff --git a/tests/phpunit.dist.xml b/tests/phpunit.dist.xml index 5d8d3c3..9672f19 100644 --- a/tests/phpunit.dist.xml +++ b/tests/phpunit.dist.xml @@ -19,15 +19,6 @@ cases/TestDOM.php - - cases/TestCharset.php - - - cases/TestTokenizer.php - - - cases/TestTreeConstructor.php - cases/TestSerializer.php

please!