Browse Source

Support processing instructions

More tests are needed, but basic functionality has been validated
ns
J. King 3 years ago
parent
commit
a88a3ae107
  1. 2
      README.md
  2. 2
      lib/Parser.php
  3. 2
      lib/Parser/Config.php
  4. 6
      lib/Parser/Token.php
  5. 2
      lib/Parser/Tokenizer.php
  6. 21
      lib/Parser/TreeBuilder.php
  7. 10
      tests/cases/TestTokenizer.php
  8. 32
      tests/cases/TestTreeConstructor.php
  9. 21
      tests/cases/tree-construction/pi01.dat

2
README.md

@ -43,7 +43,7 @@ This library and [masterminds/html5](https://packagist.org/packages/masterminds/
| Handling of misnested tags | Parent end tags always close children | Parent end tags always close children | [Per specification](https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser) |
| Handling of data between table cells | Left as-is | Left as-is | [Per specification](https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser) |
| Handling of omitted start tags | Elements are not inserted | Elements are not inserted | Per specification |
| Handling of processing instructions | Processing instructions are retained | Processing instructions are retained | Per specification |
| Handling of processing instructions | Retained | Retained | Per specification, configurable |
| Handling of bogus XLink namespace\* | Foreign content not supported | XLink attributes are lost if preceded by bogus namespace | Bogus namespace is ignored |
| Namespace for HTML elements | Null | Per specification, configurable | Null |
| Time needed to parse single-page HTML specification | 0.5 seconds | 2.7 seconds† | 6.0 seconds‡ |

2
lib/Parser.php

@ -50,7 +50,7 @@ class Parser {
$stack = new OpenElementsStack($fragmentContext);
$tokenizer = new Tokenizer($decoder, $stack, $errorHandler);
$tokenList = $tokenizer->tokenize();
$treeBuilder = new TreeBuilder($document, $decoder, $tokenizer, $tokenList, $errorHandler, $stack, new TemplateInsertionModesStack, $fragmentContext, $fragmentQuirks);
$treeBuilder = new TreeBuilder($document, $decoder, $tokenizer, $tokenList, $errorHandler, $stack, new TemplateInsertionModesStack, $fragmentContext, $fragmentQuirks, $config);
try {
$treeBuilder->constructTree();
} catch (EncodingChangeException $e) {

2
lib/Parser/Config.php

@ -11,4 +11,6 @@ class Config {
public $encodingFallback = null;
/** @var ?bool Whether parse errors should be recorded. Recording parse errors incurs a performance penalty. */
public $errorCollection = null;
/** @var ?bool Whether to retain processing instructions rather than parsing them into comments as the HTML specification requires. Setting this true will yield non-standard documents */
public $processingInstructions = null;
}

6
lib/Parser/Token.php

@ -55,6 +55,12 @@ class CommentToken extends DataToken {
}
}
class ProcessingInstructionToken extends CommentToken {
public function __construct(string $data = '') {
parent::__construct($data);
}
}
abstract class TagToken extends Token {
# Start and end tag tokens have a tag name,
# a self-closing flag, and a list of attributes,

2
lib/Parser/Tokenizer.php

@ -506,7 +506,7 @@ class Tokenizer {
# Create a comment token whose data is the empty string.
# Reconsume in the bogus comment state.
$this->error(ParseError::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
$token = new CommentToken('');
$token = new ProcessingInstructionToken('');
$this->state = self::BOGUS_COMMENT_STATE;
goto Reconsume;
}

21
lib/Parser/TreeBuilder.php

@ -45,6 +45,9 @@ class TreeBuilder {
protected $mangledElements = false;
/** @var bool Flag used to track whether name mangling has been performed for attributes; this is a minor optimization */
protected $mangledAttributes = false;
/** @var bool Whether processing instructions should be retained rather than transformed into comments as the specification requires */
protected $processingInstructions = false;
/** @var int The quirks-mode setting of the document being built */
public $quirksMode = Parser::NO_QUIRKS_MODE;
@ -252,12 +255,14 @@ class TreeBuilder {
"frameset" => self::IN_FRAMESET_MODE,
];
public function __construct(\DOMDocument $dom, Data $data, Tokenizer $tokenizer, \Generator $tokenList, ?ParseError $errorHandler, OpenElementsStack $stack, TemplateInsertionModesStack $templateInsertionModes, ?\DOMElement $fragmentContext = null, ?int $fragmentQuirks = null) {
public function __construct(\DOMDocument $dom, Data $data, Tokenizer $tokenizer, \Generator $tokenList, ?ParseError $errorHandler, OpenElementsStack $stack, TemplateInsertionModesStack $templateInsertionModes, ?\DOMElement $fragmentContext, ?int $fragmentQuirks, ?Config $config) {
if ($dom->hasChildNodes() || $dom->doctype) {
throw new Exception(Exception::TREEBUILDER_NON_EMPTY_TARGET_DOCUMENT);
} elseif (!in_array($fragmentQuirks ?? Parser::NO_QUIRKS_MODE, [Parser::NO_QUIRKS_MODE, Parser::LIMITED_QUIRKS_MODE, Parser::QUIRKS_MODE])) {
throw new Exception(Exception::INVALID_QUIRKS_MODE);
}
$config = $config ?? new Config;
$this->DOM = $dom;
$this->fragmentContext = $fragmentContext;
$this->stack = $stack;
@ -267,6 +272,7 @@ class TreeBuilder {
$this->errorHandler = $errorHandler;
$this->activeFormattingElementsList = new ActiveFormattingElementsList;
$this->tokenList = $tokenList;
$this->processingInstructions = $config->processingInstructions ?? false;
# Parsing HTML fragments
if ($this->fragmentContext) {
@ -3954,8 +3960,19 @@ class TreeBuilder {
# 3. Create a Comment node whose data attribute is set to data and whose node
# document is the same as that of the node in which the adjusted insertion
# location finds itself.
// DEVIATION: This can optionally be a processing instruction
if (
$token instanceof ProcessingInstructionToken
&& $this->processingInstructions
// see https://www.w3.org/TR/xml/#d0e1188
&& preg_match('/^\?(?![Xx][Mm][Ll](?:[ \x{9}\x{D}\x{A}]|$))([:A-Z_a-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}][:A-Z_a-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}-\.0-9\x{B7}\x{0300}-\x{036F}\x{203F}-\x{2040}]*)(?:[ \x{9}\x{D}\x{A}](.*))?/suD', $token->data, $m)
) {
$node = $this->DOM->createProcessingInstruction($m[1], $m[2]);
} else {
$node = $this->DOM->createComment($token->data);
}
# 4. Insert the newly created node at the adjusted insertion location.
$position->appendChild($this->DOM->createComment($token->data));
$position->appendChild($node);
}
public function insertStartTagToken(StartTagToken $token, \DOMNode $intendedParent = null, string $namespace = null): \DOMElement {

10
tests/cases/TestTokenizer.php

@ -16,6 +16,7 @@ use MensBeam\HTML\Parser\CommentToken;
use MensBeam\HTML\Parser\DOCTYPEToken;
use MensBeam\HTML\Parser\EndTagToken;
use MensBeam\HTML\Parser\NullCharacterToken;
use MensBeam\HTML\Parser\ProcessingInstructionToken;
use MensBeam\HTML\Parser\StartTagToken;
use MensBeam\HTML\Parser\TokenAttr;
use MensBeam\HTML\Parser\WhitespaceToken;
@ -66,7 +67,7 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
$actual[] = $t;
}
} finally {
$actual = $this->combineCharacterTokens($actual);
$actual = $this->normalizeTokens($actual);
$errors = $this->formatErrors($errorHandler->errors);
$this->assertEquals($expected, $actual, $tokenizer->debugLog);
$this->assertEquals($expErrors, $errors, $tokenizer->debugLog);
@ -98,7 +99,8 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
return $str;
}
protected function combineCharacterTokens(array $tokens) : array {
/** Combines character tokens and converts processing instruction tokens to comment tokens */
protected function normalizeTokens(array $tokens) : array {
$out = [];
$pending = null;
foreach ($tokens as $t) {
@ -116,6 +118,10 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
$out[] = $pending;
$pending = null;
}
if ($t instanceof ProcessingInstructionToken) {
// We optionally support retaining processing instructions, but the standard tokenizer tests make no distinction
$t = new CommentToken($t->data);
}
$out[] = $t;
}
}

32
tests/cases/TestTreeConstructor.php

@ -7,6 +7,7 @@ declare(strict_types=1);
namespace MensBeam\HTML\TestCase;
use MensBeam\HTML\Parser;
use MensBeam\HTML\Parser\Config;
use MensBeam\HTML\Parser\Data;
use MensBeam\HTML\Parser\LoopException;
use MensBeam\HTML\Parser\NotImplementedException;
@ -33,6 +34,29 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideStandardTreeTests */
public function testStandardTreeTests(string $data, array $exp, array $errors, $fragment): void {
$this->runTreeTest($data, $exp, $errors, $fragment, null);
}
public function provideStandardTreeTests(): iterable {
$files = new \AppendIterator();
$files->append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/html5lib-tests/tree-construction/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
$files->append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/cases/tree-construction/mensbeam*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
return $this->parseTreeTest($files);
}
/** @dataProvider provideProcessingInstructionTreeTests */
public function testProcessingInstructionTreeTests(string $data, array $exp, array $errors, $fragment): void {
$config = new Config;
$config->processingInstructions = true;
$this->runTreeTest($data, $exp, $errors, $fragment, $config);
}
public function provideProcessingInstructionTreeTests(): iterable {
$files = new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/cases/tree-construction/pi*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME);
return $this->parseTreeTest($files);
}
protected function runTreeTest(string $data, array $exp, array $errors, ?string $fragment, ?Config $config): void {
// certain tests need to be patched to ignore unavoidable limitations of PHP's DOM
[$exp, $errors, $patched, $skip] = $this->patchTest($data, $fragment, $errors, $exp);
if (strlen($skip)) {
@ -62,7 +86,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
$stack = new OpenElementsStack($fragmentContext);
$tokenizer = new Tokenizer($decoder, $stack, $errorHandler);
$tokenList = $tokenizer->tokenize();
$treeBuilder = new TreeBuilder($doc, $decoder, $tokenizer, $tokenList, $errorHandler, $stack, new TemplateInsertionModesStack, $fragmentContext);
$treeBuilder = new TreeBuilder($doc, $decoder, $tokenizer, $tokenList, $errorHandler, $stack, new TemplateInsertionModesStack, $fragmentContext, 0, $config);
// run the tree builder
try {
$treeBuilder->constructTree();
@ -376,11 +400,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
}
}
public function provideStandardTreeTests(): iterable {
$blacklist = [];
$files = new \AppendIterator();
$files->append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/html5lib-tests/tree-construction/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
$files->append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/cases/tree-construction/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
protected function parseTreeTest(iterable $files, array $blacklist = []): iterable {
foreach ($files as $file) {
$index = 0;
$l = 0;

21
tests/cases/tree-construction/pi01.dat

@ -0,0 +1,21 @@
#data
<!DOCTYPE html><html><?test moop>
#errors
(1,23): unexpected-question-mark-instead-of-tag-name
#document
| <!DOCTYPE html>
| <html>
| <?test moop>
| <head>
| <body>
#data
<!DOCTYPE html><html><?xml oops>
#errors
(1,23): unexpected-question-mark-instead-of-tag-name
#document
| <!DOCTYPE html>
| <html>
| <!-- ?xml oops -->
| <head>
| <body>
Loading…
Cancel
Save