From aeb08b5f5dab35deadb2b954c36b2e6805a2a617 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 17 Feb 2021 14:39:24 -0500 Subject: [PATCH] Fix remaining failures Fragment-case tests still need to be harnessed to test all functionality --- lib/DOM/Element.php | 4 +- lib/OpenElementsStack.php | 22 +++++++--- lib/Parser.php | 8 ++++ lib/Stack.php | 30 +++++++------ lib/Tokenizer.php | 2 +- lib/TreeBuilder.php | 65 +++++++++++++---------------- tests/cases/TestTreeConstructor.php | 27 +++++++----- 7 files changed, 89 insertions(+), 69 deletions(-) diff --git a/lib/DOM/Element.php b/lib/DOM/Element.php index f3a419b..a79cdcf 100644 --- a/lib/DOM/Element.php +++ b/lib/DOM/Element.php @@ -10,7 +10,7 @@ class Element extends \DOMElement { // Used for template elements public $content = null; - protected $selfClosingElements = ['area', 'base', 'basefont', 'bgsound', 'br', 'col', 'embed', 'frame', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']; + protected const SELF_CLOSING_ELEMENTS = ['area', 'base', 'basefont', 'bgsound', 'br', 'col', 'embed', 'frame', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']; public function __construct(string $name, string $value = '', string $namespaceURI = '') { parent::__construct($name, $value, $namespaceURI); @@ -122,7 +122,7 @@ class Element extends \DOMElement { # If current node is an area, base, basefont, bgsound, br, col, embed, frame, # hr, img, input, link, meta, param, source, track or wbr element, then continue # on to the next child node at this point. - if (in_array($tagName, $this->selfClosingElements)) { + if (in_array($tagName, self::SELF_CLOSING_ELEMENTS)) { return $s; } diff --git a/lib/OpenElementsStack.php b/lib/OpenElementsStack.php index 617d618..aceb105 100644 --- a/lib/OpenElementsStack.php +++ b/lib/OpenElementsStack.php @@ -2,7 +2,7 @@ declare(strict_types=1); namespace dW\HTML5; -class OpenElementsStack extends \splStack { +class OpenElementsStack extends Stack { protected const IMPLIED_END_TAGS = [ 'dd' => true, 'dt' => true, @@ -85,8 +85,6 @@ class OpenElementsStack extends \splStack { ], ]; - - protected $fragmentCase; protected $fragmentContext; @@ -132,9 +130,9 @@ class OpenElementsStack extends \splStack { return -1; } - public function findSame(\DOMElement $node): int { + public function findSame(\DOMElement $target): int { foreach ($this as $k => $node) { - if ($node->isSameNode($node)) { + if ($node->isSameNode($target)) { return $k; } } @@ -242,6 +240,7 @@ class OpenElementsStack extends \splStack { # if the top of the stack — an html element — is reached.) } } + assert(false, new \Exception((string) $this)); } public function __get($property) { @@ -259,7 +258,7 @@ class OpenElementsStack extends \splStack { $adjustedCurrentNode = $this->__get('adjustedCurrentNode'); return (!is_null($adjustedCurrentNode)) ? $adjustedCurrentNode->namespaceURI: null; case 'currentNode': - return $this->isEmpty() ? null : $this->top(); + return $this->top(); case 'currentNodeName': $currentNode = $this->__get('currentNode'); return ($currentNode && $currentNode->nodeType) ? $currentNode->nodeName : null; @@ -270,4 +269,15 @@ class OpenElementsStack extends \splStack { return null; } } + + public function __toString(): string { + $out = []; + foreach ($this as $node) { + $ns = $node->namespaceURI ?? Parser::HTML_NAMESPACE; + $prefix = Parser::NAMESPACE_MAP[$ns] ?? "?"; + $prefix .= $prefix ? " " : ""; + $out[] = $prefix.$node->nodeName; + } + return implode(" < ", $out); + } } diff --git a/lib/Parser.php b/lib/Parser.php index cca0f47..fe111d5 100644 --- a/lib/Parser.php +++ b/lib/Parser.php @@ -47,6 +47,14 @@ class Parser { const XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'; const XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'; + const NAMESPACE_MAP = [ + self::HTML_NAMESPACE => "", + self::MATHML_NAMESPACE => "math", + self::SVG_NAMESPACE => "svg", + self::XLINK_NAMESPACE => "xlink", + self::XML_NAMESPACE => "xml", + self::XMLNS_NAMESPACE => "xmlns", + ]; // Protected construct used for creating an instance to access properties which must // be reset on every parse diff --git a/lib/Stack.php b/lib/Stack.php index 35e8895..abd4e58 100644 --- a/lib/Stack.php +++ b/lib/Stack.php @@ -2,7 +2,7 @@ declare(strict_types=1); namespace dW\HTML5; -abstract class Stack implements \ArrayAccess { +abstract class Stack implements \ArrayAccess, \Countable, \IteratorAggregate { protected $_storage = []; protected $fragmentCase; protected $fragmentContext; @@ -23,27 +23,33 @@ abstract class Stack implements \ArrayAccess { public function offsetUnset($offset) { assert($offset >= 0 && $offset < count($this->_storage), new Exception(Exception::STACK_INVALID_INDEX, $offset)); - - unset($this->_storage[$offset]); - // Reindex the array. - $this->_storage = array_values($this->_storage); + array_splice($this->_storage, $offset, 1, []); } public function offsetGet($offset) { assert($offset >= 0 && $offset < count($this->_storage), new Exception(Exception::STACK_INVALID_INDEX, $offset)); - return $this->_storage[$offset]; } + + public function count(): int { + return count($this->_storage); + } + + public function getIterator(): \Traversable { + for ($a = count($this->_storage) - 1; $a > -1; $a--) { + yield $a => $this->_storage[$a]; + } + } public function pop() { return array_pop($this->_storage); } - public function __get($property) { - switch ($property) { - case 'length': return count($this->_storage); - break; - default: return null; - } + public function isEmpty(): bool { + return !$this->_storage; + } + + public function top() { + return ($c = count($this->_storage)) > 0 ? $this->_storage[$c - 1] : null; } } diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 43fa6bf..b2e0b80 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -2097,7 +2097,7 @@ class Tokenizer { # Create a comment token whose data is the "[CDATA[" string. # Switch to the bogus comment state. $this->data->consume(7); - if ($this->stack->adjustedCurrentNode && $this->stack->adjustedCurrentNode->namespaceURI !== Parser::HTML_NAMESPACE) { + if ($this->stack->adjustedCurrentNode && ($this->stack->adjustedCurrentNode->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE) { $this->state = self::CDATA_SECTION_STATE; } else { $this->error(ParseError::CDATA_IN_HTML_CONTENT); diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index 85ed3df..ac2ef72 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -209,7 +209,7 @@ class TreeBuilder { Parser::SVG_NAMESPACE => ['foreignObject', 'desc', 'title'], ]; - public function __construct(Document $dom, $formElement, bool $fragmentCase = false, $fragmentContext = null, OpenElementsStack $stack, TemplateInsertionModesStack $templateInsertionModes, Tokenizer $tokenizer, ParseError $errorHandler, Data $data) { + public function __construct(Document $dom, Data $data, Tokenizer $tokenizer, ParseError $errorHandler, OpenElementsStack $stack, TemplateInsertionModesStack $templateInsertionModes, ?\DOMElement $formElement = null, bool $fragmentCase = false, $fragmentContext = null) { // If the form element isn't an instance of DOMElement that has a node name of // "form" or null then there's a problem. if (!is_null($formElement) && !($formElement instanceof \DOMElement && $formElement->nodeName === 'form')) { @@ -250,7 +250,7 @@ class TreeBuilder { // Loop used for reprocessing. $iterations = 0; while (true) { - assert($iterations++ < 50, new LoopException("Probable infinite loop detected in HTML content handling")); + assert($iterations++ < 50, new LoopException("Probable infinite loop detected in HTML content handling (outer reprocessing)")); $adjustedCurrentNode = $this->stack->adjustedCurrentNode; $adjustedCurrentNodeName = $this->stack->adjustedCurrentNodeName; assert(!$adjustedCurrentNode || $adjustedCurrentNodeName, new \Exception("The adjusted current node must have a name if not null")); @@ -321,11 +321,11 @@ class TreeBuilder { protected function parseTokenInHTMLContent(Token $token, int $insertionMode = null): bool { $iterations = 0; ProcessToken: - assert($iterations++ < 50, new LoopException("Probable infinite loop detected in HTML content handling")); + assert($iterations++ < 50, new LoopException("Probable infinite loop detected in HTML content handling (inner reprocessing)")); $insertionMode = $insertionMode ?? $this->insertionMode; assert((function() use ($insertionMode) { $mode = self::INSERTION_MODE_NAMES[$insertionMode] ?? $insertionMode; - $this->debugLog .= " Mode: $mode\n"; + $this->debugLog .= " Mode: $mode (".(string) $this->stack.")\n"; return true; })()); @@ -513,7 +513,7 @@ class TreeBuilder { # Create an element for the token in the HTML namespace, with the Document as # the intended parent. Append it to the Document object. Put this element in the # stack of open elements. - $element = $this->insertStartTagToken($token, $this->DOM); + $this->insertStartTagToken($token, $this->DOM); # Switch the insertion mode to "before head". $this->insertionMode = self::BEFORE_HEAD_MODE; @@ -539,7 +539,7 @@ class TreeBuilder { } # The document element can end up being removed from the Document object, e.g., - # by scripts; nothing in particular happens in such cases, content goto ProcessTokens + # by scripts; nothing in particular happens in such cases, content continues # being appended to the nodes as described in the next section. // Good to know. There's no scripting in this implementation, though. } @@ -579,7 +579,7 @@ class TreeBuilder { # An end tag whose tag name is one of: "head", "body", "html", "br" # Act as described in the "anything else" entry below. # Any other end tag - elseif ($token instanceof EndTagToken && $token->name !== 'head' && $token->name !== 'body' && $token->name !== 'html' && $token->name === 'br') { + elseif ($token instanceof EndTagToken && $token->name !== 'head' && $token->name !== 'body' && $token->name !== 'html' && $token->name !== 'br') { # Parse error. Ignore the token. $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } @@ -1028,7 +1028,7 @@ class TreeBuilder { # A start tag whose tag name is "html" if ($token->name === 'html') { # Parse error. - $this->error(ParseError::UNEXPECTED_START_TAG, 'html'); + $this->error(ParseError::UNEXPECTED_START_TAG); # If there is a template element on the stack of open elements, then ignore the # token. if ($this->stack->find('template') === -1) { @@ -1052,17 +1052,16 @@ class TreeBuilder { # A start tag whose tag name is "body" elseif ($token->name === 'body') { # Parse error. - $this->error(ParseError::UNEXPECTED_START_TAG, 'body'); + $this->error(ParseError::UNEXPECTED_START_TAG); # If the second element on the stack of open elements is not a body element, if # the stack of open elements has only one node on it, or if there is a template # element on the stack of open elements, then ignore the token. (fragment case) - if (!($this->stack[1]->tagName !== 'body' || count($this->stack) === 1 || $this->stack->find('template') !== -1)) { + if (!(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body' || $this->stack->find('template') > -1)) { # Otherwise, set the frameset-ok flag to "not ok"; then, for each attribute on # the token, check to see if the attribute is already present on the body # element (the second element) on the stack of open elements, and if it is not, # add the attribute and its corresponding value to that element. $this->framesetOk = false; - $body = $this->stack[1]; foreach ($token->attributes as $a) { if (!$body->hasAttribute($a->name)) { @@ -1168,9 +1167,9 @@ class TreeBuilder { # A start tag whose tag name is "form" elseif ($token->name === 'form') { # If the form element pointer is not null, and there is no template element on - # the stack of open elements, then this is a parse error; ignore the token. - $templateInStack = ($this->stack->find('template') !== -1); - if (!is_null($this->formElement) && !$templateInStack) { + # the stack of open elements, then this is a parse error; ignore the token. + $templateInStack = ($this->stack->find('template') > -1); + if ($this->formElement && !$templateInStack) { $this->error(ParseError::UNEXPECTED_START_TAG, $token->name); } # Otherwise: @@ -1185,7 +1184,7 @@ class TreeBuilder { # the stack of open elements, set the form element pointer to point to the # element created. $form = $this->insertStartTagToken($token); - if ($templateInStack) { + if (!$templateInStack) { $this->formElement = $form; } } @@ -1197,10 +1196,8 @@ class TreeBuilder { # 2. Initialize node to be the current node (the bottommost node of the stack). # 3. Loop: If node is an li element, then run these substeps: - for ($i = count($this->stack) - 1; $i >= 0; $i--) { - $node = $this->stack[$i]; + foreach ($this->stack as $node) { $nodeName = $node->nodeName; - if ($nodeName === 'li') { # 1. Generate implied end tags, except for li elements. $this->stack->generateImpliedEndTags("li"); @@ -1244,8 +1241,7 @@ class TreeBuilder { $this->framesetOk = false; # 2. Initialize node to be the current node (the bottommost node of the stack). - for ($i = count($this->stack) - 1; $i >= 0; $i--) { - $node = $this->stack[$i]; + foreach ($this->stack as $node) { $nodeName = $node->nodeName; // Combining these two sets of instructions as they're identical except for the @@ -1347,8 +1343,7 @@ class TreeBuilder { # An end tag whose tag name is "template" if ($token->name === 'template') { # Process the token using the rules for the "in head" insertion mode. - $insertionMode = self::IN_HEAD_MODE; - goto ProcessToken; + return $this->parseTokenInHTMLContent($token, self::IN_HEAD_MODE); } # An end tag whose tag name is "body" # An end tag whose tag name is "html" @@ -1369,7 +1364,7 @@ class TreeBuilder { $this->error(ParseError::UNEXPECTED_END_TAG); } # Switch the insertion mode to "after body". - $this->insertionMode = self::AFTER_BODY_MODE; + $insertionMode = $this->insertionMode = self::AFTER_BODY_MODE; // The only thing different between body and html here is that when processing // an html end tag the token is reprocessed. if ($token->name === 'html') { @@ -1410,14 +1405,14 @@ class TreeBuilder { # If there is no template element on the stack of open elements, then run these # substeps: if ($this->stack->find('template') === -1) { - # 1. Let node be the element that the form element pointer is set to, or null if it - # is not set to an element. + # 1. Let node be the element that the form element pointer is set to, + # or null if it is not set to an element. $node = $this->formElement; # 2. Set the form element pointer to null. $this->formElement = null; # 3. If node is null or if the stack of open elements does not have node in # scope, then this is a parse error; return and ignore the token. - if (is_null($node) || !$this->stack->hasElementInScope($node)) { + if (!$node || !$this->stack->hasElementInScope($node)) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); return true; } @@ -1460,8 +1455,7 @@ class TreeBuilder { elseif ($token instanceof EOFToken) { # If the stack of template insertion modes is not empty, then process the token using the rules for the "in template" insertion mode. if (count($this->templateInsertionModes) !== 0) { - $insertionMode = self::IN_TEMPLATE_MODE; - goto ProcessToken; + return $this->parseTokenInHTMLContent($token, self::IN_TEMPLATE_MODE); } # Otherwise, follow these steps: @@ -1475,7 +1469,7 @@ class TreeBuilder { } # 2. Stop parsing. - // Abort! + return true; } } // IMPLEMENTATION PENDING @@ -1716,8 +1710,8 @@ class TreeBuilder { # 1. If there was an override target specified, then let target be the override # target. - $target = (!is_null($overrideTarget)) ? $overrideTarget : $this->stack->currentNode; - + $target = $overrideTarget ?? $this->stack->currentNode; + assert(isset($target), new \Exception("Open elements stack is empty")); # 2. Determine the adjusted insertion location using the first matching steps # from the following list: If foster parenting is enabled and target is a table, # tbody, tfoot, thead, or tr element @@ -1869,10 +1863,7 @@ class TreeBuilder { } public function insertStartTagToken(StartTagToken $token, \DOMNode $intendedParent = null, string $namespace = null): Element { - if (!is_null($namespace)) { - $namespace = $token->namespace; - } - + $namespace = $namespace ?? $token->namespace; # When the steps below require the UA to create an element for a token in a # particular given namespace and with a particular intended parent, the UA must # run the following steps: @@ -1901,10 +1892,10 @@ class TreeBuilder { # 8. Append each attribute in the given token to element. foreach ($token->attributes as $a) { - if (!isset($namespace) || $namespace === Parser::HTML_NAMESPACE) { + if (!$a->namespace || $a->namespace === Parser::HTML_NAMESPACE) { $element->setAttribute($a->name, $a->value); } else { - $element->setAttributeNS($namespace, $a->name, $a->value); + $element->setAttributeNS($a->namespace, $a->name, $a->value); } } diff --git a/tests/cases/TestTreeConstructor.php b/tests/cases/TestTreeConstructor.php index 70e3147..841dfab 100644 --- a/tests/cases/TestTreeConstructor.php +++ b/tests/cases/TestTreeConstructor.php @@ -18,12 +18,6 @@ use dW\HTML5\TreeBuilder; * @covers \dW\HTML5\TreeBuilder */ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { - protected const NS = [ - Parser::HTML_NAMESPACE => "", - Parser::SVG_NAMESPACE => "svg ", - Parser::MATHML_NAMESPACE => "math ", - ]; - protected $out; protected $depth; @@ -33,8 +27,10 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { $this->markTestIncomplete("Fragment tests still to be implemented"); } // certain tests need to be patched to ignore unavoidable limitations of PHP's DOM - [$exp, $patched] = $this->patchTest($data, $fragment, $exp); - if ($patched) { + [$exp, $patched, $skip] = $this->patchTest($data, $fragment, $exp); + if (strlen($skip)) { + $this->markTestSkipped($skip); + } elseif ($patched) { $this->markAsRisky(); } // convert parse error constants into standard symbols in specification @@ -55,16 +51,20 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { $stack = new OpenElementsStack; $tokenizer = new Tokenizer($decoder, $stack, $errorHandler); $doc = new Document; - $treeBuilder = new TreeBuilder($doc, null, false, null, $stack, new TemplateInsertionModesStack, $tokenizer, $errorHandler, $decoder); + $treeBuilder = new TreeBuilder($doc, $decoder, $tokenizer, $errorHandler, $stack, new TemplateInsertionModesStack); // run the tree builder try { do { $token = $tokenizer->createToken(); $treeBuilder->emitToken($token); } while (!$token instanceof EOFToken); + } catch (\DOMException $e) { + $this->markTestSkipped('Requires implementation of the "Coercing an HTML DOM into an infoset" specification section'); + return; } catch (LoopException $e) { $act = $this->serializeTree($doc); $this->assertEquals($exp, $act, $e->getMessage()."\n".$treeBuilder->debugLog); + throw $e; } catch (NotImplementedException $e) { $this->markTestSkipped($e->getMessage()); return; @@ -76,6 +76,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { protected function patchTest(string $data, $fragment, array $exp): array { $patched = false; + $skip = ""; // comments outside the root element are silently dropped by the PHP DOM for ($a = 0; $a < sizeof($exp); $a++) { if (strpos($exp[$a], "|