From 3f23040e1dce8be313599d5b5bf38113cf5a205a Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 10 Mar 2021 22:42:53 -0500 Subject: [PATCH] Fix most parse error counts More remain, though most have been addressed --- lib/ParseError.php | 6 ++ lib/TreeBuilder.php | 51 ++++++++---- tests/cases/TestTreeConstructor.php | 120 ++++++++++++++++++++++++++-- 3 files changed, 155 insertions(+), 22 deletions(-) diff --git a/lib/ParseError.php b/lib/ParseError.php index 1fd314c..d57685a 100644 --- a/lib/ParseError.php +++ b/lib/ParseError.php @@ -71,6 +71,9 @@ class ParseError { const UNEXPECTED_EOF = 212; const UNEXPECTED_PARENT = 213; const UNEXPECTED_ATTRIBUTE_VALUE = 214; + const FOSTERED_START_TAG = 215; + const FOSTERED_END_TAG = 216; + const FOSTERED_CHAR = 217; const MESSAGES = [ self::EXPECTED_DOCTYPE_BUT_GOT_START_TAG => 'Expected DOCTYPE but got start tag <%s>', @@ -87,6 +90,9 @@ class ParseError { self::UNEXPECTED_EOF => 'Unexpected end of file', self::UNEXPECTED_PARENT => 'Start tag <%s> not valid in parent <%s>', self::UNEXPECTED_ATTRIBUTE_VALUE => 'Unexpected value in attribute "%s"', + self::FOSTERED_START_TAG => 'Start tag <%s> moved to before table', + self::FOSTERED_END_TAG => 'End tag moved to before table', + self::FOSTERED_CHAR => 'Character moved to before table', self::ENCODING_ERROR => 'Corrupt encoding near byte position %s', self::UNEXPECTED_NULL_CHARACTER => 'Unexpected null character', diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index 6c76ce2..40e292e 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -1214,6 +1214,12 @@ class TreeBuilder { $nextToken->data = substr($nextToken->data, 1); } } + // FIXME: Don't process the next token if it's an EOFToken; + // This hack should be removed when the tree builder is + // refactored into a single function call + if ($nextToken instanceof EOFToken) { + return true; + } // Process the next token $token = $nextToken; goto ProcessToken; @@ -1516,6 +1522,12 @@ class TreeBuilder { $this->framesetOk = false; # Switch the insertion mode to "text". $insertionMode = $this->insertionMode = self::TEXT_MODE; + // FIXME: Don't process the next token if it's an EOFToken; + // This hack should be removed when the tree builder is + // refactored into a single function call + if ($nextToken instanceof EOFToken) { + return true; + } // Process the next token $token = $nextToken; goto ProcessToken; @@ -1886,7 +1898,7 @@ class TreeBuilder { # in the next entry; i.e. act as if this was a "br" start tag token with # no attributes, rather than the end tag token that it actually is. $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); - return $this->parseTokenInHTMLContent(new StartTagToken("br")); + return $this->parseTokenInHTMLContent(new StartTagToken("br"), $insertionMode); } # Any other end tag else { @@ -2174,7 +2186,13 @@ class TreeBuilder { # Parse error. Enable foster parenting, process the token # using the rules for the "in body" insertion mode, and # then disable foster parenting. - // TODO: parse error + if ($token instanceof CharacterToken) { + $this->error(ParseError::FOSTERED_CHAR); + } elseif ($token instanceof StartTagToken) { + $this->error(ParseError::FOSTERED_START_TAG, $token->name); + } elseif ($token instanceof EndTagToken) { + $this->error(ParseError::FOSTERED_END_TAG, $token->name); + } $this->fosterParenting = true; $result = $this->parseTokenInHTMLContent($token, self::IN_BODY_MODE); $this->fosterParenting = false; @@ -2600,7 +2618,7 @@ class TreeBuilder { # Now, if the current node is not an HTML element with # the same tag name as the token, then this is # a parse error. - if (!$this->stack->hasElementInTableScope($token->name)) { + if ($this->stack->currentNodeName !== $token->name || $this->stack->currentNodeNamespace !== null) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } # Pop elements from the stack of open elements stack @@ -3723,30 +3741,33 @@ class TreeBuilder { elseif ($token instanceof EndTagToken) { # Run these steps: # - # 1. Initialize node to be the current node (the bottommost node of the stack). + # Initialize node to be the current node (the bottommost node of the stack). // We do this below in the loop - # 2. If node is not an element with the same tag name as the token, then this is - # a parse error. - if ($this->stack->currentNodeName !== $token->name) { - $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); - } - # 3. Loop: If node is the topmost element in the stack of open elements, then return. (fragment case) + # If node's tag name, converted to ASCII lowercase, is not the + # same as the tag name of the token, then this is a parse error. + // DEVIATION: We only generate the parse error if we don't reach + // "Otherwise" below, to avoid reporting the parse error a second + // time in HTML content parsing + # Loop: If node is the topmost element in the stack of open elements, then return. (fragment case) $pos = count($this->stack) - 1; while ($pos > 0 && ($node = $this->stack[$pos])->namespaceURI !== null) { # If node's tag name, converted to ASCII lowercase, is the same as the # tag name of the token, pop elements from the stack of open elements until node # has been popped from the stack, and then abort these steps. if (strtolower($node->nodeName) === $token->name) { + if (strtolower($this->stack->currentNodeName) !== $token->name) { + $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); + } $this->stack->popUntilSame($node); return true; } - # 4. Set node to the previous entry in the stack of open elements. + # Set node to the previous entry in the stack of open elements. $pos--; - # 5. If node is not an element in the HTML namespace, return to the step labeled - # loop. + # If node is not an element in the HTML namespace, return to the step labeled + # loop. // See loop condition above } - # 6. Otherwise, process the token according to the rules given in the section + # Otherwise, process the token according to the rules given in the section # corresponding to the current insertion mode in HTML content. return $this->parseTokenInHTMLContent($token, $this->insertionMode); } @@ -4182,7 +4203,7 @@ class TreeBuilder { $this->stack->generateImpliedEndTags(); # If the current node is not now a td element or a th element, # then this is a parse error. - if (!in_array($this->stack->currntNodeName, ["td", "th"])) { + if (!in_array($this->stack->currentNodeName, ["td", "th"])) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } # Pop elements from the stack of open elements stack until a td diff --git a/tests/cases/TestTreeConstructor.php b/tests/cases/TestTreeConstructor.php index 1f77ad5..cf582fd 100644 --- a/tests/cases/TestTreeConstructor.php +++ b/tests/cases/TestTreeConstructor.php @@ -30,7 +30,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { /** @dataProvider provideStandardTreeTests */ public function testStandardTreeTests(string $data, array $exp, array $errors, $fragment): void { // certain tests need to be patched to ignore unavoidable limitations of PHP's DOM - [$exp, $patched, $skip] = $this->patchTest($data, $fragment, $exp); + [$exp, $errors, $patched, $skip] = $this->patchTest($data, $fragment, $errors, $exp); if (strlen($skip)) { $this->markTestSkipped($skip); } elseif ($patched) { @@ -43,10 +43,10 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { return is_int($v); }))); // create a stub error handler which collects parse errors - $errors = []; + $actualErrors = []; $errorHandler = $this->createStub(ParseError::class); - $errorHandler->method("emit")->willReturnCallback(function($file, $line, $col, $code) use (&$errors, $errorMap) { - $errors[] = ['code' => $errorMap[$code], 'line' => $line, 'col' => $col]; + $errorHandler->method("emit")->willReturnCallback(function($file, $line, $col, $code) use (&$actualErrors, $errorMap) { + $actualErrors[] = ['code' => $errorMap[$code], 'line' => $line, 'col' => $col]; return true; }); // initialize the output document @@ -89,10 +89,13 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { } $act = $this->balanceTree($this->serializeTree($doc, (bool) $fragmentContext), $exp); $this->assertEquals($exp, $act, $treeBuilder->debugLog); - // TODO: evaluate errors + if ($errors !== false) { + // If $errors is false, the test does not include errors when there are in fact errors + $this->assertCount(sizeof($errors), $actualErrors, var_export($actualErrors, true)); + } } - protected function patchTest(string $data, $fragment, array $exp): array { + protected function patchTest(string $data, $fragment, array $errors, array $exp): array { $patched = false; $skip = ""; // comments outside the root element are silently dropped by the PHP DOM @@ -104,7 +107,110 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { } } } - return [$exp, $patched, $skip]; + // some tests don't document errors when they should + if (!$errors && in_array($data, [ + // template.dat + '