Browse Source

Fix most parse error counts

More remain, though most have been addressed
ns
J. King 3 years ago
parent
commit
3f23040e1d
  1. 6
      lib/ParseError.php
  2. 51
      lib/TreeBuilder.php
  3. 120
      tests/cases/TestTreeConstructor.php

6
lib/ParseError.php

@ -71,6 +71,9 @@ class ParseError {
const UNEXPECTED_EOF = 212;
const UNEXPECTED_PARENT = 213;
const UNEXPECTED_ATTRIBUTE_VALUE = 214;
const FOSTERED_START_TAG = 215;
const FOSTERED_END_TAG = 216;
const FOSTERED_CHAR = 217;
const MESSAGES = [
self::EXPECTED_DOCTYPE_BUT_GOT_START_TAG => 'Expected DOCTYPE but got start tag <%s>',
@ -87,6 +90,9 @@ class ParseError {
self::UNEXPECTED_EOF => 'Unexpected end of file',
self::UNEXPECTED_PARENT => 'Start tag <%s> not valid in parent <%s>',
self::UNEXPECTED_ATTRIBUTE_VALUE => 'Unexpected value in attribute "%s"',
self::FOSTERED_START_TAG => 'Start tag <%s> moved to before table',
self::FOSTERED_END_TAG => 'End tag </%s> moved to before table',
self::FOSTERED_CHAR => 'Character moved to before table',
self::ENCODING_ERROR => 'Corrupt encoding near byte position %s',
self::UNEXPECTED_NULL_CHARACTER => 'Unexpected null character',

51
lib/TreeBuilder.php

@ -1214,6 +1214,12 @@ class TreeBuilder {
$nextToken->data = substr($nextToken->data, 1);
}
}
// FIXME: Don't process the next token if it's an EOFToken;
// This hack should be removed when the tree builder is
// refactored into a single function call
if ($nextToken instanceof EOFToken) {
return true;
}
// Process the next token
$token = $nextToken;
goto ProcessToken;
@ -1516,6 +1522,12 @@ class TreeBuilder {
$this->framesetOk = false;
# Switch the insertion mode to "text".
$insertionMode = $this->insertionMode = self::TEXT_MODE;
// FIXME: Don't process the next token if it's an EOFToken;
// This hack should be removed when the tree builder is
// refactored into a single function call
if ($nextToken instanceof EOFToken) {
return true;
}
// Process the next token
$token = $nextToken;
goto ProcessToken;
@ -1886,7 +1898,7 @@ class TreeBuilder {
# in the next entry; i.e. act as if this was a "br" start tag token with
# no attributes, rather than the end tag token that it actually is.
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
return $this->parseTokenInHTMLContent(new StartTagToken("br"));
return $this->parseTokenInHTMLContent(new StartTagToken("br"), $insertionMode);
}
# Any other end tag
else {
@ -2174,7 +2186,13 @@ class TreeBuilder {
# Parse error. Enable foster parenting, process the token
# using the rules for the "in body" insertion mode, and
# then disable foster parenting.
// TODO: parse error
if ($token instanceof CharacterToken) {
$this->error(ParseError::FOSTERED_CHAR);
} elseif ($token instanceof StartTagToken) {
$this->error(ParseError::FOSTERED_START_TAG, $token->name);
} elseif ($token instanceof EndTagToken) {
$this->error(ParseError::FOSTERED_END_TAG, $token->name);
}
$this->fosterParenting = true;
$result = $this->parseTokenInHTMLContent($token, self::IN_BODY_MODE);
$this->fosterParenting = false;
@ -2600,7 +2618,7 @@ class TreeBuilder {
# Now, if the current node is not an HTML element with
# the same tag name as the token, then this is
# a parse error.
if (!$this->stack->hasElementInTableScope($token->name)) {
if ($this->stack->currentNodeName !== $token->name || $this->stack->currentNodeNamespace !== null) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# Pop elements from the stack of open elements stack
@ -3723,30 +3741,33 @@ class TreeBuilder {
elseif ($token instanceof EndTagToken) {
# Run these steps:
#
# 1. Initialize node to be the current node (the bottommost node of the stack).
# Initialize node to be the current node (the bottommost node of the stack).
// We do this below in the loop
# 2. If node is not an element with the same tag name as the token, then this is
# a parse error.
if ($this->stack->currentNodeName !== $token->name) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# 3. Loop: If node is the topmost element in the stack of open elements, then return. (fragment case)
# If node's tag name, converted to ASCII lowercase, is not the
# same as the tag name of the token, then this is a parse error.
// DEVIATION: We only generate the parse error if we don't reach
// "Otherwise" below, to avoid reporting the parse error a second
// time in HTML content parsing
# Loop: If node is the topmost element in the stack of open elements, then return. (fragment case)
$pos = count($this->stack) - 1;
while ($pos > 0 && ($node = $this->stack[$pos])->namespaceURI !== null) {
# If node's tag name, converted to ASCII lowercase, is the same as the
# tag name of the token, pop elements from the stack of open elements until node
# has been popped from the stack, and then abort these steps.
if (strtolower($node->nodeName) === $token->name) {
if (strtolower($this->stack->currentNodeName) !== $token->name) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
$this->stack->popUntilSame($node);
return true;
}
# 4. Set node to the previous entry in the stack of open elements.
# Set node to the previous entry in the stack of open elements.
$pos--;
# 5. If node is not an element in the HTML namespace, return to the step labeled
# loop.
# If node is not an element in the HTML namespace, return to the step labeled
# loop.
// See loop condition above
}
# 6. Otherwise, process the token according to the rules given in the section
# Otherwise, process the token according to the rules given in the section
# corresponding to the current insertion mode in HTML content.
return $this->parseTokenInHTMLContent($token, $this->insertionMode);
}
@ -4182,7 +4203,7 @@ class TreeBuilder {
$this->stack->generateImpliedEndTags();
# If the current node is not now a td element or a th element,
# then this is a parse error.
if (!in_array($this->stack->currntNodeName, ["td", "th"])) {
if (!in_array($this->stack->currentNodeName, ["td", "th"])) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# Pop elements from the stack of open elements stack until a td

120
tests/cases/TestTreeConstructor.php

@ -30,7 +30,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideStandardTreeTests */
public function testStandardTreeTests(string $data, array $exp, array $errors, $fragment): void {
// certain tests need to be patched to ignore unavoidable limitations of PHP's DOM
[$exp, $patched, $skip] = $this->patchTest($data, $fragment, $exp);
[$exp, $errors, $patched, $skip] = $this->patchTest($data, $fragment, $errors, $exp);
if (strlen($skip)) {
$this->markTestSkipped($skip);
} elseif ($patched) {
@ -43,10 +43,10 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
return is_int($v);
})));
// create a stub error handler which collects parse errors
$errors = [];
$actualErrors = [];
$errorHandler = $this->createStub(ParseError::class);
$errorHandler->method("emit")->willReturnCallback(function($file, $line, $col, $code) use (&$errors, $errorMap) {
$errors[] = ['code' => $errorMap[$code], 'line' => $line, 'col' => $col];
$errorHandler->method("emit")->willReturnCallback(function($file, $line, $col, $code) use (&$actualErrors, $errorMap) {
$actualErrors[] = ['code' => $errorMap[$code], 'line' => $line, 'col' => $col];
return true;
});
// initialize the output document
@ -89,10 +89,13 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
}
$act = $this->balanceTree($this->serializeTree($doc, (bool) $fragmentContext), $exp);
$this->assertEquals($exp, $act, $treeBuilder->debugLog);
// TODO: evaluate errors
if ($errors !== false) {
// If $errors is false, the test does not include errors when there are in fact errors
$this->assertCount(sizeof($errors), $actualErrors, var_export($actualErrors, true));
}
}
protected function patchTest(string $data, $fragment, array $exp): array {
protected function patchTest(string $data, $fragment, array $errors, array $exp): array {
$patched = false;
$skip = "";
// comments outside the root element are silently dropped by the PHP DOM
@ -104,7 +107,110 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
}
}
}
return [$exp, $patched, $skip];
// some tests don't document errors when they should
if (!$errors && in_array($data, [
// template.dat
'<template><a><table><a>',
// tests6.dat
'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html></html>',
// tests8.dat
'<table><li><li></table>',
// webkit01.dat
'<table><tr><td><svg><desc><td></desc><circle>',
// webkit02.dat
'<legend>test</legend>',
'<table><input>',
'<b><em><foo><foo><aside></b>',
'<b><em><foo><foo><aside></b></em>',
'<b><em><foo><foo><foo><aside></b>',
'<b><em><foo><foo><foo><aside></b></em>',
'<b><em><foo><foo><foo><foo><foo><foo><foo><foo><foo><foo><aside></b></em>',
'<b><em><foo><foob><foob><foob><foob><fooc><fooc><fooc><fooc><food><aside></b></em>',
'<option><XH<optgroup></optgroup>',
'<svg><foreignObject><div>foo</div><plaintext></foreignObject></svg><div>bar</div>',
'<svg><foreignObject></foreignObject><title></svg>foo',
'</foreignObject><plaintext><div>foo</div>',
])) {
$errors = false;
}
if ($errors) {
// some "old" errors are made redundant by "new" errors
$obsoleteSymbolList = implode("|", [
"illegal-codepoint-for-numeric-entity",
"eof-in-attribute-value-double-quote",
"non-void-element-with-trailing-solidus",
"invalid-character-in-attribute-name",
"attributes-in-end-tag",
"expected-tag-name",
"unexpected-character-after-solidus-in-tag",
"expected-closing-tag-but-got-char",
"eof-in-tag-name",
"need-space-after-doctype",
"expected-doctype-name-but-got-right-bracket",
"expected-dashes-or-doctype",
"expected-space-or-right-bracket-in-doctype",
"unexpected-char-in-comment",
"eof-in-comment-double-dash",
"expected-named-entity",
"named-entity-without-semicolon",
"numeric-entity-without-semicolon",
"expected-numeric-entity",
"eof-in-attribute-name",
"unexpected-eof-in-text-mode",
"unexpected-EOF-after-solidus-in-tag",
"expected-attribute-name-but-got-eof",
"eof-in-script-in-script",
"expected-script-data-but-got-eof",
"unexpected-EOF-in-text-mode",
"expected-tag-name-but-got-question-mark",
"incorrect-comment",
]);
for ($a = 0, $stop = sizeof($errors); $a < $stop; $a++) {
if (preg_match("/^\(\d+,\d+\):? ($obsoleteSymbolList)$/", $errors[$a])) {
// these errors are redundant with "new" errors
unset($errors[$a]);
}
}
$errors = array_values($errors);
// some other errors appear to document implementation details rather than what the specificatioon dictates
for ($a = 0, $stop = sizeof($errors); $a < $stop; $a++) {
if (
preg_match("/^\(\d+,\d+\): unexpected-end-tag-in-special-element$/", $errors[$a])
|| preg_match('/^\d+: Unclosed element “[^”]+”\.$/u', $errors[$a])
|| ($data === '<!---x' && $errors[$a] === "(1:7) eof-in-comment")
|| ($data === "<!DOCTYPE html><body><table><caption><math><mi>foo</mi><mi>bar</mi>baz</table><p>quux" && $errors[$a] === "(1,78) expected-one-end-tag-but-got-another")
|| ($data === "<!DOCTYPE html><!-- XXX - XXX" && $errors[$a] === "(1,29): eof-in-comment")
|| ($data === "<!DOCTYPE html><!-- X" && $errors[$a] === "(1,21): eof-in-comment")
|| ($data === "<!doctype html><math></html>" && $errors[$a] === "(1,28): expected-one-end-tag-but-got-another")
|| ($data === "</" && $errors[$a] === "(1,2): expected-closing-tag-but-got-eof")
) {
// these errors seems to simply be redundant
unset($errors[$a]);
}
}
$errors = array_values($errors);
// other errors are spurious, or are for runs of character tokens
for ($a = 0, $stop = sizeof($errors); $a < $stop; $a++) {
if (preg_match("/^\((\d+),(\d+)\):? (foster-parenting-character(?:-in-table)?|unexpected-character-in-colgroup|unexpected-char-after-frameset|unexpected-char-in-frameset|expected-eof-but-got-char)$/", $errors[$a], $m1) && preg_match("/^\((\d+),(\d+)\):? $m1[3]$/", $errors[$a + 1] ?? "", $m2)) {
// if the next error is also a character error at the next or same character position, this implies a run of characters where we only have one token
// technically we should be reporting each one, so this is properly a FIXME
if ($m1[1] == $m2[1] && ($m1[2] + 1 == $m2[2] || $m1[2] == $m2[2])) {
unset($errors[$a]);
$patched = true;
}
} elseif (preg_match("/^foster-parenting text /", $errors[$a]) && preg_match("/^foster-parenting text /", $errors[$a + 1] ?? "")) {
// template tests have a different format of error message
unset($errors[$a]);
$patched = true;
} elseif (preg_match("/^\((\d+,\d+)\):? unexpected-end-tag$/", $errors[$a], $m) && preg_match("/^\($m[1]\):? (unexpected-end-tag|end-tag-too-early|expected-one-end-tag-but-got-another)$/", $errors[$a + 1] ?? "")) {
// unexpected-end-tag errors should only be reported once for a given tag
unset($errors[$a]);
}
}
$errors = array_values($errors);
}
return [$exp, $errors, $patched, $skip];
}
protected function balanceTree(array $act, array $exp): array {

Loading…
Cancel
Save