Browse Source

Fix a few tree tests

ns
J. King 3 years ago
parent
commit
4e5fd35775
  1. 8
      lib/OpenElementsStack.php
  2. 42
      lib/ParseError.php
  3. 2
      lib/ParseErrorEmitter.php
  4. 66
      lib/TreeBuilder.php

8
lib/OpenElementsStack.php

@ -78,15 +78,9 @@ class OpenElementsStack extends Stack {
$this->_storage = array_values($this->_storage);
}
public function generateImpliedEndTags($exclude = []) {
public function generateImpliedEndTags(array $exclude = []) {
$tags = ['caption', 'colgroup', 'dd', 'dt', 'li', 'optgroup', 'option', 'p', 'rb', 'rp', 'rt', 'rtc', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'];
if (is_string($exclude)) {
$exclude = [$exclude];
}
assert(is_array($exclude), new Exception(Exception::STACK_STRING_ARRAY_EXPECTED));
if (count($exclude) > 0) {
$modified = false;
foreach ($exclude as $e) {

42
lib/ParseError.php

@ -5,6 +5,7 @@ namespace dW\HTML5;
class ParseError {
protected $data;
// tokenization parse errors; these have been standardized
const ENCODING_ERROR = 100;
const UNEXPECTED_NULL_CHARACTER = 101;
const UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME = 102;
@ -53,9 +54,19 @@ class ParseError {
const CONTROL_CHARACTER_REFERENCE = 145;
const SURROGATE_IN_INPUT_STREAM = 146;
const NONCHARACTER_IN_INPUT_STREAM = 147;
const CONTROL_CHARACTER_IN_INPUT_STREAM = 148;
const CONTROL_CHARACTER_IN_INPUT_STREAM = 148;
// tree construction parse errors; these have not been standardized, but html5lib's error names are likely to become standard in future
const EXPECTED_DOCTYPE_BUT_GOT_START_TAG = 200;
const EXPECTED_DOCTYPE_BUT_GOT_END_TAG = 201;
const EXPECTED_DOCTYPE_BUT_GOT_CHARS = 202;
const UNEXPECTED_END_TAG = 203; // html5lib also uses 'adoption-agency-1.2' and 'adoption-agency-1.3' for this
const MESSAGES = [
self::EXPECTED_DOCTYPE_BUT_GOT_START_TAG => 'Expected DOCTYPE but got start tag',
self::EXPECTED_DOCTYPE_BUT_GOT_END_TAG => 'Expected DOCTYPE but got end tag',
self::EXPECTED_DOCTYPE_BUT_GOT_CHARS => 'Expected DOCTYPE but got characters',
self::UNEXPECTED_END_TAG => 'Unexpected end tag',
self::ENCODING_ERROR => 'Corrupt encoding near byte position %s',
self::UNEXPECTED_NULL_CHARACTER => 'Unexpected null character',
self::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME => 'Unexpected "?" character instead of tag name',
@ -108,55 +119,26 @@ class ParseError {
];
const REPORT_OFFSETS = [
self::ENCODING_ERROR => 0,
self::UNEXPECTED_NULL_CHARACTER => -1,
self::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME => 0,
self::EOF_BEFORE_TAG_NAME => 0,
self::INVALID_FIRST_CHARACTER_OF_TAG_NAME => 0,
self::MISSING_END_TAG_NAME => -1,
self::EOF_IN_TAG => 0,
self::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT => 0,
self::UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME => -1,
self::DUPLICATE_ATTRIBUTE => -1,
self::UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME => -1,
self::MISSING_ATTRIBUTE_VALUE => -1,
self::UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE => -1,
self::MISSING_WHITESPACE_BETWEEN_ATTRIBUTES => 0,
self::UNEXPECTED_SOLIDUS_IN_TAG => 0,
self::CDATA_IN_HTML_CONTENT => -1,
self::INCORRECTLY_OPENED_COMMENT => 0,
self::ABRUPT_CLOSING_OF_EMPTY_COMMENT => -1,
self::EOF_IN_COMMENT => 0,
self::NESTED_COMMENT => 0,
self::INCORRECTLY_CLOSED_COMMENT => -1,
self::EOF_IN_DOCTYPE => 0,
self::MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME => 0,
self::MISSING_DOCTYPE_NAME => -1,
self::INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME => 0,
self::MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD => -1,
self::MISSING_DOCTYPE_PUBLIC_IDENTIFIER => -1,
self::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER => 0,
self::ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER => -1,
self::MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS => -1,
self::MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD => -1,
self::MISSING_DOCTYPE_SYSTEM_IDENTIFIER => -1,
self::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER => 0,
self::ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER => -1,
self::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER => 0,
self::EOF_IN_CDATA => 0,
self::END_TAG_WITH_ATTRIBUTES => -1,
self::END_TAG_WITH_TRAILING_SOLIDUS => -1,
self::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE => 0,
self::UNKNOWN_NAMED_CHARACTER_REFERENCE => 0,
self::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE => 0,
self::NULL_CHARACTER_REFERENCE => 0,
self::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE => 0,
self::SURROGATE_CHARACTER_REFERENCE => 0,
self::NONCHARACTER_CHARACTER_REFERENCE => 0,
self::CONTROL_CHARACTER_REFERENCE => 0,
self::SURROGATE_IN_INPUT_STREAM => 0,
self::NONCHARACTER_IN_INPUT_STREAM => 0,
self::CONTROL_CHARACTER_IN_INPUT_STREAM => 0,
];
public function setHandler() {

2
lib/ParseErrorEmitter.php

@ -10,7 +10,7 @@ trait ParseErrorEmitter {
$data = ($this instanceof Data) ? $this : ($this->data ?? null);
assert($data instanceof Data);
assert($this->errorHandler instanceof ParseError);
list($line, $column) = $data->whereIs(ParseError::REPORT_OFFSETS[$code]);
list($line, $column) = $data->whereIs(ParseError::REPORT_OFFSETS[$code] ?? 0);
return $this->errorHandler->emit($data->filePath, $line, $column, $code, ...$arg);
}
}

66
lib/TreeBuilder.php

@ -109,12 +109,6 @@ class TreeBuilder {
$this->insertionMode = self::INITIAL_MODE;
$this->quirksMode = self::QUIRKS_MODE_OFF;
static::$instance = $this;
}
public function __destruct() {
static::$instance = null;
}
public function emitToken(Token $token) {
@ -388,14 +382,16 @@ class TreeBuilder {
# set the Document to quirks mode.
// DEVIATION: There is no iframe srcdoc document because there are no nested
// browsing contexts in this implementation.
switch (get_class($token)) {
case 'StartTagToken': $this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
break;
case 'EndTagToken': $this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
break;
case 'EOFToken': $this->error(ParseError::UNEXPECTED_EOF);
break;
default: throw new Exception(Exception::UNKNOWN_ERROR);
if ($token instanceof StartTagToken) {
$this->error(ParseError::EXPECTED_DOCTYPE_BUT_GOT_START_TAG);
} elseif ($token instanceof EndTagToken) {
$this->error(ParseError::EXPECTED_DOCTYPE_BUT_GOT_END_TAG);
} elseif ($token instanceof CharacterToken) {
$this->error(ParseError::EXPECTED_DOCTYPE_BUT_GOT_CHARS);
} elseif ($token instanceof EOFToken) {
$this->error(ParseError::UNEXPECTED_EOF);
} else {
throw new \Exception("Unexpected token type".get_class($token));
}
$this->quirksMode = self::QUIRKS_MODE_ON;
@ -431,7 +427,7 @@ class TreeBuilder {
# Create an element for the token in the HTML namespace, with the Document as
# the intended parent. Append it to the Document object. Put this element in the
# stack of open elements.
$element = static::insertStartTagToken($token, $this->DOM);
$element = $this->insertStartTagToken($token, $this->DOM);
# Switch the insertion mode to "before head".
$this->insertionMode = self::BEFORE_HEAD_MODE;
@ -490,7 +486,7 @@ class TreeBuilder {
# A start tag whose tag name is "head"
elseif ($token->name === 'head') {
# Insert an HTML element for the token.
$element = static::insertStartTagToken($token);
$element = $this->insertStartTagToken($token);
# Set the head element pointer to the newly created head element.
$this->headElement = $element;
@ -507,7 +503,7 @@ class TreeBuilder {
# Anything else
else {
# Insert an HTML element for a "head" start tag token with no attributes.
$element = static::insertStartTagToken(new StartTagToken('head'));
$element = $this->insertStartTagToken(new StartTagToken('head'));
# Set the head element pointer to the newly created head element.
$this->headElement = $element;
@ -551,7 +547,7 @@ class TreeBuilder {
elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link') {
# Insert an HTML element for the token. Immediately pop the current node off the
# stack of open elements.
static::insertStartTagToken($token);
$this->insertStartTagToken($token);
$this->stack->pop();
# Acknowledge the token’s *self-closing flag*, if it is set.
@ -561,7 +557,7 @@ class TreeBuilder {
elseif ($token->name === 'meta') {
# Insert an HTML element for the token. Immediately pop the current node off the
# stack of open elements.
static::insertStartTagToken($token);
$this->insertStartTagToken($token);
$this->stack->pop();
# Acknowledge the token’s *self-closing flag*, if it is set.
@ -597,7 +593,7 @@ class TreeBuilder {
// flag is always disabled.
elseif ($token->name === 'noscript') {
# Insert an HTML element for the token.
static::insertStartTagToken($token);
$this->insertStartTagToken($token);
# Switch the insertion mode to "in head noscript".
$this->insertionMode = self::IN_HEAD_NOSCRIPT_MODE;
}
@ -615,7 +611,7 @@ class TreeBuilder {
// intended parent isn't used when determining anything;
// Parser::createAndInsertElement will get the adjusted insertion location
// anyway.
static::insertStartTagToken($token);
$this->insertStartTagToken($token);
# 3. Mark the element as being "parser-inserted" and unset the element’s
# "non-blocking" flag.
@ -637,7 +633,7 @@ class TreeBuilder {
# A start tag whose tag name is "template"
elseif ($token->name === 'template') {
# Insert an HTML element for the token.
static::insertStartTagToken($token);
$this->insertStartTagToken($token);
# Insert a marker at the end of the list of active formatting elements.
$this->activeFormattingElementsList->insertMarker();
# Set the frameset-ok flag to "not ok".
@ -703,7 +699,7 @@ class TreeBuilder {
# 2. If the current node is not a template element, then this is a parse error.
if ($this->stack->currentNodeName !== 'template') {
$this->error(ParseError::UNEXPECTED_END_TAG, 'template');
$this->error(ParseError::UNEXPECTED_END_TAG);
}
# 3. Pop elements from the stack of open elements until a template element has been popped from the stack.
@ -722,7 +718,7 @@ class TreeBuilder {
# Any other end tag
else {
# Parse error.
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
$this->error(ParseError::UNEXPECTED_END_TAG);
}
}
# Anything else
@ -1326,7 +1322,7 @@ class TreeBuilder {
}
# Switch the insertion mode to "after body".
self::$insertionMode = self::AFTER_BODY_MODE;
$this->insertionMode = self::AFTER_BODY_MODE;
// The only thing different between body and html here is that when processing
// an html end tag the token is reprocessed.
@ -1789,7 +1785,7 @@ class TreeBuilder {
# Insert a foreign element for the token, in the same namespace as the adjusted
# current node.
static::insertStartTagToken($token, null, $this->stack->adjustedCurrentNode->namespaceURI);
$this->insertStartTagToken($token, null, $this->stack->adjustedCurrentNode->namespaceURI);
# If the token has its self-closing flag set, then run the appropriate steps
# from the following list:
@ -1944,7 +1940,7 @@ class TreeBuilder {
];
}
public static function insertCharacterToken(CharacterToken $token) {
public function insertCharacterToken(CharacterToken $token) {
# 1. Let data be the characters passed to the algorithm, or, if no characters
# were explicitly specified, the character of the character token being
# processed.
@ -1952,7 +1948,7 @@ class TreeBuilder {
# 2. Let the adjusted insertion location be the appropriate place for inserting
# a node.
$location = static::$instance->appropriatePlaceForInsertingNode();
$location = $this->appropriatePlaceForInsertingNode();
$adjustedInsertionLocation = $location['node'];
$insertBefore = $location['insert before'];
@ -1998,7 +1994,7 @@ class TreeBuilder {
$adjustedInsertionLocation = $position;
$insertBefore = false;
} else {
$location = static::$instance->appropriatePlaceForInsertingNode();
$location = $this->appropriatePlaceForInsertingNode();
$adjustedInsertionLocation = $location['node'];
$insertBefore = $location['insert before'];
}
@ -2016,7 +2012,7 @@ class TreeBuilder {
}
}
public static function insertStartTagToken(StartTagToken $token, \DOMNode $intendedParent = null, string $namespace = null): Element {
public function insertStartTagToken(StartTagToken $token, \DOMNode $intendedParent = null, string $namespace = null): Element {
if (!is_null($namespace)) {
$namespace = $token->namespace;
}
@ -2042,9 +2038,9 @@ class TreeBuilder {
// DEVIATION: There is no point to setting the synchronous custom elements flag
// and custom element definition; there is no scripting in this implementation.
if ($namespace === Parser::HTML_NAMESPACE) {
$element = static::$instance->DOM->createElement($token->name);
$element = $this->DOM->createElement($token->name);
} else {
$element = static::$instance->DOM->createElementNS($namespace, $token->name);
$element = $this->DOM->createElementNS($namespace, $token->name);
}
# 8. Append each attribute in the given token to element.
@ -2108,7 +2104,7 @@ class TreeBuilder {
# 1. Let the adjusted insertion location be the appropriate place for inserting
# a node.
$location = static::$instance->appropriatePlaceForInsertingNode($intendedParent);
$location = $this->appropriatePlaceForInsertingNode($intendedParent);
$adjustedInsertionLocation = $location['node'];
$insertBefore = $location['insert before'];
@ -2136,7 +2132,7 @@ class TreeBuilder {
// DEVIATION: Unnecessary because there is no scripting in this implementation.
# 4. Push element onto the stack of open elements so that it is the new current node.
static::$instance->stack[] = $element;
$this->stack[] = $element;
# Return element.
return $element;
@ -2148,7 +2144,7 @@ class TreeBuilder {
# invoked in response to a start tag token.
# 1. Insert an HTML element for the token.
static::insertStartTagToken($token);
$this->insertStartTagToken($token);
# 2. If the algorithm that was invoked is the generic raw text element parsing
# algorithm, switch the tokenizer to the RAWTEXT state; otherwise the algorithm

Loading…
Cancel
Save