Browse Source

Fix first failure in tree builder

ns
J. King 3 years ago
parent
commit
e3a271f06b
  1. 6
      lib/LoopException.php
  2. 20
      lib/Token.php
  3. 224
      lib/TreeBuilder.php
  4. 21
      tests/cases/TestTreeConstructor.php

6
lib/LoopException.php

@ -0,0 +1,6 @@
<?php
declare(strict_types=1);
namespace dW\HTML5;
class LoopException extends \Exception {
}

20
lib/Token.php

@ -13,6 +13,8 @@ abstract class DataToken extends Token {
}
class DOCTYPEToken extends Token {
public const NAME = "DOCTYPE token";
# DOCTYPE tokens have a name, a public identifier,
# a system identifier, and a force-quirks flag.
# When a DOCTYPE token is created, its name,
@ -33,9 +35,13 @@ class DOCTYPEToken extends Token {
}
}
class CharacterToken extends DataToken {}
class CharacterToken extends DataToken {
public const NAME = "Character token";
}
class CommentToken extends DataToken {
public const NAME = "Comment token";
public function __construct(string $data = '') {
parent::__construct($data);
}
@ -97,11 +103,17 @@ abstract class TagToken extends Token {
}
}
class StartTagToken extends TagToken {}
class StartTagToken extends TagToken {
public const NAME = "Start tag token";
}
class EndTagToken extends TagToken {}
class EndTagToken extends TagToken {
public const NAME = "End tag token";
}
class EOFToken extends Token {}
class EOFToken extends Token {
public const NAME = "EOF token";
}
class TokenAttr {
public $name;

224
lib/TreeBuilder.php

@ -5,6 +5,8 @@ namespace dW\HTML5;
class TreeBuilder {
use ParseErrorEmitter;
public $debugLog = "";
// The list of active formatting elements, used when elements are improperly nested
protected $activeFormattingElementsList;
// The DOMDocument that is assembled by this class
@ -41,42 +43,61 @@ class TreeBuilder {
// Used to store the template insertion modes
protected $templateInsertionModes;
// Used for debugging to print out information as the tree is built.
public static $debug = false;
// Instance used with the static token insertion methods.
protected static $instance;
// Constants used for insertion modes
const INITIAL_MODE = 0;
const BEFORE_HTML_MODE = 1;
const BEFORE_HEAD_MODE = 2;
const IN_HEAD_MODE = 3;
const IN_HEAD_NOSCRIPT_MODE = 4;
const AFTER_HEAD_MODE = 5;
const IN_BODY_MODE = 6;
const TEXT_MODE = 7;
const IN_TABLE_MODE = 8;
const IN_TABLE_TEXT_MODE = 9;
const IN_CAPTION_MODE = 10;
const IN_COLUMN_GROUP_MODE = 11;
const IN_TABLE_BODY_MODE = 12;
const IN_ROW_MODE = 13;
const IN_CELL_MODE = 14;
const IN_SELECT_MODE = 15;
const IN_SELECT_IN_TABLE_MODE = 16;
const IN_TEMPLATE_MODE = 17;
const AFTER_BODY_MODE = 18;
const IN_FRAMESET_MODE = 19;
const AFTER_FRAMESET_MODE = 20;
const AFTER_AFTER_BODY_MODE = 21;
const AFTER_AFTER_FRAMESET_MODE = 22;
protected const INITIAL_MODE = 0;
protected const BEFORE_HTML_MODE = 1;
protected const BEFORE_HEAD_MODE = 2;
protected const IN_HEAD_MODE = 3;
protected const IN_HEAD_NOSCRIPT_MODE = 4;
protected const AFTER_HEAD_MODE = 5;
protected const IN_BODY_MODE = 6;
protected const TEXT_MODE = 7;
protected const IN_TABLE_MODE = 8;
protected const IN_TABLE_TEXT_MODE = 9;
protected const IN_CAPTION_MODE = 10;
protected const IN_COLUMN_GROUP_MODE = 11;
protected const IN_TABLE_BODY_MODE = 12;
protected const IN_ROW_MODE = 13;
protected const IN_CELL_MODE = 14;
protected const IN_SELECT_MODE = 15;
protected const IN_SELECT_IN_TABLE_MODE = 16;
protected const IN_TEMPLATE_MODE = 17;
protected const AFTER_BODY_MODE = 18;
protected const IN_FRAMESET_MODE = 19;
protected const AFTER_FRAMESET_MODE = 20;
protected const AFTER_AFTER_BODY_MODE = 21;
protected const AFTER_AFTER_FRAMESET_MODE = 22;
// Quirks mode constants
const QUIRKS_MODE_OFF = 0;
const QUIRKS_MODE_ON = 1;
const QUIRKS_MODE_LIMITED = 2;
protected const QUIRKS_MODE_OFF = 0;
protected const QUIRKS_MODE_ON = 1;
protected const QUIRKS_MODE_LIMITED = 2;
protected const INSERTION_MODE_NAMES = [
self::INITIAL_MODE => "Initial",
self::BEFORE_HTML_MODE => "Before html",
self::BEFORE_HEAD_MODE => "Before head",
self::IN_HEAD_MODE => "In head",
self::IN_HEAD_NOSCRIPT_MODE => "In head noscript",
self::AFTER_HEAD_MODE => "After head",
self::IN_BODY_MODE => "In body",
self::TEXT_MODE => "Text",
self::IN_TABLE_MODE => "In table",
self::IN_TABLE_TEXT_MODE => "In table text",
self::IN_CAPTION_MODE => "In caption",
self::IN_COLUMN_GROUP_MODE => "In column group",
self::IN_TABLE_BODY_MODE => "In table body",
self::IN_ROW_MODE => "In row",
self::IN_CELL_MODE => "In cell",
self::IN_SELECT_MODE => "In select",
self::IN_SELECT_IN_TABLE_MODE => "In select in table",
self::IN_TEMPLATE_MODE => "In template mode",
self::AFTER_BODY_MODE => "After body",
self::IN_FRAMESET_MODE => "In frameset",
self::AFTER_FRAMESET_MODE => "After frameset",
self::AFTER_AFTER_BODY_MODE => "After after body",
self::AFTER_AFTER_FRAMESET_MODE => "After after frameset",
];
public function __construct(Document $dom, $formElement, bool $fragmentCase = false, $fragmentContext = null, OpenElementsStack $stack, Stack $templateInsertionModes, Tokenizer $tokenizer, ParseError $errorHandler, Data $data) {
// If the form element isn't an instance of DOMElement that has a node name of
@ -112,20 +133,17 @@ class TreeBuilder {
}
public function emitToken(Token $token) {
assert((function() use ($token) {
$this->debugLog .= "EMITTED: ".constant(get_class($token)."::NAME")."\n";
return true;
})());
// Loop used for reprocessing.
while (true) {
$adjustedCurrentNode = $this->stack->adjustedCurrentNode;
$adjustedCurrentNodeName = $this->stack->adjustedCurrentNodeName;
$adjustedCurrentNodeNamespace = $this->stack->adjustedCurrentNodeNamespace;
if (self::$debug) {
echo "Node: $adjustedCurrentNodeName\n";
echo "\nToken: \n";
var_export($token);
echo "\n\n";
}
# 8.2.5 Tree construction
# 13.2.6 Tree construction
#
# As each token is emitted from the tokenizer, the user agent must follow the
# appropriate steps from the following list, known as the tree construction dispatcher:
@ -182,67 +200,20 @@ class TreeBuilder {
}
protected function parseTokenInHTMLContent(Token $token, int $insertionMode = null) {
$insertionMode = (is_null($insertionMode)) ? $this->insertionMode : $insertionMode;
$insertionMode = $insertionMode ?? $this->insertionMode;
// Loop used when processing the token under different rules; always breaks.
$iterations = 0;
while (true) {
if (self::$debug) {
switch ($insertionMode) {
case self::INITIAL_MODE: $mode = "Initial";
break;
case self::BEFORE_HTML_MODE: $mode = "Before html";
break;
case self::BEFORE_HEAD_MODE: $mode = "Before head";
break;
case self::IN_HEAD_MODE: $mode = "In head";
break;
case self::IN_HEAD_NOSCRIPT_MODE: $mode = "In head noscript";
break;
case self::AFTER_HEAD_MODE: $mode = "After head";
break;
case self::IN_BODY_MODE: $mode = "In body";
break;
case self::TEXT_MODE: $mode = "Text";
break;
case self::IN_TABLE_MODE: $mode = "In table";
break;
case self::IN_TABLE_TEXT_MODE: $mode = "In table text";
break;
case self::IN_CAPTION_MODE: $mode = "In caption";
break;
case self::IN_COLUMN_GROUP_MODE: $mode = "In column group";
break;
case self::IN_TABLE_BODY_MODE: $mode = "In table body";
break;
case self::IN_ROW_MODE: $mode = "In row";
break;
case self::IN_CELL_MODE: $mode = "In cell";
break;
case self::IN_SELECT_MODE: $mode = "In select";
break;
case self::IN_SELECT_IN_TABLE_MODE: $mode = "In select in table";
break;
case self::IN_TEMPLATE_MODE: $mode = "In template mode";
break;
case self::AFTER_BODY_MODE: $mode = "After body";
break;
case self::IN_FRAMESET_MODE: $mode = "In frameset";
break;
case self::AFTER_FRAMESET_MODE: $mode = "After frameset";
break;
case self::AFTER_AFTER_BODY_MODE: $mode = "After after body";
break;
case self::AFTER_AFTER_FRAMESET_MODE: $mode = "After after frameset";
break;
default: throw new Exception(Exception::UNKNOWN_ERROR);
}
echo "Mode: $mode\n";
}
# 8.2.5.4. The rules for parsing tokens in HTML content
assert((function() use ($insertionMode) {
$mode = self::INSERTION_MODE_NAMES[$insertionMode] ?? $insertionMode;
$this->debugLog .= " Mode: $mode\n";
return true;
})());
assert($iterations++ < 50, new LoopException("Probable infinite loop detected in HTML content handling"));
# 13.2.6.4. The rules for parsing tokens in HTML content
switch ($insertionMode) {
# 8.2.5.4.1. The "initial" insertion mode
# 13.2.6.4.1. The "initial" insertion mode
case self::INITIAL_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
@ -404,7 +375,7 @@ class TreeBuilder {
}
break;
# 8.2.5.4.2. The "before html" insertion mode
# 13.2.6.4.2. The "before html" insertion mode
case self::BEFORE_HTML_MODE:
# A DOCTYPE token
if ($token instanceof DOCTYPEToken) {
@ -458,7 +429,7 @@ class TreeBuilder {
// Good to know. There's no scripting in this implementation, though.
break;
# 8.2.5.4.3. The "before head" insertion mode
# 13.2.6.4.3. The "before head" insertion mode
case self::BEFORE_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
@ -469,6 +440,7 @@ class TreeBuilder {
}
# A comment token
elseif ($token instanceof CommentToken) {
# insert a comment
$this->insertCommentToken($token);
}
# A DOCTYPE token
@ -476,27 +448,27 @@ class TreeBuilder {
# Parse error.
$this->error(ParseError::UNEXPECTED_DOCTYPE);
}
elseif ($token instanceof StartTagToken) {
# A start tag whose tag name is "html"
if ($token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
}
# A start tag whose tag name is "head"
elseif ($token->name === 'head') {
# Insert an HTML element for the token.
$element = $this->insertStartTagToken($token);
# Set the head element pointer to the newly created head element.
$this->headElement = $element;
# A start tag whose tag name is "html"
elseif ($token instanceof StartTagToken && $token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
}
# A start tag whose tag name is "head"
elseif ($token instanceof StartTagToken && $token->name === 'head') {
# Insert an HTML element for the token.
$element = $this->insertStartTagToken($token);
# Set the head element pointer to the newly created head element.
$this->headElement = $element;
# Switch the insertion mode to "in head".
$this->insertionMode = self::IN_HEAD_MODE;
}
# Switch the insertion mode to "in head".
$this->insertionMode = self::IN_HEAD_MODE;
}
# An end tag whose tag name is one of: "head", "body", "html", "br"
// See "Anything else" below
# Any other end tag
elseif ($token instanceof EndTagToken && $token->name !== 'head' && $token->name !== 'body' && $token->name !== 'html' && $token->name === 'br') {
# Parse error.
# Parse error. Ignore the token
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# An end tag whose tag name is one of: "head", "body", "html", "br"
@ -516,7 +488,7 @@ class TreeBuilder {
}
break;
# 8.2.5.4.4. The "in head" insertion mode
# 13.2.6.4.4. The "in head" insertion mode
case self::IN_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
@ -734,7 +706,7 @@ class TreeBuilder {
}
break;
# 8.2.5.4.5. The "in head noscript" insertion mode
# 13.2.6.4.5. The "in head noscript" insertion mode
case self::IN_HEAD_NOSCRIPT_MODE:
# DOCTYPE token
if ($token instanceof DOCTYPEToken) {
@ -832,7 +804,7 @@ class TreeBuilder {
}
break;
# 8.2.5.4.6. The "after head" insertion mode
# 13.2.6.4.6. The "after head" insertion mode
case self::AFTER_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
@ -947,7 +919,7 @@ class TreeBuilder {
}
break;
# 8.2.5.4.7. The "in body" insertion mode
# 13.2.6.4.7. The "in body" insertion mode
case self::IN_BODY_MODE:
if ($token instanceof CharacterToken) {
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
@ -1161,7 +1133,7 @@ class TreeBuilder {
if ($nodeName === 'li') {
# 1. Generate implied end tags, except for li elements.
$this->stack->generateImpliedEndTags('li');
$this->stack->generateImpliedEndTags(["li"]);
# 2. If the current node is not an li element, then this is a parse error.
if ($this->stack->currentNodeName !== 'li') {
@ -1451,7 +1423,7 @@ class TreeBuilder {
$currentNode = $this->stack->currentNode;
$currentNodeName = $this->stack->currentNodeName;
$currentNodeNamespace = $this->stack->currentNodeNamespace;
# 8.2.5.5 The rules for parsing tokens in foreign content
# 13.2.6.5 The rules for parsing tokens in foreign content
#
# When the user agent is to apply the rules for parsing tokens in foreign
# content, the user agent must handle the token as follows:
@ -1854,7 +1826,7 @@ class TreeBuilder {
protected function appropriatePlaceForInsertingNode(\DOMNode $overrideTarget = null): array {
$insertBefore = false;
# 8.2.5.1. Creating and inserting nodes
# 13.2.6.1. Creating and inserting nodes
#
# While the parser is processing a token, it can enable or disable foster
# parenting. This affects the following algorithm.
@ -2319,7 +2291,7 @@ class TreeBuilder {
# must run the following steps:
# 1. Generate implied end tags, except for p elements.
$this->stack->generateImpliedEndTags('p');
$this->stack->generateImpliedEndTags(["p"]);
# 2. If the current node is not a p element, then this is a parse error.
$currentNodeName = $this->stack->currentNodeName;
if ($currentNodeName !== 'p') {

21
tests/cases/TestTreeConstructor.php

@ -5,6 +5,7 @@ namespace dW\HTML5\TestCase;
use dW\HTML5\Data;
use dW\HTML5\Document;
use dW\HTML5\EOFToken;
use dW\HTML5\LoopException;
use dW\HTML5\OpenElementsStack;
use dW\HTML5\ParseError;
use dW\HTML5\Parser;
@ -52,13 +53,19 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
$doc = new Document;
$treeBuilder = new TreeBuilder($doc, null, false, null, $stack, new TemplateInsertionModesStack, $tokenizer, $errorHandler, $decoder);
// run the tree builder
do {
$token = $tokenizer->createToken();
$treeBuilder->emitToken($token);
} while (!$token instanceof EOFToken);
$act = $this->serializeTree($doc);
$this->assertEquals($exp, $act);
// TODO: evaluate errors
try {
do {
$token = $tokenizer->createToken();
$treeBuilder->emitToken($token);
} while (!$token instanceof EOFToken);
} catch (LoopException $e) {
$act = $this->serializeTree($doc);
$this->assertEquals($exp, $act, $e->getMessage()."\n".$treeBuilder->debugLog);
} finally {
$act = $this->serializeTree($doc);
$this->assertEquals($exp, $act, $treeBuilder->debugLog);
// TODO: evaluate errors
}
}
protected function push(string $data): void {

Loading…
Cancel
Save