Browse Source

Moved tokenizer to its own class

• Changed the name of the parser instance variable from Parser::$self to Parser::$instance
• Added parse errors for entities into ParseError.
• Moved Parser::fixDOM to DOM::fixIdAttributes.
• Added an exception for when the tokenizer enters an invalid state (infinite looping).
• Made ParseError use Parser::$instance->data instead of a passed around DataStream object.
split-manual
Dustin Wilson 6 years ago
parent
commit
027e5b9f58
  1. 6
      lib/ActiveFormattingElementsList.php
  2. 34
      lib/DOM.php
  3. 18
      lib/DataStream.php
  4. 18
      lib/Exception.php
  5. 40
      lib/ParseError.php
  6. 3146
      lib/Parser.php
  7. 2
      lib/Stack.php
  8. 16
      lib/Token.php
  9. 3132
      lib/Tokenizer.php

6
lib/ActiveFormattingElementsList.php

@ -132,7 +132,7 @@ class ActiveFormattingElementsList implements \ArrayAccess {
# elements is a marker, or if it is an element that is in the stack of open # elements is a marker, or if it is an element that is in the stack of open
# elements, then there is nothing to reconstruct; stop this algorithm. # elements, then there is nothing to reconstruct; stop this algorithm.
$entry = end($this->_storage); $entry = end($this->_storage);
if ($entry instanceof ActiveFormattingElementMarker || in_array($entry['element'], Parser::$self->stack)) { if ($entry instanceof ActiveFormattingElementMarker || in_array($entry['element'], Parser::$instance->stack)) {
return; return;
} }
@ -153,7 +153,7 @@ class ActiveFormattingElementsList implements \ArrayAccess {
# 6. If entry is neither a marker nor an element that is also in the stack of # 6. If entry is neither a marker nor an element that is also in the stack of
# open elements, go to the step labeled Rewind. # open elements, go to the step labeled Rewind.
if (!$entry instanceof ActiveFormattingElementMarker && !in_array($entry['element'], Parser::$self->stack)) { if (!$entry instanceof ActiveFormattingElementMarker && !in_array($entry['element'], Parser::$instance->stack)) {
goto rewind; goto rewind;
} }
@ -165,7 +165,7 @@ class ActiveFormattingElementsList implements \ArrayAccess {
# 8. Create: Insert an HTML element for the token for which the element entry # 8. Create: Insert an HTML element for the token for which the element entry
# was created, to obtain new element. # was created, to obtain new element.
create: create:
$element = Parser::$self->insertElement($entry['token']); $element = Parser::$instance->insertElement($entry['token']);
# 9. Replace the entry for entry in the list with an entry for new element. # 9. Replace the entry for entry in the list with an entry for new element.
$this->_storage[key($this->_storage)]['element'] = $element; $this->_storage[key($this->_storage)]['element'] = $element;

34
lib/DOM.php

@ -43,6 +43,40 @@ class DOM {
); );
} }
public static function fixIdAttributes(\DOMDocument $dom) {
// TODO: Accept DOMDocumentFragment, append it to a document, fix shit, and
// then poop out a fragment so selecting id attributes works on fragments.
// Fix id attributes so they may be selected by the DOM. Fix the PHP id attribute
// bug. Allows DOMDocument->getElementById() to work on id attributes.
if (!Parser::$instance->fragmentCase) {
$dom->relaxNGValidateSource('<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
<start>
<element>
<anyName/>
<ref name="anythingID"/>
</element>
</start>
<define name="anythingID">
<zeroOrMore>
<choice>
<element>
<anyName/>
<ref name="anythingID"/>
</element>
<attribute name="id"><data type="ID"/></attribute>
<zeroOrMore><attribute><anyName/></attribute></zeroOrMore>
<text/>
</choice>
</zeroOrMore>
</define>
</grammar>');
}
$dom->normalize();
return $dom;
}
protected static function ancestor(mixed $needle, \DOMElement $context, bool $returnNode = true) { protected static function ancestor(mixed $needle, \DOMElement $context, bool $returnNode = true) {
while ($context = $context->parentNode) { while ($context = $context->parentNode) {
$result = static::compare($needle, $context); $result = static::compare($needle, $context);

18
lib/DataStream.php

@ -141,7 +141,7 @@ class DataStream
# unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X # unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X
# character). This is a parse error; nothing is returned. # character). This is a parse error; nothing is returned.
if (!$number) { if (!$number) {
ParseError::trigger(ParseError::HEX_DIGITS_EXPECTED, $this, $this->peek()); ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'hexadecimal digit');
$this->unconsume(2); $this->unconsume(2);
return '&'; return '&';
@ -154,7 +154,7 @@ class DataStream
# unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X # unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X
# character). This is a parse error; nothing is returned. # character). This is a parse error; nothing is returned.
if (!$number) { if (!$number) {
ParseError::trigger(ParseError::DIGITS_EXPECTED, $this, $this->peek()); ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'decimal digit');
$this->unconsume(); $this->unconsume();
return '&'; return '&';
@ -167,7 +167,7 @@ class DataStream
if ($char === ';') { if ($char === ';') {
$this->consume(); $this->consume();
} else { } else {
ParseError::trigger(ParseError::SEMICOLON_TERMINATOR_EXPECTED, $this, $char); ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $char, 'semicolon terminator');
} }
# If one or more characters match the range, then take them all and interpret the # If one or more characters match the range, then take them all and interpret the
@ -266,7 +266,7 @@ class DataStream
} }
if ($returnValue) { if ($returnValue) {
ParseError::trigger(Error::INVALID_NUMERIC_ENTITY); ParseError::trigger(Error::INVALID_NUMERIC_ENTITY, $number);
return $returnValue; return $returnValue;
} }
@ -274,7 +274,7 @@ class DataStream
# 0x10FFFF, then this is a parse error. Return a U+FFFD REPLACEMENT CHARACTER # 0x10FFFF, then this is a parse error. Return a U+FFFD REPLACEMENT CHARACTER
# character token. # character token.
if (($number >= 0xD800 && $number <= 0xDFFF) || $number > 0x10FFFF) { if (($number >= 0xD800 && $number <= 0xDFFF) || $number > 0x10FFFF) {
ParseError::trigger(Error::INVALID_CODEPOINT); ParseError::trigger(Error::INVALID_CODEPOINT, $number);
return '�'; return '�';
} }
@ -295,7 +295,7 @@ class DataStream
$number === 0xBFFFF || $number === 0xCFFFE || $number === 0xCFFFF || $number === 0xDFFFE || $number === 0xBFFFF || $number === 0xCFFFE || $number === 0xCFFFF || $number === 0xDFFFE ||
$number === 0xDFFFF || $number === 0xEFFFE || $number === 0xEFFFF || $number === 0xFFFFE || $number === 0xDFFFF || $number === 0xEFFFE || $number === 0xEFFFF || $number === 0xFFFFE ||
$number === 0xFFFFF || $number === 0x10FFFE || $number === 0x10FFFF) { $number === 0xFFFFF || $number === 0x10FFFE || $number === 0x10FFFF) {
ParseError::trigger(Error::INVALID_CODEPOINT); ParseError::trigger(Error::INVALID_CODEPOINT, $number);
return '&'; return '&';
} }
@ -337,7 +337,7 @@ class DataStream
$next = $this->peek(); $next = $this->peek();
if ($inAttribute && $lastChar !== ';' && ($next === '=' || ctype_alnum($next))) { if ($inAttribute && $lastChar !== ';' && ($next === '=' || ctype_alnum($next))) {
if ($next === '=') { if ($next === '=') {
ParseError::trigger(ParseError::INVALID_NAMED_ENTITY, $this); ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $next, 'semicolon terminator');
} }
return '&'; return '&';
@ -349,7 +349,7 @@ class DataStream
// Used for PHP's entity decoder. Described below. // Used for PHP's entity decoder. Described below.
$sequence.=';'; $sequence.=';';
ParseError::trigger(ParseError::SEMICOLON_TERMINATOR_EXPECTED, $this); ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $lastChar, 'semicolon terminator');
} }
# Return one or two character tokens for the character(s) corresponding to the # Return one or two character tokens for the character(s) corresponding to the
@ -367,7 +367,7 @@ class DataStream
# (&) consist of a sequence of one or more alphanumeric ASCII characters followed # (&) consist of a sequence of one or more alphanumeric ASCII characters followed
# by a U+003B SEMICOLON character (;), then this is a parse error. # by a U+003B SEMICOLON character (;), then this is a parse error.
if (preg_match('/^[A-Za-z0-9]+;/', $char)) { if (preg_match('/^[A-Za-z0-9]+;/', $char)) {
ParseError::trigger(ParseError::INVALID_NAMED_ENTITY, $this); ParseError::trigger(ParseError::INVALID_NAMED_ENTITY, $char);
} }
return '&'; return '&';

18
lib/Exception.php

@ -21,6 +21,8 @@ class Exception extends \Exception {
const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10501; const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10501;
const TOKENIZER_INVALID_STATE = 10601;
protected static $messages = [10000 => 'Invalid error code', protected static $messages = [10000 => 'Invalid error code',
10001 => 'Unknown error; escaping', 10001 => 'Unknown error; escaping',
10002 => 'Incorrect number of parameters for Exception message; %s expected', 10002 => 'Incorrect number of parameters for Exception message; %s expected',
@ -37,7 +39,9 @@ class Exception extends \Exception {
10401 => 'Data string expected; found %s', 10401 => 'Data string expected; found %s',
10402 => '%s is an invalid data consumption length; a value of 1 or above is expected', 10402 => '%s is an invalid data consumption length; a value of 1 or above is expected',
10501 => 'The first argument must either be an instance of \DOMElement, a string, or a closure; found %s']; 10501 => 'The first argument must either be an instance of \DOMElement, a string, or a closure; found %s',
10601 => 'The Tokenizer has entered an invalid state'];
public function __construct(int $code, ...$args) { public function __construct(int $code, ...$args) {
if (!isset(static::$messages[$code])) { if (!isset(static::$messages[$code])) {
@ -62,18 +66,6 @@ class Exception extends \Exception {
} }
if ($count > 0) { if ($count > 0) {
// Convert newlines and tabs in the arguments to words to better express what they
// are.
/*$args = array_map(function($value) {
switch ($value) {
case "\n": return 'Newline';
break;
case "\t": return 'Tab';
break;
default: return $value;
}
}, $args);*/
// Go through each of the arguments and run sprintf on the strings. // Go through each of the arguments and run sprintf on the strings.
$message = call_user_func_array('sprintf', array_merge([$message], $args)); $message = call_user_func_array('sprintf', array_merge([$message], $args));
} }

40
lib/ParseError.php

@ -3,10 +3,6 @@ declare(strict_types=1);
namespace dW\HTML5; namespace dW\HTML5;
class ParseError { class ParseError {
// DataStream object passed to it used to get information used in error
// reporting.
public static $data;
const TAG_NAME_EXPECTED = 0; const TAG_NAME_EXPECTED = 0;
const UNEXPECTED_EOF = 1; const UNEXPECTED_EOF = 1;
const UNEXPECTED_CHARACTER = 2; const UNEXPECTED_CHARACTER = 2;
@ -17,7 +13,11 @@ class ParseError {
const UNEXPECTED_DOCTYPE = 7; const UNEXPECTED_DOCTYPE = 7;
const INVALID_DOCTYPE = 8; const INVALID_DOCTYPE = 8;
const INVALID_CONTROL_OR_NONCHARACTERS = 9; const INVALID_CONTROL_OR_NONCHARACTERS = 9;
const INVALID_XMLNS_ATTRIBUTE_VALUE = 10; const UNEXPECTED_XMLNS_ATTRIBUTE_VALUE = 10;
const ENTITY_UNEXPECTED_CHARACTER = 11;
const INVALID_NUMERIC_ENTITY = 12;
const INVALID_NAMED_ENTITY = 14;
const INVALID_CODEPOINT = 15;
protected static $messages = ['Tag name expected; found %s', protected static $messages = ['Tag name expected; found %s',
'Unexpected end-of-file; %s expected', 'Unexpected end-of-file; %s expected',
@ -29,14 +29,18 @@ class ParseError {
'Unexpected DOCTYPE; %s expected', 'Unexpected DOCTYPE; %s expected',
'Invalid DOCTYPE', 'Invalid DOCTYPE',
'Invalid Control or Non-character; removing', 'Invalid Control or Non-character; removing',
'Invalid xmlns attribute value; %s expected']; 'Unexpected xmlns attribute value; %s expected',
'Unexpected "%s" character in entity; %s expected',
'"%s" is an invalid numeric entity',
'"%s" is an invalid name for an entity',
'"%s" is an invalid character codepoint'];
public static function errorHandler($code, $message, $file, $line, array $context) { public static function errorHandler($code, $message, $file, $line, array $context) {
if ($code === E_USER_WARNING) { if ($code === E_USER_WARNING) {
$errMsg = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, static::$data->filePath); $errMsg = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, Parser::$instance->data->filePath);
if (static::$data->length !== 0) { if (Parser::$instance->data->length !== 0) {
$errMsg .= sprintf(" on line %s, column %s\n", static ::$data->line, static::$data->column); $errMsg .= sprintf(" on line %s, column %s\n", Parser::$instance->data->line, Parser::$instance->data->column);
} else { } else {
$errMsg .= "\n"; $errMsg .= "\n";
} }
@ -45,13 +49,11 @@ class ParseError {
} }
} }
public static function trigger(int $code, DataStream $data, ...$args): bool { public static function trigger(int $code, ...$args): bool {
if (!isset(static::$messages[$code])) { if (!isset(static::$messages[$code])) {
throw new Exception(Exception::INVALID_CODE); throw new Exception(Exception::INVALID_CODE);
} }
static::$data = $data;
// Set the error handler and honor already-set error reporting rules. // Set the error handler and honor already-set error reporting rules.
set_error_handler('\\dW\\HTML5\\ParseError::errorHandler', error_reporting()); set_error_handler('\\dW\\HTML5\\ParseError::errorHandler', error_reporting());
@ -68,12 +70,14 @@ class ParseError {
// Convert newlines and tabs in the arguments to words to better express what they // Convert newlines and tabs in the arguments to words to better express what they
// are. // are.
$args = array_map(function($value) { $args = array_map(function($value) {
switch ($value) { if ($value === "\n") {
case "\n": return 'Newline'; return 'Newline';
break; } elseif ($value === "\t") {
case "\t": return 'Tab'; return 'Tab';
break; } elseif (is_null($value)) {
default: return $value; return "nothing";
} else {
return $value;
} }
}, $args); }, $args);

3146
lib/Parser.php

File diff suppressed because it is too large

2
lib/Stack.php

@ -76,7 +76,7 @@ class Stack implements \ArrayAccess {
# the HTML fragment parsing algorithm and the stack of open elements has only one # the HTML fragment parsing algorithm and the stack of open elements has only one
# element in it (fragment case); otherwise, the adjusted current node is the # element in it (fragment case); otherwise, the adjusted current node is the
# current node. # current node.
return (Parser::$self->fragmentCase && $this->length === 1) ? Parser::$self->fragmentContext : $this->currentNode; return (Parser::$instance->fragmentCase && $this->length === 1) ? Parser::$instance->fragmentContext : $this->currentNode;
break; break;
case 'adjustedCurrentNodeNamespace': case 'adjustedCurrentNodeNamespace':
$adjustedCurrentNode = $this->adjustedCurrentNode; $adjustedCurrentNode = $this->adjustedCurrentNode;

16
lib/Token.php

@ -7,16 +7,16 @@ abstract class Token {}
abstract class DataToken extends Token { abstract class DataToken extends Token {
public $data; public $data;
public function __construct($data) { public function __construct(string $data) {
$this->data = (string)$data; $this->data = $data;
} }
} }
abstract class TagToken extends Token { abstract class TagToken extends Token {
public $name; public $name;
public function __construct($name) { public function __construct(string $name) {
$this->name = (string)$name; $this->name = $name;
} }
} }
@ -38,7 +38,7 @@ class DOCTYPEToken extends Token {
class CharacterToken extends DataToken {} class CharacterToken extends DataToken {}
class CommentToken extends DataToken { class CommentToken extends DataToken {
public function __construct($data = '') { public function __construct(string $data = '') {
parent::__construct($data); parent::__construct($data);
} }
} }
@ -48,7 +48,7 @@ class StartTagToken extends TagToken {
public $selfClosing; public $selfClosing;
public $attributes = []; public $attributes = [];
public function __construct($name, bool $selfClosing = false, string $namespace = Parser::HTML_NAMESPACE) { public function __construct(string $name, bool $selfClosing = false, string $namespace = Parser::HTML_NAMESPACE) {
$this->selfClosing = $selfClosing; $this->selfClosing = $selfClosing;
$this->namespace = $namespace; $this->namespace = $namespace;
parent::__construct($name); parent::__construct($name);
@ -68,7 +68,7 @@ class StartTagToken extends TagToken {
unset($this->attributes[$this->getAttributeKey($name)]); unset($this->attributes[$this->getAttributeKey($name)]);
} }
public function setAttribute($name, $value, $namespace = Parser::HTML_NAMESPACE) { public function setAttribute(string $name, string $value, string $namespace = Parser::HTML_NAMESPACE) {
$key = $this->_getAttributeKey($name); $key = $this->_getAttributeKey($name);
$attribute = new TokenAttr($name, $value, $namespace); $attribute = new TokenAttr($name, $value, $namespace);
@ -79,7 +79,7 @@ class StartTagToken extends TagToken {
} }
} }
private function _getAttributeKey($name) { private function _getAttributeKey(string $name) {
$key = null; $key = null;
foreach ($this->attributes as $key => $a) { foreach ($this->attributes as $key => $a) {
if ($a->name === $name) { if ($a->name === $name) {

3132
lib/Tokenizer.php

File diff suppressed because it is too large
Loading…
Cancel
Save