Browse Source

Moved tokenizer to its own class

• Changed the name of the parser instance variable from Parser::$self to Parser::$instance
• Added parse errors for entities into ParseError.
• Moved Parser::fixDOM to DOM::fixIdAttributes.
• Added an exception for when the tokenizer enters an invalid state (infinite looping).
• Made ParseError use Parser::$instance->data instead of a passed around DataStream object.
split-manual
Dustin Wilson 6 years ago
parent
commit
027e5b9f58
  1. 6
      lib/ActiveFormattingElementsList.php
  2. 34
      lib/DOM.php
  3. 18
      lib/DataStream.php
  4. 18
      lib/Exception.php
  5. 40
      lib/ParseError.php
  6. 3146
      lib/Parser.php
  7. 2
      lib/Stack.php
  8. 16
      lib/Token.php
  9. 3132
      lib/Tokenizer.php

6
lib/ActiveFormattingElementsList.php

@ -132,7 +132,7 @@ class ActiveFormattingElementsList implements \ArrayAccess {
# elements is a marker, or if it is an element that is in the stack of open
# elements, then there is nothing to reconstruct; stop this algorithm.
$entry = end($this->_storage);
if ($entry instanceof ActiveFormattingElementMarker || in_array($entry['element'], Parser::$self->stack)) {
if ($entry instanceof ActiveFormattingElementMarker || in_array($entry['element'], Parser::$instance->stack)) {
return;
}
@ -153,7 +153,7 @@ class ActiveFormattingElementsList implements \ArrayAccess {
# 6. If entry is neither a marker nor an element that is also in the stack of
# open elements, go to the step labeled Rewind.
if (!$entry instanceof ActiveFormattingElementMarker && !in_array($entry['element'], Parser::$self->stack)) {
if (!$entry instanceof ActiveFormattingElementMarker && !in_array($entry['element'], Parser::$instance->stack)) {
goto rewind;
}
@ -165,7 +165,7 @@ class ActiveFormattingElementsList implements \ArrayAccess {
# 8. Create: Insert an HTML element for the token for which the element entry
# was created, to obtain new element.
create:
$element = Parser::$self->insertElement($entry['token']);
$element = Parser::$instance->insertElement($entry['token']);
# 9. Replace the entry for entry in the list with an entry for new element.
$this->_storage[key($this->_storage)]['element'] = $element;

34
lib/DOM.php

@ -43,6 +43,40 @@ class DOM {
);
}
public static function fixIdAttributes(\DOMDocument $dom) {
// TODO: Accept DOMDocumentFragment, append it to a document, fix shit, and
// then poop out a fragment so selecting id attributes works on fragments.
// Fix id attributes so they may be selected by the DOM. Fix the PHP id attribute
// bug. Allows DOMDocument->getElementById() to work on id attributes.
if (!Parser::$instance->fragmentCase) {
$dom->relaxNGValidateSource('<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
<start>
<element>
<anyName/>
<ref name="anythingID"/>
</element>
</start>
<define name="anythingID">
<zeroOrMore>
<choice>
<element>
<anyName/>
<ref name="anythingID"/>
</element>
<attribute name="id"><data type="ID"/></attribute>
<zeroOrMore><attribute><anyName/></attribute></zeroOrMore>
<text/>
</choice>
</zeroOrMore>
</define>
</grammar>');
}
$dom->normalize();
return $dom;
}
protected static function ancestor(mixed $needle, \DOMElement $context, bool $returnNode = true) {
while ($context = $context->parentNode) {
$result = static::compare($needle, $context);

18
lib/DataStream.php

@ -141,7 +141,7 @@ class DataStream
# unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X
# character). This is a parse error; nothing is returned.
if (!$number) {
ParseError::trigger(ParseError::HEX_DIGITS_EXPECTED, $this, $this->peek());
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'hexadecimal digit');
$this->unconsume(2);
return '&';
@ -154,7 +154,7 @@ class DataStream
# unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X
# character). This is a parse error; nothing is returned.
if (!$number) {
ParseError::trigger(ParseError::DIGITS_EXPECTED, $this, $this->peek());
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'decimal digit');
$this->unconsume();
return '&';
@ -167,7 +167,7 @@ class DataStream
if ($char === ';') {
$this->consume();
} else {
ParseError::trigger(ParseError::SEMICOLON_TERMINATOR_EXPECTED, $this, $char);
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $char, 'semicolon terminator');
}
# If one or more characters match the range, then take them all and interpret the
@ -266,7 +266,7 @@ class DataStream
}
if ($returnValue) {
ParseError::trigger(Error::INVALID_NUMERIC_ENTITY);
ParseError::trigger(Error::INVALID_NUMERIC_ENTITY, $number);
return $returnValue;
}
@ -274,7 +274,7 @@ class DataStream
# 0x10FFFF, then this is a parse error. Return a U+FFFD REPLACEMENT CHARACTER
# character token.
if (($number >= 0xD800 && $number <= 0xDFFF) || $number > 0x10FFFF) {
ParseError::trigger(Error::INVALID_CODEPOINT);
ParseError::trigger(Error::INVALID_CODEPOINT, $number);
return '�';
}
@ -295,7 +295,7 @@ class DataStream
$number === 0xBFFFF || $number === 0xCFFFE || $number === 0xCFFFF || $number === 0xDFFFE ||
$number === 0xDFFFF || $number === 0xEFFFE || $number === 0xEFFFF || $number === 0xFFFFE ||
$number === 0xFFFFF || $number === 0x10FFFE || $number === 0x10FFFF) {
ParseError::trigger(Error::INVALID_CODEPOINT);
ParseError::trigger(Error::INVALID_CODEPOINT, $number);
return '&';
}
@ -337,7 +337,7 @@ class DataStream
$next = $this->peek();
if ($inAttribute && $lastChar !== ';' && ($next === '=' || ctype_alnum($next))) {
if ($next === '=') {
ParseError::trigger(ParseError::INVALID_NAMED_ENTITY, $this);
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $next, 'semicolon terminator');
}
return '&';
@ -349,7 +349,7 @@ class DataStream
// Used for PHP's entity decoder. Described below.
$sequence.=';';
ParseError::trigger(ParseError::SEMICOLON_TERMINATOR_EXPECTED, $this);
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $lastChar, 'semicolon terminator');
}
# Return one or two character tokens for the character(s) corresponding to the
@ -367,7 +367,7 @@ class DataStream
# (&) consist of a sequence of one or more alphanumeric ASCII characters followed
# by a U+003B SEMICOLON character (;), then this is a parse error.
if (preg_match('/^[A-Za-z0-9]+;/', $char)) {
ParseError::trigger(ParseError::INVALID_NAMED_ENTITY, $this);
ParseError::trigger(ParseError::INVALID_NAMED_ENTITY, $char);
}
return '&';

18
lib/Exception.php

@ -21,6 +21,8 @@ class Exception extends \Exception {
const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10501;
const TOKENIZER_INVALID_STATE = 10601;
protected static $messages = [10000 => 'Invalid error code',
10001 => 'Unknown error; escaping',
10002 => 'Incorrect number of parameters for Exception message; %s expected',
@ -37,7 +39,9 @@ class Exception extends \Exception {
10401 => 'Data string expected; found %s',
10402 => '%s is an invalid data consumption length; a value of 1 or above is expected',
10501 => 'The first argument must either be an instance of \DOMElement, a string, or a closure; found %s'];
10501 => 'The first argument must either be an instance of \DOMElement, a string, or a closure; found %s',
10601 => 'The Tokenizer has entered an invalid state'];
public function __construct(int $code, ...$args) {
if (!isset(static::$messages[$code])) {
@ -62,18 +66,6 @@ class Exception extends \Exception {
}
if ($count > 0) {
// Convert newlines and tabs in the arguments to words to better express what they
// are.
/*$args = array_map(function($value) {
switch ($value) {
case "\n": return 'Newline';
break;
case "\t": return 'Tab';
break;
default: return $value;
}
}, $args);*/
// Go through each of the arguments and run sprintf on the strings.
$message = call_user_func_array('sprintf', array_merge([$message], $args));
}

40
lib/ParseError.php

@ -3,10 +3,6 @@ declare(strict_types=1);
namespace dW\HTML5;
class ParseError {
// DataStream object passed to it used to get information used in error
// reporting.
public static $data;
const TAG_NAME_EXPECTED = 0;
const UNEXPECTED_EOF = 1;
const UNEXPECTED_CHARACTER = 2;
@ -17,7 +13,11 @@ class ParseError {
const UNEXPECTED_DOCTYPE = 7;
const INVALID_DOCTYPE = 8;
const INVALID_CONTROL_OR_NONCHARACTERS = 9;
const INVALID_XMLNS_ATTRIBUTE_VALUE = 10;
const UNEXPECTED_XMLNS_ATTRIBUTE_VALUE = 10;
const ENTITY_UNEXPECTED_CHARACTER = 11;
const INVALID_NUMERIC_ENTITY = 12;
const INVALID_NAMED_ENTITY = 14;
const INVALID_CODEPOINT = 15;
protected static $messages = ['Tag name expected; found %s',
'Unexpected end-of-file; %s expected',
@ -29,14 +29,18 @@ class ParseError {
'Unexpected DOCTYPE; %s expected',
'Invalid DOCTYPE',
'Invalid Control or Non-character; removing',
'Invalid xmlns attribute value; %s expected'];
'Unexpected xmlns attribute value; %s expected',
'Unexpected "%s" character in entity; %s expected',
'"%s" is an invalid numeric entity',
'"%s" is an invalid name for an entity',
'"%s" is an invalid character codepoint'];
public static function errorHandler($code, $message, $file, $line, array $context) {
if ($code === E_USER_WARNING) {
$errMsg = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, static::$data->filePath);
$errMsg = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, Parser::$instance->data->filePath);
if (static::$data->length !== 0) {
$errMsg .= sprintf(" on line %s, column %s\n", static ::$data->line, static::$data->column);
if (Parser::$instance->data->length !== 0) {
$errMsg .= sprintf(" on line %s, column %s\n", Parser::$instance->data->line, Parser::$instance->data->column);
} else {
$errMsg .= "\n";
}
@ -45,13 +49,11 @@ class ParseError {
}
}
public static function trigger(int $code, DataStream $data, ...$args): bool {
public static function trigger(int $code, ...$args): bool {
if (!isset(static::$messages[$code])) {
throw new Exception(Exception::INVALID_CODE);
}
static::$data = $data;
// Set the error handler and honor already-set error reporting rules.
set_error_handler('\\dW\\HTML5\\ParseError::errorHandler', error_reporting());
@ -68,12 +70,14 @@ class ParseError {
// Convert newlines and tabs in the arguments to words to better express what they
// are.
$args = array_map(function($value) {
switch ($value) {
case "\n": return 'Newline';
break;
case "\t": return 'Tab';
break;
default: return $value;
if ($value === "\n") {
return 'Newline';
} elseif ($value === "\t") {
return 'Tab';
} elseif (is_null($value)) {
return "nothing";
} else {
return $value;
}
}, $args);

3146
lib/Parser.php

File diff suppressed because it is too large

2
lib/Stack.php

@ -76,7 +76,7 @@ class Stack implements \ArrayAccess {
# the HTML fragment parsing algorithm and the stack of open elements has only one
# element in it (fragment case); otherwise, the adjusted current node is the
# current node.
return (Parser::$self->fragmentCase && $this->length === 1) ? Parser::$self->fragmentContext : $this->currentNode;
return (Parser::$instance->fragmentCase && $this->length === 1) ? Parser::$instance->fragmentContext : $this->currentNode;
break;
case 'adjustedCurrentNodeNamespace':
$adjustedCurrentNode = $this->adjustedCurrentNode;

16
lib/Token.php

@ -7,16 +7,16 @@ abstract class Token {}
abstract class DataToken extends Token {
public $data;
public function __construct($data) {
$this->data = (string)$data;
public function __construct(string $data) {
$this->data = $data;
}
}
abstract class TagToken extends Token {
public $name;
public function __construct($name) {
$this->name = (string)$name;
public function __construct(string $name) {
$this->name = $name;
}
}
@ -38,7 +38,7 @@ class DOCTYPEToken extends Token {
class CharacterToken extends DataToken {}
class CommentToken extends DataToken {
public function __construct($data = '') {
public function __construct(string $data = '') {
parent::__construct($data);
}
}
@ -48,7 +48,7 @@ class StartTagToken extends TagToken {
public $selfClosing;
public $attributes = [];
public function __construct($name, bool $selfClosing = false, string $namespace = Parser::HTML_NAMESPACE) {
public function __construct(string $name, bool $selfClosing = false, string $namespace = Parser::HTML_NAMESPACE) {
$this->selfClosing = $selfClosing;
$this->namespace = $namespace;
parent::__construct($name);
@ -68,7 +68,7 @@ class StartTagToken extends TagToken {
unset($this->attributes[$this->getAttributeKey($name)]);
}
public function setAttribute($name, $value, $namespace = Parser::HTML_NAMESPACE) {
public function setAttribute(string $name, string $value, string $namespace = Parser::HTML_NAMESPACE) {
$key = $this->_getAttributeKey($name);
$attribute = new TokenAttr($name, $value, $namespace);
@ -79,7 +79,7 @@ class StartTagToken extends TagToken {
}
}
private function _getAttributeKey($name) {
private function _getAttributeKey(string $name) {
$key = null;
foreach ($this->attributes as $key => $a) {
if ($a->name === $name) {

3132
lib/Tokenizer.php

File diff suppressed because it is too large
Loading…
Cancel
Save