Browse Source

Explicitly index state names and error messages

split-manual
J. King 5 years ago
parent
commit
8644b6c757
  1. 32
      lib/ParseError.php
  2. 210
      lib/Tokenizer.php

32
lib/ParseError.php

@ -21,21 +21,23 @@ class ParseError {
const INVALID_NAMED_ENTITY = 13;
const INVALID_CODEPOINT = 14;
protected static $messages = ['Tag name expected',
'Unexpected end-of-file',
'Unexpected "%s" character',
'%s attribute already exists; discarding',
'Unexpected end-of-tag',
'Unexpected %s start tag',
'Unexpected %s end tag',
'Unexpected DOCTYPE',
'Invalid DOCTYPE',
'Invalid Control or Non-character; removing',
'Unexpected xmlns attribute value',
'Unexpected "%s" character in entity; %s expected',
'"%s" is an invalid numeric entity',
'"%s" is an invalid name for an entity',
'"%s" is an invalid character codepoint'];
protected static $messages = [
self::TAG_NAME_EXPECTED => 'Tag name expected',
self::UNEXPECTED_EOF => 'Unexpected end-of-file',
self::UNEXPECTED_CHARACTER => 'Unexpected "%s" character',
self::ATTRIBUTE_EXISTS => '%s attribute already exists; discarding',
self::UNEXPECTED_END_OF_TAG => 'Unexpected end-of-tag',
self::UNEXPECTED_START_TAG => 'Unexpected %s start tag',
self::UNEXPECTED_END_TAG => 'Unexpected %s end tag',
self::UNEXPECTED_DOCTYPE => 'Unexpected DOCTYPE',
self::INVALID_DOCTYPE => 'Invalid DOCTYPE',
self::INVALID_CONTROL_OR_NONCHARACTERS => 'Invalid Control or Non-character; removing',
self::UNEXPECTED_XMLNS_ATTRIBUTE_VALUE => 'Unexpected xmlns attribute value',
self::ENTITY_UNEXPECTED_CHARACTER => 'Unexpected "%s" character in entity; %s expected',
self::INVALID_NUMERIC_ENTITY => '"%s" is an invalid numeric entity',
self::INVALID_NAMED_ENTITY => '"%s" is an invalid name for an entity',
self::INVALID_CODEPOINT => '"%s" is an invalid character codepoint'
];
public function __construct(Data $data) {
$this->data = $data;

210
lib/Tokenizer.php

@ -76,6 +76,74 @@ class Tokenizer {
const BOGUS_DOCTYPE_STATE = 63;
const CDATA_SECTION_STATE = 64;
const STATE_NAMES = [
self::DATA_STATE => "Data",
self::RCDATA_STATE => "RCDATA",
self::RAWTEXT_STATE => "RAWTEXT",
self::SCRIPT_DATA_STATE => "Script data",
self::PLAINTEXT_STATE => "PLAINTEXT",
self::TAG_OPEN_STATE => "Tag open",
self::END_TAG_OPEN_STATE => "End tag open",
self::TAG_NAME_STATE => "Tag name",
self::RCDATA_LESS_THAN_SIGN_STATE => "RCDATA less-than sign",
self::RCDATA_END_TAG_OPEN_STATE => "RCDATA end tag open",
self::RCDATA_END_TAG_NAME_STATE => "RCDATA end tag name",
self::RAWTEXT_LESS_THAN_SIGN_STATE => "RAWTEXT less than sign",
self::RAWTEXT_END_TAG_OPEN_STATE => "RAWTEXT end tag open",
self::RAWTEXT_END_TAG_NAME_STATE => "RAWTEXT end tag name",
self::SCRIPT_DATA_LESS_THAN_SIGN_STATE => "Script data less-than sign",
self::SCRIPT_DATA_END_TAG_OPEN_STATE => "Script data end tag open",
self::SCRIPT_DATA_END_TAG_NAME_STATE => "Script data end tag name",
self::SCRIPT_DATA_ESCAPE_START_STATE => "Script data escape start",
self::SCRIPT_DATA_ESCAPE_START_DASH_STATE => "Script data escape start dash",
self::SCRIPT_DATA_ESCAPED_STATE => "Script data escaped",
self::SCRIPT_DATA_ESCAPED_DASH_STATE => "Script data escaped dash",
self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE => "Script data escaped dash dash",
self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE => "Script data escaped less-than sign",
self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE => "Script data escaped end tag open",
self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE => "Script data escaped end tag name",
self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE => "Script data double escape start",
self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE => "Script data double escaped",
self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE => "Script data double escaped dash",
self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE => "Script data double escaped dash dash",
self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE => "Script data double escaped less-than sign",
self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE => "Script data double escape end",
self::BEFORE_ATTRIBUTE_NAME_STATE => "Before attribute",
self::ATTRIBUTE_NAME_STATE => "Attribute name",
self::AFTER_ATTRIBUTE_NAME_STATE => "After attribute name",
self::BEFORE_ATTRIBUTE_VALUE_STATE => "Before attribute value",
self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE => "Attribute value (double quoted)",
self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE => "Attribute value (single quoted)",
self::ATTRIBUTE_VALUE_UNQUOTED_STATE => "Attribute value (unquoted)",
self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE => "After attribute value (quoted)",
self::SELF_CLOSING_START_TAG_STATE => "Self-closing start tag",
self::BOGUS_COMMENT_STATE => "Bogus comment",
self::MARKUP_DECLARATION_OPEN_STATE => "Markup declaration open",
self::COMMENT_START_STATE => "Comment start",
self::COMMENT_START_DASH_STATE => "Comment start dash",
self::COMMENT_STATE => "Comment",
self::COMMENT_END_DASH_STATE => "Comment end dash",
self::COMMENT_END_STATE => "Comment end",
self::COMMENT_END_BANG_STATE => "Comment end bang",
self::DOCTYPE_STATE => "DOCTYPE",
self::BEFORE_DOCTYPE_NAME_STATE => "Before DOCTYPE name",
self::DOCTYPE_NAME_STATE => "DOCTYPE name",
self::AFTER_DOCTYPE_NAME_STATE => "After DOCTYPE name",
self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE => "After DOCTYPE public keyword",
self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE => "Before DOCTYPE public identifier",
self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE => "DOCTYPE public identifier (double quoted)",
self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE => "DOCTYPE public identifier (single quoted)",
self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE => "After DOCTYPE public identifier",
self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE => "Between DOCTYPE public and system identifiers",
self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE => "After DOCTYPE system keyword",
self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE => "Before DOCTYPE system identifier",
self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE => "DOCTYPE system identifier (double-quoted)",
self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE => "DOCTYPE system identifier (single-quoted)",
self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE => "After DOCTYPE system identifier",
self::BOGUS_DOCTYPE_STATE => "Bogus comment",
self::CDATA_SECTION_STATE => "CDATA section",
];
// Ctype constants
const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
@ -89,141 +157,13 @@ class Tokenizer {
public function createToken(): Token {
while (true) {
if (self::$debug) {
switch ($this->state) {
case self::DATA_STATE: $state = "Data";
break;
case self::RCDATA_STATE: $state = "RCDATA";
break;
case self::RAWTEXT_STATE: $state = "RAWTEXT";
break;
case self::SCRIPT_DATA_STATE: $state = "Script data";
break;
case self::PLAINTEXT_STATE: $state = "PLAINTEXT";
break;
case self::TAG_OPEN_STATE: $state = "Tag open";
break;
case self::END_TAG_OPEN_STATE: $state = "End tag open";
break;
case self::TAG_NAME_STATE: $state = "Tag name";
break;
case self::RCDATA_LESS_THAN_SIGN_STATE: $state = "RCDATA less-than sign";
break;
case self::RCDATA_END_TAG_OPEN_STATE: $state = "RCDATA end tag open";
break;
case self::RCDATA_END_TAG_NAME_STATE: $state = "RCDATA end tag name";
break;
case self::RAWTEXT_LESS_THAN_SIGN_STATE: $state = "RAWTEXT less than sign";
break;
case self::RAWTEXT_END_TAG_OPEN_STATE: $state = "RAWTEXT end tag open";
break;
case self::RAWTEXT_END_TAG_NAME_STATE: $state = "RAWTEXT end tag name";
break;
case self::SCRIPT_DATA_LESS_THAN_SIGN_STATE: $state = "Script data less-than sign";
break;
case self::SCRIPT_DATA_END_TAG_OPEN_STATE: $state = "Script data end tag open";
break;
case self::SCRIPT_DATA_END_TAG_NAME_STATE: $state = "Script data end tag name";
break;
case self::SCRIPT_DATA_ESCAPE_START_STATE: $state = "Script data escape start";
break;
case self::SCRIPT_DATA_ESCAPE_START_DASH_STATE: $state = "Script data escape start dash";
break;
case self::SCRIPT_DATA_ESCAPED_STATE: $state = "Script data escaped";
break;
case self::SCRIPT_DATA_ESCAPED_DASH_STATE: $state = "Script data escaped dash";
break;
case self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE: $state = "Script data escaped dash dash";
break;
case self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE: $state = "Script data escaped less-than sign";
break;
case self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE: $state = "Script data escaped end tag open";
break;
case self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE: $state = "Script data escaped end tag name";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE: $state = "Script data double escape start";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE: $state = "Script data double escaped";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE: $state = "Script data double escaped dash";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE: $state = "Script data double escaped dash dash";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE: $state = "Script data double escaped less-than sign";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE: $state = "Script data double escape end";
break;
case self::BEFORE_ATTRIBUTE_NAME_STATE: $state = "Before attribute";
break;
case self::ATTRIBUTE_NAME_STATE: $state = "Attribute name";
break;
case self::AFTER_ATTRIBUTE_NAME_STATE: $state = "After attribute name";
break;
case self::BEFORE_ATTRIBUTE_VALUE_STATE: $state = "Before attribute value";
break;
case self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE: $state = "Attribute value (double quoted)";
break;
case self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE: $state = "Attribute value (single quoted)";
break;
case self::ATTRIBUTE_VALUE_UNQUOTED_STATE: $state = "Attribute value (unquoted)";
break;
case self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE: $state = "After attribute value (quoted)";
break;
case self::SELF_CLOSING_START_TAG_STATE: $state = "Self-closing start tag";
break;
case self::BOGUS_COMMENT_STATE: $state = "Bogus comment";
break;
case self::MARKUP_DECLARATION_OPEN_STATE: $state = "Markup declaration open";
break;
case self::COMMENT_START_STATE: $state = "Comment start";
break;
case self::COMMENT_START_DASH_STATE: $state = "Comment start dash";
break;
case self::COMMENT_STATE: $state = "Comment";
break;
case self::COMMENT_END_DASH_STATE: $state = "Comment end dash";
break;
case self::COMMENT_END_STATE: $state = "Comment end";
break;
case self::COMMENT_END_BANG_STATE: $state = "Comment end bang";
break;
case self::DOCTYPE_STATE: $state = "DOCTYPE";
break;
case self::BEFORE_DOCTYPE_NAME_STATE: $state = "Before DOCTYPE name";
break;
case self::DOCTYPE_NAME_STATE: $state = "DOCTYPE name";
break;
case self::AFTER_DOCTYPE_NAME_STATE: $state = "After DOCTYPE name";
break;
case self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE: $state = "After DOCTYPE public keyword";
break;
case self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE: $state = "Before DOCTYPE public identifier";
break;
case self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE: $state = "DOCTYPE public identifier (double quoted)";
break;
case self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE: $state = "DOCTYPE public identifier (single quoted)";
break;
case self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE: $state = "After DOCTYPE public identifier";
break;
case self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE: $state = "Between DOCTYPE public and system identifiers";
break;
case self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE: $state = "After DOCTYPE system keyword";
break;
case self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE: $state = "Before DOCTYPE system identifier";
break;
case self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE: $state = "DOCTYPE system identifier (double-quoted)";
break;
case self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE: $state = "DOCTYPE system identifier (single-quoted)";
break;
case self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE: $state = "After DOCTYPE system identifier";
break;
case self::BOGUS_DOCTYPE_STATE: $state = "Bogus comment";
break;
case self::CDATA_SECTION_STATE: $state = "CDATA section";
break;
default: throw new Exception(Exception::UNKNOWN_ERROR);
}
echo "State: $state\n";
$state = self::STATE_NAMES[$this->state] ?? "";
if ($state) {
echo "State: $state\n";
unset($state);
} else {
throw new Exception(Exception::UNKNOWN_ERROR);
}
}
# 12.2.4.1 Data state

Loading…
Cancel
Save