From 33363ab2d31ed9a8a77024245c0781da45d754c7 Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Mon, 27 Aug 2018 14:57:47 -0500 Subject: [PATCH] Fixed Data bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Fixed bug where Data::consumeWhile and Data::consumeUntil wouldn't move the pointer back one position if there were no matches. • Changed DataStream to Data. • Made each class have its own debug static property so each can print debug information separately. --- lib/{DataStream.php => Data.php} | 58 +++++++++++++++++++++++++++----- lib/Exception.php | 4 +-- lib/ParseError.php | 2 +- lib/Parser.php | 7 ++-- lib/Tokenizer.php | 7 ++-- lib/TreeBuilder.php | 42 ++++++++++++++--------- 6 files changed, 86 insertions(+), 34 deletions(-) rename lib/{DataStream.php => Data.php} (95%) diff --git a/lib/DataStream.php b/lib/Data.php similarity index 95% rename from lib/DataStream.php rename to lib/Data.php index f293779..08c9a8f 100644 --- a/lib/DataStream.php +++ b/lib/Data.php @@ -2,7 +2,7 @@ declare(strict_types=1); namespace dW\HTML5; -class DataStream +class Data { // Used to get the file path for error reporting. public $filePath; @@ -17,9 +17,15 @@ class DataStream // last newline. protected $newlines = []; + + // Used for debugging to print out information as data is consumed. + public static $debug = false; + + const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; const DIGIT = '0123456789'; const HEX = '0123456789ABCDEFabcdef'; + const WHITESPACE = "\t\n\x0c\x0d "; public function __construct(string $data, string $filePath = 'STDIN') { @@ -61,7 +67,7 @@ class DataStream public function consume(int $length = 1): string { if ($length <= 0) { - throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); + throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length); } for ($i = 0, $string = ''; $i < $length; $i++) { @@ -78,12 +84,22 @@ class DataStream $string .= $char; } + if (self::$debug) { + echo "\nConsume\n==========\n"; + echo "Length: $length\n"; + echo "Data: "; + var_export($string); + echo "\n"; + echo "Pointer: {$this->data->posChar()}\n"; + echo "==========\n\n"; + } + return $string; } public function unconsume(int $length = 1) { if ($length <= 0) { - throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); + throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length); } $this->data->seek(0 - $length); @@ -100,6 +116,12 @@ class DataStream } else { $this->_column -= $length; } + + if (self::$debug) { + echo "\nUnconsume\n==========\n"; + echo "Pointer: {$this->data->posChar()}\n"; + echo "==========\n\n"; + } } public function consumeWhile(string $match, int $limit = 0): string { @@ -112,10 +134,21 @@ class DataStream public function peek(int $length = 1): string { if ($length <= 0) { - throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); + throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length); } - return $this->data->peekChar($length); + $string = $this->data->peekChar($length); + + if (self::$debug) { + echo "\nPeek\n==========\n"; + echo "Data: "; + var_export($string); + echo "\n"; + echo "Pointer: {$this->data->posChar()}\n"; + echo "==========\n\n"; + } + + return $string; } public function peekWhile(string $match, int $limit = 0): string { @@ -437,11 +470,18 @@ class DataStream } } - if ($count === 0) { - return ''; + $this->data->seek(($advancePointer) ? -1 : 0 - $count - 2); + + if (self::$debug) { + echo ($advancePointer) ? "\nconsume" : "\npeek"; + echo ($while) ? 'While' : 'Until'; + echo "\n==========\nPattern: "; + var_export(str_replace(["\t", "\n", "\x0c", "\x0d"], ['\t', '\n', '\x0c', '\x0d'], implode('', $match))); + echo "\nData: "; + var_export($string); + echo "\nPointer: {$this->data->posChar()}\n==========\n\n"; } - $this->data->seek(($advancePointer) ? -1 : 0 - $count - 2); return $string; } @@ -451,6 +491,8 @@ class DataStream break; case 'line': return $this->_line; break; + case 'pointer': return $this->data->posChar(); + break; default: return null; } } diff --git a/lib/Exception.php b/lib/Exception.php index 86e82e9..c89a47a 100644 --- a/lib/Exception.php +++ b/lib/Exception.php @@ -15,8 +15,8 @@ class Exception extends \Exception { const STACK_DOMNODE_ONLY = 10202; const STACK_FRAGMENT_CONTEXT_DOMELEMENT_DOMDOCUMENT_DOMDOCUMENTFRAG_EXPECTED = 10203; - const DATASTREAM_NODATA = 10301; - const DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH = 10302; + const DATA_NODATA = 10301; + const DATA_INVALID_DATA_CONSUMPTION_LENGTH = 10302; const DOM_DOMDOCUMENT_EXPECTED = 10401; const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10402; diff --git a/lib/ParseError.php b/lib/ParseError.php index 8f99dab..8a3a976 100644 --- a/lib/ParseError.php +++ b/lib/ParseError.php @@ -37,7 +37,7 @@ class ParseError { '"%s" is an invalid name for an entity', '"%s" is an invalid character codepoint']; - public function __construct(DataStream $data) { + public function __construct(Data $data) { $this->data = $data; // Set the error handler and honor already-set error reporting rules. diff --git a/lib/Parser.php b/lib/Parser.php index 4deee2c..5581a56 100644 --- a/lib/Parser.php +++ b/lib/Parser.php @@ -5,7 +5,7 @@ namespace dW\HTML5; class Parser { /* Non-static properties */ - // Input data that's being parsed, uses DataStream + // Input data that's being parsed, uses Data protected $data; // The DOMDocument that is assembled by the tree builder protected $DOM; @@ -34,9 +34,6 @@ class Parser { /* Static properties */ - // For debugging - public static $debug = false; - // Property used as an instance for the non-static properties protected static $instance; @@ -72,7 +69,7 @@ class Parser { } // Process the input stream. - static::$instance->data = new DataStream(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN'); + static::$instance->data = new Data(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN'); // Set the locale for CTYPE to en_US.UTF8 so ctype functions and strtolower only // work on basic latin characters. Used extensively when tokenizing. diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 0ae732d..433da0d 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -8,6 +8,8 @@ class Tokenizer { protected $data; protected $stack; + public static $debug = false; + const DATA_STATE = 0; const RCDATA_STATE = 1; const RAWTEXT_STATE = 2; @@ -78,7 +80,7 @@ class Tokenizer { const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; - public function __construct(DataStream $data, OpenElementsStack $stack) { + public function __construct(Data $data, OpenElementsStack $stack) { $this->state = self::DATA_STATE; $this->data = $data; $this->stack = $stack; @@ -86,7 +88,7 @@ class Tokenizer { public function createToken(): Token { while (true) { - if (Parser::$debug) { + if (self::$debug) { switch ($this->state) { case self::DATA_STATE: $state = "Data"; break; @@ -260,6 +262,7 @@ class Tokenizer { // OPTIMIZATION: Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back through here // every single time. + return new CharacterToken($char.$this->data->consumeUntil('&<')); } } diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index c5db58f..ddb9c59 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -42,6 +42,8 @@ class TreeBuilder { // Instance used with the static token insertion methods. protected static $instance; + // Used for debugging to print out information as the tree is built. + protected static $debug = false; // Constants used for insertion modes @@ -119,8 +121,11 @@ class TreeBuilder { $adjustedCurrentNodeName = $this->stack->adjustedCurrentNodeName; $adjustedCurrentNodeNamespace = $this->stack->adjustedCurrentNodeNamespace; - if (Parser::$debug) { + if (self::$debug) { echo "Node: $adjustedCurrentNodeName\n"; + echo "\nToken: \n"; + var_export($token); + echo "\n\n"; } # 8.2.5 Tree construction @@ -175,11 +180,6 @@ class TreeBuilder { } } - # TEMPORARY - echo "\n"; - var_export($token); - echo "\n\n"; - break; } } @@ -189,7 +189,7 @@ class TreeBuilder { // Loop used when processing the token under different rules; always breaks. while (true) { - if (Parser::$debug) { + if (self::$debug) { switch ($insertionMode) { case self::INITIAL_MODE: $mode = "Initial"; break; @@ -251,7 +251,7 @@ class TreeBuilder { # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. - if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data))) { + if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { # Ignore the token. } # A comment token @@ -425,7 +425,7 @@ class TreeBuilder { # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. - elseif ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) { + elseif ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { # Ignore the token. } # A start tag whose tag name is "html" @@ -468,7 +468,9 @@ class TreeBuilder { case self::BEFORE_HEAD_MODE: # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE - if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) { + // OPTIMIZATION: Will check for multiple space characters at once as character + // tokens can contain more than one character. + if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { # Ignore the token. } # A comment token @@ -524,7 +526,9 @@ class TreeBuilder { case self::IN_HEAD_MODE: # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE - if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data))) { + // OPTIMIZATION: Will check for multiple space characters at once as character + // tokens can contain more than one character. + if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { # Insert the character. $this->insertCharacterToken($token); } @@ -813,7 +817,9 @@ class TreeBuilder { # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE # A comment token - elseif (($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) || + // OPTIMIZATION: Will check for multiple space characters at once as character + // tokens can contain more than one character. + elseif (($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) || $token instanceof CommentToken) { # Process the token using the rules for the "in head" insertion mode. $insertionMode = self::IN_HEAD_MODE; @@ -838,7 +844,9 @@ class TreeBuilder { case self::AFTER_HEAD_MODE: # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE - if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) { + // OPTIMIZATION: Will check for multiple space characters at once as character + // tokens can contain more than one character. + if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { # Insert the character. $this->insertCharacterToken($token); } @@ -962,7 +970,9 @@ class TreeBuilder { # Insert the token’s character. $this->insertCharacterToken($token); - if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) { + // OPTIMIZATION: Will check for multiple space characters at once as character + // tokens can contain more than one character. + if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) { # Set the frameset-ok flag to "not ok". $this->framesetOk = false; } @@ -1074,7 +1084,7 @@ class TreeBuilder { } protected function parseTokenInForeignContent(Token $token): bool { - if (Parser::$debug) { + if (self::$debug) { echo "Foreign Content\n"; } @@ -1092,7 +1102,7 @@ class TreeBuilder { # Any other character token // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. - if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) { + if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) { # Set the frameset-ok flag to "not ok". $this->$framesetOk = false; }