Fixed Data bug

• Fixed bug where Data::consumeWhile and Data::consumeUntil wouldn't move the pointer back one position if there were no matches. • Changed DataStream to Data. • Made each class have its own debug static property so each can print debug information separately.
6 years ago · 33363ab2d3
6 changed files with 86 additions and 34 deletions
--- a/lib/DataStream.php
+++ b/lib/DataStream.php
@ -2,7 +2,7 @@
 declare(strict_types=1);
 namespace dW\HTML5;
-class DataStream
+class Data
 {
    // Used to get the file path for error reporting.
    public $filePath;
@ -17,9 +17,15 @@ class DataStream
    // last newline.
    protected $newlines = [];
    // Used for debugging to print out information as data is consumed.
    public static $debug = false;
    const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
    const DIGIT = '0123456789';
    const HEX = '0123456789ABCDEFabcdef';
    const WHITESPACE = "\t\n\x0c\x0d ";
    public function __construct(string $data, string $filePath = 'STDIN') {
@ -61,7 +67,7 @@ class DataStream
    public function consume(int $length = 1): string {
        if ($length <= 0) {
-            throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
+            throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length);
        }
        for ($i = 0, $string = ''; $i < $length; $i++) {
@ -78,12 +84,22 @@ class DataStream
            $string .= $char;
        }
        if (self::$debug) {
            echo "\nConsume\n==========\n";
            echo "Length: $length\n";
            echo "Data: ";
            var_export($string);
            echo "\n";
            echo "Pointer: {$this->data->posChar()}\n";
            echo "==========\n\n";
        }
        return $string;
    }
    public function unconsume(int $length = 1) {
        if ($length <= 0) {
-            throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
+            throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length);
        }
        $this->data->seek(0 - $length);
@ -100,6 +116,12 @@ class DataStream
        } else {
            $this->_column -= $length;
        }
        if (self::$debug) {
            echo "\nUnconsume\n==========\n";
            echo "Pointer: {$this->data->posChar()}\n";
            echo "==========\n\n";
        }
    }
    public function consumeWhile(string $match, int $limit = 0): string {
@ -112,10 +134,21 @@ class DataStream
    public function peek(int $length = 1): string {
        if ($length <= 0) {
-            throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
+            throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length);
        }
-        return $this->data->peekChar($length);
+        $string = $this->data->peekChar($length);
        if (self::$debug) {
            echo "\nPeek\n==========\n";
            echo "Data: ";
            var_export($string);
            echo "\n";
            echo "Pointer: {$this->data->posChar()}\n";
            echo "==========\n\n";
        }
        return $string;
    }
    public function peekWhile(string $match, int $limit = 0): string {
@ -437,11 +470,18 @@ class DataStream
            }
        }
-        if ($count === 0) {
+        $this->data->seek(($advancePointer) ? -1 : 0 - $count - 2);
-            return '';
+
        if (self::$debug) {
            echo ($advancePointer) ? "\nconsume" : "\npeek";
            echo ($while) ? 'While' : 'Until';
            echo "\n==========\nPattern: ";
            var_export(str_replace(["\t", "\n", "\x0c", "\x0d"], ['\t', '\n', '\x0c', '\x0d'], implode('', $match)));
            echo "\nData: ";
            var_export($string);
            echo "\nPointer: {$this->data->posChar()}\n==========\n\n";
        }
        $this->data->seek(($advancePointer) ? -1 : 0 - $count - 2);
        return $string;
    }
@ -451,6 +491,8 @@ class DataStream
            break;
            case 'line': return $this->_line;
            break;
            case 'pointer': return $this->data->posChar();
            break;
            default: return null;
        }
    }
--- a/lib/Exception.php
+++ b/lib/Exception.php
@ -15,8 +15,8 @@ class Exception extends \Exception {
    const STACK_DOMNODE_ONLY = 10202;
    const STACK_FRAGMENT_CONTEXT_DOMELEMENT_DOMDOCUMENT_DOMDOCUMENTFRAG_EXPECTED = 10203;
-    const DATASTREAM_NODATA = 10301;
+    const DATA_NODATA = 10301;
-    const DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH = 10302;
+    const DATA_INVALID_DATA_CONSUMPTION_LENGTH = 10302;
    const DOM_DOMDOCUMENT_EXPECTED = 10401;
    const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10402;
--- a/lib/ParseError.php
+++ b/lib/ParseError.php
@ -37,7 +37,7 @@ class ParseError {
                                  '"%s" is an invalid name for an entity',
                                  '"%s" is an invalid character codepoint'];
-    public function __construct(DataStream $data) {
+    public function __construct(Data $data) {
        $this->data = $data;
        // Set the error handler and honor already-set error reporting rules.
--- a/lib/Parser.php
+++ b/lib/Parser.php
@ -5,7 +5,7 @@ namespace dW\HTML5;
 class Parser {
    /* Non-static properties */
-    // Input data that's being parsed, uses DataStream
+    // Input data that's being parsed, uses Data
    protected $data;
    // The DOMDocument that is assembled by the tree builder
    protected $DOM;
@ -34,9 +34,6 @@ class Parser {
    /* Static properties */
    // For debugging
    public static $debug = false;
    // Property used as an instance for the non-static properties
    protected static $instance;
@ -72,7 +69,7 @@ class Parser {
        }
        // Process the input stream.
-        static::$instance->data = new DataStream(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN');
+        static::$instance->data = new Data(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN');
        // Set the locale for CTYPE to en_US.UTF8 so ctype functions and strtolower only
        // work on basic latin characters. Used extensively when tokenizing.
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@ -8,6 +8,8 @@ class Tokenizer {
    protected $data;
    protected $stack;
    public static $debug = false;
    const DATA_STATE = 0;
    const RCDATA_STATE = 1;
    const RAWTEXT_STATE = 2;
@ -78,7 +80,7 @@ class Tokenizer {
    const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
    const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
-    public function __construct(DataStream $data, OpenElementsStack $stack) {
+    public function __construct(Data $data, OpenElementsStack $stack) {
        $this->state = self::DATA_STATE;
        $this->data = $data;
        $this->stack = $stack;
@ -86,7 +88,7 @@ class Tokenizer {
    public function createToken(): Token {
        while (true) {
-            if (Parser::$debug) {
+            if (self::$debug) {
                switch ($this->state) {
                    case self::DATA_STATE: $state = "Data";
                    break;
@ -260,6 +262,7 @@ class Tokenizer {
                    // OPTIMIZATION: Consume all characters that don't match what is above and emit
                    // that as a character token instead to prevent having to loop back through here
                    // every single time.
                    return new CharacterToken($char.$this->data->consumeUntil('&<'));
                }
            }
--- a/lib/TreeBuilder.php
+++ b/lib/TreeBuilder.php
@ -42,6 +42,8 @@ class TreeBuilder {
    // Instance used with the static token insertion methods.
    protected static $instance;
    // Used for debugging to print out information as the tree is built.
    protected static $debug = false;
    // Constants used for insertion modes
@ -119,8 +121,11 @@ class TreeBuilder {
            $adjustedCurrentNodeName = $this->stack->adjustedCurrentNodeName;
            $adjustedCurrentNodeNamespace = $this->stack->adjustedCurrentNodeNamespace;
-            if (Parser::$debug) {
+            if (self::$debug) {
                echo "Node: $adjustedCurrentNodeName\n";
                echo "\nToken: \n";
                var_export($token);
                echo "\n\n";
            }
            # 8.2.5 Tree construction
@ -175,11 +180,6 @@ class TreeBuilder {
                }
            }
            # TEMPORARY
            echo "\n";
            var_export($token);
            echo "\n\n";
            break;
        }
    }
@ -189,7 +189,7 @@ class TreeBuilder {
        // Loop used when processing the token under different rules; always breaks.
        while (true) {
-            if (Parser::$debug) {
+            if (self::$debug) {
                switch ($insertionMode) {
                    case self::INITIAL_MODE: $mode = "Initial";
                    break;
@ -251,7 +251,7 @@ class TreeBuilder {
                    # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
                    // OPTIMIZATION: Will check for multiple space characters at once as character
                    // tokens can contain more than one character.
-                    if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data))) {
+                    if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
                        # Ignore the token.
                    }
                    # A comment token
@ -425,7 +425,7 @@ class TreeBuilder {
                    # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
                    // OPTIMIZATION: Will check for multiple space characters at once as character
                    // tokens can contain more than one character.
-                    elseif ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) {
+                    elseif ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
                        # Ignore the token.
                    }
                    # A start tag whose tag name is "html"
@ -468,7 +468,9 @@ class TreeBuilder {
                case self::BEFORE_HEAD_MODE:
                    # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
                    # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
-                    if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) {
+                    // OPTIMIZATION: Will check for multiple space characters at once as character
                    // tokens can contain more than one character.
                    if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
                        # Ignore the token.
                    }
                    # A comment token
@ -524,7 +526,9 @@ class TreeBuilder {
                case self::IN_HEAD_MODE:
                    # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
                    # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
-                    if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data))) {
+                    // OPTIMIZATION: Will check for multiple space characters at once as character
                    // tokens can contain more than one character.
                    if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
                        # Insert the character.
                        $this->insertCharacterToken($token);
                    }
@ -813,7 +817,9 @@ class TreeBuilder {
                    # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
                    # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
                    # A comment token
-                    elseif (($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) ||
+                    // OPTIMIZATION: Will check for multiple space characters at once as character
                    // tokens can contain more than one character.
                    elseif (($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) ||
                         $token instanceof CommentToken) {
                        # Process the token using the rules for the "in head" insertion mode.
                        $insertionMode = self::IN_HEAD_MODE;
@ -838,7 +844,9 @@ class TreeBuilder {
                case self::AFTER_HEAD_MODE:
                    # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
                    # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
-                    if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) {
+                    // OPTIMIZATION: Will check for multiple space characters at once as character
                    // tokens can contain more than one character.
                    if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
                        # Insert the character.
                        $this->insertCharacterToken($token);
                    }
@ -962,7 +970,9 @@ class TreeBuilder {
                        # Insert the token’s character.
                        $this->insertCharacterToken($token);
-                        if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) {
+                        // OPTIMIZATION: Will check for multiple space characters at once as character
                        // tokens can contain more than one character.
                        if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
                            # Set the frameset-ok flag to "not ok".
                            $this->framesetOk = false;
                        }
@ -1074,7 +1084,7 @@ class TreeBuilder {
    }
    protected function parseTokenInForeignContent(Token $token): bool {
-        if (Parser::$debug) {
+        if (self::$debug) {
            echo "Foreign Content\n";
        }
@ -1092,7 +1102,7 @@ class TreeBuilder {
            # Any other character token
            // OPTIMIZATION: Will check for multiple space characters at once as character
            // tokens can contain more than one character.
-            if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) {
+            if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
                # Set the frameset-ok flag to "not ok".
                $this->$framesetOk = false;
            }