Character consumption cleanup

- Newline normalization now done on-the-fly - Consequently, original input string is used as-is - Byte order mark is not supposed to be skipped - Use more straightforward method of tracking column position - Simplify backtracking when spanning - Genericize character interpretation: this will be expanded to emit illegal-character parse errors when appropriate
5 years ago · 9560358021
1 changed files with 69 additions and 58 deletions
--- a/lib/Data.php
+++ b/lib/Data.php
@ -14,8 +14,9 @@ class Data {
    protected $_line = 1;
    // Used for error reporting to display column number.
    protected $_column = 0;
-    // Used for error reporting when unconsuming to calculate column number from
-    // last newline.
+    // array of normalized CR+LF pairs, denoted by the character offset of the LF
+    protected $normalized = [];
+    // Holds the character position and column number of each newline
    protected $newlines = [];
    // Whether the EOF imaginary character has been consumed
    protected $eof = false;
@ -44,62 +45,82 @@ class Data {
        // encoding. At this moment this implementation won't determine a character
        // encoding and will just assume UTF-8.

-
-        // Normalize line breaks. Convert CRLF and CR to LF.
-        // Break the string up into a traversable object.
-        $this->data = new \MensBeam\Intl\Encoding\UTF8(str_replace(["\r\n", "\r"], "\n", $data), false, true);
-
-        # One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present
-        # in the input stream.
-
-        if ($this->data->nextChar() !== '\xEF\xBB\xBF') {
-            // rewind to the start of the string if the first character was not a BOM
-            $this->data->rewind();
-        }
+        $this->data = new \MensBeam\Intl\Encoding\UTF8($data, false, true);
    }

-    public function consume(int $length = 1): string {
+    public function consume(int $length = 1, $advancePointer = true): string {
        assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length));

        for ($i = 0, $string = ''; $i < $length; $i++) {
            $char = $this->data->nextChar();

-            if ($char === "\n") {
-                $this->newlines[] = $this->data->posChar();
-                $this->_column = 1;
-                $this->_line++;
-            } else {
-                $this->_column++;
+            # Before the tokenization stage, the input stream must be 
+            #   preprocessed by normalizing newlines.
+            # Thus, newlines in HTML DOMs are represented by U+000A LF characters, 
+            #   and there are never any U+000D CR characters in the input to the tokenization stage.
+            if ($char === "\r") {
+                // if this is a CR+LF pair, skip the CR and note the normalization
+                if ($this->data->peekChar() === "\n") {
+                    $char = $this->data->nextChar();
+                    $this->normalized[$this->data->posChar()] = true;
+                }
+                // otherwise just silently change the character to LF; 
+                // the bare CR will be trivial to process when seeking backwards
+                else {
+                    $char = "\n";
+                }
            }
-
+            // append the character to the output string
            $string .= $char;
+            // unless we're peeking, track line and column position, and whether we've hit EOF
+            if ($advancePointer) {
+                if (!$this->checkChar($char)) {
+                    break;
+                }
+            }
        }
+        return $string;
+    }

-        if ($char === '') {
+    protected function checkChar(string $char): bool {
+        if ($char === "\n") {
+            $this->newlines[$this->data->posChar()] = $this->_column;
+            $this->_column = 1;
+            $this->_line++;
+        } elseif ($char === '') {
            $this->eof = true;
+            $this->_column++;
+            return false;
+        } else {
+            $this->_column++;
        }
-
-        return $string;
+        return true;
    }

-    public function unconsume(int $length = 1) {
+    public function unconsume(int $length = 1, bool $retreatPointer = true): void {
        assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length));

-        if (!$this->eof) {
-            $this->data->seek(0 - $length);
-
-            $string = $this->data->peekChar($length);
-            $numOfNewlines = substr_count($string, "\n");
-
-            if ($numOfNewlines > 0) {
-                $this->_line -= $numOfNewlines;
-
-                $count = $this->newlines;
-                $index = count($this->newlines) - ($numOfNewlines - 1);
-                $this->_column = 1 + (($count > 0 && isset($this->newlines[$index])) ? $this->data->posChar() - $this->newlines[$index] : $this->data->posChar());
-            } else {
-                $this->_column -= $length;
+        if ($this->eof) {
+            $length--;
+            $this->eof = false;
+        }
+        while ($length-- > 0) {
+            $here = $this->data->posChar();
+            // if the previous character was a normalized CR+LF pair, we need to go back two
+            if (isset($this->normalized[$here])) {
+                $this->data->seek(-1);
+            }
+            // recalculate line and column positions, if requested
+            if ($retreatPointer) {
+                $col = $this->newlines[$here] ?? 0;
+                if ($col) {
+                    $this->_column = $col;
+                    $this->_line--;
+                } else {
+                    $this->_column--;
+                }
            }
+            $this->data->seek(-1);
        }
    }

@ -131,11 +152,11 @@ class Data {
        // Break the matching characters into an array of characters. Unicode friendly.
        $match = preg_split('/(?<!^)(?!$)/Su', $match);

+        $start = $this->data->posChar();
        $count = 0;
        $string = '';
        while (true) {
-            $char = $this->data->nextChar();
-            $count++;
+            $char = $this->consume(1, false);

            if ($char === '') {
                break;
@ -145,38 +166,28 @@ class Data {

            // strspn
            if ($while && !$inArray) {
+                $this->unconsume(1, false);
                break;
            }
            // strcspn
            elseif (!$while && $inArray) {
+                $this->unconsume(1, false);
                break;
            }

            if ($advancePointer) {
-                if ($char === "\n") {
-                    $this->newlines[] = $this->data->posChar();
-                    $this->_column = 1;
-                    $this->_line++;
-                } else {
-                    $this->_column++;
-                }
+                $this->checkChar($char);
            }

+            $count++;
            $string .= $char;
            if ($count === $limit) {
                break;
            }
        }

-        // If the end (or limit) is reached the pointer isn't moved when the last character
-        // is checked, so it only needs to be moved backwards if not wanting the
-        // pointer to move.
-        if ($char === '' || $count === $limit) {
-            if (!$advancePointer) {
-                $this->data->seek(0 - $count - 1);
-            }
-        } else {
-            $this->data->seek(($advancePointer) ? -1 : 0 - $count - 2);
+        if (!$advancePointer && $count) {
+            $this->data->seek(-($this->data->posChar - $start));
        }

        return $string;