From fb0441809e0922aa5689faeb5fe214607f9f6493 Mon Sep 17 00:00:00 2001
From: Dustin Wilson <dustin@dustinwilson.com>
Date: Mon, 13 Sep 2021 16:55:16 -0500
Subject: [PATCH] Minor optimizations to tokenizer

---
 lib/Data.php      | 25 ++++++++++++------------
 lib/Grammar.php   |  8 ++++++--
 lib/Tokenizer.php | 50 +++++++++++++++++++++++------------------------
 3 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/lib/Data.php b/lib/Data.php
index 2d78b6a..3c40817 100644
--- a/lib/Data.php
+++ b/lib/Data.php
@@ -11,31 +11,32 @@ class Data {
     use FauxReadOnly;
     // True if on the first line
     protected bool $_firstLine = true;
-    protected \Generator $generator;
+    // The stored generator
+    protected \Generator $_generator;
     // True if on the last line.
     protected bool $_lastLine = false;
     // Some matches will check for the last line before the final newline, so this
     // will be true if on the line before the final newline or if on the last line
     // if there isn't an extra newline at the end of the string.
     protected bool $_lastLineBeforeFinalNewLine = false;
+    // The input string split into an array by newline
+    protected array $lines = [];
+    // The length of the data array
+    protected int $linesLength = 0;
 
 
     public function __construct(string $data) {
-        $this->generator = $this->lineGenerator($data);
+        $this->lines = explode("\n", $data);
+        $this->linesLength = count($this->lines);
+        $this->_generator = $this->lineGenerator();
     }
 
 
-    public function get(): \Generator {
-        return $this->generator;
-    }
-
-
-    protected function lineGenerator(string $string): \Generator {
-        $string = explode("\n", $string);
-        $lastLineIndex = count($string) - 1;
-        $lastLineBeforeFinalNewLineIndex = ($string[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex;
+    protected function lineGenerator(): \Generator {
+        $lastLineIndex = $this->linesLength - 1;
+        $lastLineBeforeFinalNewLineIndex = ($this->lines[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex;
 
-        foreach ($string as $lineNumber => $line) {
+        foreach ($this->lines as $lineNumber => $line) {
             $this->_lastLine = ($lineNumber === $lastLineIndex);
             $this->_lastLineBeforeFinalNewLine = ($lineNumber === $lastLineBeforeFinalNewLineIndex);
             yield $lineNumber + 1 => $line;
diff --git a/lib/Grammar.php b/lib/Grammar.php
index 5afaeaf..5a8bdb7 100644
--- a/lib/Grammar.php
+++ b/lib/Grammar.php
@@ -29,6 +29,9 @@ class Grammar {
     protected ?array $_repository;
     protected ?string $_scopeName;
 
+    protected const ESCAPE_SLASHES_REGEX = '/(?<!\\\)\//S';
+    protected const LONG_CHARACTER_CODE_REGEX = '/\\\x\{([0-9A-Fa-f]+)\}/S';
+
 
     public function __construct(?string $scopeName = null, ?array $patterns = null, ?string $name = null, ?array $injections = null, ?array $repository = null) {
         $this->_name = $name;
@@ -198,11 +201,12 @@ class Grammar {
                     $p['beginPattern'] = true;
                 case 'match':
                     // Escape forward slashes that aren't escaped in regexes.
-                    $value = preg_replace('/(?<!\\\)\//', '\/', $value);
+                    $value = preg_replace(self::ESCAPE_SLASHES_REGEX, '\/', $value);
                     // Fix oniguruma long character codes.
-                    $value = preg_replace_callback('/\\\x\{([0-9A-Fa-f]+)\}/', function($matches) {
+                    $value = preg_replace_callback(self::LONG_CHARACTER_CODE_REGEX, function($matches) {
                         return "\x{" . (((int)base_convert($matches[1], 16, 10) > 0x10ffff) ? '10ffff' : $matches[1]) . "}";
                     }, $value);
+
                     $p['match'] = "/$value/Su";
 
                     $modified = true;
diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php
index db0a858..f1d4010 100644
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@@ -43,7 +43,7 @@ class Tokenizer {
 
 
     public function tokenize(): \Generator {
-        foreach ($this->data->get() as $lineNumber => $line) {
+        foreach ($this->data->generator as $lineNumber => $line) {
             $this->lineNumber = $lineNumber;
             $this->line = $line;
             assert($this->debugLine());
@@ -76,7 +76,7 @@ class Tokenizer {
 
     protected function resolveScopeName(string $scopeName, array $match): string {
         return preg_replace_callback(self::SCOPE_RESOLVE_REGEX, function($m) use($match) {
-            $replacement = $match[(int)$m[1]][0] ?? $m[1];
+            $replacement = trim($match[(int)$m[1]][0] ?? $m[1]);
             $command = $m[2] ?? null;
             switch ($command) {
                 case 'downcase': return strtolower($replacement);
@@ -88,7 +88,7 @@ class Tokenizer {
         }, $scopeName);
     }
 
-    protected function tokenizeLine(int $lineLength): array {
+    protected function tokenizeLine(int $stopOffset): array {
         $tokens = [];
 
         while (true) {
@@ -117,25 +117,25 @@ class Tokenizer {
 
                     // If the rule is a Pattern
                     if ($rule instanceof Pattern) {
-                        // Throw out pattern regexes with anchors that shouldn't match the current line.
-                        // This is necessary because the tokenizer is fed data line by line and
-                        // therefore anchors that match the beginning of the document and the end won't
-                        // do anything.
-                        if (preg_match(self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1) {
-                            if (
-                                // \A anchors match the beginning of the whole string, not just this line
-                                ($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
-                                // \z anchors match the end of the whole string, not just this line
-                                ($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
-                                // \Z anchors match the end of the whole string or before the final newline if
-                                // there's a trailing newline in the string
-                                ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
-                            ) {
+                        if (preg_match($rule->match, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $this->offset) === 1) {
+                            // Throw out pattern regexes with anchors that shouldn't match the current line.
+                            // This is necessary because the tokenizer is fed data line by line and
+                            // therefore anchors that match the beginning of the document and the end won't
+                            // do anything.
+                            if (preg_match(
+                                    self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1 && (
+                                        // \A anchors match the beginning of the whole string, not just this line
+                                        ($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
+                                        // \z anchors match the end of the whole string, not just this line
+                                        ($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
+                                        // \Z anchors match the end of the whole string or before the final newline if
+                                        // there's a trailing newline in the string
+                                        ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
+                                    )
+                                ) {
                                 continue 2;
                             }
-                        }
 
-                        if (preg_match($rule->match, "{$this->line}\n", $match, PREG_OFFSET_CAPTURE, $this->offset)) {
                             // If the match's offset is the same as the current offset then it is the
                             // closest match. There's no need to iterate anymore through the patterns.
                             if ($match[0][1] === $this->offset) {
@@ -351,13 +351,13 @@ class Tokenizer {
                 $this->ruleStack[] = $pattern;
 
                 // If the rule has patterns process tokens from its subpatterns.
-                if ($pattern->patterns !== null && $this->offset < $lineLength) {
+                if ($pattern->patterns !== null && $this->offset < $stopOffset) {
                     // If the pattern has just a regular match (meaning neither a begin nor an end
                     // pattern) but has subpatterns then only tokenize the part of the line that's
-                    // within the match. Otherwise, tokenize up to the line's length. Because of
-                    // recursion, the line length could be set by this step before or within the
+                    // within the match. Otherwise, tokenize up to the stop offset. Because of
+                    // recursion, the stop offset could be set by this step before or within the
                     // capture tokenization process.
-                    $tokens = [ ...$tokens, ...$this->tokenizeLine((!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $lineLength) ];
+                    $tokens = [ ...$tokens, ...$this->tokenizeLine((!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $stopOffset) ];
                 }
 
                 // If the offset is before the end of the match then create a token from the
@@ -406,7 +406,7 @@ class Tokenizer {
                 }
 
                 // If the offset isn't at the end of the line then look for more matches.
-                if ($this->offset < $lineLength) {
+                if ($this->offset < $stopOffset) {
                     continue;
                 }
             }
@@ -419,7 +419,7 @@ class Tokenizer {
                         $this->ruleStack[] = $injection;
                         $this->activeInjection = $injection;
 
-                        if ($this->offset < $lineLength) {
+                        if ($this->offset < $stopOffset) {
                             continue 2;
                         }
                     }