Minor optimizations to tokenizer

3 years ago · fb0441809e
3 changed files with 44 additions and 39 deletions
--- a/lib/Data.php
+++ b/lib/Data.php
@ -11,31 +11,32 @@ class Data {
    use FauxReadOnly;
    // True if on the first line
    protected bool $_firstLine = true;
-    protected \Generator $generator;
+    // The stored generator
+    protected \Generator $_generator;
    // True if on the last line.
    protected bool $_lastLine = false;
    // Some matches will check for the last line before the final newline, so this
    // will be true if on the line before the final newline or if on the last line
    // if there isn't an extra newline at the end of the string.
    protected bool $_lastLineBeforeFinalNewLine = false;
+    // The input string split into an array by newline
+    protected array $lines = [];
+    // The length of the data array
+    protected int $linesLength = 0;


    public function __construct(string $data) {
-        $this->generator = $this->lineGenerator($data);
+        $this->lines = explode("\n", $data);
+        $this->linesLength = count($this->lines);
+        $this->_generator = $this->lineGenerator();
    }


-    public function get(): \Generator {
-        return $this->generator;
-    }
-
-
-    protected function lineGenerator(string $string): \Generator {
-        $string = explode("\n", $string);
-        $lastLineIndex = count($string) - 1;
-        $lastLineBeforeFinalNewLineIndex = ($string[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex;
+    protected function lineGenerator(): \Generator {
+        $lastLineIndex = $this->linesLength - 1;
+        $lastLineBeforeFinalNewLineIndex = ($this->lines[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex;

-        foreach ($string as $lineNumber => $line) {
+        foreach ($this->lines as $lineNumber => $line) {
            $this->_lastLine = ($lineNumber === $lastLineIndex);
            $this->_lastLineBeforeFinalNewLine = ($lineNumber === $lastLineBeforeFinalNewLineIndex);
            yield $lineNumber + 1 => $line;
--- a/lib/Grammar.php
+++ b/lib/Grammar.php
@ -29,6 +29,9 @@ class Grammar {
    protected ?array $_repository;
    protected ?string $_scopeName;

+    protected const ESCAPE_SLASHES_REGEX = '/(?<!\\\)\//S';
+    protected const LONG_CHARACTER_CODE_REGEX = '/\\\x\{([0-9A-Fa-f]+)\}/S';
+

    public function __construct(?string $scopeName = null, ?array $patterns = null, ?string $name = null, ?array $injections = null, ?array $repository = null) {
        $this->_name = $name;
@ -198,11 +201,12 @@ class Grammar {
                    $p['beginPattern'] = true;
                case 'match':
                    // Escape forward slashes that aren't escaped in regexes.
-                    $value = preg_replace('/(?<!\\\)\//', '\/', $value);
+                    $value = preg_replace(self::ESCAPE_SLASHES_REGEX, '\/', $value);
                    // Fix oniguruma long character codes.
-                    $value = preg_replace_callback('/\\\x\{([0-9A-Fa-f]+)\}/', function($matches) {
+                    $value = preg_replace_callback(self::LONG_CHARACTER_CODE_REGEX, function($matches) {
                        return "\x{" . (((int)base_convert($matches[1], 16, 10) > 0x10ffff) ? '10ffff' : $matches[1]) . "}";
                    }, $value);
+
                    $p['match'] = "/$value/Su";

                    $modified = true;
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@ -43,7 +43,7 @@ class Tokenizer {


    public function tokenize(): \Generator {
-        foreach ($this->data->get() as $lineNumber => $line) {
+        foreach ($this->data->generator as $lineNumber => $line) {
            $this->lineNumber = $lineNumber;
            $this->line = $line;
            assert($this->debugLine());
@ -76,7 +76,7 @@ class Tokenizer {

    protected function resolveScopeName(string $scopeName, array $match): string {
        return preg_replace_callback(self::SCOPE_RESOLVE_REGEX, function($m) use($match) {
-            $replacement = $match[(int)$m[1]][0] ?? $m[1];
+            $replacement = trim($match[(int)$m[1]][0] ?? $m[1]);
            $command = $m[2] ?? null;
            switch ($command) {
                case 'downcase': return strtolower($replacement);
@ -88,7 +88,7 @@ class Tokenizer {
        }, $scopeName);
    }

-    protected function tokenizeLine(int $lineLength): array {
+    protected function tokenizeLine(int $stopOffset): array {
        $tokens = [];

        while (true) {
@ -117,25 +117,25 @@ class Tokenizer {

                    // If the rule is a Pattern
                    if ($rule instanceof Pattern) {
-                        // Throw out pattern regexes with anchors that shouldn't match the current line.
-                        // This is necessary because the tokenizer is fed data line by line and
-                        // therefore anchors that match the beginning of the document and the end won't
-                        // do anything.
-                        if (preg_match(self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1) {
-                            if (
-                                // \A anchors match the beginning of the whole string, not just this line
-                                ($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
-                                // \z anchors match the end of the whole string, not just this line
-                                ($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
-                                // \Z anchors match the end of the whole string or before the final newline if
-                                // there's a trailing newline in the string
-                                ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
-                            ) {
+                        if (preg_match($rule->match, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $this->offset) === 1) {
+                            // Throw out pattern regexes with anchors that shouldn't match the current line.
+                            // This is necessary because the tokenizer is fed data line by line and
+                            // therefore anchors that match the beginning of the document and the end won't
+                            // do anything.
+                            if (preg_match(
+                                    self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1 && (
+                                        // \A anchors match the beginning of the whole string, not just this line
+                                        ($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
+                                        // \z anchors match the end of the whole string, not just this line
+                                        ($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
+                                        // \Z anchors match the end of the whole string or before the final newline if
+                                        // there's a trailing newline in the string
+                                        ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
+                                    )
+                                ) {
                                continue 2;
                            }
-                        }

-                        if (preg_match($rule->match, "{$this->line}\n", $match, PREG_OFFSET_CAPTURE, $this->offset)) {
                            // If the match's offset is the same as the current offset then it is the
                            // closest match. There's no need to iterate anymore through the patterns.
                            if ($match[0][1] === $this->offset) {
@ -351,13 +351,13 @@ class Tokenizer {
                $this->ruleStack[] = $pattern;

                // If the rule has patterns process tokens from its subpatterns.
-                if ($pattern->patterns !== null && $this->offset < $lineLength) {
+                if ($pattern->patterns !== null && $this->offset < $stopOffset) {
                    // If the pattern has just a regular match (meaning neither a begin nor an end
                    // pattern) but has subpatterns then only tokenize the part of the line that's
-                    // within the match. Otherwise, tokenize up to the line's length. Because of
-                    // recursion, the line length could be set by this step before or within the
+                    // within the match. Otherwise, tokenize up to the stop offset. Because of
+                    // recursion, the stop offset could be set by this step before or within the
                    // capture tokenization process.
-                    $tokens = [ ...$tokens, ...$this->tokenizeLine((!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $lineLength) ];
+                    $tokens = [ ...$tokens, ...$this->tokenizeLine((!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $stopOffset) ];
                }

                // If the offset is before the end of the match then create a token from the
@ -406,7 +406,7 @@ class Tokenizer {
                }

                // If the offset isn't at the end of the line then look for more matches.
-                if ($this->offset < $lineLength) {
+                if ($this->offset < $stopOffset) {
                    continue;
                }
            }
@ -419,7 +419,7 @@ class Tokenizer {
                        $this->ruleStack[] = $injection;
                        $this->activeInjection = $injection;

-                        if ($this->offset < $lineLength) {
+                        if ($this->offset < $stopOffset) {
                            continue 2;
                        }
                    }