Fixed back references in end patterns, negative lookahead differences

2022-01-12 16:58:56 -06:00 · 2022-01-12 16:58:56 -06:00 · 3e07ac45af
commit 3e07ac45af
parent 1e61057caf
2 changed files with 185 additions and 132 deletions
--- a/lib/Grammar/Exception.php
+++ b/lib/Grammar/Exception.php
@ -5,7 +5,7 @@

 declare(strict_types=1);
 namespace MensBeam\Lit\Grammar;
-use MensBeam\Framework\Exception;
+use MensBeam\Framework\Exception as FrameworkException;

 class Exception extends FrameworkException {
    const JSON_INVALID_FILE = 300;
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@ -45,8 +45,11 @@ class Tokenizer {
    // The stack of scopes
    protected array $scopeStack;

+    protected array $previousMatches = [];
+
    protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S';
    protected const ANCHOR_CHECK_REGEX = '/(?<!\\\)\\\([AGZz])/S';
+    protected const BACK_REFERENCE_REGEX = '/\\\\(\d+)/S';


    public function __construct(Data $data, Grammar $grammar) {
@ -122,136 +125,10 @@ class Tokenizer {

    protected function tokenizeLine(int $stopOffset): array {
        $tokens = [];
-        $injected = false;

        while (true) {
-            // Grab the current rule list from the cache if available to prevent having to
-            // splice in references repeatedly.
-            $cacheIndex = array_search(end($this->ruleStack)->patterns, $this->ruleCacheIndexes);
-            if ($cacheIndex !== false) {
-                $currentRules = $this->ruleCacheValues[$cacheIndex];
-            } else {
-                $currentRules = end($this->ruleStack)->patterns;
-
-                if (!$this->activeInjection && $this->grammar->injections !== null) {
-                    foreach ($this->grammar->injections as $selector => $injection) {
-                        $selector = ScopeParser::parseSelector($selector);
-                        if ($selector->matches($this->scopeStack)) {
-                            $prefix = $selector->getPrefix($this->scopeStack);
-                            if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) {
-                                $currentRules = [ ...$injection->patterns, ...$currentRules ];
-                                if ($prefix === Filter::PREFIX_LEFT) {
-                                    break;
-                                }
-                            }
-                            if ($prefix === null || $prefix === Filter::PREFIX_RIGHT || $prefix === Filter::PREFIX_BOTH) {
-                                $currentRules = [ ...$currentRules, ...$injection->patterns ];
-                            }
-
-                            $injected = true;
-                            break;
-                        }
-                    }
-                }
-            }
-
-            $currentRulesCount = count($currentRules);
-            $closestMatch = null;
-
-            // Iterate through the rules to find matches for the line at the current offset.
-            for ($i = 0; $i < $currentRulesCount; $i++) {
-                while (true) {
-                    $rule = $currentRules[$i];
-
-                    // Grammar references can return false if the grammar does not exist, so
-                    // continue on if the current rule is false.
-                    if ($rule === false) {
-                        continue 2;
-                    }
-
-                    // If the rule is a Pattern
-                    if ($rule instanceof Pattern) {
-                        if (preg_match($rule->match, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $this->offset) === 1) {
-                            // Throw out pattern regexes with anchors that shouldn't match the current line.
-                            // This is necessary because the tokenizer is fed data line by line and
-                            // therefore anchors that match the beginning of the document and the end won't
-                            // do anything.
-                            if (preg_match(
-                                    self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1 && (
-                                        // \A anchors match the beginning of the whole string, not just this line
-                                        ($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
-                                        // \z anchors match the end of the whole string, not just this line
-                                        ($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
-                                        // \Z anchors match the end of the whole string or before the final newline if
-                                        // there's a trailing newline in the string
-                                        ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
-                                    )
-                                ) {
-                                continue 2;
-                            }
-
-                            // If the match's offset is the same as the current offset then it is the
-                            // closest match. There's no need to iterate anymore through the patterns.
-                            if ($match[0][1] === $this->offset) {
-                                $closestMatch = [
-                                    'match' => $match,
-                                    'pattern' => $rule
-                                ];
-                                break 2;
-                            }
-                            // Otherwise, if the closest match is currently null or the match's offset is
-                            // less than the closest match's offset then set the match as the closest match
-                            // and continue looking for a closer one.
-                            elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
-                                $closestMatch = [
-                                    'match' => $match,
-                                    'pattern' => $rule
-                                ];
-                            }
-                        }
-                    }
-                    // Otherwise, if the rule is a Reference then retrieve its patterns, splice into
-                    // the rule list, and reprocess the rule.
-                    elseif ($rule instanceof Reference) {
-                        if (!$rule instanceof BaseReference) {
-                            $obj = $rule->get();
-                            if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) {
-                                $obj = $obj->patterns;
-                            }
-                        } else {
-                            $obj = $this->grammar->patterns;
-                        }
-
-                        array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj);
-                        $currentRulesCount = count($currentRules);
-
-                        // When the current rule list changes write it to the cache.
-                        if ($cacheIndex === false) {
-                            $this->ruleCacheIndexes[] = end($this->ruleStack)->patterns;
-                            $cacheIndex = count($this->ruleCacheIndexes) - 1;
-                        }
-
-                        if ($injected) {
-                            // Injections need to be re-evaluated against the scope stack every time they're
-                            // injected so don't cache them.
-                            $temp = $currentRules;
-                            foreach ($temp as $k => $r) {
-                                if ($r instanceof Pattern && $r->injection) {
-                                    unset($temp[$k]);
-                                }
-                            }
-                            $this->ruleCacheValues[$cacheIndex] = array_values($temp);
-                        } else {
-                            $this->ruleCacheValues[$cacheIndex] = $currentRules;
-                        }
-
-                        continue;
-                    }
-
-                    break;
-                }
-            }
-
+            $closestMatch = $this->findClosestMatch(end($this->ruleStack));
+            $this->previousMatches[] = $closestMatch;
            assert($this->debugClosestMatch($closestMatch));

            // If there were a match above...
@ -261,7 +138,10 @@ class Tokenizer {

                // If the subpattern begins after the offset then create a token from the bits
                // of the line in-between the last token and the one(s) about to be created.
-                if ($match[0][1] > $this->offset) {
+                // However, don't do this if the pattern is an end pattern and its match
+                // contains a negative lookahead for the offset. This is due to a difference in
+                // how PCRE works versus the original Oniguruma.
+                if ($match[0][1] > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) {
                    $tokens[] = [
                        'scopes' => $this->scopeStack,
                        'text' => substr($this->line, $this->offset, $match[0][1] - $this->offset)
@ -451,9 +331,12 @@ class Tokenizer {
                }

                // If the offset is before the end of the match then create a token from the
-                // bits of the match from the offset until the end of the match.
+                // bits of the match from the offset until the end of the match. However, don't
+                // do this if the pattern is an end pattern and its match contains a negative
+                // lookahead for the offset. This is due to a difference in how PCRE works
+                // versus the original Oniguruma.
                $endOffset = $match[0][1] + strlen($match[0][0]);
-                if ($endOffset > $this->offset) {
+                if ($endOffset > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) {
                    $tokens[] = [
                        'scopes' => $this->scopeStack,
                        'text' => substr($this->line, $this->offset, $endOffset - $this->offset)
@ -502,6 +385,176 @@ class Tokenizer {
        return $tokens;
    }

+    protected function findClosestMatch(Grammar|Pattern $pattern): ?array {
+        $injected = false;
+        // Grab the current rule list from the cache if available to prevent having to
+        // splice in references repeatedly.
+        $cacheIndex = array_search($pattern->patterns, $this->ruleCacheIndexes);
+        if ($cacheIndex !== false) {
+            $currentRules = $this->ruleCacheValues[$cacheIndex];
+        } else {
+            $currentRules = $pattern->patterns;
+
+            if (!$this->activeInjection && $this->grammar->injections !== null) {
+                foreach ($this->grammar->injections as $selector => $injection) {
+                    $selector = ScopeParser::parseSelector($selector);
+                    if ($selector->matches($this->scopeStack)) {
+                        $prefix = $selector->getPrefix($this->scopeStack);
+                        if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) {
+                            $currentRules = [ ...$injection->patterns, ...$currentRules ];
+                            if ($prefix === Filter::PREFIX_LEFT) {
+                                break;
+                            }
+                        }
+                        if ($prefix === null || $prefix === Filter::PREFIX_RIGHT || $prefix === Filter::PREFIX_BOTH) {
+                            $currentRules = [ ...$currentRules, ...$injection->patterns ];
+                        }
+
+                        $injected = true;
+                        break;
+                    }
+                }
+            }
+        }
+
+        $closestMatch = null;
+        for ($i = 0, $currentRulesCount = count($currentRules); $i < $currentRulesCount; $i++) {
+            while (true) {
+                $rule = $currentRules[$i];
+
+                // Grammar references can return false if the grammar does not exist, so
+                // continue on if the current rule is false.
+                if ($rule === false) {
+                    continue 2;
+                }
+
+                // If the rule is a Pattern
+                if ($rule instanceof Pattern) {
+                    $ruleMatch = $rule->match;
+                    $offset = $this->offset;
+
+                    // If the rule is an end pattern with a back reference then it expects to be
+                    // able to reference a subpattern from its begin pattern. Replace the reference
+                    // with the matched subpattern and then match with it below.
+                    if ($rule->endPattern && preg_match(self::BACK_REFERENCE_REGEX, $ruleMatch, $m) === 1) {
+                        $beginMatch = null;
+                        for ($previousMatchesCount = count($this->previousMatches), $i = $previousMatchesCount - 1; $i >= 0; $i--) {
+                            $cur = $this->previousMatches[$i];
+                            if ($cur !== null && $cur['pattern']->beginPattern) {
+                                foreach ($cur['pattern']->patterns as $p) {
+                                    if ($p === $rule) {
+                                        $beginMatch = $cur['match'];
+                                        break 2;
+                                    }
+                                }
+                            }
+                        }
+
+                        if ($beginMatch !== null) {
+                            $ruleMatch = preg_replace_callback(self::BACK_REFERENCE_REGEX, function($m) use ($beginMatch) {
+                                $index = (int)$m[1];
+                                return $beginMatch[$index][0] ?? $m[0];
+                            }, $ruleMatch);
+                        }
+                    }
+
+                    if (preg_match($ruleMatch, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $offset) === 1) {
+                        // Throw out pattern regexes with anchors that shouldn't match the current line.
+                        // This is necessary because the tokenizer is fed data line by line and
+                        // therefore anchors that match the beginning of the document and the end won't
+                        // do anything.
+                        if (preg_match(
+                                self::ANCHOR_CHECK_REGEX, $ruleMatch, $validRegexMatch) === 1 && (
+                                    // \A anchors match the beginning of the whole string, not just this line
+                                    ($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
+                                    // \z anchors match the end of the whole string, not just this line
+                                    ($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
+                                    // \Z anchors match the end of the whole string or before the final newline if
+                                    // there's a trailing newline in the string
+                                    ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
+                                )
+                            ) {
+                            continue 2;
+                        }
+
+                        // If there is an existing match that doesn't match at the offset, the current
+                        // matched rule is a begin pattern that doesn't capture anything, its end
+                        // pattern would be the next match, and its end pattern also doesn't capture
+                        // anything then discard this match. If this wasn't here it would continuously
+                        // match this begin pattern followed by its end pattern, creating an infinite
+                        // loop because the offset never moves forward.
+                        if ($closestMatch !== null && $rule->beginPattern && $match[0][0] === '') {
+                            $m = $this->findClosestMatch($rule);
+                            if ($m !== null && $m['pattern']->endPattern && $m['match'][0][0] === '') {
+                                continue 2;
+                            }
+                        }
+
+                        // If the match's offset is the same as the current offset then it is the
+                        // closest match. There's no need to iterate anymore through the patterns.
+                        if ($match[0][1] === $this->offset) {
+                            $closestMatch = [
+                                'match' => $match,
+                                'pattern' => $rule
+                            ];
+                            break 2;
+                        }
+                        // Otherwise, if the closest match is currently null or the match's offset is
+                        // less than the closest match's offset then set the match as the closest match
+                        // and continue looking for a closer one.
+                        elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
+                            $closestMatch = [
+                                'match' => $match,
+                                'pattern' => $rule
+                            ];
+                        }
+                    }
+                }
+                // Otherwise, if the rule is a Reference then retrieve its patterns, splice into
+                // the rule list, and reprocess the rule.
+                elseif ($rule instanceof Reference) {
+                    if (!$rule instanceof BaseReference) {
+                        $obj = $rule->get();
+                        if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) {
+                            $obj = $obj->patterns;
+                        }
+                    } else {
+                        $obj = $this->grammar->patterns;
+                    }
+
+                    array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj);
+                    $currentRulesCount = count($currentRules);
+
+                    // When the current rule list changes write it to the cache.
+                    if ($cacheIndex === false) {
+                        $this->ruleCacheIndexes[] = end($this->ruleStack)->patterns;
+                        $cacheIndex = count($this->ruleCacheIndexes) - 1;
+                    }
+
+                    if ($injected) {
+                        // Injections need to be re-evaluated against the scope stack every time they're
+                        // injected so don't cache them.
+                        $temp = $currentRules;
+                        foreach ($temp as $k => $r) {
+                            if ($r instanceof Pattern && $r->injection) {
+                                unset($temp[$k]);
+                            }
+                        }
+                        $this->ruleCacheValues[$cacheIndex] = array_values($temp);
+                    } else {
+                        $this->ruleCacheValues[$cacheIndex] = $currentRules;
+                    }
+
+                    continue;
+                }
+
+                break;
+            }
+        }
+
+        return $closestMatch;
+    }
+

    private function debugClosestMatch(?array $closestMatch): bool {
        if (self::$debug) {