diff --git a/lib/Grammar/Exception.php b/lib/Grammar/Exception.php index 7fce23a..27b300f 100644 --- a/lib/Grammar/Exception.php +++ b/lib/Grammar/Exception.php @@ -5,7 +5,7 @@ declare(strict_types=1); namespace MensBeam\Lit\Grammar; -use MensBeam\Framework\Exception; +use MensBeam\Framework\Exception as FrameworkException; class Exception extends FrameworkException { const JSON_INVALID_FILE = 300; diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index ba62e25..6d5ee30 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -45,8 +45,11 @@ class Tokenizer { // The stack of scopes protected array $scopeStack; + protected array $previousMatches = []; + protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S'; protected const ANCHOR_CHECK_REGEX = '/(?ruleStack)->patterns, $this->ruleCacheIndexes); - if ($cacheIndex !== false) { - $currentRules = $this->ruleCacheValues[$cacheIndex]; - } else { - $currentRules = end($this->ruleStack)->patterns; - - if (!$this->activeInjection && $this->grammar->injections !== null) { - foreach ($this->grammar->injections as $selector => $injection) { - $selector = ScopeParser::parseSelector($selector); - if ($selector->matches($this->scopeStack)) { - $prefix = $selector->getPrefix($this->scopeStack); - if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { - $currentRules = [ ...$injection->patterns, ...$currentRules ]; - if ($prefix === Filter::PREFIX_LEFT) { - break; - } - } - if ($prefix === null || $prefix === Filter::PREFIX_RIGHT || $prefix === Filter::PREFIX_BOTH) { - $currentRules = [ ...$currentRules, ...$injection->patterns ]; - } - - $injected = true; - break; - } - } - } - } - - $currentRulesCount = count($currentRules); - $closestMatch = null; - - // Iterate through the rules to find matches for the line at the current offset. - for ($i = 0; $i < $currentRulesCount; $i++) { - while (true) { - $rule = $currentRules[$i]; - - // Grammar references can return false if the grammar does not exist, so - // continue on if the current rule is false. - if ($rule === false) { - continue 2; - } - - // If the rule is a Pattern - if ($rule instanceof Pattern) { - if (preg_match($rule->match, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $this->offset) === 1) { - // Throw out pattern regexes with anchors that shouldn't match the current line. - // This is necessary because the tokenizer is fed data line by line and - // therefore anchors that match the beginning of the document and the end won't - // do anything. - if (preg_match( - self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1 && ( - // \A anchors match the beginning of the whole string, not just this line - ($validRegexMatch[1] === 'A' && !$this->data->firstLine) || - // \z anchors match the end of the whole string, not just this line - ($validRegexMatch[1] === 'z' && !$this->data->lastLine) || - // \Z anchors match the end of the whole string or before the final newline if - // there's a trailing newline in the string - ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine) - ) - ) { - continue 2; - } - - // If the match's offset is the same as the current offset then it is the - // closest match. There's no need to iterate anymore through the patterns. - if ($match[0][1] === $this->offset) { - $closestMatch = [ - 'match' => $match, - 'pattern' => $rule - ]; - break 2; - } - // Otherwise, if the closest match is currently null or the match's offset is - // less than the closest match's offset then set the match as the closest match - // and continue looking for a closer one. - elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) { - $closestMatch = [ - 'match' => $match, - 'pattern' => $rule - ]; - } - } - } - // Otherwise, if the rule is a Reference then retrieve its patterns, splice into - // the rule list, and reprocess the rule. - elseif ($rule instanceof Reference) { - if (!$rule instanceof BaseReference) { - $obj = $rule->get(); - if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { - $obj = $obj->patterns; - } - } else { - $obj = $this->grammar->patterns; - } - - array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj); - $currentRulesCount = count($currentRules); - - // When the current rule list changes write it to the cache. - if ($cacheIndex === false) { - $this->ruleCacheIndexes[] = end($this->ruleStack)->patterns; - $cacheIndex = count($this->ruleCacheIndexes) - 1; - } - - if ($injected) { - // Injections need to be re-evaluated against the scope stack every time they're - // injected so don't cache them. - $temp = $currentRules; - foreach ($temp as $k => $r) { - if ($r instanceof Pattern && $r->injection) { - unset($temp[$k]); - } - } - $this->ruleCacheValues[$cacheIndex] = array_values($temp); - } else { - $this->ruleCacheValues[$cacheIndex] = $currentRules; - } - - continue; - } - - break; - } - } - + $closestMatch = $this->findClosestMatch(end($this->ruleStack)); + $this->previousMatches[] = $closestMatch; assert($this->debugClosestMatch($closestMatch)); // If there were a match above... @@ -261,7 +138,10 @@ class Tokenizer { // If the subpattern begins after the offset then create a token from the bits // of the line in-between the last token and the one(s) about to be created. - if ($match[0][1] > $this->offset) { + // However, don't do this if the pattern is an end pattern and its match + // contains a negative lookahead for the offset. This is due to a difference in + // how PCRE works versus the original Oniguruma. + if ($match[0][1] > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => substr($this->line, $this->offset, $match[0][1] - $this->offset) @@ -451,9 +331,12 @@ class Tokenizer { } // If the offset is before the end of the match then create a token from the - // bits of the match from the offset until the end of the match. + // bits of the match from the offset until the end of the match. However, don't + // do this if the pattern is an end pattern and its match contains a negative + // lookahead for the offset. This is due to a difference in how PCRE works + // versus the original Oniguruma. $endOffset = $match[0][1] + strlen($match[0][0]); - if ($endOffset > $this->offset) { + if ($endOffset > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => substr($this->line, $this->offset, $endOffset - $this->offset) @@ -502,6 +385,176 @@ class Tokenizer { return $tokens; } + protected function findClosestMatch(Grammar|Pattern $pattern): ?array { + $injected = false; + // Grab the current rule list from the cache if available to prevent having to + // splice in references repeatedly. + $cacheIndex = array_search($pattern->patterns, $this->ruleCacheIndexes); + if ($cacheIndex !== false) { + $currentRules = $this->ruleCacheValues[$cacheIndex]; + } else { + $currentRules = $pattern->patterns; + + if (!$this->activeInjection && $this->grammar->injections !== null) { + foreach ($this->grammar->injections as $selector => $injection) { + $selector = ScopeParser::parseSelector($selector); + if ($selector->matches($this->scopeStack)) { + $prefix = $selector->getPrefix($this->scopeStack); + if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { + $currentRules = [ ...$injection->patterns, ...$currentRules ]; + if ($prefix === Filter::PREFIX_LEFT) { + break; + } + } + if ($prefix === null || $prefix === Filter::PREFIX_RIGHT || $prefix === Filter::PREFIX_BOTH) { + $currentRules = [ ...$currentRules, ...$injection->patterns ]; + } + + $injected = true; + break; + } + } + } + } + + $closestMatch = null; + for ($i = 0, $currentRulesCount = count($currentRules); $i < $currentRulesCount; $i++) { + while (true) { + $rule = $currentRules[$i]; + + // Grammar references can return false if the grammar does not exist, so + // continue on if the current rule is false. + if ($rule === false) { + continue 2; + } + + // If the rule is a Pattern + if ($rule instanceof Pattern) { + $ruleMatch = $rule->match; + $offset = $this->offset; + + // If the rule is an end pattern with a back reference then it expects to be + // able to reference a subpattern from its begin pattern. Replace the reference + // with the matched subpattern and then match with it below. + if ($rule->endPattern && preg_match(self::BACK_REFERENCE_REGEX, $ruleMatch, $m) === 1) { + $beginMatch = null; + for ($previousMatchesCount = count($this->previousMatches), $i = $previousMatchesCount - 1; $i >= 0; $i--) { + $cur = $this->previousMatches[$i]; + if ($cur !== null && $cur['pattern']->beginPattern) { + foreach ($cur['pattern']->patterns as $p) { + if ($p === $rule) { + $beginMatch = $cur['match']; + break 2; + } + } + } + } + + if ($beginMatch !== null) { + $ruleMatch = preg_replace_callback(self::BACK_REFERENCE_REGEX, function($m) use ($beginMatch) { + $index = (int)$m[1]; + return $beginMatch[$index][0] ?? $m[0]; + }, $ruleMatch); + } + } + + if (preg_match($ruleMatch, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $offset) === 1) { + // Throw out pattern regexes with anchors that shouldn't match the current line. + // This is necessary because the tokenizer is fed data line by line and + // therefore anchors that match the beginning of the document and the end won't + // do anything. + if (preg_match( + self::ANCHOR_CHECK_REGEX, $ruleMatch, $validRegexMatch) === 1 && ( + // \A anchors match the beginning of the whole string, not just this line + ($validRegexMatch[1] === 'A' && !$this->data->firstLine) || + // \z anchors match the end of the whole string, not just this line + ($validRegexMatch[1] === 'z' && !$this->data->lastLine) || + // \Z anchors match the end of the whole string or before the final newline if + // there's a trailing newline in the string + ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine) + ) + ) { + continue 2; + } + + // If there is an existing match that doesn't match at the offset, the current + // matched rule is a begin pattern that doesn't capture anything, its end + // pattern would be the next match, and its end pattern also doesn't capture + // anything then discard this match. If this wasn't here it would continuously + // match this begin pattern followed by its end pattern, creating an infinite + // loop because the offset never moves forward. + if ($closestMatch !== null && $rule->beginPattern && $match[0][0] === '') { + $m = $this->findClosestMatch($rule); + if ($m !== null && $m['pattern']->endPattern && $m['match'][0][0] === '') { + continue 2; + } + } + + // If the match's offset is the same as the current offset then it is the + // closest match. There's no need to iterate anymore through the patterns. + if ($match[0][1] === $this->offset) { + $closestMatch = [ + 'match' => $match, + 'pattern' => $rule + ]; + break 2; + } + // Otherwise, if the closest match is currently null or the match's offset is + // less than the closest match's offset then set the match as the closest match + // and continue looking for a closer one. + elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) { + $closestMatch = [ + 'match' => $match, + 'pattern' => $rule + ]; + } + } + } + // Otherwise, if the rule is a Reference then retrieve its patterns, splice into + // the rule list, and reprocess the rule. + elseif ($rule instanceof Reference) { + if (!$rule instanceof BaseReference) { + $obj = $rule->get(); + if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { + $obj = $obj->patterns; + } + } else { + $obj = $this->grammar->patterns; + } + + array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj); + $currentRulesCount = count($currentRules); + + // When the current rule list changes write it to the cache. + if ($cacheIndex === false) { + $this->ruleCacheIndexes[] = end($this->ruleStack)->patterns; + $cacheIndex = count($this->ruleCacheIndexes) - 1; + } + + if ($injected) { + // Injections need to be re-evaluated against the scope stack every time they're + // injected so don't cache them. + $temp = $currentRules; + foreach ($temp as $k => $r) { + if ($r instanceof Pattern && $r->injection) { + unset($temp[$k]); + } + } + $this->ruleCacheValues[$cacheIndex] = array_values($temp); + } else { + $this->ruleCacheValues[$cacheIndex] = $currentRules; + } + + continue; + } + + break; + } + } + + return $closestMatch; + } + private function debugClosestMatch(?array $closestMatch): bool { if (self::$debug) {