|
@ -45,8 +45,11 @@ class Tokenizer { |
|
|
// The stack of scopes |
|
|
// The stack of scopes |
|
|
protected array $scopeStack; |
|
|
protected array $scopeStack; |
|
|
|
|
|
|
|
|
|
|
|
protected array $previousMatches = []; |
|
|
|
|
|
|
|
|
protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S'; |
|
|
protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S'; |
|
|
protected const ANCHOR_CHECK_REGEX = '/(?<!\\\)\\\([AGZz])/S'; |
|
|
protected const ANCHOR_CHECK_REGEX = '/(?<!\\\)\\\([AGZz])/S'; |
|
|
|
|
|
protected const BACK_REFERENCE_REGEX = '/\\\\(\d+)/S'; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public function __construct(Data $data, Grammar $grammar) { |
|
|
public function __construct(Data $data, Grammar $grammar) { |
|
@ -122,136 +125,10 @@ class Tokenizer { |
|
|
|
|
|
|
|
|
protected function tokenizeLine(int $stopOffset): array { |
|
|
protected function tokenizeLine(int $stopOffset): array { |
|
|
$tokens = []; |
|
|
$tokens = []; |
|
|
$injected = false; |
|
|
|
|
|
|
|
|
|
|
|
while (true) { |
|
|
while (true) { |
|
|
// Grab the current rule list from the cache if available to prevent having to |
|
|
$closestMatch = $this->findClosestMatch(end($this->ruleStack)); |
|
|
// splice in references repeatedly. |
|
|
$this->previousMatches[] = $closestMatch; |
|
|
$cacheIndex = array_search(end($this->ruleStack)->patterns, $this->ruleCacheIndexes); |
|
|
|
|
|
if ($cacheIndex !== false) { |
|
|
|
|
|
$currentRules = $this->ruleCacheValues[$cacheIndex]; |
|
|
|
|
|
} else { |
|
|
|
|
|
$currentRules = end($this->ruleStack)->patterns; |
|
|
|
|
|
|
|
|
|
|
|
if (!$this->activeInjection && $this->grammar->injections !== null) { |
|
|
|
|
|
foreach ($this->grammar->injections as $selector => $injection) { |
|
|
|
|
|
$selector = ScopeParser::parseSelector($selector); |
|
|
|
|
|
if ($selector->matches($this->scopeStack)) { |
|
|
|
|
|
$prefix = $selector->getPrefix($this->scopeStack); |
|
|
|
|
|
if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { |
|
|
|
|
|
$currentRules = [ ...$injection->patterns, ...$currentRules ]; |
|
|
|
|
|
if ($prefix === Filter::PREFIX_LEFT) { |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
if ($prefix === null || $prefix === Filter::PREFIX_RIGHT || $prefix === Filter::PREFIX_BOTH) { |
|
|
|
|
|
$currentRules = [ ...$currentRules, ...$injection->patterns ]; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
$injected = true; |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
$currentRulesCount = count($currentRules); |
|
|
|
|
|
$closestMatch = null; |
|
|
|
|
|
|
|
|
|
|
|
// Iterate through the rules to find matches for the line at the current offset. |
|
|
|
|
|
for ($i = 0; $i < $currentRulesCount; $i++) { |
|
|
|
|
|
while (true) { |
|
|
|
|
|
$rule = $currentRules[$i]; |
|
|
|
|
|
|
|
|
|
|
|
// Grammar references can return false if the grammar does not exist, so |
|
|
|
|
|
// continue on if the current rule is false. |
|
|
|
|
|
if ($rule === false) { |
|
|
|
|
|
continue 2; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// If the rule is a Pattern |
|
|
|
|
|
if ($rule instanceof Pattern) { |
|
|
|
|
|
if (preg_match($rule->match, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $this->offset) === 1) { |
|
|
|
|
|
// Throw out pattern regexes with anchors that shouldn't match the current line. |
|
|
|
|
|
// This is necessary because the tokenizer is fed data line by line and |
|
|
|
|
|
// therefore anchors that match the beginning of the document and the end won't |
|
|
|
|
|
// do anything. |
|
|
|
|
|
if (preg_match( |
|
|
|
|
|
self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1 && ( |
|
|
|
|
|
// \A anchors match the beginning of the whole string, not just this line |
|
|
|
|
|
($validRegexMatch[1] === 'A' && !$this->data->firstLine) || |
|
|
|
|
|
// \z anchors match the end of the whole string, not just this line |
|
|
|
|
|
($validRegexMatch[1] === 'z' && !$this->data->lastLine) || |
|
|
|
|
|
// \Z anchors match the end of the whole string or before the final newline if |
|
|
|
|
|
// there's a trailing newline in the string |
|
|
|
|
|
($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine) |
|
|
|
|
|
) |
|
|
|
|
|
) { |
|
|
|
|
|
continue 2; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// If the match's offset is the same as the current offset then it is the |
|
|
|
|
|
// closest match. There's no need to iterate anymore through the patterns. |
|
|
|
|
|
if ($match[0][1] === $this->offset) { |
|
|
|
|
|
$closestMatch = [ |
|
|
|
|
|
'match' => $match, |
|
|
|
|
|
'pattern' => $rule |
|
|
|
|
|
]; |
|
|
|
|
|
break 2; |
|
|
|
|
|
} |
|
|
|
|
|
// Otherwise, if the closest match is currently null or the match's offset is |
|
|
|
|
|
// less than the closest match's offset then set the match as the closest match |
|
|
|
|
|
// and continue looking for a closer one. |
|
|
|
|
|
elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) { |
|
|
|
|
|
$closestMatch = [ |
|
|
|
|
|
'match' => $match, |
|
|
|
|
|
'pattern' => $rule |
|
|
|
|
|
]; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into |
|
|
|
|
|
// the rule list, and reprocess the rule. |
|
|
|
|
|
elseif ($rule instanceof Reference) { |
|
|
|
|
|
if (!$rule instanceof BaseReference) { |
|
|
|
|
|
$obj = $rule->get(); |
|
|
|
|
|
if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { |
|
|
|
|
|
$obj = $obj->patterns; |
|
|
|
|
|
} |
|
|
|
|
|
} else { |
|
|
|
|
|
$obj = $this->grammar->patterns; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj); |
|
|
|
|
|
$currentRulesCount = count($currentRules); |
|
|
|
|
|
|
|
|
|
|
|
// When the current rule list changes write it to the cache. |
|
|
|
|
|
if ($cacheIndex === false) { |
|
|
|
|
|
$this->ruleCacheIndexes[] = end($this->ruleStack)->patterns; |
|
|
|
|
|
$cacheIndex = count($this->ruleCacheIndexes) - 1; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if ($injected) { |
|
|
|
|
|
// Injections need to be re-evaluated against the scope stack every time they're |
|
|
|
|
|
// injected so don't cache them. |
|
|
|
|
|
$temp = $currentRules; |
|
|
|
|
|
foreach ($temp as $k => $r) { |
|
|
|
|
|
if ($r instanceof Pattern && $r->injection) { |
|
|
|
|
|
unset($temp[$k]); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
$this->ruleCacheValues[$cacheIndex] = array_values($temp); |
|
|
|
|
|
} else { |
|
|
|
|
|
$this->ruleCacheValues[$cacheIndex] = $currentRules; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
assert($this->debugClosestMatch($closestMatch)); |
|
|
assert($this->debugClosestMatch($closestMatch)); |
|
|
|
|
|
|
|
|
// If there were a match above... |
|
|
// If there were a match above... |
|
@ -261,7 +138,10 @@ class Tokenizer { |
|
|
|
|
|
|
|
|
// If the subpattern begins after the offset then create a token from the bits |
|
|
// If the subpattern begins after the offset then create a token from the bits |
|
|
// of the line in-between the last token and the one(s) about to be created. |
|
|
// of the line in-between the last token and the one(s) about to be created. |
|
|
if ($match[0][1] > $this->offset) { |
|
|
// However, don't do this if the pattern is an end pattern and its match |
|
|
|
|
|
// contains a negative lookahead for the offset. This is due to a difference in |
|
|
|
|
|
// how PCRE works versus the original Oniguruma. |
|
|
|
|
|
if ($match[0][1] > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) { |
|
|
$tokens[] = [ |
|
|
$tokens[] = [ |
|
|
'scopes' => $this->scopeStack, |
|
|
'scopes' => $this->scopeStack, |
|
|
'text' => substr($this->line, $this->offset, $match[0][1] - $this->offset) |
|
|
'text' => substr($this->line, $this->offset, $match[0][1] - $this->offset) |
|
@ -451,9 +331,12 @@ class Tokenizer { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// If the offset is before the end of the match then create a token from the |
|
|
// If the offset is before the end of the match then create a token from the |
|
|
// bits of the match from the offset until the end of the match. |
|
|
// bits of the match from the offset until the end of the match. However, don't |
|
|
|
|
|
// do this if the pattern is an end pattern and its match contains a negative |
|
|
|
|
|
// lookahead for the offset. This is due to a difference in how PCRE works |
|
|
|
|
|
// versus the original Oniguruma. |
|
|
$endOffset = $match[0][1] + strlen($match[0][0]); |
|
|
$endOffset = $match[0][1] + strlen($match[0][0]); |
|
|
if ($endOffset > $this->offset) { |
|
|
if ($endOffset > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) { |
|
|
$tokens[] = [ |
|
|
$tokens[] = [ |
|
|
'scopes' => $this->scopeStack, |
|
|
'scopes' => $this->scopeStack, |
|
|
'text' => substr($this->line, $this->offset, $endOffset - $this->offset) |
|
|
'text' => substr($this->line, $this->offset, $endOffset - $this->offset) |
|
@ -502,6 +385,176 @@ class Tokenizer { |
|
|
return $tokens; |
|
|
return $tokens; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
protected function findClosestMatch(Grammar|Pattern $pattern): ?array { |
|
|
|
|
|
$injected = false; |
|
|
|
|
|
// Grab the current rule list from the cache if available to prevent having to |
|
|
|
|
|
// splice in references repeatedly. |
|
|
|
|
|
$cacheIndex = array_search($pattern->patterns, $this->ruleCacheIndexes); |
|
|
|
|
|
if ($cacheIndex !== false) { |
|
|
|
|
|
$currentRules = $this->ruleCacheValues[$cacheIndex]; |
|
|
|
|
|
} else { |
|
|
|
|
|
$currentRules = $pattern->patterns; |
|
|
|
|
|
|
|
|
|
|
|
if (!$this->activeInjection && $this->grammar->injections !== null) { |
|
|
|
|
|
foreach ($this->grammar->injections as $selector => $injection) { |
|
|
|
|
|
$selector = ScopeParser::parseSelector($selector); |
|
|
|
|
|
if ($selector->matches($this->scopeStack)) { |
|
|
|
|
|
$prefix = $selector->getPrefix($this->scopeStack); |
|
|
|
|
|
if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { |
|
|
|
|
|
$currentRules = [ ...$injection->patterns, ...$currentRules ]; |
|
|
|
|
|
if ($prefix === Filter::PREFIX_LEFT) { |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
if ($prefix === null || $prefix === Filter::PREFIX_RIGHT || $prefix === Filter::PREFIX_BOTH) { |
|
|
|
|
|
$currentRules = [ ...$currentRules, ...$injection->patterns ]; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
$injected = true; |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
$closestMatch = null; |
|
|
|
|
|
for ($i = 0, $currentRulesCount = count($currentRules); $i < $currentRulesCount; $i++) { |
|
|
|
|
|
while (true) { |
|
|
|
|
|
$rule = $currentRules[$i]; |
|
|
|
|
|
|
|
|
|
|
|
// Grammar references can return false if the grammar does not exist, so |
|
|
|
|
|
// continue on if the current rule is false. |
|
|
|
|
|
if ($rule === false) { |
|
|
|
|
|
continue 2; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// If the rule is a Pattern |
|
|
|
|
|
if ($rule instanceof Pattern) { |
|
|
|
|
|
$ruleMatch = $rule->match; |
|
|
|
|
|
$offset = $this->offset; |
|
|
|
|
|
|
|
|
|
|
|
// If the rule is an end pattern with a back reference then it expects to be |
|
|
|
|
|
// able to reference a subpattern from its begin pattern. Replace the reference |
|
|
|
|
|
// with the matched subpattern and then match with it below. |
|
|
|
|
|
if ($rule->endPattern && preg_match(self::BACK_REFERENCE_REGEX, $ruleMatch, $m) === 1) { |
|
|
|
|
|
$beginMatch = null; |
|
|
|
|
|
for ($previousMatchesCount = count($this->previousMatches), $i = $previousMatchesCount - 1; $i >= 0; $i--) { |
|
|
|
|
|
$cur = $this->previousMatches[$i]; |
|
|
|
|
|
if ($cur !== null && $cur['pattern']->beginPattern) { |
|
|
|
|
|
foreach ($cur['pattern']->patterns as $p) { |
|
|
|
|
|
if ($p === $rule) { |
|
|
|
|
|
$beginMatch = $cur['match']; |
|
|
|
|
|
break 2; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if ($beginMatch !== null) { |
|
|
|
|
|
$ruleMatch = preg_replace_callback(self::BACK_REFERENCE_REGEX, function($m) use ($beginMatch) { |
|
|
|
|
|
$index = (int)$m[1]; |
|
|
|
|
|
return $beginMatch[$index][0] ?? $m[0]; |
|
|
|
|
|
}, $ruleMatch); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (preg_match($ruleMatch, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $offset) === 1) { |
|
|
|
|
|
// Throw out pattern regexes with anchors that shouldn't match the current line. |
|
|
|
|
|
// This is necessary because the tokenizer is fed data line by line and |
|
|
|
|
|
// therefore anchors that match the beginning of the document and the end won't |
|
|
|
|
|
// do anything. |
|
|
|
|
|
if (preg_match( |
|
|
|
|
|
self::ANCHOR_CHECK_REGEX, $ruleMatch, $validRegexMatch) === 1 && ( |
|
|
|
|
|
// \A anchors match the beginning of the whole string, not just this line |
|
|
|
|
|
($validRegexMatch[1] === 'A' && !$this->data->firstLine) || |
|
|
|
|
|
// \z anchors match the end of the whole string, not just this line |
|
|
|
|
|
($validRegexMatch[1] === 'z' && !$this->data->lastLine) || |
|
|
|
|
|
// \Z anchors match the end of the whole string or before the final newline if |
|
|
|
|
|
// there's a trailing newline in the string |
|
|
|
|
|
($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine) |
|
|
|
|
|
) |
|
|
|
|
|
) { |
|
|
|
|
|
continue 2; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// If there is an existing match that doesn't match at the offset, the current |
|
|
|
|
|
// matched rule is a begin pattern that doesn't capture anything, its end |
|
|
|
|
|
// pattern would be the next match, and its end pattern also doesn't capture |
|
|
|
|
|
// anything then discard this match. If this wasn't here it would continuously |
|
|
|
|
|
// match this begin pattern followed by its end pattern, creating an infinite |
|
|
|
|
|
// loop because the offset never moves forward. |
|
|
|
|
|
if ($closestMatch !== null && $rule->beginPattern && $match[0][0] === '') { |
|
|
|
|
|
$m = $this->findClosestMatch($rule); |
|
|
|
|
|
if ($m !== null && $m['pattern']->endPattern && $m['match'][0][0] === '') { |
|
|
|
|
|
continue 2; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// If the match's offset is the same as the current offset then it is the |
|
|
|
|
|
// closest match. There's no need to iterate anymore through the patterns. |
|
|
|
|
|
if ($match[0][1] === $this->offset) { |
|
|
|
|
|
$closestMatch = [ |
|
|
|
|
|
'match' => $match, |
|
|
|
|
|
'pattern' => $rule |
|
|
|
|
|
]; |
|
|
|
|
|
break 2; |
|
|
|
|
|
} |
|
|
|
|
|
// Otherwise, if the closest match is currently null or the match's offset is |
|
|
|
|
|
// less than the closest match's offset then set the match as the closest match |
|
|
|
|
|
// and continue looking for a closer one. |
|
|
|
|
|
elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) { |
|
|
|
|
|
$closestMatch = [ |
|
|
|
|
|
'match' => $match, |
|
|
|
|
|
'pattern' => $rule |
|
|
|
|
|
]; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into |
|
|
|
|
|
// the rule list, and reprocess the rule. |
|
|
|
|
|
elseif ($rule instanceof Reference) { |
|
|
|
|
|
if (!$rule instanceof BaseReference) { |
|
|
|
|
|
$obj = $rule->get(); |
|
|
|
|
|
if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { |
|
|
|
|
|
$obj = $obj->patterns; |
|
|
|
|
|
} |
|
|
|
|
|
} else { |
|
|
|
|
|
$obj = $this->grammar->patterns; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj); |
|
|
|
|
|
$currentRulesCount = count($currentRules); |
|
|
|
|
|
|
|
|
|
|
|
// When the current rule list changes write it to the cache. |
|
|
|
|
|
if ($cacheIndex === false) { |
|
|
|
|
|
$this->ruleCacheIndexes[] = end($this->ruleStack)->patterns; |
|
|
|
|
|
$cacheIndex = count($this->ruleCacheIndexes) - 1; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if ($injected) { |
|
|
|
|
|
// Injections need to be re-evaluated against the scope stack every time they're |
|
|
|
|
|
// injected so don't cache them. |
|
|
|
|
|
$temp = $currentRules; |
|
|
|
|
|
foreach ($temp as $k => $r) { |
|
|
|
|
|
if ($r instanceof Pattern && $r->injection) { |
|
|
|
|
|
unset($temp[$k]); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
$this->ruleCacheValues[$cacheIndex] = array_values($temp); |
|
|
|
|
|
} else { |
|
|
|
|
|
$this->ruleCacheValues[$cacheIndex] = $currentRules; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return $closestMatch; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private function debugClosestMatch(?array $closestMatch): bool { |
|
|
private function debugClosestMatch(?array $closestMatch): bool { |
|
|
if (self::$debug) { |
|
|
if (self::$debug) { |
|
|