Fixed back references in end patterns, negative lookahead differences
This commit is contained in:
parent
1e61057caf
commit
3e07ac45af
2 changed files with 185 additions and 132 deletions
|
@ -5,7 +5,7 @@
|
||||||
|
|
||||||
declare(strict_types=1);
|
declare(strict_types=1);
|
||||||
namespace MensBeam\Lit\Grammar;
|
namespace MensBeam\Lit\Grammar;
|
||||||
use MensBeam\Framework\Exception;
|
use MensBeam\Framework\Exception as FrameworkException;
|
||||||
|
|
||||||
class Exception extends FrameworkException {
|
class Exception extends FrameworkException {
|
||||||
const JSON_INVALID_FILE = 300;
|
const JSON_INVALID_FILE = 300;
|
||||||
|
|
|
@ -45,8 +45,11 @@ class Tokenizer {
|
||||||
// The stack of scopes
|
// The stack of scopes
|
||||||
protected array $scopeStack;
|
protected array $scopeStack;
|
||||||
|
|
||||||
|
protected array $previousMatches = [];
|
||||||
|
|
||||||
protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S';
|
protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S';
|
||||||
protected const ANCHOR_CHECK_REGEX = '/(?<!\\\)\\\([AGZz])/S';
|
protected const ANCHOR_CHECK_REGEX = '/(?<!\\\)\\\([AGZz])/S';
|
||||||
|
protected const BACK_REFERENCE_REGEX = '/\\\\(\d+)/S';
|
||||||
|
|
||||||
|
|
||||||
public function __construct(Data $data, Grammar $grammar) {
|
public function __construct(Data $data, Grammar $grammar) {
|
||||||
|
@ -122,136 +125,10 @@ class Tokenizer {
|
||||||
|
|
||||||
protected function tokenizeLine(int $stopOffset): array {
|
protected function tokenizeLine(int $stopOffset): array {
|
||||||
$tokens = [];
|
$tokens = [];
|
||||||
$injected = false;
|
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// Grab the current rule list from the cache if available to prevent having to
|
$closestMatch = $this->findClosestMatch(end($this->ruleStack));
|
||||||
// splice in references repeatedly.
|
$this->previousMatches[] = $closestMatch;
|
||||||
$cacheIndex = array_search(end($this->ruleStack)->patterns, $this->ruleCacheIndexes);
|
|
||||||
if ($cacheIndex !== false) {
|
|
||||||
$currentRules = $this->ruleCacheValues[$cacheIndex];
|
|
||||||
} else {
|
|
||||||
$currentRules = end($this->ruleStack)->patterns;
|
|
||||||
|
|
||||||
if (!$this->activeInjection && $this->grammar->injections !== null) {
|
|
||||||
foreach ($this->grammar->injections as $selector => $injection) {
|
|
||||||
$selector = ScopeParser::parseSelector($selector);
|
|
||||||
if ($selector->matches($this->scopeStack)) {
|
|
||||||
$prefix = $selector->getPrefix($this->scopeStack);
|
|
||||||
if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) {
|
|
||||||
$currentRules = [ ...$injection->patterns, ...$currentRules ];
|
|
||||||
if ($prefix === Filter::PREFIX_LEFT) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ($prefix === null || $prefix === Filter::PREFIX_RIGHT || $prefix === Filter::PREFIX_BOTH) {
|
|
||||||
$currentRules = [ ...$currentRules, ...$injection->patterns ];
|
|
||||||
}
|
|
||||||
|
|
||||||
$injected = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$currentRulesCount = count($currentRules);
|
|
||||||
$closestMatch = null;
|
|
||||||
|
|
||||||
// Iterate through the rules to find matches for the line at the current offset.
|
|
||||||
for ($i = 0; $i < $currentRulesCount; $i++) {
|
|
||||||
while (true) {
|
|
||||||
$rule = $currentRules[$i];
|
|
||||||
|
|
||||||
// Grammar references can return false if the grammar does not exist, so
|
|
||||||
// continue on if the current rule is false.
|
|
||||||
if ($rule === false) {
|
|
||||||
continue 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the rule is a Pattern
|
|
||||||
if ($rule instanceof Pattern) {
|
|
||||||
if (preg_match($rule->match, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $this->offset) === 1) {
|
|
||||||
// Throw out pattern regexes with anchors that shouldn't match the current line.
|
|
||||||
// This is necessary because the tokenizer is fed data line by line and
|
|
||||||
// therefore anchors that match the beginning of the document and the end won't
|
|
||||||
// do anything.
|
|
||||||
if (preg_match(
|
|
||||||
self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1 && (
|
|
||||||
// \A anchors match the beginning of the whole string, not just this line
|
|
||||||
($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
|
|
||||||
// \z anchors match the end of the whole string, not just this line
|
|
||||||
($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
|
|
||||||
// \Z anchors match the end of the whole string or before the final newline if
|
|
||||||
// there's a trailing newline in the string
|
|
||||||
($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
continue 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the match's offset is the same as the current offset then it is the
|
|
||||||
// closest match. There's no need to iterate anymore through the patterns.
|
|
||||||
if ($match[0][1] === $this->offset) {
|
|
||||||
$closestMatch = [
|
|
||||||
'match' => $match,
|
|
||||||
'pattern' => $rule
|
|
||||||
];
|
|
||||||
break 2;
|
|
||||||
}
|
|
||||||
// Otherwise, if the closest match is currently null or the match's offset is
|
|
||||||
// less than the closest match's offset then set the match as the closest match
|
|
||||||
// and continue looking for a closer one.
|
|
||||||
elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
|
|
||||||
$closestMatch = [
|
|
||||||
'match' => $match,
|
|
||||||
'pattern' => $rule
|
|
||||||
];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into
|
|
||||||
// the rule list, and reprocess the rule.
|
|
||||||
elseif ($rule instanceof Reference) {
|
|
||||||
if (!$rule instanceof BaseReference) {
|
|
||||||
$obj = $rule->get();
|
|
||||||
if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) {
|
|
||||||
$obj = $obj->patterns;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
$obj = $this->grammar->patterns;
|
|
||||||
}
|
|
||||||
|
|
||||||
array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj);
|
|
||||||
$currentRulesCount = count($currentRules);
|
|
||||||
|
|
||||||
// When the current rule list changes write it to the cache.
|
|
||||||
if ($cacheIndex === false) {
|
|
||||||
$this->ruleCacheIndexes[] = end($this->ruleStack)->patterns;
|
|
||||||
$cacheIndex = count($this->ruleCacheIndexes) - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($injected) {
|
|
||||||
// Injections need to be re-evaluated against the scope stack every time they're
|
|
||||||
// injected so don't cache them.
|
|
||||||
$temp = $currentRules;
|
|
||||||
foreach ($temp as $k => $r) {
|
|
||||||
if ($r instanceof Pattern && $r->injection) {
|
|
||||||
unset($temp[$k]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
$this->ruleCacheValues[$cacheIndex] = array_values($temp);
|
|
||||||
} else {
|
|
||||||
$this->ruleCacheValues[$cacheIndex] = $currentRules;
|
|
||||||
}
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert($this->debugClosestMatch($closestMatch));
|
assert($this->debugClosestMatch($closestMatch));
|
||||||
|
|
||||||
// If there were a match above...
|
// If there were a match above...
|
||||||
|
@ -261,7 +138,10 @@ class Tokenizer {
|
||||||
|
|
||||||
// If the subpattern begins after the offset then create a token from the bits
|
// If the subpattern begins after the offset then create a token from the bits
|
||||||
// of the line in-between the last token and the one(s) about to be created.
|
// of the line in-between the last token and the one(s) about to be created.
|
||||||
if ($match[0][1] > $this->offset) {
|
// However, don't do this if the pattern is an end pattern and its match
|
||||||
|
// contains a negative lookahead for the offset. This is due to a difference in
|
||||||
|
// how PCRE works versus the original Oniguruma.
|
||||||
|
if ($match[0][1] > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) {
|
||||||
$tokens[] = [
|
$tokens[] = [
|
||||||
'scopes' => $this->scopeStack,
|
'scopes' => $this->scopeStack,
|
||||||
'text' => substr($this->line, $this->offset, $match[0][1] - $this->offset)
|
'text' => substr($this->line, $this->offset, $match[0][1] - $this->offset)
|
||||||
|
@ -451,9 +331,12 @@ class Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the offset is before the end of the match then create a token from the
|
// If the offset is before the end of the match then create a token from the
|
||||||
// bits of the match from the offset until the end of the match.
|
// bits of the match from the offset until the end of the match. However, don't
|
||||||
|
// do this if the pattern is an end pattern and its match contains a negative
|
||||||
|
// lookahead for the offset. This is due to a difference in how PCRE works
|
||||||
|
// versus the original Oniguruma.
|
||||||
$endOffset = $match[0][1] + strlen($match[0][0]);
|
$endOffset = $match[0][1] + strlen($match[0][0]);
|
||||||
if ($endOffset > $this->offset) {
|
if ($endOffset > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) {
|
||||||
$tokens[] = [
|
$tokens[] = [
|
||||||
'scopes' => $this->scopeStack,
|
'scopes' => $this->scopeStack,
|
||||||
'text' => substr($this->line, $this->offset, $endOffset - $this->offset)
|
'text' => substr($this->line, $this->offset, $endOffset - $this->offset)
|
||||||
|
@ -502,6 +385,176 @@ class Tokenizer {
|
||||||
return $tokens;
|
return $tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected function findClosestMatch(Grammar|Pattern $pattern): ?array {
|
||||||
|
$injected = false;
|
||||||
|
// Grab the current rule list from the cache if available to prevent having to
|
||||||
|
// splice in references repeatedly.
|
||||||
|
$cacheIndex = array_search($pattern->patterns, $this->ruleCacheIndexes);
|
||||||
|
if ($cacheIndex !== false) {
|
||||||
|
$currentRules = $this->ruleCacheValues[$cacheIndex];
|
||||||
|
} else {
|
||||||
|
$currentRules = $pattern->patterns;
|
||||||
|
|
||||||
|
if (!$this->activeInjection && $this->grammar->injections !== null) {
|
||||||
|
foreach ($this->grammar->injections as $selector => $injection) {
|
||||||
|
$selector = ScopeParser::parseSelector($selector);
|
||||||
|
if ($selector->matches($this->scopeStack)) {
|
||||||
|
$prefix = $selector->getPrefix($this->scopeStack);
|
||||||
|
if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) {
|
||||||
|
$currentRules = [ ...$injection->patterns, ...$currentRules ];
|
||||||
|
if ($prefix === Filter::PREFIX_LEFT) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($prefix === null || $prefix === Filter::PREFIX_RIGHT || $prefix === Filter::PREFIX_BOTH) {
|
||||||
|
$currentRules = [ ...$currentRules, ...$injection->patterns ];
|
||||||
|
}
|
||||||
|
|
||||||
|
$injected = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$closestMatch = null;
|
||||||
|
for ($i = 0, $currentRulesCount = count($currentRules); $i < $currentRulesCount; $i++) {
|
||||||
|
while (true) {
|
||||||
|
$rule = $currentRules[$i];
|
||||||
|
|
||||||
|
// Grammar references can return false if the grammar does not exist, so
|
||||||
|
// continue on if the current rule is false.
|
||||||
|
if ($rule === false) {
|
||||||
|
continue 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the rule is a Pattern
|
||||||
|
if ($rule instanceof Pattern) {
|
||||||
|
$ruleMatch = $rule->match;
|
||||||
|
$offset = $this->offset;
|
||||||
|
|
||||||
|
// If the rule is an end pattern with a back reference then it expects to be
|
||||||
|
// able to reference a subpattern from its begin pattern. Replace the reference
|
||||||
|
// with the matched subpattern and then match with it below.
|
||||||
|
if ($rule->endPattern && preg_match(self::BACK_REFERENCE_REGEX, $ruleMatch, $m) === 1) {
|
||||||
|
$beginMatch = null;
|
||||||
|
for ($previousMatchesCount = count($this->previousMatches), $i = $previousMatchesCount - 1; $i >= 0; $i--) {
|
||||||
|
$cur = $this->previousMatches[$i];
|
||||||
|
if ($cur !== null && $cur['pattern']->beginPattern) {
|
||||||
|
foreach ($cur['pattern']->patterns as $p) {
|
||||||
|
if ($p === $rule) {
|
||||||
|
$beginMatch = $cur['match'];
|
||||||
|
break 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($beginMatch !== null) {
|
||||||
|
$ruleMatch = preg_replace_callback(self::BACK_REFERENCE_REGEX, function($m) use ($beginMatch) {
|
||||||
|
$index = (int)$m[1];
|
||||||
|
return $beginMatch[$index][0] ?? $m[0];
|
||||||
|
}, $ruleMatch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match($ruleMatch, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $offset) === 1) {
|
||||||
|
// Throw out pattern regexes with anchors that shouldn't match the current line.
|
||||||
|
// This is necessary because the tokenizer is fed data line by line and
|
||||||
|
// therefore anchors that match the beginning of the document and the end won't
|
||||||
|
// do anything.
|
||||||
|
if (preg_match(
|
||||||
|
self::ANCHOR_CHECK_REGEX, $ruleMatch, $validRegexMatch) === 1 && (
|
||||||
|
// \A anchors match the beginning of the whole string, not just this line
|
||||||
|
($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
|
||||||
|
// \z anchors match the end of the whole string, not just this line
|
||||||
|
($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
|
||||||
|
// \Z anchors match the end of the whole string or before the final newline if
|
||||||
|
// there's a trailing newline in the string
|
||||||
|
($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
continue 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there is an existing match that doesn't match at the offset, the current
|
||||||
|
// matched rule is a begin pattern that doesn't capture anything, its end
|
||||||
|
// pattern would be the next match, and its end pattern also doesn't capture
|
||||||
|
// anything then discard this match. If this wasn't here it would continuously
|
||||||
|
// match this begin pattern followed by its end pattern, creating an infinite
|
||||||
|
// loop because the offset never moves forward.
|
||||||
|
if ($closestMatch !== null && $rule->beginPattern && $match[0][0] === '') {
|
||||||
|
$m = $this->findClosestMatch($rule);
|
||||||
|
if ($m !== null && $m['pattern']->endPattern && $m['match'][0][0] === '') {
|
||||||
|
continue 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the match's offset is the same as the current offset then it is the
|
||||||
|
// closest match. There's no need to iterate anymore through the patterns.
|
||||||
|
if ($match[0][1] === $this->offset) {
|
||||||
|
$closestMatch = [
|
||||||
|
'match' => $match,
|
||||||
|
'pattern' => $rule
|
||||||
|
];
|
||||||
|
break 2;
|
||||||
|
}
|
||||||
|
// Otherwise, if the closest match is currently null or the match's offset is
|
||||||
|
// less than the closest match's offset then set the match as the closest match
|
||||||
|
// and continue looking for a closer one.
|
||||||
|
elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
|
||||||
|
$closestMatch = [
|
||||||
|
'match' => $match,
|
||||||
|
'pattern' => $rule
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into
|
||||||
|
// the rule list, and reprocess the rule.
|
||||||
|
elseif ($rule instanceof Reference) {
|
||||||
|
if (!$rule instanceof BaseReference) {
|
||||||
|
$obj = $rule->get();
|
||||||
|
if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) {
|
||||||
|
$obj = $obj->patterns;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
$obj = $this->grammar->patterns;
|
||||||
|
}
|
||||||
|
|
||||||
|
array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj);
|
||||||
|
$currentRulesCount = count($currentRules);
|
||||||
|
|
||||||
|
// When the current rule list changes write it to the cache.
|
||||||
|
if ($cacheIndex === false) {
|
||||||
|
$this->ruleCacheIndexes[] = end($this->ruleStack)->patterns;
|
||||||
|
$cacheIndex = count($this->ruleCacheIndexes) - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($injected) {
|
||||||
|
// Injections need to be re-evaluated against the scope stack every time they're
|
||||||
|
// injected so don't cache them.
|
||||||
|
$temp = $currentRules;
|
||||||
|
foreach ($temp as $k => $r) {
|
||||||
|
if ($r instanceof Pattern && $r->injection) {
|
||||||
|
unset($temp[$k]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$this->ruleCacheValues[$cacheIndex] = array_values($temp);
|
||||||
|
} else {
|
||||||
|
$this->ruleCacheValues[$cacheIndex] = $currentRules;
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $closestMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private function debugClosestMatch(?array $closestMatch): bool {
|
private function debugClosestMatch(?array $closestMatch): bool {
|
||||||
if (self::$debug) {
|
if (self::$debug) {
|
||||||
|
|
Loading…
Reference in a new issue