From c04e54d5ed0a698e56d09e8eb83b19788de5cc5c Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Thu, 26 Aug 2021 00:28:27 -0500 Subject: [PATCH] Minor tokenization bug fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • When calculating the offset after handling overlapping tokens it now aware of invalid capture offsets (meaning they matched nothing). • Tokenizer::tokenizeLine now correctly does not continue looking for new matches when the newly tokenized pattern was an end pattern. • Grammars no longer have beginCaptures incorrectly applied to end patterns. --- lib/Grammar.php | 12 ++++++------ lib/Highlight.php | 4 +--- lib/Tokenizer.php | 28 +++++++++++++++++++++++----- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/lib/Grammar.php b/lib/Grammar.php index 97185b7..5afaeaf 100644 --- a/lib/Grammar.php +++ b/lib/Grammar.php @@ -131,12 +131,6 @@ class Grammar { $modified = true; - if (isset($pattern['beginCaptures'])) { - $pattern['captures'] = $pattern['beginCaptures']; - } elseif (isset($pattern['captures'])) { - $pattern['captures'] = $pattern['captures']; - } - $endCaptures = null; if (isset($pattern['endCaptures'])) { $endCaptures = $pattern['endCaptures']; @@ -144,6 +138,12 @@ class Grammar { $endCaptures = $pattern['captures']; } + if (isset($pattern['beginCaptures'])) { + $pattern['captures'] = $pattern['beginCaptures']; + } elseif (isset($pattern['captures'])) { + $pattern['captures'] = $pattern['captures']; + } + $endPattern = [ 'match' => $pattern['end'], 'endPattern' => true diff --git a/lib/Highlight.php b/lib/Highlight.php index 3a42957..2fb8f6d 100644 --- a/lib/Highlight.php +++ b/lib/Highlight.php @@ -24,9 +24,7 @@ class Highlight { $tokenList = $tokenizer->tokenize(); foreach ($tokenList as $lineNumber => $tokens) { - if ($lineNumber === 26) { - //var_export($tokens); - //echo "\n"; + if ($lineNumber === 38) { die(); } } diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 35dad45..81935a8 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -21,9 +21,11 @@ class Tokenizer { public static bool $debug = false; protected Data $data; + protected int $debugCount = 0; protected Grammar $grammar; protected int $offset = 0; protected ?Pattern $activeInjection = null; + protected int $lineNumber = 1; protected array $ruleStack; protected array $scopeStack; @@ -43,6 +45,7 @@ class Tokenizer { foreach ($this->data->get() as $lineNumber => $line) { assert($this->debugLine($lineNumber, $line)); + $this->lineNumber = $lineNumber; $this->offset = 0; $lineLength = strlen($line); @@ -218,6 +221,7 @@ class Tokenizer { 'scopes' => $this->scopeStack, 'text' => substr($line, $this->offset, $m[1] - $this->offset) ]; + $this->offset = $m[1]; } @@ -302,17 +306,25 @@ class Tokenizer { } array_splice($tokens, $i, 1, $t); - $this->offset = $match[$k - 1][1] + strlen($match[$k - 1][0]); + + // Find the nearest index to the match that doesn't have an invalid offset value + // (meaning that particular capture matched nothing) and set the offset to the + // end of that match. + $j = count($match) - 2; + while ($match[$j][1] === -1 || $match[$j][1] === null) { + $j--; + } + + $this->offset = $match[$j][1] + strlen($match[$j][0]); break; } } - - $this->debugCount = count($tokens); } else { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => $m[0] ]; + $this->offset = $m[1] + strlen($m[0]); } } @@ -365,6 +377,7 @@ class Tokenizer { if (!$pattern->beginPattern) { if ($pattern->endPattern) { + // Pop everything off both stacks until a begin pattern is reached. while (!end($this->ruleStack)->beginPattern) { $popped = array_pop($this->ruleStack); @@ -396,8 +409,9 @@ class Tokenizer { } } - // If the offset isn't at the end of the line then look for more matches. - if ($this->offset !== $lineLength) { + // If the offset isn't at the end of the line then look for more matches but + // only if the currently tokenized pattern wasn't an end pattern. + if (!$pattern->endPattern && $this->offset < $lineLength) { continue; } } @@ -427,16 +441,20 @@ class Tokenizer { private function debugClosestMatch(?array $closestMatch): bool { if (self::$debug) { $message = <<offset, $closestMatch['pattern']->match ?? 'NULL', $closestMatch['pattern']->name ?? 'NULL', + ($closestMatch !== null && $closestMatch['pattern']->captures !== null) ? 'yes' : 'no', var_export($closestMatch['pattern']->beginPattern ?? null, true), var_export($closestMatch['pattern']->endPattern ?? null, true), var_export($closestMatch['match'] ?? null, true)