Browse Source

Minor tokenization bug fixes

• When calculating the offset after handling overlapping tokens it now aware of invalid capture offsets (meaning they matched nothing).
• Tokenizer::tokenizeLine now correctly does not continue looking for new matches when the newly tokenized pattern was an end pattern.
• Grammars no longer have beginCaptures incorrectly applied to end patterns.
main
Dustin Wilson 3 years ago
parent
commit
c04e54d5ed
  1. 12
      lib/Grammar.php
  2. 4
      lib/Highlight.php
  3. 28
      lib/Tokenizer.php

12
lib/Grammar.php

@ -131,12 +131,6 @@ class Grammar {
$modified = true; $modified = true;
if (isset($pattern['beginCaptures'])) {
$pattern['captures'] = $pattern['beginCaptures'];
} elseif (isset($pattern['captures'])) {
$pattern['captures'] = $pattern['captures'];
}
$endCaptures = null; $endCaptures = null;
if (isset($pattern['endCaptures'])) { if (isset($pattern['endCaptures'])) {
$endCaptures = $pattern['endCaptures']; $endCaptures = $pattern['endCaptures'];
@ -144,6 +138,12 @@ class Grammar {
$endCaptures = $pattern['captures']; $endCaptures = $pattern['captures'];
} }
if (isset($pattern['beginCaptures'])) {
$pattern['captures'] = $pattern['beginCaptures'];
} elseif (isset($pattern['captures'])) {
$pattern['captures'] = $pattern['captures'];
}
$endPattern = [ $endPattern = [
'match' => $pattern['end'], 'match' => $pattern['end'],
'endPattern' => true 'endPattern' => true

4
lib/Highlight.php

@ -24,9 +24,7 @@ class Highlight {
$tokenList = $tokenizer->tokenize(); $tokenList = $tokenizer->tokenize();
foreach ($tokenList as $lineNumber => $tokens) { foreach ($tokenList as $lineNumber => $tokens) {
if ($lineNumber === 26) { if ($lineNumber === 38) {
//var_export($tokens);
//echo "\n";
die(); die();
} }
} }

28
lib/Tokenizer.php

@ -21,9 +21,11 @@ class Tokenizer {
public static bool $debug = false; public static bool $debug = false;
protected Data $data; protected Data $data;
protected int $debugCount = 0;
protected Grammar $grammar; protected Grammar $grammar;
protected int $offset = 0; protected int $offset = 0;
protected ?Pattern $activeInjection = null; protected ?Pattern $activeInjection = null;
protected int $lineNumber = 1;
protected array $ruleStack; protected array $ruleStack;
protected array $scopeStack; protected array $scopeStack;
@ -43,6 +45,7 @@ class Tokenizer {
foreach ($this->data->get() as $lineNumber => $line) { foreach ($this->data->get() as $lineNumber => $line) {
assert($this->debugLine($lineNumber, $line)); assert($this->debugLine($lineNumber, $line));
$this->lineNumber = $lineNumber;
$this->offset = 0; $this->offset = 0;
$lineLength = strlen($line); $lineLength = strlen($line);
@ -218,6 +221,7 @@ class Tokenizer {
'scopes' => $this->scopeStack, 'scopes' => $this->scopeStack,
'text' => substr($line, $this->offset, $m[1] - $this->offset) 'text' => substr($line, $this->offset, $m[1] - $this->offset)
]; ];
$this->offset = $m[1]; $this->offset = $m[1];
} }
@ -302,17 +306,25 @@ class Tokenizer {
} }
array_splice($tokens, $i, 1, $t); array_splice($tokens, $i, 1, $t);
$this->offset = $match[$k - 1][1] + strlen($match[$k - 1][0]);
// Find the nearest index to the match that doesn't have an invalid offset value
// (meaning that particular capture matched nothing) and set the offset to the
// end of that match.
$j = count($match) - 2;
while ($match[$j][1] === -1 || $match[$j][1] === null) {
$j--;
}
$this->offset = $match[$j][1] + strlen($match[$j][0]);
break; break;
} }
} }
$this->debugCount = count($tokens);
} else { } else {
$tokens[] = [ $tokens[] = [
'scopes' => $this->scopeStack, 'scopes' => $this->scopeStack,
'text' => $m[0] 'text' => $m[0]
]; ];
$this->offset = $m[1] + strlen($m[0]); $this->offset = $m[1] + strlen($m[0]);
} }
} }
@ -365,6 +377,7 @@ class Tokenizer {
if (!$pattern->beginPattern) { if (!$pattern->beginPattern) {
if ($pattern->endPattern) { if ($pattern->endPattern) {
// Pop everything off both stacks until a begin pattern is reached.
while (!end($this->ruleStack)->beginPattern) { while (!end($this->ruleStack)->beginPattern) {
$popped = array_pop($this->ruleStack); $popped = array_pop($this->ruleStack);
@ -396,8 +409,9 @@ class Tokenizer {
} }
} }
// If the offset isn't at the end of the line then look for more matches. // If the offset isn't at the end of the line then look for more matches but
if ($this->offset !== $lineLength) { // only if the currently tokenized pattern wasn't an end pattern.
if (!$pattern->endPattern && $this->offset < $lineLength) {
continue; continue;
} }
} }
@ -427,16 +441,20 @@ class Tokenizer {
private function debugClosestMatch(?array $closestMatch): bool { private function debugClosestMatch(?array $closestMatch): bool {
if (self::$debug) { if (self::$debug) {
$message = <<<DEBUG $message = <<<DEBUG
Offset: %s
Regex: %s Regex: %s
Scope: %s Scope: %s
HasCaptures: %s
BeginPattern: %s BeginPattern: %s
EndPattern: %s EndPattern: %s
Match: %s Match: %s
DEBUG; DEBUG;
$message = sprintf($message, $message = sprintf($message,
$this->offset,
$closestMatch['pattern']->match ?? 'NULL', $closestMatch['pattern']->match ?? 'NULL',
$closestMatch['pattern']->name ?? 'NULL', $closestMatch['pattern']->name ?? 'NULL',
($closestMatch !== null && $closestMatch['pattern']->captures !== null) ? 'yes' : 'no',
var_export($closestMatch['pattern']->beginPattern ?? null, true), var_export($closestMatch['pattern']->beginPattern ?? null, true),
var_export($closestMatch['pattern']->endPattern ?? null, true), var_export($closestMatch['pattern']->endPattern ?? null, true),
var_export($closestMatch['match'] ?? null, true) var_export($closestMatch['match'] ?? null, true)

Loading…
Cancel
Save