Browse Source

Fixed tokenization of pattern and capture leftovers

main
Dustin Wilson 3 years ago
parent
commit
c055e9f3ba
  1. 10
      lib/Grammar.php
  2. 4
      lib/Highlight.php
  3. 41
      lib/Tokenizer.php

10
lib/Grammar.php

@ -234,10 +234,12 @@ class Grammar {
case 'match':
// Escape forward slashes that aren't escaped in regexes.
$value = preg_replace('/(?<!\\\)\//', '\/', $value);
// Truncate unicode character codes that are too long.
$value = preg_replace_callback('/\\\x\{([0-9A-Fa-f]{6,})\}/', function($matches) {
$code = ((int)base_convert($matches[1], 16, 10) > 0x10ffff) ? '10ffff' : $matches[1];
return "\\x{"."$code}";
// Fix oniguruma long character codes.
$value = preg_replace_callback('/\\\x\{(7[0-9A-Fa-f]+)\}/', function($matches) {
// Remove the 7
$code = substr($matches[1], 1);
$code = ((int)base_convert($matches[1], 16, 10) > 0x10ffff) ? '10ffff' : $code;
return "\\x$code";
}, $value);
$p['match'] = "/$value/u";

4
lib/Highlight.php

@ -28,9 +28,9 @@ class Highlight {
$tokenList = $tokenizer->tokenize();
foreach ($tokenList as $lineNumber => $tokens) {
//var_export($tokens);
//echo "\n";
if ($lineNumber === 7) {
var_export($tokens);
echo "\n";
die();
}
}

41
lib/Tokenizer.php

@ -143,11 +143,6 @@ class Tokenizer {
$match = $closestMatch['match'];
$pattern = $closestMatch['pattern'];
if ($this->debug === 7) {
var_export($closestMatch);
echo "\n";
}
// **¡TEMPORARY!** Haven't implemented begin and end line
// anchors, so let's toss patterns with them completely for now.
//if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) {
@ -178,6 +173,17 @@ class Tokenizer {
continue;
}
// If the capture begins after the offset then create a token from the bits of
// the line in-between the last token and the one(s) about to be created.
if ($k > 0 && $m[1] > $this->offset) {
$tokens[] = new Token(
$this->scopeStack,
substr($line, $this->offset, $m[1] - $this->offset)
);
$this->debugCount++;
$this->offset = $m[1];
}
// If the capture has a name add it to the scope stack.
if ($pattern->captures[$k]->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($pattern->captures[$k]->name, $match);
@ -188,6 +194,19 @@ class Tokenizer {
if ($pattern->captures[$k]->patterns !== null) {
$this->ruleStack[] = $pattern->captures[$k];
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
// If the offset is before the end of the capture then create a token from the
// bits of the capture from the offset until the end of the capture.
$endOffset = $m[1] + strlen($m[0]);
if ($endOffset > $this->offset) {
$tokens[] = new Token(
$this->scopeStack,
substr($line, $this->offset, $endOffset - $this->offset)
);
$this->debugCount++;
$this->offset = $endOffset;
}
array_pop($this->ruleStack);
}
// Otherwise, create a token for the capture.
@ -232,6 +251,18 @@ class Tokenizer {
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
}
// If the offset is before the end of the match then create a token from the
// bits of the match from the offset until the end of the match.
$endOffset = $match[0][1] + strlen($match[0][0]);
if ($endOffset > $this->offset) {
$tokens[] = new Token(
$this->scopeStack,
substr($line, $this->offset, $endOffset - $this->offset)
);
$this->debugCount++;
$this->offset = $endOffset;
}
if (!$pattern->beginPattern) {
if ($pattern->endPattern) {
while (!end($this->ruleStack)->beginPattern) {

Loading…
Cancel
Save