Browse Source

Subpattern tokenization now maintains line length

main
Dustin Wilson 3 years ago
parent
commit
b2ae3be4a7
  1. 2
      lib/Grammar.php
  2. 2
      lib/Highlight.php
  3. 17
      lib/Tokenizer.php

2
lib/Grammar.php

@ -203,7 +203,7 @@ class Grammar {
$value = preg_replace_callback('/\\\x\{([0-9A-Fa-f]+)\}/', function($matches) { $value = preg_replace_callback('/\\\x\{([0-9A-Fa-f]+)\}/', function($matches) {
return "\x{" . (((int)base_convert($matches[1], 16, 10) > 0x10ffff) ? '10ffff' : $matches[1]) . "}"; return "\x{" . (((int)base_convert($matches[1], 16, 10) > 0x10ffff) ? '10ffff' : $matches[1]) . "}";
}, $value); }, $value);
$p['match'] = "/$value/u"; $p['match'] = "/$value/Su";
$modified = true; $modified = true;
break; break;

2
lib/Highlight.php

@ -24,7 +24,7 @@ class Highlight {
$tokenList = $tokenizer->tokenize(); $tokenList = $tokenizer->tokenize();
foreach ($tokenList as $lineNumber => $tokens) { foreach ($tokenList as $lineNumber => $tokens) {
if ($lineNumber === 20) { if ($lineNumber === 24) {
var_export($tokens); var_export($tokens);
echo "\n"; echo "\n";
die(); die();

17
lib/Tokenizer.php

@ -118,8 +118,10 @@ class Tokenizer {
// If the rule is a Pattern // If the rule is a Pattern
if ($rule instanceof Pattern) { if ($rule instanceof Pattern) {
// Throw out pattern regexes with anchors that should match the current line. // Throw out pattern regexes with anchors that shouldn't match the current line.
// This is necessary because the tokenizer is fed data line by line. // This is necessary because the tokenizer is fed data line by line and
// therefore anchors that match the beginning of the document and the end won't
// do anything.
if (preg_match(self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1) { if (preg_match(self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1) {
if ( if (
// \A anchors match the beginning of the whole string, not just this line // \A anchors match the beginning of the whole string, not just this line
@ -201,7 +203,10 @@ class Tokenizer {
// create tokens from the captures. // create tokens from the captures.
if ($pattern->captures !== null) { if ($pattern->captures !== null) {
foreach ($match as $k => $m) { foreach ($match as $k => $m) {
if ($m[0] === '' || ($k === 0 && !isset($pattern->captures[0]))) { // If either the capture match is empty, there's no pattern capture for this
// match, or the match being processed is the first one and there are no
// captures for it then continue onto the next one.
if ($m[0] === '' || $m[1] < 0 || !isset($pattern->captures[$k]) || ($k === 0 && !isset($pattern->captures[0]))) {
continue; continue;
} }
@ -345,8 +350,10 @@ class Tokenizer {
if ($pattern->patterns !== null && $this->offset < $lineLength) { if ($pattern->patterns !== null && $this->offset < $lineLength) {
// If the pattern has just a regular match (meaning neither a begin nor an end // If the pattern has just a regular match (meaning neither a begin nor an end
// pattern) but has subpatterns then only tokenize the part of the line that's // pattern) but has subpatterns then only tokenize the part of the line that's
// within the match. // within the match. Otherwise, tokenize up to the line's length. Because of
$tokens = [ ...$tokens, ...$this->tokenizeLine($line, (!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : 0) ]; // recursion, the line length could be set by this step before or within the
// capture tokenization process.
$tokens = [ ...$tokens, ...$this->tokenizeLine($line, (!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $lineLength) ];
} }
// If the offset is before the end of the match then create a token from the // If the offset is before the end of the match then create a token from the

Loading…
Cancel
Save