diff --git a/lib/Grammar.php b/lib/Grammar.php index 74dbdc8..f4167c0 100644 --- a/lib/Grammar.php +++ b/lib/Grammar.php @@ -200,11 +200,8 @@ class Grammar { // Escape forward slashes that aren't escaped in regexes. $value = preg_replace('/(? 0x10ffff) ? '10ffff' : $code; - return "\\x$code"; + $value = preg_replace_callback('/\\\x\{([0-9A-Fa-f]+)\}/', function($matches) { + return "\x{" . (((int)base_convert($matches[1], 16, 10) > 0x10ffff) ? '10ffff' : $matches[1]) . "}"; }, $value); $p['match'] = "/$value/u"; diff --git a/lib/Highlight.php b/lib/Highlight.php index 4b1cf2f..82e666f 100644 --- a/lib/Highlight.php +++ b/lib/Highlight.php @@ -24,7 +24,7 @@ class Highlight { $tokenList = $tokenizer->tokenize(); foreach ($tokenList as $lineNumber => $tokens) { - if ($lineNumber === 19) { + if ($lineNumber === 20) { var_export($tokens); echo "\n"; die(); diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 7c9623e..bc3ca33 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -224,6 +224,10 @@ class Tokenizer { // If the capture has patterns of its own add the capture to the rule stack, // process the patterns, and then pop the capture off the stack. if ($pattern->captures[$k]->patterns !== null) { + if ($m[1] < $this->offset) { + die("MOTHERFUCKER!\n"); + } + $this->ruleStack[] = $pattern->captures[$k]; // Only tokenize the part of the line that's contains the match. $captureLength = $m[1] + strlen($m[0]); @@ -242,22 +246,79 @@ class Tokenizer { } array_pop($this->ruleStack); + $this->offset = $m[1] + strlen($m[0]); } // Otherwise, create a token for the capture. else { - $tokens[] = [ - 'scopes' => $this->scopeStack, - 'text' => $m[0] - ]; - $this->debugCount++; + // If the capture's offset is before the current offset then the new token needs + // to be spliced within previously emitted ones. + if ($m[1] < $this->offset) { + $curOffset = $this->offset; + // Go backwards through the tokens, find the token the current capture is + // within, and splice new tokens into the token array + for ($tokensLength = count($tokens), $i = $tokensLength - 1; $i >= 0; $i--) { + $cur = $tokens[$i]; + $curOffset -= strlen($cur['text']); + if ($m[1] >= $curOffset) { + // If the length of the new capture would put part of it outside the previous + // token then toss the token. + if ($m[1] + strlen($m[0]) > $curOffset + strlen($cur['text'])) { + // TODO: trigger a warning or something here maybe? + break; + } + + $t = []; + + // Add in token for anything before the new capture token within the token being + // spliced + $preMatchText = substr($cur['text'], 0, $m[1] - $curOffset); + if ($preMatchText !== '') { + $t[] = [ + 'scopes' => $cur['scopes'], + 'text' => $preMatchText + ]; + } + + // The new capture's scope needs to be added to the prior token's scope stack to + // make the stack for the new one. + $scopeStack = $cur['scopes']; + $scopeStack[] = $pattern->captures[$k]->name; + $t[] = [ + 'scopes' => $scopeStack, + 'text' => $m[0] + ]; + + // Add in token for anything after the new capture token within the token being + // spliced + $postMatchText = substr($cur['text'], $m[1] - $curOffset + strlen($m[0])); + if ($postMatchText !== '') { + $t[] = [ + 'scopes' => $cur['scopes'], + 'text' => $postMatchText + ]; + } + + array_splice($tokens, $i, 1, $t); + $this->offset = $match[$k - 1][1] + strlen($match[$k - 1][0]); + break; + } + } + + $this->debugCount = count($tokens); + } else { + $tokens[] = [ + 'scopes' => $this->scopeStack, + 'text' => $m[0] + ]; + $this->debugCount++; + $this->offset = $m[1] + strlen($m[0]); + } } // Pop the capture's name off the scope stack. if ($pattern->captures[$k]->name !== null) { array_pop($this->scopeStack); } - - $this->offset = $m[1] + strlen($m[0]); } } // Otherwise, if the rule doesn't have captures then a token is created from the