From 434c29a03f717cf56c5cb2f6e22765c3f8503de6 Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Mon, 13 Sep 2021 14:10:19 -0500 Subject: [PATCH] Tokenization looks to be working... VERY SLOWLY --- lib/Highlight.php | 3 --- lib/Tokenizer.php | 43 +++++++++++++++++++++---------------------- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/lib/Highlight.php b/lib/Highlight.php index 2fb8f6d..4539dde 100644 --- a/lib/Highlight.php +++ b/lib/Highlight.php @@ -24,9 +24,6 @@ class Highlight { $tokenList = $tokenizer->tokenize(); foreach ($tokenList as $lineNumber => $tokens) { - if ($lineNumber === 38) { - die(); - } } } } \ No newline at end of file diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 81935a8..db0a858 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -25,6 +25,7 @@ class Tokenizer { protected Grammar $grammar; protected int $offset = 0; protected ?Pattern $activeInjection = null; + protected string $line = ''; protected int $lineNumber = 1; protected array $ruleStack; protected array $scopeStack; @@ -43,13 +44,13 @@ class Tokenizer { public function tokenize(): \Generator { foreach ($this->data->get() as $lineNumber => $line) { - assert($this->debugLine($lineNumber, $line)); - $this->lineNumber = $lineNumber; + $this->line = $line; + assert($this->debugLine()); $this->offset = 0; $lineLength = strlen($line); - $tokens = ($lineLength > 0) ? $this->tokenizeLine($line) : []; + $tokens = ($lineLength > 0) ? $this->tokenizeLine($lineLength) : []; // Output a token for everything else contained on the line including the // newline or just a newline if there weren't any spare characters left on the @@ -68,7 +69,6 @@ class Tokenizer { } assert($this->debugTokens($tokens)); - yield $lineNumber => $tokens; } } @@ -88,12 +88,8 @@ class Tokenizer { }, $scopeName); } - protected function tokenizeLine(string $line, int $lineLength = 0): array { + protected function tokenizeLine(int $lineLength): array { $tokens = []; - // When processing subpatterns a linelength is specified based upon the parent - // match's string length (like with captures), otherwise set the line length to - // the entire line. - $lineLength = ($lineLength === 0) ? strlen($line) : $lineLength; while (true) { if ($this->activeInjection === null && $this->grammar->injections !== null) { @@ -139,7 +135,7 @@ class Tokenizer { } } - if (preg_match($rule->match, "$line\n", $match, PREG_OFFSET_CAPTURE, $this->offset)) { + if (preg_match($rule->match, "{$this->line}\n", $match, PREG_OFFSET_CAPTURE, $this->offset)) { // If the match's offset is the same as the current offset then it is the // closest match. There's no need to iterate anymore through the patterns. if ($match[0][1] === $this->offset) { @@ -193,7 +189,7 @@ class Tokenizer { if ($match[0][1] > $this->offset) { $tokens[] = [ 'scopes' => $this->scopeStack, - 'text' => substr($line, $this->offset, $match[0][1] - $this->offset) + 'text' => substr($this->line, $this->offset, $match[0][1] - $this->offset) ]; $this->offset = $match[0][1]; } @@ -219,7 +215,7 @@ class Tokenizer { if ($k > 0 && $m[1] > $this->offset) { $tokens[] = [ 'scopes' => $this->scopeStack, - 'text' => substr($line, $this->offset, $m[1] - $this->offset) + 'text' => substr($this->line, $this->offset, $m[1] - $this->offset) ]; $this->offset = $m[1]; @@ -240,14 +236,14 @@ class Tokenizer { $this->ruleStack[] = $pattern->captures[$k]; // Only tokenize the part of the line that's contains the match. $captureEndOffset = $m[1] + strlen($m[0]); - $tokens = [ ...$tokens, ...$this->tokenizeLine($line, $captureEndOffset) ]; + $tokens = [ ...$tokens, ...$this->tokenizeLine($captureEndOffset) ]; // If the offset is before the end of the capture then create a token from the // bits of the capture from the offset until the end of the capture. if ($captureEndOffset > $this->offset) { $tokens[] = [ 'scopes' => $this->scopeStack, - 'text' => substr($line, $this->offset, $captureEndOffset - $this->offset) + 'text' => substr($this->line, $this->offset, $captureEndOffset - $this->offset) ]; $this->offset = $captureEndOffset; } @@ -361,7 +357,7 @@ class Tokenizer { // within the match. Otherwise, tokenize up to the line's length. Because of // recursion, the line length could be set by this step before or within the // capture tokenization process. - $tokens = [ ...$tokens, ...$this->tokenizeLine($line, (!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $lineLength) ]; + $tokens = [ ...$tokens, ...$this->tokenizeLine((!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $lineLength) ]; } // If the offset is before the end of the match then create a token from the @@ -370,7 +366,7 @@ class Tokenizer { if ($endOffset > $this->offset) { $tokens[] = [ 'scopes' => $this->scopeStack, - 'text' => substr($line, $this->offset, $endOffset - $this->offset) + 'text' => substr($this->line, $this->offset, $endOffset - $this->offset) ]; $this->offset = $endOffset; } @@ -409,9 +405,8 @@ class Tokenizer { } } - // If the offset isn't at the end of the line then look for more matches but - // only if the currently tokenized pattern wasn't an end pattern. - if (!$pattern->endPattern && $this->offset < $lineLength) { + // If the offset isn't at the end of the line then look for more matches. + if ($this->offset < $lineLength) { continue; } } @@ -481,7 +476,7 @@ class Tokenizer { return ($count > 0) ? preg_replace('/^/m', str_repeat('|', $count) . ' ', $message) : $message; } - private function debugLine(int $lineNumber, string $line): bool { + private function debugLine(): bool { if (self::$debug) { $message = <<lineNumber} ", 80, '-'), + preg_replace('/\\\\{2}/', '\\', var_export($this->line, true)) + ); } return true; @@ -498,7 +497,7 @@ class Tokenizer { public function debugTokens(array $tokens): bool { if (self::$debug) { - echo 'Tokens: ' . var_export($tokens, true) . "\n\n"; + echo 'Tokens: ' . preg_replace('/\\\\{2}/', '\\', var_export($tokens, true)) . "\n\n"; } return true;