From adf7cd733195944d7a3c545e2629e15ebb51d7aa Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Wed, 18 Aug 2021 22:45:14 -0500 Subject: [PATCH] Misunderstood matching process, still broken lol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Before the first pattern's regex to match the line would be processed into tokens. This apparently is incorrect. Instead, the pattern regex that has an offset that is closest to the offset wins. Changes reflect this. --- lib/Highlight.php | 2 +- lib/Token.php | 6 +- lib/Tokenizer.php | 261 ++++++++++++++++++++++++++-------------------- 3 files changed, 154 insertions(+), 115 deletions(-) diff --git a/lib/Highlight.php b/lib/Highlight.php index bad5b55..4d3b923 100644 --- a/lib/Highlight.php +++ b/lib/Highlight.php @@ -30,7 +30,7 @@ class Highlight { foreach ($tokenList as $lineNumber => $tokens) { var_export($tokens); echo "\n"; - if ($lineNumber === 2) { + if ($lineNumber === 6) { die(); } } diff --git a/lib/Token.php b/lib/Token.php index 072abe4..83518ef 100644 --- a/lib/Token.php +++ b/lib/Token.php @@ -9,11 +9,11 @@ namespace dW\Lit; class Token { use FauxReadOnly; protected array $_scopes; - protected string $_string; + protected string $_text; - public function __construct(array $scopes, string $string) { + public function __construct(array $scopes, string $text) { $this->_scopes = $scopes; - $this->string = $string; + $this->text = $text; } } \ No newline at end of file diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 4a5935e..af96c15 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -24,6 +24,7 @@ class Tokenizer { protected array $ruleStack; protected array $scopeStack; protected int $debug = 0; + protected int $debugCount = 0; public function __construct(\Generator $data, Grammar $grammar) { @@ -38,16 +39,20 @@ class Tokenizer { foreach ($this->data as $lineNumber => $line) { $this->debug = $lineNumber; $this->offset = 0; - $tokens = $this->tokenizeLine($line); - // If after tokenizing the line the entire line still hasn't been tokenized then - // create a token of the rest of the line. $lineLength = strlen($line); + $tokens = ($lineLength > 0) ? $this->tokenizeLine($line) : []; + + // Output a token for everything else contained on the line including the + // newline or just a newline if there weren't any spare characters left on the + // line. $tokens[] = new Token( $this->scopeStack, - ($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength) . "\n" : "\n" + ($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength - $this->offset) . "\n" : "\n" ); + $this->debugCount++; + yield $lineNumber => $tokens; } } @@ -71,131 +76,159 @@ class Tokenizer { $tokens = []; $lineLength = strlen($line); - if ($this->activeInjection === null && $this->grammar->injections !== null) { - foreach ($this->grammar->injections as $selector => $injection) { - $selector = ScopeParser::parseSelector($selector); - if ($selector->matches($this->scopeStack)) { - $prefix = $selector->getPrefix($this->scopeStack); - if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { - $this->scopeStack[] = $injection; - $this->activeInjection = $injection; - break; + while (true) { + if ($this->activeInjection === null && $this->grammar->injections !== null) { + foreach ($this->grammar->injections as $selector => $injection) { + $selector = ScopeParser::parseSelector($selector); + if ($selector->matches($this->scopeStack)) { + $prefix = $selector->getPrefix($this->scopeStack); + if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { + $this->scopeStack[] = $injection; + $this->activeInjection = $injection; + break; + } } } } - } - while (true) { $currentRules = end($this->ruleStack)->patterns; $currentRulesCount = count($currentRules); + $nextMatch = null; for ($i = 0; $i < $currentRulesCount; $i++) { while (true) { $rule = $currentRules[$i]; - if ($rule instanceof Pattern) { - echo "Match: {$rule->match}\n\n"; + if ($this->debug === 6 && $this->debugCount === 12) { + if ($rule instanceof Pattern) { + echo "Match: {$rule->match}\n"; + } } - // If the rule is a Pattern and matches the line at the offset then tokenize the - // matches. + // If the rule is a Pattern and matches the line at the offset then... if ($rule instanceof Pattern && preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) { - // ¡TEMPORARY! Haven't implemented begin and end line - // anchors, so let's toss them completely. - if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) { - continue 2; + $match = [ + 'match' => $match, + 'pattern' => $rule + ]; + + if ($match['match'][0][1] === $this->offset) { + $nextMatch = $match; + break 2; + } elseif ($match['match'][0][1] < $nextMatch['match'][0][1]) { + $nextMatch = $match; } - - // Add the name and contentName to the scope stack - // if present. - if ($rule->name !== null) { - $this->scopeStack[] = $this->resolveScopeName($rule->name, $match); + } + // Otherwise, if the rule is a Reference then retrieve its patterns, splice into + // the rule list, and reprocess the rule. + elseif ($rule instanceof Reference && $obj = $rule->get()) { + if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { + $obj = $obj->patterns; } - if ($rule->captures !== null) { - // Iterate through each of the matched subpatterns and create tokens from the - // captures. - foreach ($match as $k => $m) { - if ($m[0] === '' || ($k === 0 && !isset($rule->captures[0]))) { - continue; - } - - // If the subpattern begins after the offset then create a token from the bits - // of the line in-between the last token and the one about to be created. - if ($m[1] > $this->offset) { - $scopeStack = $this->scopeStack; - // If this is the first capture, then the scopes added to the stack need to be - // removed from this token's scope stack as this will grab everything before - // this match began. - if ($k === 0 && $rule->name !== null) { - array_pop($scopeStack); - } - - $tokens[] = new Token( - $scopeStack, - substr($line, $this->offset, $m[1]) - ); - $this->offset = $m[1]; - } - - if ($rule->captures[$k]->name !== null) { - $this->scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match); - } - - if ($rule->captures[$k]->patterns !== null) { - $this->ruleStack[] = $rule->captures[$k]; - $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ]; - array_pop($this->ruleStack); - } else { - $tokens[] = new Token( - $this->scopeStack, - $m[0] - ); - } - - if ($rule->captures[$k]->name !== null) { - array_pop($this->scopeStack); - } - - $this->offset = $m[1] + strlen($m[0]); - } - } + array_splice($currentRules, $i, 1, $obj); + $currentRulesCount = count($currentRules); + continue; + } + + break; + } + } + + // If there were a match above... + if ($nextMatch !== null) { + $match = $nextMatch['match']; + $pattern = $nextMatch['pattern']; + + // **¡TEMPORARY!** Haven't implemented begin and end line + // anchors, so let's toss patterns with them completely for now. + if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) { + continue; + } - // If the pattern is a begin pattern and has a content name then add that to the - // scope stack before processing the children. - if ($rule->beginPattern && $rule->contentName !== null) { - $this->scopeStack[] = $this->resolveScopeName($rule->contentName, $match); + // If the subpattern begins after the offset then create a token from the bits + // of the line in-between the last token and the one(s) about to be created. + if ($match[0][1] > $this->offset) { + $tokens[] = new Token( + $this->scopeStack, + substr($line, $this->offset, $match[0][1] - $this->offset) + ); + $this->debugCount++; + $this->offset = $match[0][1]; + } + + // Add the name to the scope stack if present. + if ($pattern->name !== null) { + $this->scopeStack[] = $this->resolveScopeName($pattern->name, $match); + } + + // If a rule has captures iterate through each of the matched subpatterns and + // create tokens from the captures. + if ($pattern->captures !== null) { + foreach ($match as $k => $m) { + if ($m[0] === '' || ($k === 0 && !isset($pattern->captures[0]))) { + continue; } - $this->ruleStack[] = $rule; + // If the capture has a name add it to the scope stack. + if ($pattern->captures[$k]->name !== null) { + $this->scopeStack[] = $this->resolveScopeName($pattern->captures[$k]->name, $match); + } - if ($rule->patterns !== null && $this->offset < $lineLength) { + // If the capture has patterns of its own add the capture to the rule stack, + // process the patterns, and then pop the capture off the stack. + if ($pattern->captures[$k]->patterns !== null) { + $this->ruleStack[] = $pattern->captures[$k]; $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ]; + array_pop($this->ruleStack); + } + // Otherwise, create a token for the capture. + else { + $tokens[] = new Token( + $this->scopeStack, + $m[0] + ); + $this->debugCount++; + } + + // Pop the capture's name off the scope stack. + if ($pattern->captures[$k]->name !== null) { + array_pop($this->scopeStack); } - if (!$rule->beginPattern) { - if ($rule->endPattern) { - while (!end($this->ruleStack)->beginPattern) { - $popped = array_pop($this->ruleStack); + $this->offset = $m[1] + strlen($m[0]); + } + } + // Otherwise, if the rule doesn't have captures then a token is created from the + // entire match. + else { + $tokens[] = new Token( + $this->scopeStack, + $match[0][0] + ); + + $this->offset = $match[0][1] + strlen($match[0][0]); + $this->debugCount++; + } + + // If the pattern is a begin pattern and has a content name then add that to the + // scope stack before processing the children. + if ($pattern->beginPattern && $pattern->contentName !== null) { + $this->scopeStack[] = $this->resolveScopeName($pattern->contentName, $match); + } - if ($popped->name !== null) { - array_pop($this->scopeStack); - } + $this->ruleStack[] = $pattern; - // If what was just popped is the active injection then remove it, too. - if ($popped === $this->activeInjection) { - $this->activeInjection = null; - } - } - } + // If the rule has patterns process tokens from its subpatterns. + if ($pattern->patterns !== null && $this->offset < $lineLength) { + $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ]; + } + if (!$pattern->beginPattern) { + if ($pattern->endPattern) { + while (!end($this->ruleStack)->beginPattern) { $popped = array_pop($this->ruleStack); - // If what was just popped is a begin pattern and has a content name pop it off - // the scope stack. - if ($popped->beginPattern && $popped->contentName !== null) { - array_pop($this->scopeStack); - } if ($popped->name !== null) { array_pop($this->scopeStack); } @@ -205,25 +238,31 @@ class Tokenizer { $this->activeInjection = null; } } + } - break 2; + $popped = array_pop($this->ruleStack); + + // If what was just popped is a begin pattern and has a content name pop it off + // the scope stack. + if ($popped->beginPattern && $popped->contentName !== null) { + array_pop($this->scopeStack); + } + if ($popped->name !== null) { + array_pop($this->scopeStack); } - // Otherwise, if the rule is a Reference then retrieve its patterns, splice into - // the rule list, and reprocess the rule. - elseif ($rule instanceof Reference && $obj = $rule->get()) { - if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { - $obj = $obj->patterns; - } - array_splice($currentRules, $i, 1, $obj); - $currentRulesCount = count($currentRules); - continue; + // If what was just popped is the active injection then remove it, too. + if ($popped === $this->activeInjection) { + $this->activeInjection = null; } + } - break; + if ($this->offset !== $lineLength) { + continue; } } + if ($this->activeInjection === null && $this->grammar->injections !== null) { foreach ($this->grammar->injections as $selector => $injection) { $selector = ScopeParser::parseSelector($selector);