diff --git a/lib/Highlight.php b/lib/Highlight.php index bad5b55..4d3b923 100644 --- a/lib/Highlight.php +++ b/lib/Highlight.php @@ -30,7 +30,7 @@ class Highlight { foreach ($tokenList as $lineNumber => $tokens) { var_export($tokens); echo "\n"; - if ($lineNumber === 2) { + if ($lineNumber === 6) { die(); } } diff --git a/lib/Token.php b/lib/Token.php index 072abe4..83518ef 100644 --- a/lib/Token.php +++ b/lib/Token.php @@ -9,11 +9,11 @@ namespace dW\Lit; class Token { use FauxReadOnly; protected array $_scopes; - protected string $_string; + protected string $_text; - public function __construct(array $scopes, string $string) { + public function __construct(array $scopes, string $text) { $this->_scopes = $scopes; - $this->string = $string; + $this->text = $text; } } \ No newline at end of file diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 4a5935e..af96c15 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -24,6 +24,7 @@ class Tokenizer { protected array $ruleStack; protected array $scopeStack; protected int $debug = 0; + protected int $debugCount = 0; public function __construct(\Generator $data, Grammar $grammar) { @@ -38,16 +39,20 @@ class Tokenizer { foreach ($this->data as $lineNumber => $line) { $this->debug = $lineNumber; $this->offset = 0; - $tokens = $this->tokenizeLine($line); - // If after tokenizing the line the entire line still hasn't been tokenized then - // create a token of the rest of the line. $lineLength = strlen($line); + $tokens = ($lineLength > 0) ? $this->tokenizeLine($line) : []; + + // Output a token for everything else contained on the line including the + // newline or just a newline if there weren't any spare characters left on the + // line. $tokens[] = new Token( $this->scopeStack, - ($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength) . "\n" : "\n" + ($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength - $this->offset) . "\n" : "\n" ); + $this->debugCount++; + yield $lineNumber => $tokens; } } @@ -71,131 +76,159 @@ class Tokenizer { $tokens = []; $lineLength = strlen($line); - if ($this->activeInjection === null && $this->grammar->injections !== null) { - foreach ($this->grammar->injections as $selector => $injection) { - $selector = ScopeParser::parseSelector($selector); - if ($selector->matches($this->scopeStack)) { - $prefix = $selector->getPrefix($this->scopeStack); - if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { - $this->scopeStack[] = $injection; - $this->activeInjection = $injection; - break; + while (true) { + if ($this->activeInjection === null && $this->grammar->injections !== null) { + foreach ($this->grammar->injections as $selector => $injection) { + $selector = ScopeParser::parseSelector($selector); + if ($selector->matches($this->scopeStack)) { + $prefix = $selector->getPrefix($this->scopeStack); + if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { + $this->scopeStack[] = $injection; + $this->activeInjection = $injection; + break; + } } } } - } - while (true) { $currentRules = end($this->ruleStack)->patterns; $currentRulesCount = count($currentRules); + $nextMatch = null; for ($i = 0; $i < $currentRulesCount; $i++) { while (true) { $rule = $currentRules[$i]; - if ($rule instanceof Pattern) { - echo "Match: {$rule->match}\n\n"; + if ($this->debug === 6 && $this->debugCount === 12) { + if ($rule instanceof Pattern) { + echo "Match: {$rule->match}\n"; + } } - // If the rule is a Pattern and matches the line at the offset then tokenize the - // matches. + // If the rule is a Pattern and matches the line at the offset then... if ($rule instanceof Pattern && preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) { - // ¡TEMPORARY! Haven't implemented begin and end line - // anchors, so let's toss them completely. - if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) { - continue 2; + $match = [ + 'match' => $match, + 'pattern' => $rule + ]; + + if ($match['match'][0][1] === $this->offset) { + $nextMatch = $match; + break 2; + } elseif ($match['match'][0][1] < $nextMatch['match'][0][1]) { + $nextMatch = $match; } - - // Add the name and contentName to the scope stack - // if present. - if ($rule->name !== null) { - $this->scopeStack[] = $this->resolveScopeName($rule->name, $match); + } + // Otherwise, if the rule is a Reference then retrieve its patterns, splice into + // the rule list, and reprocess the rule. + elseif ($rule instanceof Reference && $obj = $rule->get()) { + if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { + $obj = $obj->patterns; } - if ($rule->captures !== null) { - // Iterate through each of the matched subpatterns and create tokens from the - // captures. - foreach ($match as $k => $m) { - if ($m[0] === '' || ($k === 0 && !isset($rule->captures[0]))) { - continue; - } - - // If the subpattern begins after the offset then create a token from the bits - // of the line in-between the last token and the one about to be created. - if ($m[1] > $this->offset) { - $scopeStack = $this->scopeStack; - // If this is the first capture, then the scopes added to the stack need to be - // removed from this token's scope stack as this will grab everything before - // this match began. - if ($k === 0 && $rule->name !== null) { - array_pop($scopeStack); - } - - $tokens[] = new Token( - $scopeStack, - substr($line, $this->offset, $m[1]) - ); - $this->offset = $m[1]; - } - - if ($rule->captures[$k]->name !== null) { - $this->scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match); - } - - if ($rule->captures[$k]->patterns !== null) { - $this->ruleStack[] = $rule->captures[$k]; - $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ]; - array_pop($this->ruleStack); - } else { - $tokens[] = new Token( - $this->scopeStack, - $m[0] - ); - } - - if ($rule->captures[$k]->name !== null) { - array_pop($this->scopeStack); - } - - $this->offset = $m[1] + strlen($m[0]); - } - } + array_splice($currentRules, $i, 1, $obj); + $currentRulesCount = count($currentRules); + continue; + } + + break; + } + } + + // If there were a match above... + if ($nextMatch !== null) { + $match = $nextMatch['match']; + $pattern = $nextMatch['pattern']; + + // **¡TEMPORARY!** Haven't implemented begin and end line + // anchors, so let's toss patterns with them completely for now. + if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) { + continue; + } - // If the pattern is a begin pattern and has a content name then add that to the - // scope stack before processing the children. - if ($rule->beginPattern && $rule->contentName !== null) { - $this->scopeStack[] = $this->resolveScopeName($rule->contentName, $match); + // If the subpattern begins after the offset then create a token from the bits + // of the line in-between the last token and the one(s) about to be created. + if ($match[0][1] > $this->offset) { + $tokens[] = new Token( + $this->scopeStack, + substr($line, $this->offset, $match[0][1] - $this->offset) + ); + $this->debugCount++; + $this->offset = $match[0][1]; + } + + // Add the name to the scope stack if present. + if ($pattern->name !== null) { + $this->scopeStack[] = $this->resolveScopeName($pattern->name, $match); + } + + // If a rule has captures iterate through each of the matched subpatterns and + // create tokens from the captures. + if ($pattern->captures !== null) { + foreach ($match as $k => $m) { + if ($m[0] === '' || ($k === 0 && !isset($pattern->captures[0]))) { + continue; } - $this->ruleStack[] = $rule; + // If the capture has a name add it to the scope stack. + if ($pattern->captures[$k]->name !== null) { + $this->scopeStack[] = $this->resolveScopeName($pattern->captures[$k]->name, $match); + } - if ($rule->patterns !== null && $this->offset < $lineLength) { + // If the capture has patterns of its own add the capture to the rule stack, + // process the patterns, and then pop the capture off the stack. + if ($pattern->captures[$k]->patterns !== null) { + $this->ruleStack[] = $pattern->captures[$k]; $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ]; + array_pop($this->ruleStack); + } + // Otherwise, create a token for the capture. + else { + $tokens[] = new Token( + $this->scopeStack, + $m[0] + ); + $this->debugCount++; + } + + // Pop the capture's name off the scope stack. + if ($pattern->captures[$k]->name !== null) { + array_pop($this->scopeStack); } - if (!$rule->beginPattern) { - if ($rule->endPattern) { - while (!end($this->ruleStack)->beginPattern) { - $popped = array_pop($this->ruleStack); + $this->offset = $m[1] + strlen($m[0]); + } + } + // Otherwise, if the rule doesn't have captures then a token is created from the + // entire match. + else { + $tokens[] = new Token( + $this->scopeStack, + $match[0][0] + ); + + $this->offset = $match[0][1] + strlen($match[0][0]); + $this->debugCount++; + } + + // If the pattern is a begin pattern and has a content name then add that to the + // scope stack before processing the children. + if ($pattern->beginPattern && $pattern->contentName !== null) { + $this->scopeStack[] = $this->resolveScopeName($pattern->contentName, $match); + } - if ($popped->name !== null) { - array_pop($this->scopeStack); - } + $this->ruleStack[] = $pattern; - // If what was just popped is the active injection then remove it, too. - if ($popped === $this->activeInjection) { - $this->activeInjection = null; - } - } - } + // If the rule has patterns process tokens from its subpatterns. + if ($pattern->patterns !== null && $this->offset < $lineLength) { + $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ]; + } + if (!$pattern->beginPattern) { + if ($pattern->endPattern) { + while (!end($this->ruleStack)->beginPattern) { $popped = array_pop($this->ruleStack); - // If what was just popped is a begin pattern and has a content name pop it off - // the scope stack. - if ($popped->beginPattern && $popped->contentName !== null) { - array_pop($this->scopeStack); - } if ($popped->name !== null) { array_pop($this->scopeStack); } @@ -205,25 +238,31 @@ class Tokenizer { $this->activeInjection = null; } } + } - break 2; + $popped = array_pop($this->ruleStack); + + // If what was just popped is a begin pattern and has a content name pop it off + // the scope stack. + if ($popped->beginPattern && $popped->contentName !== null) { + array_pop($this->scopeStack); + } + if ($popped->name !== null) { + array_pop($this->scopeStack); } - // Otherwise, if the rule is a Reference then retrieve its patterns, splice into - // the rule list, and reprocess the rule. - elseif ($rule instanceof Reference && $obj = $rule->get()) { - if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { - $obj = $obj->patterns; - } - array_splice($currentRules, $i, 1, $obj); - $currentRulesCount = count($currentRules); - continue; + // If what was just popped is the active injection then remove it, too. + if ($popped === $this->activeInjection) { + $this->activeInjection = null; } + } - break; + if ($this->offset !== $lineLength) { + continue; } } + if ($this->activeInjection === null && $this->grammar->injections !== null) { foreach ($this->grammar->injections as $selector => $injection) { $selector = ScopeParser::parseSelector($selector);