data = $data; $this->grammar = $grammar; $this->ruleStack = [ $this->grammar ]; // Grammar scope names can contain spaces (eg: source.js.rails // source.js.jquery), meaning more than one needs to be added to the stack. It // doesn't look like anything does it elsewhere, so spaces in scope names only // work here. $this->scopeStack = preg_split('/\s+/', $this->grammar->scopeName); } /** Receives lines from the Data object and yields an array of tokens */ public function tokenize(): \Generator { foreach ($this->data->get() as $lineNumber => $line) { $this->lineNumber = $lineNumber; $this->line = $line; // Because of how this tokenizes if the final line is just a new line it will // yield an empty token set; just end the generator instead. if ($this->data->lastLine && $line === '') { return; } assert($this->debugLine()); $this->offset = 0; $lineLength = strlen($line); $tokens = ($lineLength > 0) ? $this->tokenizeLine($lineLength) : []; // Output a token for everything else contained on the line including the // newline or just a newline if there weren't any spare characters left on the // line. If it is the last line, and there's nothing else remaining on the line // then output no additional token. if ($this->offset < $lineLength) { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => substr($line, $this->offset, $lineLength - $this->offset) . ((!$this->data->lastLine) ? "\n" : '') ]; } elseif (!$this->data->lastLine) { // Some matches can grab the new line, so check to see if it's already present // before appending another token. $lastToken = end($tokens); if ($lastToken === false || substr($lastToken['text'], -1) !== "\n") { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => "\n" ]; } } assert($this->debugTokens($tokens)); yield $lineNumber => $tokens; } } protected function resolveScopeName(string $scopeName, array $match): string { return preg_replace_callback(self::SCOPE_RESOLVE_REGEX, function($m) use($scopeName, $match) { $replacement = trim($match[(int)(($m[1] !== '') ? $m[1] : $m[2])][0] ?? $m[1]); $command = $m[2] ?? null; switch ($command) { case 'downcase': return strtolower($replacement); break; case 'upcase': return strtoupper($replacement); break; default: return $replacement; } }, $scopeName); } protected function tokenizeLine(int $stopOffset): array { $tokens = []; while (true) { $closestMatch = $this->findClosestMatch(end($this->ruleStack)); $this->previousMatches[] = $closestMatch; assert($this->debugClosestMatch($closestMatch)); // If there were a match above... if ($closestMatch !== null) { $match = $closestMatch['match']; $pattern = $closestMatch['pattern']; // If the subpattern begins after the offset then create a token from the bits // of the line in-between the last token and the one(s) about to be created. // However, don't do this if the pattern is an end pattern and its match // contains a negative lookahead for the offset. This is due to a difference in // how PCRE works versus the original Oniguruma. if ($match[0][1] > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => substr($this->line, $this->offset, $match[0][1] - $this->offset) ]; $this->offset = $match[0][1]; } // If the pattern is an end pattern and its corresponding begin pattern has a // content name remove that from the scope stack before continuing. if ($pattern->endPattern) { $previousRule = end($this->ruleStack); if ($previousRule->beginPattern && $previousRule->contentName !== null) { array_pop($this->scopeStack); } } // Add the name to the scope stack if present. if ($pattern->name !== null) { $this->scopeStack[] = $this->resolveScopeName($pattern->name, $match); } // If a rule has captures iterate through each of the matched subpatterns and // create tokens from the captures. if ($pattern->captures !== null) { foreach ($match as $k => $m) { // If either the capture match is empty, there's no pattern capture for this // match, or the match being processed is the first one and there are no // captures for it then continue onto the next one. if ($m[0] === '' || $m[1] < 0 || !isset($pattern->captures[$k]) || ($k === 0 && !isset($pattern->captures[0]))) { continue; } // If the capture begins after the offset then create a token from the bits of // the line in-between the last token and the one(s) about to be created. if ($k > 0 && $m[1] > $this->offset) { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => substr($this->line, $this->offset, $m[1] - $this->offset) ]; $this->offset = $m[1]; } // If the capture has a name add it to the scope stack. if ($pattern->captures[$k]->name !== null) { $this->scopeStack[] = $this->resolveScopeName($pattern->captures[$k]->name, $match); } // If the capture has patterns of its own add the capture to the rule stack, // process the patterns, and then pop the capture off the stack. if ($pattern->captures[$k]->patterns !== null) { if ($m[1] < $this->offset) { die("MOTHERFUCKER!\n"); } $this->ruleStack[] = $pattern->captures[$k]; // Don't do injections on capture lists. $activeInjection = $this->activeInjection; $this->activeInjection = true; // Only tokenize the part of the line that's contains the match. $captureEndOffset = $m[1] + strlen($m[0]); $tokens = [ ...$tokens, ...$this->tokenizeLine($captureEndOffset) ]; // If the offset is before the end of the capture then create a token from the // bits of the capture from the offset until the end of the capture. if ($captureEndOffset > $this->offset) { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => substr($this->line, $this->offset, $captureEndOffset - $this->offset) ]; $this->offset = $captureEndOffset; } array_pop($this->ruleStack); // Return to the original active injection state. $this->activeInjection = $activeInjection; $this->offset = $m[1] + strlen($m[0]); } // Otherwise, create a token for the capture. else { // If the capture's offset is before the current offset then the new token needs // to be spliced within previously emitted ones. if ($m[1] < $this->offset) { $curOffset = $this->offset; // Go backwards through the tokens, find the token the current capture is // within, and splice new tokens into the token array for ($tokensLength = count($tokens), $i = $tokensLength - 1; $i >= 0; $i--) { $cur = $tokens[$i]; $curOffset -= strlen($cur['text']); if ($m[1] >= $curOffset) { // If the length of the new capture would put part of it outside the previous // token then toss the token. if ($m[1] + strlen($m[0]) > $curOffset + strlen($cur['text'])) { // TODO: trigger a warning or something here maybe? break; } $t = []; // Add in token for anything before the new capture token within the token being // spliced $preMatchText = substr($cur['text'], 0, $m[1] - $curOffset); if ($preMatchText !== '') { $t[] = [ 'scopes' => $cur['scopes'], 'text' => $preMatchText ]; } // The new capture's scope needs to be added to the prior token's scope stack to // make the stack for the new one. $scopeStack = $cur['scopes']; $scopeStack[] = $pattern->captures[$k]->name; $t[] = [ 'scopes' => $scopeStack, 'text' => $m[0] ]; // Add in token for anything after the new capture token within the token being // spliced $postMatchText = substr($cur['text'], $m[1] - $curOffset + strlen($m[0])); if ($postMatchText !== '') { $t[] = [ 'scopes' => $cur['scopes'], 'text' => $postMatchText ]; } array_splice($tokens, $i, 1, $t); $this->offset = max($this->offset, $m[1] + strlen($m[0])); break; } } } else { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => $m[0] ]; $this->offset = $m[1] + strlen($m[0]); } } // Pop the capture's name off the scope stack. if ($pattern->captures[$k]->name !== null) { array_pop($this->scopeStack); } } } // Otherwise, if the rule doesn't have captures then a token is created from the // entire match, but only if the matched text isn't empty. elseif ($match[0][0] !== '') { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => $match[0][0] ]; $this->offset = $match[0][1] + strlen($match[0][0]); } // If the pattern is a begin pattern and has a content name then add that to the // scope stack before processing the children. if ($pattern->beginPattern && $pattern->contentName !== null) { $this->scopeStack[] = $this->resolveScopeName($pattern->contentName, $match); } $this->ruleStack[] = $pattern; $this->activeInjection = ($pattern->injection); // If the rule has patterns process tokens from its subpatterns. if ($pattern->patterns !== null && $this->offset < $stopOffset) { // If the pattern has just a regular match (meaning neither a begin nor an end // pattern) but has subpatterns then only tokenize the part of the line that's // within the match. Otherwise, tokenize up to the stop offset. Because of // recursion, the stop offset could be set by this step before or within the // capture tokenization process. $tokens = [ ...$tokens, ...$this->tokenizeLine((!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $stopOffset) ]; } // If the offset is before the end of the match then create a token from the // bits of the match from the offset until the end of the match. However, don't // do this if the pattern is an end pattern and its match contains a negative // lookahead for the offset. This is due to a difference in how PCRE works // versus the original Oniguruma. $endOffset = $match[0][1] + strlen($match[0][0]); if ($endOffset > $this->offset && !($pattern->endPattern && preg_match('/\(\?!\\\G\)/', $pattern->match) === 1)) { $tokens[] = [ 'scopes' => $this->scopeStack, 'text' => substr($this->line, $this->offset, $endOffset - $this->offset) ]; $this->offset = $endOffset; } if (!$pattern->beginPattern) { if ($pattern->endPattern) { // Pop everything off both stacks until a begin pattern is reached. while (!end($this->ruleStack)->beginPattern) { $popped = array_pop($this->ruleStack); if ($popped->name !== null) { array_pop($this->scopeStack); } // If what was just popped is the active injection then remove it, too. if ($popped->injection) { $this->activeInjection = false; } } } $popped = array_pop($this->ruleStack); // Pop the rule's name from the stack. if ($popped->name !== null) { array_pop($this->scopeStack); } // This seems to be necessary sometimes because of how references are built // in this implementation. They're sometimes made into patterns with a // single subpattern. This causes some issues when popping off the stacks, so // circumvent this bit of bullshittery below... *crosses fingers* $end = end($this->ruleStack); if ($end instanceof Pattern && !$end->beginPattern && !$end->endPattern && $end->match === null && $end->patterns[0] instanceof RepositoryReference) { $rep = $end->patterns[0]->get(); if ($rep->patterns[0]->endPattern) { $mmm = $this->findClosestMatch($rep); if ($mmm['pattern'] === $rep->patterns[0]) { array_pop($this->ruleStack); if ($rep->name !== null) { array_pop($this->scopeStack); } } } } // If what was just popped is the active injection then remove it, too. if ($popped->injection) { $this->activeInjection = false; } } // If the offset isn't at the end of the line then look for more matches. if ($this->offset < $stopOffset) { continue; } } break; } return $tokens; } protected function findClosestMatch(Grammar|Pattern $pattern): ?array { $injected = false; // Grab the current rule list from the cache if available to prevent having to // splice in references repeatedly. $cacheIndex = array_search($pattern->patterns, $this->ruleCacheIndexes); if ($cacheIndex !== false) { $currentRules = $this->ruleCacheValues[$cacheIndex]; } else { $currentRules = $pattern->patterns; if (!$this->activeInjection && $this->grammar->injections !== null) { foreach ($this->grammar->injections as $selector => $injection) { $selector = ScopeParser::parseSelector($selector); if ($selector->matches($this->scopeStack)) { $prefix = $selector->getPrefix($this->scopeStack); if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { $currentRules = [ ...$injection->patterns, ...$currentRules ]; if ($prefix === Filter::PREFIX_LEFT) { break; } } if ($prefix === null || $prefix === Filter::PREFIX_RIGHT || $prefix === Filter::PREFIX_BOTH) { $currentRules = [ ...$currentRules, ...$injection->patterns ]; } $injected = true; break; } } } } $closestMatch = null; for ($i = 0, $currentRulesCount = count($currentRules); $i < $currentRulesCount; $i++) { while (true) { $rule = $currentRules[$i]; // Grammar references can return false if the grammar does not exist, so // continue on if the current rule is false. if ($rule === false) { continue 2; } // If the rule is a Pattern if ($rule instanceof Pattern) { $ruleMatch = $rule->match; $offset = $this->offset; // If the rule is an end pattern with a back reference then it expects to be // able to reference a subpattern from its begin pattern. Replace the reference // with the matched subpattern and then match with it below. if ($rule->endPattern && preg_match(self::BACK_REFERENCE_REGEX, $ruleMatch, $m) === 1) { $beginMatch = null; for ($previousMatchesCount = count($this->previousMatches), $i = $previousMatchesCount - 1; $i >= 0; $i--) { $cur = $this->previousMatches[$i]; if ($cur !== null && $cur['pattern']->beginPattern) { foreach ($cur['pattern']->patterns as $p) { if ($p === $rule) { $beginMatch = $cur['match']; break 2; } } } } if ($beginMatch !== null) { $ruleMatch = preg_replace_callback(self::BACK_REFERENCE_REGEX, function($m) use ($beginMatch) { $index = (int)$m[1]; return $beginMatch[$index][0] ?? $m[0]; }, $ruleMatch); } } if (preg_match($ruleMatch, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $offset) === 1) { // Throw out pattern regexes with anchors that shouldn't match the current line. // This is necessary because the tokenizer is fed data line by line and // therefore anchors that match the beginning of the document and the end won't // do anything. if (preg_match( self::ANCHOR_CHECK_REGEX, $ruleMatch, $validRegexMatch) === 1 && ( // \A anchors match the beginning of the whole string, not just this line ($validRegexMatch[1] === 'A' && !$this->data->firstLine) || // \z anchors match the end of the whole string, not just this line ($validRegexMatch[1] === 'z' && !$this->data->lastLine) || // \Z anchors match the end of the whole string or before the final newline if // there's a trailing newline in the string ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine) ) ) { continue 2; } // If there is an existing match that doesn't match at the offset, the current // matched rule is a begin pattern that doesn't capture anything, its end // pattern would be the next match, and its end pattern also doesn't capture // anything then discard this match. If this wasn't here it would continuously // match this begin pattern followed by its end pattern, creating an infinite // loop because the offset never moves forward. if ($closestMatch !== null && $rule->beginPattern && $match[0][0] === '') { $m = $this->findClosestMatch($rule); if ($m !== null && $m['pattern']->endPattern && $m['match'][0][0] === '') { continue 2; } } // If the match's offset is the same as the current offset then it is the // closest match. There's no need to iterate anymore through the patterns. if ($match[0][1] === $this->offset) { $closestMatch = [ 'match' => $match, 'pattern' => $rule ]; break 2; } // Otherwise, if the closest match is currently null or the match's offset is // less than the closest match's offset then set the match as the closest match // and continue looking for a closer one. elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) { $closestMatch = [ 'match' => $match, 'pattern' => $rule ]; } } } // Otherwise, if the rule is a Reference then retrieve its patterns, splice into // the rule list, and reprocess the rule. elseif ($rule instanceof Reference) { if (!$rule instanceof BaseReference) { $obj = $rule->get(); if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { $obj = $obj->patterns; } } else { $obj = $this->grammar->patterns; } array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj); $currentRulesCount = count($currentRules); // When the current rule list changes write it to the cache. if ($cacheIndex === false) { $this->ruleCacheIndexes[] = end($this->ruleStack)->patterns; $cacheIndex = count($this->ruleCacheIndexes) - 1; } if ($injected) { // Injections need to be re-evaluated against the scope stack every time they're // injected so don't cache them. $temp = $currentRules; foreach ($temp as $k => $r) { if ($r instanceof Pattern && $r->injection) { unset($temp[$k]); } } $this->ruleCacheValues[$cacheIndex] = array_values($temp); } else { $this->ruleCacheValues[$cacheIndex] = $currentRules; } continue; } break; } } return $closestMatch; } private function debugClosestMatch(?array $closestMatch): bool { if (self::$debug) { $message = <<offset, $closestMatch['pattern']->match ?? 'NULL', $closestMatch['pattern']->name ?? 'NULL', ($closestMatch !== null && $closestMatch['pattern']->captures !== null) ? 'yes' : 'no', var_export($closestMatch['pattern']->beginPattern ?? null, true), var_export($closestMatch['pattern']->endPattern ?? null, true), var_export($closestMatch['match'] ?? null, true) ); echo $this->debug_indentLines($message) . "\n\n"; } return true; } private function debug_indentLines(string $message): string { $backtrace = debug_backtrace(); array_shift($backtrace); array_shift($backtrace); $count = -1; foreach ($backtrace as $b) { if ($b['function'] === 'tokenizeLine') { $count++; } } return ($count > 0) ? preg_replace('/^/m', str_repeat('|', $count) . ' ', $message) : $message; } private function debugLine(): bool { if (self::$debug) { $message = <<lineNumber} ", 80, '-'), preg_replace('/\\\\{2}/', '\\', var_export($this->line, true)) ); } return true; } public function debugTokens(array $tokens): bool { if (self::$debug) { echo 'Tokens: ' . preg_replace('/\\\\{2}/', '\\', var_export($tokens, true)) . "\n\n"; } return true; } }