diff --git a/lib/Grammar.php b/lib/Grammar.php index 47ab36a..fc1daa1 100644 --- a/lib/Grammar.php +++ b/lib/Grammar.php @@ -29,7 +29,7 @@ use dW\Lit\Grammar\{ */ class Grammar { use FauxReadOnly; - protected ?string $_contentRegex; + protected ?string $_contentName; protected ?string $_firstLineMatch; protected ?InjectionList $_injections; protected ?string $_name; @@ -37,14 +37,12 @@ class Grammar { protected ?PatternList $_patterns; protected ?Repository $_repository; protected ?string $_scopeName; - protected ?string $_contentScopeName; - public function __construct(?string $scopeName = null, ?string $contentScopeName = null, ?PatternList $patterns = null, ?string $name = null, ?string $contentRegex = null, ?string $firstLineMatch = null, ?InjectionList $injections = null, ?Repository $repository = null, ?Grammar $ownerGrammar = null) { + public function __construct(?string $scopeName = null, ?PatternList $patterns = null, ?string $name = null, ?string $firstLineMatch = null, ?InjectionList $injections = null, ?Repository $repository = null, ?Grammar $ownerGrammar = null) { $this->_name = $name; $this->_scopeName = $scopeName; $this->_patterns = $patterns; - $this->_contentRegex = $contentRegex; $this->_firstLineMatch = $firstLineMatch; $this->_injections = $injections; $this->_repository = $repository; @@ -98,14 +96,6 @@ class Grammar { $this->_name = $json['name'] ?? null; $this->_scopeName = $json['scopeName']; - $this->_contentScopeName = $json['contentScopeName'] ?? null; - - if (isset($json['contentRegex'])) { - $value = str_replace('/', '\/', $json['contentRegex']); - $this->_contentRegex = $value; - } else { - $this->_contentRegex = null; - } if (isset($json['firstLineMatch'])) { $value = str_replace('/', '\/', $json['firstLineMatch']); @@ -164,7 +154,6 @@ class Grammar { $p = [ 'ownerGrammar' => $this, 'name' => null, - 'contentName' => null, 'match' => null, 'patterns' => null, 'captures' => null, @@ -231,7 +220,6 @@ class Grammar { foreach ($pattern as $key => $value) { switch ($key) { case 'name': - case 'contentName': $p[$key] = $value; $modified = true; break; diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 0235edb..117c274 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -24,10 +24,6 @@ class Tokenizer { $this->grammar = $grammar; $this->ruleStack = [ $this->grammar ]; $this->scopeStack = [ $this->grammar->scopeName ]; - - if ($this->grammar->contentScopeName !== null) { - $this->scopeStack[] = $this->grammar->contentScopeName; - } } @@ -60,55 +56,173 @@ class Tokenizer { return $match; } - protected function _tokenize(string $inputLine, int $offset = 0): array { - $currentRules = end($this->ruleStack)->patterns->getIterator(); - $currentRulesCount = count($currentRules); - $results = []; - $line = $inputLine; + protected function _tokenize(string $line, int &$offset = 0): array { + $tokens = []; $lineLength = strlen($line); - for ($i = 0; $i < $currentRulesCount; $i++) { - while (true) { - $rule = $currentRules[$i]; - if ($rule instanceof Pattern) { - if ($match = $this->getMatch($rule->match, $line, $offset)) { - $tokens = []; - unset($match[0]); - foreach ($match as $k => $m) { - if ($m[1] > $offset) { - $tokens[] = [ - 'scope' => $this->scopeStack, - 'string' => substr($line, $offset, $m[1]) - ]; - $offset = $m[1]; + while (true) { + $currentRules = end($this->ruleStack)->patterns->getIterator(); + $currentRulesCount = count($currentRules); + + for ($i = 0; $i < $currentRulesCount; $i++) { + while (true) { + $rule = $currentRules[$i]; + // If the rule is a Pattern and matches the line at the offset then tokenize the + // matches. + if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line, $offset)) { + // First, remove the first entry in the match, the full + // match, leaving only the subpatterns. + //unset($match[0]); + + // Add the name and contentName to the scope stack + // if present. + if ($rule->name !== null) { + $this->scopeStack[] = $rule->name; + } + if ($rule->contentName !== null) { + $this->scopeStack[] = $rule->contentName; + } + + $wholeMatchCaptureScopeCount = 0; + if ($rule->captures !== null) { + // Iterate through each of the matched subpatterns and create tokens from the + // captures. + foreach ($match as $k => $m) { + if ($m[0] === '') { + continue; + } + + // If the subpattern begins after the offset then create a token from the bits + // of the line in-between. + if ($m[1] > $offset) { + $scopeStack = $this->scopeStack; + // If this is the first capture, then the scopes added to the stack need to be + // removed from this token's scope stack as this will grab everything before + // this match began. + if ($k === 0) { + if ($rule->contentName !== null) { + array_pop($scopeStack); + } + if ($rule->name !== null) { + array_pop($scopeStack); + } + } + + $tokens[] = [ + 'scopes' => $scopeStack, + 'string' => substr($line, $offset, $m[1]) + ]; + $offset = $m[1]; + } + + // The first match is the whole match, and if there are captures for it the name + // and contentName should be added to the stack regardless of whether it has + // patterns or not. However, keep count of how many were added to the stack so + // they may be removed when this rule has finished tokenizing. + if ($k === 0) { + if (!isset($rule->captures[0])) { + continue; + } + + if ($rule->captures[0]->name !== null) { + $this->scopeStack[] = $rule->captures[0]->name; + $wholeMatchCaptureScopeCount++; + } + if ($rule->captures[0]->contentName !== null) { + $this->scopeStack[] = $rule->captures[0]->contentName; + $wholeMatchCaptureScopeCount++; + } + } + + // If the capture rule has patterns of its own then + // those must be matched, too. + if ($rule->captures[$k]->patterns !== null) { + $this->ruleStack[] = $rule->captures[$k]; + + // The scope stack for the whole match is handled above, so only handle that for + // other captures. + if ($k !== 0) { + if ($rule->captures->name !== null) { + $this->scopeStack[] = $rule->captures[$k]->name; + } + if ($rule->captures->contentName !== null) { + $this->scopeStack[] = $rule->captures[$k]->contentName; + } + } + + $tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ]; + + // The scope stack for the whole match is handled above, so only handle that for + // other captures. + if ($k !== 0) { + if ($rule->captures[$k]->contentName !== null) { + array_pop($this->scopeStack); + } + if ($rule->captures[$k]->name !== null) { + array_pop($this->scopeStack); + } + } + + array_pop($this->ruleStack); + } else { + $tokens[] = [ + 'scopes' => [ ...$this->scopeStack, $rule->captures[$k]->name ], + 'string' => $m[0] + ]; + } + + $offset = $m[1] + strlen($m[0]); + $firstCapture = false; } + } + + if ($rule->patterns !== null) { + $tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ]; + } - $tokens[] = [ - 'scope' => [ ...$this->scopeStack, $rule->captures[$k]->name ], - 'string' => $m[0] - ]; - $offset = $m[1] + strlen($m[0]); + // Remove the name and contentName from the scope stack if present. + if ($rule->contentName !== null) { + array_pop($this->scopeStack); + } + if ($rule->name !== null) { + array_pop($this->scopeStack); + } + + // If the rule has a whole match capture (0) then remove its name and + // contentName, too. + $j = 0; + while ($j++ < $wholeMatchCaptureScopeCount) { + array_pop($this->scopeStack); } + // And remove the rule from the rule stack, too. + array_pop($this->ruleStack); + echo "\n"; die(var_export($tokens)); + break 2; } - } elseif ($rule instanceof Reference && $obj = $rule->get()) { - if ($obj instanceof PatternList) { - $obj = $obj->getIterator(); - } elseif ($obj instanceof Grammar) { - $obj = $obj->patterns->getIterator(); + // Otherwise, if the rule is a Reference then retrieve its patterns, splice into + // the rule list, and reprocess the rule. + elseif ($rule instanceof Reference && $obj = $rule->get()) { + if ($obj instanceof PatternList) { + $obj = $obj->getIterator(); + } elseif ($obj instanceof Grammar) { + $obj = $obj->patterns->getIterator(); + } + + array_splice($currentRules, $i, 1, $obj); + $currentRulesCount = count($currentRules); + continue; } - array_splice($currentRules, $i, 1, $obj); - $currentRulesCount = count($currentRules); - continue; + break; } - - break; } + + break; } - return $inputLine; + return $tokens; } } \ No newline at end of file