@ -24,6 +24,7 @@ class Tokenizer {
protected array $ruleStack;
protected array $scopeStack;
protected int $debug = 0;
protected int $debugCount = 0;
public function __construct(\Generator $data, Grammar $grammar) {
@ -38,16 +39,20 @@ class Tokenizer {
foreach ($this->data as $lineNumber => $line) {
$this->debug = $lineNumber;
$this->offset = 0;
$tokens = $this->tokenizeLine($line);
// If after tokenizing the line the entire line still hasn't been tokenized then
// create a token of the rest of the line.
$lineLength = strlen($line);
$tokens = ($lineLength > 0) ? $this->tokenizeLine($line) : [];
// Output a token for everything else contained on the line including the
// newline or just a newline if there weren't any spare characters left on the
// line.
$tokens[] = new Token(
$this->scopeStack,
($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength) . "\n" : "\n"
($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength - $this->offset ) . "\n" : "\n"
);
$this->debugCount++;
yield $lineNumber => $tokens;
}
}
@ -71,131 +76,159 @@ class Tokenizer {
$tokens = [];
$lineLength = strlen($line);
if ($this->activeInjection === null & & $this->grammar->injections !== null) {
foreach ($this->grammar->injections as $selector => $injection) {
$selector = ScopeParser::parseSelector($selector);
if ($selector->matches($this->scopeStack)) {
$prefix = $selector->getPrefix($this->scopeStack);
if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) {
$this->scopeStack[] = $injection;
$this->activeInjection = $injection;
break;
while (true) {
if ($this->activeInjection === null & & $this->grammar->injections !== null) {
foreach ($this->grammar->injections as $selector => $injection) {
$selector = ScopeParser::parseSelector($selector);
if ($selector->matches($this->scopeStack)) {
$prefix = $selector->getPrefix($this->scopeStack);
if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) {
$this->scopeStack[] = $injection;
$this->activeInjection = $injection;
break;
}
}
}
}
}
while (true) {
$currentRules = end($this->ruleStack)->patterns;
$currentRulesCount = count($currentRules);
$nextMatch = null;
for ($i = 0; $i < $currentRulesCount; $i++) {
while (true) {
$rule = $currentRules[$i];
if ($rule instanceof Pattern) {
echo "Match: {$rule->match}\n\n";
if ($this->debug === 6 & & $this->debugCount === 12) {
if ($rule instanceof Pattern) {
echo "Match: {$rule->match}\n";
}
}
// If the rule is a Pattern and matches the line at the offset then tokenize the
// matches.
// If the rule is a Pattern and matches the line at the offset then...
if ($rule instanceof Pattern & & preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) {
// ¡TEMPORARY! Haven't implemented begin and end line
// anchors, so let's toss them completely.
if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) {
continue 2;
$match = [
'match' => $match,
'pattern' => $rule
];
if ($match['match'][0][1] === $this->offset) {
$nextMatch = $match;
break 2;
} elseif ($match['match'][0][1] < $nextMatch['match'][0][1]) {
$nextMatch = $match;
}
// Add the name and contentName to the scope stack
// if present.
if ($rule->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->name, $match);
}
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into
// the rule list, and reprocess the rule.
elseif ($rule instanceof Reference & & $obj = $rule->get()) {
if ($obj instanceof Grammar || ($rule instanceof RepositoryReference & & $obj->match === null)) {
$obj = $obj->patterns;
}
if ($rule->captures !== null) {
// Iterate through each of the matched subpatterns and create tokens from the
// captures.
foreach ($match as $k => $m) {
if ($m[0] === '' || ($k === 0 & & !isset($rule->captures[0]))) {
continue;
}
// If the subpattern begins after the offset then create a token from the bits
// of the line in-between the last token and the one about to be created.
if ($m[1] > $this->offset) {
$scopeStack = $this->scopeStack;
// If this is the first capture, then the scopes added to the stack need to be
// removed from this token's scope stack as this will grab everything before
// this match began.
if ($k === 0 & & $rule->name !== null) {
array_pop($scopeStack);
}
$tokens[] = new Token(
$scopeStack,
substr($line, $this->offset, $m[1])
);
$this->offset = $m[1];
}
if ($rule->captures[$k]->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match);
}
if ($rule->captures[$k]->patterns !== null) {
$this->ruleStack[] = $rule->captures[$k];
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
array_pop($this->ruleStack);
} else {
$tokens[] = new Token(
$this->scopeStack,
$m[0]
);
}
if ($rule->captures[$k]->name !== null) {
array_pop($this->scopeStack);
}
$this->offset = $m[1] + strlen($m[0]);
}
}
array_splice($currentRules, $i, 1, $obj);
$currentRulesCount = count($currentRules);
continue;
}
break;
}
}
// If there were a match above...
if ($nextMatch !== null) {
$match = $nextMatch['match'];
$pattern = $nextMatch['pattern'];
// **¡TEMPORARY!** Haven't implemented begin and end line
// anchors, so let's toss patterns with them completely for now.
if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) {
continue;
}
// If the pattern is a begin pattern and has a content name then add that to the
// scope stack before processing the children.
if ($rule->beginPattern & & $rule->contentName !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->contentName, $match);
// If the subpattern begins after the offset then create a token from the bits
// of the line in-between the last token and the one(s) about to be created.
if ($match[0][1] > $this->offset) {
$tokens[] = new Token(
$this->scopeStack,
substr($line, $this->offset, $match[0][1] - $this->offset)
);
$this->debugCount++;
$this->offset = $match[0][1];
}
// Add the name to the scope stack if present.
if ($pattern->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($pattern->name, $match);
}
// If a rule has captures iterate through each of the matched subpatterns and
// create tokens from the captures.
if ($pattern->captures !== null) {
foreach ($match as $k => $m) {
if ($m[0] === '' || ($k === 0 & & !isset($pattern->captures[0]))) {
continue;
}
$this->ruleStack[] = $rule;
// If the capture has a name add it to the scope stack.
if ($pattern->captures[$k]->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($pattern->captures[$k]->name, $match);
}
if ($rule->patterns !== null & & $this->offset < $lineLength) {
// If the capture has patterns of its own add the capture to the rule stack,
// process the patterns, and then pop the capture off the stack.
if ($pattern->captures[$k]->patterns !== null) {
$this->ruleStack[] = $pattern->captures[$k];
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
array_pop($this->ruleStack);
}
// Otherwise, create a token for the capture.
else {
$tokens[] = new Token(
$this->scopeStack,
$m[0]
);
$this->debugCount++;
}
// Pop the capture's name off the scope stack.
if ($pattern->captures[$k]->name !== null) {
array_pop($this->scopeStack);
}
if (!$rule->beginPattern) {
if ($rule->endPattern) {
while (!end($this->ruleStack)->beginPattern) {
$popped = array_pop($this->ruleStack);
$this->offset = $m[1] + strlen($m[0]);
}
}
// Otherwise, if the rule doesn't have captures then a token is created from the
// entire match.
else {
$tokens[] = new Token(
$this->scopeStack,
$match[0][0]
);
$this->offset = $match[0][1] + strlen($match[0][0]);
$this->debugCount++;
}
// If the pattern is a begin pattern and has a content name then add that to the
// scope stack before processing the children.
if ($pattern->beginPattern & & $pattern->contentName !== null) {
$this->scopeStack[] = $this->resolveScopeName($pattern->contentName, $match);
}
if ($popped->name !== null) {
array_pop($this->scopeStack);
}
$this->ruleStack[] = $pattern;
// If what was just popped is the active injection then remove it, too.
if ($popped === $this->activeInjection) {
$this->activeInjection = null;
}
}
}
// If the rule has patterns process tokens from its subpatterns.
if ($pattern->patterns !== null & & $this->offset < $lineLength) {
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
}
if (!$pattern->beginPattern) {
if ($pattern->endPattern) {
while (!end($this->ruleStack)->beginPattern) {
$popped = array_pop($this->ruleStack);
// If what was just popped is a begin pattern and has a content name pop it off
// the scope stack.
if ($popped->beginPattern & & $popped->contentName !== null) {
array_pop($this->scopeStack);
}
if ($popped->name !== null) {
array_pop($this->scopeStack);
}
@ -205,25 +238,31 @@ class Tokenizer {
$this->activeInjection = null;
}
}
}
break 2;
$popped = array_pop($this->ruleStack);
// If what was just popped is a begin pattern and has a content name pop it off
// the scope stack.
if ($popped->beginPattern & & $popped->contentName !== null) {
array_pop($this->scopeStack);
}
if ($popped->name !== null) {
array_pop($this->scopeStack);
}
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into
// the rule list, and reprocess the rule.
elseif ($rule instanceof Reference & & $obj = $rule->get()) {
if ($obj instanceof Grammar || ($rule instanceof RepositoryReference & & $obj->match === null)) {
$obj = $obj->patterns;
}
array_splice($currentRules, $i, 1, $obj);
$currentRulesCount = count($currentRules);
continue ;
// If what was just popped is the active injection then remove it, too.
if ($popped === $this->activeInjection) {
$this->activeInjection = null;
}
}
break;
if ($this->offset !== $lineLength) {
continue;
}
}
if ($this->activeInjection === null & & $this->grammar->injections !== null) {
foreach ($this->grammar->injections as $selector => $injection) {
$selector = ScopeParser::parseSelector($selector);