@ -24,6 +24,7 @@ class Tokenizer {
protected array $ruleStack;
protected array $scopeStack;
protected int $debug = 0;
protected int $debugCount = 0;
public function __construct(\Generator $data, Grammar $grammar) {
@ -38,16 +39,20 @@ class Tokenizer {
foreach ($this->data as $lineNumber => $line) {
$this->debug = $lineNumber;
$this->offset = 0;
$tokens = $this->tokenizeLine($line);
// If after tokenizing the line the entire line still hasn't been tokenized then
// create a token of the rest of the line.
$lineLength = strlen($line);
$tokens = ($lineLength > 0) ? $this->tokenizeLine($line) : [];
// Output a token for everything else contained on the line including the
// newline or just a newline if there weren't any spare characters left on the
// line.
$tokens[] = new Token(
$this->scopeStack,
($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength) . "\n" : "\n"
($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength - $this->offset ) . "\n" : "\n"
);
$this->debugCount++;
yield $lineNumber => $tokens;
}
}
@ -71,6 +76,7 @@ class Tokenizer {
$tokens = [];
$lineLength = strlen($line);
while (true) {
if ($this->activeInjection === null & & $this->grammar->injections !== null) {
foreach ($this->grammar->injections as $selector => $injection) {
$selector = ScopeParser::parseSelector($selector);
@ -85,96 +91,141 @@ class Tokenizer {
}
}
while (true) {
$currentRules = end($this->ruleStack)->patterns;
$currentRulesCount = count($currentRules);
$nextMatch = null;
for ($i = 0; $i < $currentRulesCount; $i++) {
while (true) {
$rule = $currentRules[$i];
if ($this->debug === 6 & & $this->debugCount === 12) {
if ($rule instanceof Pattern) {
echo "Match: {$rule->match}\n\n";
echo "Match: {$rule->match}\n";
}
}
// If the rule is a Pattern and matches the line at the offset then tokenize the
// matches.
// If the rule is a Pattern and matches the line at the offset then...
if ($rule instanceof Pattern & & preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) {
// ¡TEMPORARY! Haven't implemented begin and end line
// anchors, so let's toss them completely.
if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) {
continue 2;
}
$match = [
'match' => $match,
'pattern' => $rule
];
// Add the name and contentName to the scope stack
// if present.
if ($rule->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->name, $match);
if ($match['match'][0][1] === $this->offset) {
$nextMatch = $match;
break 2;
} elseif ($match['match'][0][1] < $nextMatch['match'][0][1]) {
$nextMatch = $match;
}
}
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into
// the rule list, and reprocess the rule.
elseif ($rule instanceof Reference & & $obj = $rule->get()) {
if ($obj instanceof Grammar || ($rule instanceof RepositoryReference & & $obj->match === null)) {
$obj = $obj->patterns;
}
if ($rule->captures !== null) {
// Iterate through each of the matched subpatterns and create tokens from the
// captures.
foreach ($match as $k => $m) {
if ($m[0] === '' || ($k === 0 & & !isset($rule->captures[0]))) {
array_splice($currentRules, $i, 1, $obj);
$currentRulesCount = count($currentRules);
continue;
}
// If the subpattern begins after the offset then create a token from the bits
// of the line in-between the last token and the one about to be created.
if ($m[1] > $this->offset) {
$scopeStack = $this->scopeStack;
// If this is the first capture, then the scopes added to the stack need to be
// removed from this token's scope stack as this will grab everything before
// this match began.
if ($k === 0 & & $rule->name !== null) {
array_pop($scopeStack);
break;
}
}
// If there were a match above...
if ($nextMatch !== null) {
$match = $nextMatch['match'];
$pattern = $nextMatch['pattern'];
// **¡TEMPORARY!** Haven't implemented begin and end line
// anchors, so let's toss patterns with them completely for now.
if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) {
continue;
}
// If the subpattern begins after the offset then create a token from the bits
// of the line in-between the last token and the one(s) about to be created.
if ($match[0][1] > $this->offset) {
$tokens[] = new Token(
$scopeStack,
substr($line, $this->offset, $m[1])
$this-> scopeStack,
substr($line, $this->offset, $match[0] [1] - $this->offset )
);
$this->offset = $m[1];
$this->debugCount++;
$this->offset = $match[0][1];
}
// Add the name to the scope stack if present.
if ($pattern->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($pattern->name, $match);
}
// If a rule has captures iterate through each of the matched subpatterns and
// create tokens from the captures.
if ($pattern->captures !== null) {
foreach ($match as $k => $m) {
if ($m[0] === '' || ($k === 0 & & !isset($pattern->captures[0]))) {
continue;
}
if ($rule->captures[$k]->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match);
// If the capture has a name add it to the scope stack.
if ($pattern->captures[$k]->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($pattern->captures[$k]->name, $match);
}
if ($rule->captures[$k]->patterns !== null) {
$this->ruleStack[] = $rule->captures[$k];
// If the capture has patterns of its own add the capture to the rule stack,
// process the patterns, and then pop the capture off the stack.
if ($pattern->captures[$k]->patterns !== null) {
$this->ruleStack[] = $pattern->captures[$k];
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
array_pop($this->ruleStack);
} else {
}
// Otherwise, create a token for the capture.
else {
$tokens[] = new Token(
$this->scopeStack,
$m[0]
);
$this->debugCount++;
}
if ($rule->captures[$k]->name !== null) {
// Pop the capture's name off the scope stack.
if ($pattern->captures[$k]->name !== null) {
array_pop($this->scopeStack);
}
$this->offset = $m[1] + strlen($m[0]);
}
}
// Otherwise, if the rule doesn't have captures then a token is created from the
// entire match.
else {
$tokens[] = new Token(
$this->scopeStack,
$match[0][0]
);
$this->offset = $match[0][1] + strlen($match[0][0]);
$this->debugCount++;
}
// If the pattern is a begin pattern and has a content name then add that to the
// scope stack before processing the children.
if ($rule->beginPattern & & $rule->contentName !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->contentName, $match);
if ($pattern->beginPattern & & $pattern ->contentName !== null) {
$this->scopeStack[] = $this->resolveScopeName($pattern ->contentName, $match);
}
$this->ruleStack[] = $rule;
$this->ruleStack[] = $pattern ;
if ($rule->patterns !== null & & $this->offset < $lineLength) {
// If the rule has patterns process tokens from its subpatterns.
if ($pattern->patterns !== null & & $this->offset < $lineLength) {
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
}
if (!$rule ->beginPattern) {
if ($rule ->endPattern) {
if (!$pattern ->beginPattern) {
if ($pattern ->endPattern) {
while (!end($this->ruleStack)->beginPattern) {
$popped = array_pop($this->ruleStack);
@ -206,24 +257,12 @@ class Tokenizer {
}
}
break 2;
}
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into
// the rule list, and reprocess the rule.
elseif ($rule instanceof Reference & & $obj = $rule->get()) {
if ($obj instanceof Grammar || ($rule instanceof RepositoryReference & & $obj->match === null)) {
$obj = $obj->patterns;
}
array_splice($currentRules, $i, 1, $obj);
$currentRulesCount = count($currentRules);
if ($this->offset !== $lineLength) {
continue;
}
break;
}
}
if ($this->activeInjection === null & & $this->grammar->injections !== null) {
foreach ($this->grammar->injections as $selector => $injection) {
$selector = ScopeParser::parseSelector($selector);