|
|
|
<?php
|
|
|
|
/** @license MIT
|
|
|
|
* Copyright 2021 Dustin Wilson et al.
|
|
|
|
* See LICENSE file for details */
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace dW\Lit;
|
|
|
|
use dW\Lit\Scope\Parser as ScopeParser;
|
|
|
|
use dW\Lit\Grammar\{
|
|
|
|
Pattern,
|
|
|
|
Reference
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
class Tokenizer {
|
|
|
|
protected \Generator $data;
|
|
|
|
protected Grammar $grammar;
|
|
|
|
protected int $offset = 0;
|
|
|
|
protected array $ruleStack;
|
|
|
|
protected array $scopeStack;
|
|
|
|
protected $debug = false;
|
|
|
|
|
|
|
|
|
|
|
|
public function __construct(\Generator $data, Grammar $grammar) {
|
|
|
|
$this->data = $data;
|
|
|
|
$this->grammar = $grammar;
|
|
|
|
$this->ruleStack = [ $this->grammar ];
|
|
|
|
$this->scopeStack = [ $this->grammar->scopeName ];
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public function tokenize(): \Generator {
|
|
|
|
foreach ($this->data as $lineNumber => $line) {
|
|
|
|
$this->offset = 0;
|
|
|
|
$tokens = $this->tokenizeLine($line);
|
|
|
|
|
|
|
|
// If after tokenizing the line the entire line still hasn't been tokenized then
|
|
|
|
// create a token of the rest of the line.
|
|
|
|
$lineLength = strlen($line);
|
|
|
|
if ($this->offset < $lineLength) {
|
|
|
|
$tokens[] = new Token(
|
|
|
|
$this->scopeStack,
|
|
|
|
substr($line, $this->offset, $lineLength)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
yield $lineNumber => $tokens;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
protected function getMatch(string $regex, string $line): ?array {
|
|
|
|
if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE, $this->offset) !== 1) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $match;
|
|
|
|
}
|
|
|
|
|
|
|
|
protected function tokenizeLine(string $line): array {
|
|
|
|
$tokens = [];
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
$currentRules = end($this->ruleStack)->patterns->getIterator();
|
|
|
|
$currentRulesCount = count($currentRules);
|
|
|
|
|
|
|
|
for ($i = 0; $i < $currentRulesCount; $i++) {
|
|
|
|
while (true) {
|
|
|
|
$rule = $currentRules[$i];
|
|
|
|
// If the rule is a Pattern and matches the line at the offset then tokenize the
|
|
|
|
// matches.
|
|
|
|
if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line)) {
|
|
|
|
// Add the name and contentName to the scope stack
|
|
|
|
// if present.
|
|
|
|
if ($rule->name !== null) {
|
|
|
|
$this->scopeStack[] = $rule->name;
|
|
|
|
}
|
|
|
|
if ($rule->contentName !== null) {
|
|
|
|
$this->scopeStack[] = $rule->contentName;
|
|
|
|
}
|
|
|
|
|
|
|
|
$wholeMatchCaptureScopeCount = 0;
|
|
|
|
if ($rule->captures !== null) {
|
|
|
|
// Iterate through each of the matched subpatterns and create tokens from the
|
|
|
|
// captures.
|
|
|
|
foreach ($match as $k => $m) {
|
|
|
|
if ($m[0] === '') {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the subpattern begins after the offset then create a token from the bits
|
|
|
|
// of the line in-between the last token and the one about to be created.
|
|
|
|
if ($m[1] > $this->offset) {
|
|
|
|
$scopeStack = $this->scopeStack;
|
|
|
|
// If this is the first capture, then the scopes added to the stack need to be
|
|
|
|
// removed from this token's scope stack as this will grab everything before
|
|
|
|
// this match began.
|
|
|
|
if ($k === 0) {
|
|
|
|
if ($rule->contentName !== null) {
|
|
|
|
array_pop($scopeStack);
|
|
|
|
}
|
|
|
|
if ($rule->name !== null) {
|
|
|
|
array_pop($scopeStack);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$tokens[] = new Token(
|
|
|
|
$scopeStack,
|
|
|
|
substr($line, $this->offset, $m[1])
|
|
|
|
);
|
|
|
|
$this->offset = $m[1];
|
|
|
|
}
|
|
|
|
|
|
|
|
// The first match is the whole match, and if there are captures for it the name
|
|
|
|
// and contentName should be added to the stack regardless of whether it has
|
|
|
|
// patterns or not. However, keep count of how many were added to the stack so
|
|
|
|
// they may be removed when this rule has finished tokenizing.
|
|
|
|
if ($k === 0) {
|
|
|
|
if (!isset($rule->captures[0])) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($rule->captures[0]->name !== null) {
|
|
|
|
$this->scopeStack[] = $rule->captures[0]->name;
|
|
|
|
$wholeMatchCaptureScopeCount++;
|
|
|
|
}
|
|
|
|
if ($rule->captures[0]->contentName !== null) {
|
|
|
|
$this->scopeStack[] = $rule->captures[0]->contentName;
|
|
|
|
$wholeMatchCaptureScopeCount++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the capture rule has patterns of its own then
|
|
|
|
// those must be matched, too.
|
|
|
|
if ($rule->captures[$k]->patterns !== null) {
|
|
|
|
$this->ruleStack[] = $rule->captures[$k];
|
|
|
|
|
|
|
|
// The scope stack for the whole match is handled above, so only handle that for
|
|
|
|
// other captures.
|
|
|
|
if ($k !== 0) {
|
|
|
|
if ($rule->captures->name !== null) {
|
|
|
|
$this->scopeStack[] = $rule->captures[$k]->name;
|
|
|
|
}
|
|
|
|
if ($rule->captures->contentName !== null) {
|
|
|
|
$this->scopeStack[] = $rule->captures[$k]->contentName;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
|
|
|
|
|
|
|
|
// The scope stack for the whole match is handled above, so only handle that for
|
|
|
|
// other captures.
|
|
|
|
if ($k !== 0) {
|
|
|
|
if ($rule->captures[$k]->contentName !== null) {
|
|
|
|
array_pop($this->scopeStack);
|
|
|
|
}
|
|
|
|
if ($rule->captures[$k]->name !== null) {
|
|
|
|
array_pop($this->scopeStack);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
array_pop($this->ruleStack);
|
|
|
|
} else {
|
|
|
|
$tokens[] = new Token(
|
|
|
|
[ ...$this->scopeStack, $rule->captures[$k]->name ],
|
|
|
|
$m[0]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
$this->offset = $m[1] + strlen($m[0]);
|
|
|
|
$firstCapture = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$this->ruleStack[] = $rule;
|
|
|
|
|
|
|
|
if ($rule->patterns !== null) {
|
|
|
|
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove the name and contentName from the scope stack if present.
|
|
|
|
if ($rule->contentName !== null) {
|
|
|
|
array_pop($this->scopeStack);
|
|
|
|
}
|
|
|
|
if ($rule->name !== null) {
|
|
|
|
array_pop($this->scopeStack);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the rule has a whole match capture (0) then remove its name and
|
|
|
|
// contentName, too.
|
|
|
|
$j = 0;
|
|
|
|
while ($j++ < $wholeMatchCaptureScopeCount) {
|
|
|
|
array_pop($this->scopeStack);
|
|
|
|
}
|
|
|
|
|
|
|
|
// And remove the rule from the rule stack, too.
|
|
|
|
array_pop($this->ruleStack);
|
|
|
|
break 2;
|
|
|
|
}
|
|
|
|
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into
|
|
|
|
// the rule list, and reprocess the rule.
|
|
|
|
elseif ($rule instanceof Reference && $obj = $rule->get()) {
|
|
|
|
if ($obj instanceof PatternList) {
|
|
|
|
$obj = $obj->getIterator();
|
|
|
|
} elseif ($obj instanceof Grammar) {
|
|
|
|
$obj = $obj->patterns->getIterator();
|
|
|
|
}
|
|
|
|
|
|
|
|
array_splice($currentRules, $i, 1, $obj);
|
|
|
|
$currentRulesCount = count($currentRules);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $tokens;
|
|
|
|
}
|
|
|
|
}
|