|
|
@ -24,10 +24,6 @@ class Tokenizer { |
|
|
|
$this->grammar = $grammar; |
|
|
|
$this->ruleStack = [ $this->grammar ]; |
|
|
|
$this->scopeStack = [ $this->grammar->scopeName ]; |
|
|
|
|
|
|
|
if ($this->grammar->contentScopeName !== null) { |
|
|
|
$this->scopeStack[] = $this->grammar->contentScopeName; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -60,55 +56,173 @@ class Tokenizer { |
|
|
|
return $match; |
|
|
|
} |
|
|
|
|
|
|
|
protected function _tokenize(string $inputLine, int $offset = 0): array { |
|
|
|
$currentRules = end($this->ruleStack)->patterns->getIterator(); |
|
|
|
$currentRulesCount = count($currentRules); |
|
|
|
$results = []; |
|
|
|
$line = $inputLine; |
|
|
|
protected function _tokenize(string $line, int &$offset = 0): array { |
|
|
|
$tokens = []; |
|
|
|
$lineLength = strlen($line); |
|
|
|
|
|
|
|
for ($i = 0; $i < $currentRulesCount; $i++) { |
|
|
|
while (true) { |
|
|
|
$rule = $currentRules[$i]; |
|
|
|
if ($rule instanceof Pattern) { |
|
|
|
if ($match = $this->getMatch($rule->match, $line, $offset)) { |
|
|
|
$tokens = []; |
|
|
|
unset($match[0]); |
|
|
|
foreach ($match as $k => $m) { |
|
|
|
if ($m[1] > $offset) { |
|
|
|
$tokens[] = [ |
|
|
|
'scope' => $this->scopeStack, |
|
|
|
'string' => substr($line, $offset, $m[1]) |
|
|
|
]; |
|
|
|
$offset = $m[1]; |
|
|
|
while (true) { |
|
|
|
$currentRules = end($this->ruleStack)->patterns->getIterator(); |
|
|
|
$currentRulesCount = count($currentRules); |
|
|
|
|
|
|
|
for ($i = 0; $i < $currentRulesCount; $i++) { |
|
|
|
while (true) { |
|
|
|
$rule = $currentRules[$i]; |
|
|
|
// If the rule is a Pattern and matches the line at the offset then tokenize the |
|
|
|
// matches. |
|
|
|
if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line, $offset)) { |
|
|
|
// First, remove the first entry in the match, the full |
|
|
|
// match, leaving only the subpatterns. |
|
|
|
//unset($match[0]); |
|
|
|
|
|
|
|
// Add the name and contentName to the scope stack |
|
|
|
// if present. |
|
|
|
if ($rule->name !== null) { |
|
|
|
$this->scopeStack[] = $rule->name; |
|
|
|
} |
|
|
|
if ($rule->contentName !== null) { |
|
|
|
$this->scopeStack[] = $rule->contentName; |
|
|
|
} |
|
|
|
|
|
|
|
$wholeMatchCaptureScopeCount = 0; |
|
|
|
if ($rule->captures !== null) { |
|
|
|
// Iterate through each of the matched subpatterns and create tokens from the |
|
|
|
// captures. |
|
|
|
foreach ($match as $k => $m) { |
|
|
|
if ($m[0] === '') { |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
// If the subpattern begins after the offset then create a token from the bits |
|
|
|
// of the line in-between. |
|
|
|
if ($m[1] > $offset) { |
|
|
|
$scopeStack = $this->scopeStack; |
|
|
|
// If this is the first capture, then the scopes added to the stack need to be |
|
|
|
// removed from this token's scope stack as this will grab everything before |
|
|
|
// this match began. |
|
|
|
if ($k === 0) { |
|
|
|
if ($rule->contentName !== null) { |
|
|
|
array_pop($scopeStack); |
|
|
|
} |
|
|
|
if ($rule->name !== null) { |
|
|
|
array_pop($scopeStack); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$tokens[] = [ |
|
|
|
'scopes' => $scopeStack, |
|
|
|
'string' => substr($line, $offset, $m[1]) |
|
|
|
]; |
|
|
|
$offset = $m[1]; |
|
|
|
} |
|
|
|
|
|
|
|
// The first match is the whole match, and if there are captures for it the name |
|
|
|
// and contentName should be added to the stack regardless of whether it has |
|
|
|
// patterns or not. However, keep count of how many were added to the stack so |
|
|
|
// they may be removed when this rule has finished tokenizing. |
|
|
|
if ($k === 0) { |
|
|
|
if (!isset($rule->captures[0])) { |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
if ($rule->captures[0]->name !== null) { |
|
|
|
$this->scopeStack[] = $rule->captures[0]->name; |
|
|
|
$wholeMatchCaptureScopeCount++; |
|
|
|
} |
|
|
|
if ($rule->captures[0]->contentName !== null) { |
|
|
|
$this->scopeStack[] = $rule->captures[0]->contentName; |
|
|
|
$wholeMatchCaptureScopeCount++; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// If the capture rule has patterns of its own then |
|
|
|
// those must be matched, too. |
|
|
|
if ($rule->captures[$k]->patterns !== null) { |
|
|
|
$this->ruleStack[] = $rule->captures[$k]; |
|
|
|
|
|
|
|
// The scope stack for the whole match is handled above, so only handle that for |
|
|
|
// other captures. |
|
|
|
if ($k !== 0) { |
|
|
|
if ($rule->captures->name !== null) { |
|
|
|
$this->scopeStack[] = $rule->captures[$k]->name; |
|
|
|
} |
|
|
|
if ($rule->captures->contentName !== null) { |
|
|
|
$this->scopeStack[] = $rule->captures[$k]->contentName; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ]; |
|
|
|
|
|
|
|
// The scope stack for the whole match is handled above, so only handle that for |
|
|
|
// other captures. |
|
|
|
if ($k !== 0) { |
|
|
|
if ($rule->captures[$k]->contentName !== null) { |
|
|
|
array_pop($this->scopeStack); |
|
|
|
} |
|
|
|
if ($rule->captures[$k]->name !== null) { |
|
|
|
array_pop($this->scopeStack); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
array_pop($this->ruleStack); |
|
|
|
} else { |
|
|
|
$tokens[] = [ |
|
|
|
'scopes' => [ ...$this->scopeStack, $rule->captures[$k]->name ], |
|
|
|
'string' => $m[0] |
|
|
|
]; |
|
|
|
} |
|
|
|
|
|
|
|
$offset = $m[1] + strlen($m[0]); |
|
|
|
$firstCapture = false; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if ($rule->patterns !== null) { |
|
|
|
$tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ]; |
|
|
|
} |
|
|
|
|
|
|
|
$tokens[] = [ |
|
|
|
'scope' => [ ...$this->scopeStack, $rule->captures[$k]->name ], |
|
|
|
'string' => $m[0] |
|
|
|
]; |
|
|
|
$offset = $m[1] + strlen($m[0]); |
|
|
|
// Remove the name and contentName from the scope stack if present. |
|
|
|
if ($rule->contentName !== null) { |
|
|
|
array_pop($this->scopeStack); |
|
|
|
} |
|
|
|
if ($rule->name !== null) { |
|
|
|
array_pop($this->scopeStack); |
|
|
|
} |
|
|
|
|
|
|
|
// If the rule has a whole match capture (0) then remove its name and |
|
|
|
// contentName, too. |
|
|
|
$j = 0; |
|
|
|
while ($j++ < $wholeMatchCaptureScopeCount) { |
|
|
|
array_pop($this->scopeStack); |
|
|
|
} |
|
|
|
|
|
|
|
// And remove the rule from the rule stack, too. |
|
|
|
array_pop($this->ruleStack); |
|
|
|
|
|
|
|
echo "\n"; |
|
|
|
die(var_export($tokens)); |
|
|
|
break 2; |
|
|
|
} |
|
|
|
} elseif ($rule instanceof Reference && $obj = $rule->get()) { |
|
|
|
if ($obj instanceof PatternList) { |
|
|
|
$obj = $obj->getIterator(); |
|
|
|
} elseif ($obj instanceof Grammar) { |
|
|
|
$obj = $obj->patterns->getIterator(); |
|
|
|
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into |
|
|
|
// the rule list, and reprocess the rule. |
|
|
|
elseif ($rule instanceof Reference && $obj = $rule->get()) { |
|
|
|
if ($obj instanceof PatternList) { |
|
|
|
$obj = $obj->getIterator(); |
|
|
|
} elseif ($obj instanceof Grammar) { |
|
|
|
$obj = $obj->patterns->getIterator(); |
|
|
|
} |
|
|
|
|
|
|
|
array_splice($currentRules, $i, 1, $obj); |
|
|
|
$currentRulesCount = count($currentRules); |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
array_splice($currentRules, $i, 1, $obj); |
|
|
|
$currentRulesCount = count($currentRules); |
|
|
|
continue; |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
return $inputLine; |
|
|
|
return $tokens; |
|
|
|
} |
|
|
|
} |