Browse Source

Tokenization progress

main
Dustin Wilson 3 years ago
parent
commit
ad23bf4c4d
  1. 16
      lib/Grammar.php
  2. 192
      lib/Tokenizer.php

16
lib/Grammar.php

@ -29,7 +29,7 @@ use dW\Lit\Grammar\{
*/ */
class Grammar { class Grammar {
use FauxReadOnly; use FauxReadOnly;
protected ?string $_contentRegex; protected ?string $_contentName;
protected ?string $_firstLineMatch; protected ?string $_firstLineMatch;
protected ?InjectionList $_injections; protected ?InjectionList $_injections;
protected ?string $_name; protected ?string $_name;
@ -37,14 +37,12 @@ class Grammar {
protected ?PatternList $_patterns; protected ?PatternList $_patterns;
protected ?Repository $_repository; protected ?Repository $_repository;
protected ?string $_scopeName; protected ?string $_scopeName;
protected ?string $_contentScopeName;
public function __construct(?string $scopeName = null, ?string $contentScopeName = null, ?PatternList $patterns = null, ?string $name = null, ?string $contentRegex = null, ?string $firstLineMatch = null, ?InjectionList $injections = null, ?Repository $repository = null, ?Grammar $ownerGrammar = null) { public function __construct(?string $scopeName = null, ?PatternList $patterns = null, ?string $name = null, ?string $firstLineMatch = null, ?InjectionList $injections = null, ?Repository $repository = null, ?Grammar $ownerGrammar = null) {
$this->_name = $name; $this->_name = $name;
$this->_scopeName = $scopeName; $this->_scopeName = $scopeName;
$this->_patterns = $patterns; $this->_patterns = $patterns;
$this->_contentRegex = $contentRegex;
$this->_firstLineMatch = $firstLineMatch; $this->_firstLineMatch = $firstLineMatch;
$this->_injections = $injections; $this->_injections = $injections;
$this->_repository = $repository; $this->_repository = $repository;
@ -98,14 +96,6 @@ class Grammar {
$this->_name = $json['name'] ?? null; $this->_name = $json['name'] ?? null;
$this->_scopeName = $json['scopeName']; $this->_scopeName = $json['scopeName'];
$this->_contentScopeName = $json['contentScopeName'] ?? null;
if (isset($json['contentRegex'])) {
$value = str_replace('/', '\/', $json['contentRegex']);
$this->_contentRegex = $value;
} else {
$this->_contentRegex = null;
}
if (isset($json['firstLineMatch'])) { if (isset($json['firstLineMatch'])) {
$value = str_replace('/', '\/', $json['firstLineMatch']); $value = str_replace('/', '\/', $json['firstLineMatch']);
@ -164,7 +154,6 @@ class Grammar {
$p = [ $p = [
'ownerGrammar' => $this, 'ownerGrammar' => $this,
'name' => null, 'name' => null,
'contentName' => null,
'match' => null, 'match' => null,
'patterns' => null, 'patterns' => null,
'captures' => null, 'captures' => null,
@ -231,7 +220,6 @@ class Grammar {
foreach ($pattern as $key => $value) { foreach ($pattern as $key => $value) {
switch ($key) { switch ($key) {
case 'name': case 'name':
case 'contentName':
$p[$key] = $value; $p[$key] = $value;
$modified = true; $modified = true;
break; break;

192
lib/Tokenizer.php

@ -24,10 +24,6 @@ class Tokenizer {
$this->grammar = $grammar; $this->grammar = $grammar;
$this->ruleStack = [ $this->grammar ]; $this->ruleStack = [ $this->grammar ];
$this->scopeStack = [ $this->grammar->scopeName ]; $this->scopeStack = [ $this->grammar->scopeName ];
if ($this->grammar->contentScopeName !== null) {
$this->scopeStack[] = $this->grammar->contentScopeName;
}
} }
@ -60,55 +56,173 @@ class Tokenizer {
return $match; return $match;
} }
protected function _tokenize(string $inputLine, int $offset = 0): array { protected function _tokenize(string $line, int &$offset = 0): array {
$currentRules = end($this->ruleStack)->patterns->getIterator(); $tokens = [];
$currentRulesCount = count($currentRules);
$results = [];
$line = $inputLine;
$lineLength = strlen($line); $lineLength = strlen($line);
for ($i = 0; $i < $currentRulesCount; $i++) { while (true) {
while (true) { $currentRules = end($this->ruleStack)->patterns->getIterator();
$rule = $currentRules[$i]; $currentRulesCount = count($currentRules);
if ($rule instanceof Pattern) {
if ($match = $this->getMatch($rule->match, $line, $offset)) { for ($i = 0; $i < $currentRulesCount; $i++) {
$tokens = []; while (true) {
unset($match[0]); $rule = $currentRules[$i];
foreach ($match as $k => $m) { // If the rule is a Pattern and matches the line at the offset then tokenize the
if ($m[1] > $offset) { // matches.
$tokens[] = [ if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line, $offset)) {
'scope' => $this->scopeStack, // First, remove the first entry in the match, the full
'string' => substr($line, $offset, $m[1]) // match, leaving only the subpatterns.
]; //unset($match[0]);
$offset = $m[1];
// Add the name and contentName to the scope stack
// if present.
if ($rule->name !== null) {
$this->scopeStack[] = $rule->name;
}
if ($rule->contentName !== null) {
$this->scopeStack[] = $rule->contentName;
}
$wholeMatchCaptureScopeCount = 0;
if ($rule->captures !== null) {
// Iterate through each of the matched subpatterns and create tokens from the
// captures.
foreach ($match as $k => $m) {
if ($m[0] === '') {
continue;
}
// If the subpattern begins after the offset then create a token from the bits
// of the line in-between.
if ($m[1] > $offset) {
$scopeStack = $this->scopeStack;
// If this is the first capture, then the scopes added to the stack need to be
// removed from this token's scope stack as this will grab everything before
// this match began.
if ($k === 0) {
if ($rule->contentName !== null) {
array_pop($scopeStack);
}
if ($rule->name !== null) {
array_pop($scopeStack);
}
}
$tokens[] = [
'scopes' => $scopeStack,
'string' => substr($line, $offset, $m[1])
];
$offset = $m[1];
}
// The first match is the whole match, and if there are captures for it the name
// and contentName should be added to the stack regardless of whether it has
// patterns or not. However, keep count of how many were added to the stack so
// they may be removed when this rule has finished tokenizing.
if ($k === 0) {
if (!isset($rule->captures[0])) {
continue;
}
if ($rule->captures[0]->name !== null) {
$this->scopeStack[] = $rule->captures[0]->name;
$wholeMatchCaptureScopeCount++;
}
if ($rule->captures[0]->contentName !== null) {
$this->scopeStack[] = $rule->captures[0]->contentName;
$wholeMatchCaptureScopeCount++;
}
}
// If the capture rule has patterns of its own then
// those must be matched, too.
if ($rule->captures[$k]->patterns !== null) {
$this->ruleStack[] = $rule->captures[$k];
// The scope stack for the whole match is handled above, so only handle that for
// other captures.
if ($k !== 0) {
if ($rule->captures->name !== null) {
$this->scopeStack[] = $rule->captures[$k]->name;
}
if ($rule->captures->contentName !== null) {
$this->scopeStack[] = $rule->captures[$k]->contentName;
}
}
$tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];
// The scope stack for the whole match is handled above, so only handle that for
// other captures.
if ($k !== 0) {
if ($rule->captures[$k]->contentName !== null) {
array_pop($this->scopeStack);
}
if ($rule->captures[$k]->name !== null) {
array_pop($this->scopeStack);
}
}
array_pop($this->ruleStack);
} else {
$tokens[] = [
'scopes' => [ ...$this->scopeStack, $rule->captures[$k]->name ],
'string' => $m[0]
];
}
$offset = $m[1] + strlen($m[0]);
$firstCapture = false;
} }
}
if ($rule->patterns !== null) {
$tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];
}
$tokens[] = [ // Remove the name and contentName from the scope stack if present.
'scope' => [ ...$this->scopeStack, $rule->captures[$k]->name ], if ($rule->contentName !== null) {
'string' => $m[0] array_pop($this->scopeStack);
]; }
$offset = $m[1] + strlen($m[0]); if ($rule->name !== null) {
array_pop($this->scopeStack);
}
// If the rule has a whole match capture (0) then remove its name and
// contentName, too.
$j = 0;
while ($j++ < $wholeMatchCaptureScopeCount) {
array_pop($this->scopeStack);
} }
// And remove the rule from the rule stack, too.
array_pop($this->ruleStack);
echo "\n"; echo "\n";
die(var_export($tokens)); die(var_export($tokens));
break 2;
} }
} elseif ($rule instanceof Reference && $obj = $rule->get()) { // Otherwise, if the rule is a Reference then retrieve its patterns, splice into
if ($obj instanceof PatternList) { // the rule list, and reprocess the rule.
$obj = $obj->getIterator(); elseif ($rule instanceof Reference && $obj = $rule->get()) {
} elseif ($obj instanceof Grammar) { if ($obj instanceof PatternList) {
$obj = $obj->patterns->getIterator(); $obj = $obj->getIterator();
} elseif ($obj instanceof Grammar) {
$obj = $obj->patterns->getIterator();
}
array_splice($currentRules, $i, 1, $obj);
$currentRulesCount = count($currentRules);
continue;
} }
array_splice($currentRules, $i, 1, $obj); break;
$currentRulesCount = count($currentRules);
continue;
} }
break;
} }
break;
} }
return $inputLine; return $tokens;
} }
} }
Loading…
Cancel
Save