Tokenization progress

3 years ago · ad23bf4c4d
2 changed files with 155 additions and 53 deletions
--- a/lib/Grammar.php
+++ b/lib/Grammar.php
@ -29,7 +29,7 @@ use dW\Lit\Grammar\{
 */
 class Grammar {
    use FauxReadOnly;
-    protected ?string $_contentRegex;
+    protected ?string $_contentName;
    protected ?string $_firstLineMatch;
    protected ?InjectionList $_injections;
    protected ?string $_name;
@ -37,14 +37,12 @@ class Grammar {
    protected ?PatternList $_patterns;
    protected ?Repository $_repository;
    protected ?string $_scopeName;
    protected ?string $_contentScopeName;
-    public function __construct(?string $scopeName = null, ?string $contentScopeName = null, ?PatternList $patterns = null, ?string $name = null, ?string $contentRegex = null, ?string $firstLineMatch = null, ?InjectionList $injections = null, ?Repository $repository = null, ?Grammar $ownerGrammar = null) {
+    public function __construct(?string $scopeName = null, ?PatternList $patterns = null, ?string $name = null, ?string $firstLineMatch = null, ?InjectionList $injections = null, ?Repository $repository = null, ?Grammar $ownerGrammar = null) {
        $this->_name = $name;
        $this->_scopeName = $scopeName;
        $this->_patterns = $patterns;
        $this->_contentRegex = $contentRegex;
        $this->_firstLineMatch = $firstLineMatch;
        $this->_injections = $injections;
        $this->_repository = $repository;
@ -98,14 +96,6 @@ class Grammar {
        $this->_name = $json['name'] ?? null;
        $this->_scopeName = $json['scopeName'];
        $this->_contentScopeName = $json['contentScopeName'] ?? null;
        if (isset($json['contentRegex'])) {
            $value = str_replace('/', '\/', $json['contentRegex']);
            $this->_contentRegex = $value;
        } else {
            $this->_contentRegex = null;
        }
        if (isset($json['firstLineMatch'])) {
            $value = str_replace('/', '\/', $json['firstLineMatch']);
@ -164,7 +154,6 @@ class Grammar {
        $p = [
            'ownerGrammar' => $this,
            'name' => null,
            'contentName' => null,
            'match' => null,
            'patterns' => null,
            'captures' => null,
@ -231,7 +220,6 @@ class Grammar {
        foreach ($pattern as $key => $value) {
            switch ($key) {
                case 'name':
                case 'contentName':
                    $p[$key] = $value;
                    $modified = true;
                break;
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@ -24,10 +24,6 @@ class Tokenizer {
        $this->grammar = $grammar;
        $this->ruleStack = [ $this->grammar ];
        $this->scopeStack = [ $this->grammar->scopeName ];
        if ($this->grammar->contentScopeName !== null) {
            $this->scopeStack[] = $this->grammar->contentScopeName;
        }
    }
@ -60,40 +56,155 @@ class Tokenizer {
        return $match;
    }
-    protected function _tokenize(string $inputLine, int $offset = 0): array {
+    protected function _tokenize(string $line, int &$offset = 0): array {
        $tokens = [];
        $lineLength = strlen($line);
        while (true) {
            $currentRules = end($this->ruleStack)->patterns->getIterator();
            $currentRulesCount = count($currentRules);
        $results = [];
        $line = $inputLine;
        $lineLength = strlen($line);
            for ($i = 0; $i < $currentRulesCount; $i++) {
                while (true) {
                    $rule = $currentRules[$i];
-                if ($rule instanceof Pattern) {
+                    // If the rule is a Pattern and matches the line at the offset then tokenize the
-                    if ($match = $this->getMatch($rule->match, $line, $offset)) {
+                    // matches.
-                        $tokens = [];
+                    if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line, $offset)) {
-                        unset($match[0]);
+                        // First, remove the first entry in the match, the full
                        // match, leaving only the subpatterns.
                        //unset($match[0]);
                        // Add the name and contentName to the scope stack
                        // if present.
                        if ($rule->name !== null) {
                            $this->scopeStack[] = $rule->name;
                        }
                        if ($rule->contentName !== null) {
                            $this->scopeStack[] = $rule->contentName;
                        }
                        $wholeMatchCaptureScopeCount = 0;
                        if ($rule->captures !== null) {
                            // Iterate through each of the matched subpatterns and create tokens from the
                            // captures.
                            foreach ($match as $k => $m) {
                                if ($m[0] === '') {
                                    continue;
                                }
                                // If the subpattern begins after the offset then create a token from the bits
                                // of the line in-between.
                                if ($m[1] > $offset) {
                                    $scopeStack = $this->scopeStack;
                                    // If this is the first capture, then the scopes added to the stack need to be
                                    // removed from this token's scope stack as this will grab everything before
                                    // this match began.
                                    if ($k === 0) {
                                        if ($rule->contentName !== null) {
                                            array_pop($scopeStack);
                                        }
                                        if ($rule->name !== null) {
                                            array_pop($scopeStack);
                                        }
                                    }
                                    $tokens[] = [
-                                    'scope' => $this->scopeStack,
+                                        'scopes' => $scopeStack,
                                        'string' => substr($line, $offset, $m[1])
                                    ];
                                    $offset = $m[1];
                                }
                                // The first match is the whole match, and if there are captures for it the name
                                // and contentName should be added to the stack regardless of whether it has
                                // patterns or not. However, keep count of how many were added to the stack so
                                // they may be removed when this rule has finished tokenizing.
                                if ($k === 0) {
                                    if (!isset($rule->captures[0])) {
                                        continue;
                                    }
                                    if ($rule->captures[0]->name !== null) {
                                        $this->scopeStack[] = $rule->captures[0]->name;
                                        $wholeMatchCaptureScopeCount++;
                                    }
                                    if ($rule->captures[0]->contentName !== null) {
                                        $this->scopeStack[] = $rule->captures[0]->contentName;
                                        $wholeMatchCaptureScopeCount++;
                                    }
                                }
                                // If the capture rule has patterns of its own then
                                // those must be matched, too.
                                if ($rule->captures[$k]->patterns !== null) {
                                    $this->ruleStack[] = $rule->captures[$k];
                                    // The scope stack for the whole match is handled above, so only handle that for
                                    // other captures.
                                    if ($k !== 0) {
                                        if ($rule->captures->name !== null) {
                                            $this->scopeStack[] = $rule->captures[$k]->name;
                                        }
                                        if ($rule->captures->contentName !== null) {
                                            $this->scopeStack[] = $rule->captures[$k]->contentName;
                                        }
                                    }
                                    $tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];
                                    // The scope stack for the whole match is handled above, so only handle that for
                                    // other captures.
                                    if ($k !== 0) {
                                        if ($rule->captures[$k]->contentName !== null) {
                                            array_pop($this->scopeStack);
                                        }
                                        if ($rule->captures[$k]->name !== null) {
                                            array_pop($this->scopeStack);
                                        }
                                    }
                                    array_pop($this->ruleStack);
                                } else {
                                    $tokens[] = [
-                                'scope' => [ ...$this->scopeStack, $rule->captures[$k]->name ],
+                                        'scopes' => [ ...$this->scopeStack, $rule->captures[$k]->name ],
                                        'string' => $m[0]
                                    ];
                                }
                                $offset = $m[1] + strlen($m[0]);
                                $firstCapture = false;
                            }
                        }
                        if ($rule->patterns !== null) {
                            $tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];
                        }
                        // Remove the name and contentName from the scope stack if present.
                        if ($rule->contentName !== null) {
                            array_pop($this->scopeStack);
                        }
                        if ($rule->name !== null) {
                            array_pop($this->scopeStack);
                        }
                        // If the rule has a whole match capture (0) then remove its name and
                        // contentName, too.
                        $j = 0;
                        while ($j++ < $wholeMatchCaptureScopeCount) {
                            array_pop($this->scopeStack);
                        }
                        // And remove the rule from the rule stack, too.
                        array_pop($this->ruleStack);
                        echo "\n";
                        die(var_export($tokens));
                        break 2;
                    }
-                } elseif ($rule instanceof Reference && $obj = $rule->get()) {
+                    // Otherwise, if the rule is a Reference then retrieve its patterns, splice into
                    // the rule list, and reprocess the rule.
                    elseif ($rule instanceof Reference && $obj = $rule->get()) {
                        if ($obj instanceof PatternList) {
                            $obj = $obj->getIterator();
                        } elseif ($obj instanceof Grammar) {
@ -109,6 +220,9 @@ class Tokenizer {
                }
            }
-        return $inputLine;
+            break;
        }
        return $tokens;
    }
 }