Tokenization progress

3 years ago · ad23bf4c4d
2 changed files with 155 additions and 53 deletions
--- a/lib/Grammar.php
+++ b/lib/Grammar.php
@ -29,7 +29,7 @@ use dW\Lit\Grammar\{
 */
 class Grammar {
    use FauxReadOnly;
-    protected ?string $_contentRegex;
+    protected ?string $_contentName;
    protected ?string $_firstLineMatch;
    protected ?InjectionList $_injections;
    protected ?string $_name;
@ -37,14 +37,12 @@ class Grammar {
    protected ?PatternList $_patterns;
    protected ?Repository $_repository;
    protected ?string $_scopeName;
-    protected ?string $_contentScopeName;


-    public function __construct(?string $scopeName = null, ?string $contentScopeName = null, ?PatternList $patterns = null, ?string $name = null, ?string $contentRegex = null, ?string $firstLineMatch = null, ?InjectionList $injections = null, ?Repository $repository = null, ?Grammar $ownerGrammar = null) {
+    public function __construct(?string $scopeName = null, ?PatternList $patterns = null, ?string $name = null, ?string $firstLineMatch = null, ?InjectionList $injections = null, ?Repository $repository = null, ?Grammar $ownerGrammar = null) {
        $this->_name = $name;
        $this->_scopeName = $scopeName;
        $this->_patterns = $patterns;
-        $this->_contentRegex = $contentRegex;
        $this->_firstLineMatch = $firstLineMatch;
        $this->_injections = $injections;
        $this->_repository = $repository;
@ -98,14 +96,6 @@ class Grammar {

        $this->_name = $json['name'] ?? null;
        $this->_scopeName = $json['scopeName'];
-        $this->_contentScopeName = $json['contentScopeName'] ?? null;
-
-        if (isset($json['contentRegex'])) {
-            $value = str_replace('/', '\/', $json['contentRegex']);
-            $this->_contentRegex = $value;
-        } else {
-            $this->_contentRegex = null;
-        }

        if (isset($json['firstLineMatch'])) {
            $value = str_replace('/', '\/', $json['firstLineMatch']);
@ -164,7 +154,6 @@ class Grammar {
        $p = [
            'ownerGrammar' => $this,
            'name' => null,
-            'contentName' => null,
            'match' => null,
            'patterns' => null,
            'captures' => null,
@ -231,7 +220,6 @@ class Grammar {
        foreach ($pattern as $key => $value) {
            switch ($key) {
                case 'name':
-                case 'contentName':
                    $p[$key] = $value;
                    $modified = true;
                break;
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@ -24,10 +24,6 @@ class Tokenizer {
        $this->grammar = $grammar;
        $this->ruleStack = [ $this->grammar ];
        $this->scopeStack = [ $this->grammar->scopeName ];
-
-        if ($this->grammar->contentScopeName !== null) {
-            $this->scopeStack[] = $this->grammar->contentScopeName;
-        }
    }


@ -60,55 +56,173 @@ class Tokenizer {
        return $match;
    }

-    protected function _tokenize(string $inputLine, int $offset = 0): array {
-        $currentRules = end($this->ruleStack)->patterns->getIterator();
-        $currentRulesCount = count($currentRules);
-        $results = [];
-        $line = $inputLine;
+    protected function _tokenize(string $line, int &$offset = 0): array {
+        $tokens = [];
        $lineLength = strlen($line);

-        for ($i = 0; $i < $currentRulesCount; $i++) {
-            while (true) {
-                $rule = $currentRules[$i];
-                if ($rule instanceof Pattern) {
-                    if ($match = $this->getMatch($rule->match, $line, $offset)) {
-                        $tokens = [];
-                        unset($match[0]);
-                        foreach ($match as $k => $m) {
-                            if ($m[1] > $offset) {
-                                $tokens[] = [
-                                    'scope' => $this->scopeStack,
-                                    'string' => substr($line, $offset, $m[1])
-                                ];
-                                $offset = $m[1];
+        while (true) {
+            $currentRules = end($this->ruleStack)->patterns->getIterator();
+            $currentRulesCount = count($currentRules);
+
+            for ($i = 0; $i < $currentRulesCount; $i++) {
+                while (true) {
+                    $rule = $currentRules[$i];
+                    // If the rule is a Pattern and matches the line at the offset then tokenize the
+                    // matches.
+                    if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line, $offset)) {
+                        // First, remove the first entry in the match, the full
+                        // match, leaving only the subpatterns.
+                        //unset($match[0]);
+
+                        // Add the name and contentName to the scope stack
+                        // if present.
+                        if ($rule->name !== null) {
+                            $this->scopeStack[] = $rule->name;
+                        }
+                        if ($rule->contentName !== null) {
+                            $this->scopeStack[] = $rule->contentName;
+                        }
+
+                        $wholeMatchCaptureScopeCount = 0;
+                        if ($rule->captures !== null) {
+                            // Iterate through each of the matched subpatterns and create tokens from the
+                            // captures.
+                            foreach ($match as $k => $m) {
+                                if ($m[0] === '') {
+                                    continue;
+                                }
+
+                                // If the subpattern begins after the offset then create a token from the bits
+                                // of the line in-between.
+                                if ($m[1] > $offset) {
+                                    $scopeStack = $this->scopeStack;
+                                    // If this is the first capture, then the scopes added to the stack need to be
+                                    // removed from this token's scope stack as this will grab everything before
+                                    // this match began.
+                                    if ($k === 0) {
+                                        if ($rule->contentName !== null) {
+                                            array_pop($scopeStack);
+                                        }
+                                        if ($rule->name !== null) {
+                                            array_pop($scopeStack);
+                                        }
+                                    }
+
+                                    $tokens[] = [
+                                        'scopes' => $scopeStack,
+                                        'string' => substr($line, $offset, $m[1])
+                                    ];
+                                    $offset = $m[1];
+                                }
+
+                                // The first match is the whole match, and if there are captures for it the name
+                                // and contentName should be added to the stack regardless of whether it has
+                                // patterns or not. However, keep count of how many were added to the stack so
+                                // they may be removed when this rule has finished tokenizing.
+                                if ($k === 0) {
+                                    if (!isset($rule->captures[0])) {
+                                        continue;
+                                    }
+
+                                    if ($rule->captures[0]->name !== null) {
+                                        $this->scopeStack[] = $rule->captures[0]->name;
+                                        $wholeMatchCaptureScopeCount++;
+                                    }
+                                    if ($rule->captures[0]->contentName !== null) {
+                                        $this->scopeStack[] = $rule->captures[0]->contentName;
+                                        $wholeMatchCaptureScopeCount++;
+                                    }
+                                }
+
+                                // If the capture rule has patterns of its own then
+                                // those must be matched, too.
+                                if ($rule->captures[$k]->patterns !== null) {
+                                    $this->ruleStack[] = $rule->captures[$k];
+
+                                    // The scope stack for the whole match is handled above, so only handle that for
+                                    // other captures.
+                                    if ($k !== 0) {
+                                        if ($rule->captures->name !== null) {
+                                            $this->scopeStack[] = $rule->captures[$k]->name;
+                                        }
+                                        if ($rule->captures->contentName !== null) {
+                                            $this->scopeStack[] = $rule->captures[$k]->contentName;
+                                        }
+                                    }
+
+                                    $tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];
+
+                                    // The scope stack for the whole match is handled above, so only handle that for
+                                    // other captures.
+                                    if ($k !== 0) {
+                                        if ($rule->captures[$k]->contentName !== null) {
+                                            array_pop($this->scopeStack);
+                                        }
+                                        if ($rule->captures[$k]->name !== null) {
+                                            array_pop($this->scopeStack);
+                                        }
+                                    }
+
+                                    array_pop($this->ruleStack);
+                                } else {
+                                    $tokens[] = [
+                                        'scopes' => [ ...$this->scopeStack, $rule->captures[$k]->name ],
+                                        'string' => $m[0]
+                                    ];
+                                }
+
+                                $offset = $m[1] + strlen($m[0]);
+                                $firstCapture = false;
                            }
+                        }
+
+                        if ($rule->patterns !== null) {
+                            $tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];
+                        }

-                            $tokens[] = [
-                                'scope' => [ ...$this->scopeStack, $rule->captures[$k]->name ],
-                                'string' => $m[0]
-                            ];
-                            $offset = $m[1] + strlen($m[0]);
+                        // Remove the name and contentName from the scope stack if present.
+                        if ($rule->contentName !== null) {
+                            array_pop($this->scopeStack);
+                        }
+                        if ($rule->name !== null) {
+                            array_pop($this->scopeStack);
+                        }
+
+                        // If the rule has a whole match capture (0) then remove its name and
+                        // contentName, too.
+                        $j = 0;
+                        while ($j++ < $wholeMatchCaptureScopeCount) {
+                            array_pop($this->scopeStack);
                        }

+                        // And remove the rule from the rule stack, too.
+                        array_pop($this->ruleStack);
+
                        echo "\n";
                        die(var_export($tokens));
+                        break 2;
                    }
-                } elseif ($rule instanceof Reference && $obj = $rule->get()) {
-                    if ($obj instanceof PatternList) {
-                        $obj = $obj->getIterator();
-                    } elseif ($obj instanceof Grammar) {
-                        $obj = $obj->patterns->getIterator();
+                    // Otherwise, if the rule is a Reference then retrieve its patterns, splice into
+                    // the rule list, and reprocess the rule.
+                    elseif ($rule instanceof Reference && $obj = $rule->get()) {
+                        if ($obj instanceof PatternList) {
+                            $obj = $obj->getIterator();
+                        } elseif ($obj instanceof Grammar) {
+                            $obj = $obj->patterns->getIterator();
+                        }
+
+                        array_splice($currentRules, $i, 1, $obj);
+                        $currentRulesCount = count($currentRules);
+                        continue;
                    }

-                    array_splice($currentRules, $i, 1, $obj);
-                    $currentRulesCount = count($currentRules);
-                    continue;
+                    break;
                }
-
-                break;
            }
+
+            break;
        }

-        return $inputLine;
+        return $tokens;
    }
 }