Lit/lib/Tokenizer.php

<?php
/** @license MIT
 * Copyright 2021 Dustin Wilson et al.
 * See LICENSE file for details */

declare(strict_types=1);
namespace dW\Lit;
use dW\Lit\Scope\Parser as ScopeParser;
use dW\Lit\Grammar\{
        Pattern,
        Reference
};


class Tokenizer {
    protected \Generator $data;
    protected Grammar $grammar;
    protected int $offset = 0;
    protected array $ruleStack;
    protected array $scopeStack;
    protected $debug = false;


    public function __construct(\Generator $data, Grammar $grammar) {
        $this->data = $data;
        $this->grammar = $grammar;
        $this->ruleStack = [ $this->grammar ];
        $this->scopeStack = [ $this->grammar->scopeName ];
    }


    public function tokenize(): \Generator {
        foreach ($this->data as $lineNumber => $line) {
            $this->offset = 0;
            $tokens = $this->tokenizeLine($line);

            // If after tokenizing the line the entire line still hasn't been tokenized then
            // create a token of the rest of the line.
            $lineLength = strlen($line);
            if ($this->offset < $lineLength) {
                $tokens[] = new Token(
                    $this->scopeStack,
                    substr($line, $this->offset, $lineLength)
                );
            }
            
            yield $lineNumber => $tokens;
        }
    }


    protected function getMatch(string $regex, string $line): ?array {
        if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE, $this->offset) !== 1) {
            return null;
        }

        return $match;
    }

    protected function tokenizeLine(string $line): array {
        $tokens = [];

        while (true) {
            $currentRules = end($this->ruleStack)->patterns->getIterator();
            $currentRulesCount = count($currentRules);

            for ($i = 0; $i < $currentRulesCount; $i++) {
                while (true) {
                    $rule = $currentRules[$i];
                    // If the rule is a Pattern and matches the line at the offset then tokenize the
                    // matches.
                    if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line)) {
                        // Add the name and contentName to the scope stack
                        // if present.
                        if ($rule->name !== null) {
                            $this->scopeStack[] = $rule->name;
                        }
                        if ($rule->contentName !== null) {
                            $this->scopeStack[] = $rule->contentName;
                        }

                        $wholeMatchCaptureScopeCount = 0;
                        if ($rule->captures !== null) {
                            // Iterate through each of the matched subpatterns and create tokens from the
                            // captures.
                            foreach ($match as $k => $m) {
                                if ($m[0] === '') {
                                    continue;
                                }

                                // If the subpattern begins after the offset then create a token from the bits
                                // of the line in-between the last token and the one about to be created.
                                if ($m[1] > $this->offset) {
                                    $scopeStack = $this->scopeStack;
                                    // If this is the first capture, then the scopes added to the stack need to be
                                    // removed from this token's scope stack as this will grab everything before
                                    // this match began.
                                    if ($k === 0) {
                                        if ($rule->contentName !== null) {
                                            array_pop($scopeStack);
                                        }
                                        if ($rule->name !== null) {
                                            array_pop($scopeStack);
                                        }
                                    }

                                    $tokens[] = new Token(
                                        $scopeStack,
                                        substr($line, $this->offset, $m[1])
                                    );
                                    $this->offset = $m[1];
                                }

                                // The first match is the whole match, and if there are captures for it the name
                                // and contentName should be added to the stack regardless of whether it has
                                // patterns or not. However, keep count of how many were added to the stack so
                                // they may be removed when this rule has finished tokenizing.
                                if ($k === 0) {
                                    if (!isset($rule->captures[0])) {
                                        continue;
                                    }

                                    if ($rule->captures[0]->name !== null) {
                                        $this->scopeStack[] = $rule->captures[0]->name;
                                        $wholeMatchCaptureScopeCount++;
                                    }
                                    if ($rule->captures[0]->contentName !== null) {
                                        $this->scopeStack[] = $rule->captures[0]->contentName;
                                        $wholeMatchCaptureScopeCount++;
                                    }
                                }

                                // If the capture rule has patterns of its own then
                                // those must be matched, too.
                                if ($rule->captures[$k]->patterns !== null) {
                                    $this->ruleStack[] = $rule->captures[$k];

                                    // The scope stack for the whole match is handled above, so only handle that for
                                    // other captures.
                                    if ($k !== 0) {
                                        if ($rule->captures->name !== null) {
                                            $this->scopeStack[] = $rule->captures[$k]->name;
                                        }
                                        if ($rule->captures->contentName !== null) {
                                            $this->scopeStack[] = $rule->captures[$k]->contentName;
                                        }
                                    }

                                    $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];

                                    // The scope stack for the whole match is handled above, so only handle that for
                                    // other captures.
                                    if ($k !== 0) {
                                        if ($rule->captures[$k]->contentName !== null) {
                                            array_pop($this->scopeStack);
                                        }
                                        if ($rule->captures[$k]->name !== null) {
                                            array_pop($this->scopeStack);
                                        }
                                    }

                                    array_pop($this->ruleStack);
                                } else {
                                    $tokens[] = new Token(
                                        [ ...$this->scopeStack, $rule->captures[$k]->name ],
                                        $m[0]
                                    );
                                }

                                $this->offset = $m[1] + strlen($m[0]);
                                $firstCapture = false;
                            }
                        }

                        $this->ruleStack[] = $rule;

                        if ($rule->patterns !== null) {
                            $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
                        }

                        // Remove the name and contentName from the scope stack if present.
                        if ($rule->contentName !== null) {
                            array_pop($this->scopeStack);
                        }
                        if ($rule->name !== null) {
                            array_pop($this->scopeStack);
                        }

                        // If the rule has a whole match capture (0) then remove its name and
                        // contentName, too.
                        $j = 0;
                        while ($j++ < $wholeMatchCaptureScopeCount) {
                            array_pop($this->scopeStack);
                        }

                        // And remove the rule from the rule stack, too.
                        array_pop($this->ruleStack);
                        break 2;
                    }
                    // Otherwise, if the rule is a Reference then retrieve its patterns, splice into
                    // the rule list, and reprocess the rule.
                    elseif ($rule instanceof Reference && $obj = $rule->get()) {
                        if ($obj instanceof PatternList) {
                            $obj = $obj->getIterator();
                        } elseif ($obj instanceof Grammar) {
                            $obj = $obj->patterns->getIterator();
                        }

                        array_splice($currentRules, $i, 1, $obj);
                        $currentRulesCount = count($currentRules);
                        continue;
                    }

                    break;
                }
            }

            break;
        }

        return $tokens;
    }
}
Changed Project name to Fukkus 3 years ago			`<?php`
			`/** @license MIT`
			`* Copyright 2021 Dustin Wilson et al.`
Cleaning up a bit 3 years ago			`* See LICENSE file for details */`
Changed Project name to Fukkus 3 years ago
			`declare(strict_types=1);`
Trying to figure out structure 3 years ago			`namespace dW\Lit;`
Tokenizing stuff... maybe? :) 3 years ago			`use dW\Lit\Scope\Parser as ScopeParser;`
			`use dW\Lit\Grammar\{`
			`Pattern,`
			`Reference`
			`};`
Changed Project name to Fukkus 3 years ago
Trying to start code tokenization 3 years ago
Working on Grammars 3 years ago			`class Tokenizer {`
			`protected \Generator $data;`
Changed Pattern to Rule to be consistent with other implementations 3 years ago			`protected Grammar $grammar;`
One full line tokenizes lol 3 years ago			`protected int $offset = 0;`
Trying to start code tokenization 3 years ago			`protected array $ruleStack;`
Tokenizing stuff... maybe? :) 3 years ago			`protected array $scopeStack;`
One full line tokenizes lol 3 years ago			`protected $debug = false;`
Reorganized data folder 3 years ago
Many changes • Lines are now converted to UTF-32 while tokenizing so that byte offsets may be cleanly converted to character offsets • Now when grammars are parsed into Grammar objects begin and end matches are converted to regular matches by adding end matches to the pattern's pattern list to simplify tokenization. • Highlight::withFile and Highlight::withString now accept an encoding parameter which defaults to UTF-8. 3 years ago
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago			`public function __construct(\Generator $data, Grammar $grammar) {`
Working on Grammars 3 years ago			`$this->data = $data;`
Changed Pattern to Rule to be consistent with other implementations 3 years ago			`$this->grammar = $grammar;`
Trying to start code tokenization 3 years ago			`$this->ruleStack = [ $this->grammar ];`
Tokenizing stuff... maybe? :) 3 years ago			`$this->scopeStack = [ $this->grammar->scopeName ];`
Working on Grammars 3 years ago			`}`

Changed Pattern to Rule to be consistent with other implementations 3 years ago
Working on Grammars 3 years ago			`public function tokenize(): \Generator {`
One full line tokenizes lol 3 years ago			`foreach ($this->data as $lineNumber => $line) {`
			`$this->offset = 0;`
			`$tokens = $this->tokenizeLine($line);`

			`// If after tokenizing the line the entire line still hasn't been tokenized then`
			`// create a token of the rest of the line.`
			`$lineLength = strlen($line);`
			`if ($this->offset < $lineLength) {`
			`$tokens[] = new Token(`
			`$this->scopeStack,`
			`substr($line, $this->offset, $lineLength)`
			`);`
			`}`

			`yield $lineNumber => $tokens;`
Working on Grammars 3 years ago			`}`
			`}`
Tokenizing stuff... maybe? :) 3 years ago

One full line tokenizes lol 3 years ago			`protected function getMatch(string $regex, string $line): ?array {`
			`if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE, $this->offset) !== 1) {`
Tokenizing stuff... maybe? :) 3 years ago			`return null;`
			`}`

			`return $match;`
			`}`
Setting up Tokenizer for recursion 3 years ago
One full line tokenizes lol 3 years ago			`protected function tokenizeLine(string $line): array {`
Tokenization progress 3 years ago			`$tokens = [];`
Setting up Tokenizer for recursion 3 years ago
Tokenization progress 3 years ago			`while (true) {`
			`$currentRules = end($this->ruleStack)->patterns->getIterator();`
			`$currentRulesCount = count($currentRules);`

			`for ($i = 0; $i < $currentRulesCount; $i++) {`
			`while (true) {`
			`$rule = $currentRules[$i];`
			`// If the rule is a Pattern and matches the line at the offset then tokenize the`
			`// matches.`
One full line tokenizes lol 3 years ago			`if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line)) {`
Tokenization progress 3 years ago			`// Add the name and contentName to the scope stack`
			`// if present.`
			`if ($rule->name !== null) {`
			`$this->scopeStack[] = $rule->name;`
			`}`
			`if ($rule->contentName !== null) {`
			`$this->scopeStack[] = $rule->contentName;`
			`}`

			`$wholeMatchCaptureScopeCount = 0;`
			`if ($rule->captures !== null) {`
			`// Iterate through each of the matched subpatterns and create tokens from the`
			`// captures.`
			`foreach ($match as $k => $m) {`
			`if ($m[0] === '') {`
			`continue;`
			`}`

			`// If the subpattern begins after the offset then create a token from the bits`
One full line tokenizes lol 3 years ago			`// of the line in-between the last token and the one about to be created.`
			`if ($m[1] > $this->offset) {`
Tokenization progress 3 years ago			`$scopeStack = $this->scopeStack;`
			`// If this is the first capture, then the scopes added to the stack need to be`
			`// removed from this token's scope stack as this will grab everything before`
			`// this match began.`
			`if ($k === 0) {`
			`if ($rule->contentName !== null) {`
			`array_pop($scopeStack);`
			`}`
			`if ($rule->name !== null) {`
			`array_pop($scopeStack);`
			`}`
			`}`

One full line tokenizes lol 3 years ago			`$tokens[] = new Token(`
			`$scopeStack,`
			`substr($line, $this->offset, $m[1])`
			`);`
			`$this->offset = $m[1];`
Tokenization progress 3 years ago			`}`

			`// The first match is the whole match, and if there are captures for it the name`
			`// and contentName should be added to the stack regardless of whether it has`
			`// patterns or not. However, keep count of how many were added to the stack so`
			`// they may be removed when this rule has finished tokenizing.`
			`if ($k === 0) {`
			`if (!isset($rule->captures[0])) {`
			`continue;`
			`}`

			`if ($rule->captures[0]->name !== null) {`
			`$this->scopeStack[] = $rule->captures[0]->name;`
			`$wholeMatchCaptureScopeCount++;`
			`}`
			`if ($rule->captures[0]->contentName !== null) {`
			`$this->scopeStack[] = $rule->captures[0]->contentName;`
			`$wholeMatchCaptureScopeCount++;`
			`}`
			`}`

			`// If the capture rule has patterns of its own then`
			`// those must be matched, too.`
			`if ($rule->captures[$k]->patterns !== null) {`
			`$this->ruleStack[] = $rule->captures[$k];`

			`// The scope stack for the whole match is handled above, so only handle that for`
			`// other captures.`
			`if ($k !== 0) {`
			`if ($rule->captures->name !== null) {`
			`$this->scopeStack[] = $rule->captures[$k]->name;`
			`}`
			`if ($rule->captures->contentName !== null) {`
			`$this->scopeStack[] = $rule->captures[$k]->contentName;`
			`}`
			`}`

One full line tokenizes lol 3 years ago			`$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];`
Tokenization progress 3 years ago
			`// The scope stack for the whole match is handled above, so only handle that for`
			`// other captures.`
			`if ($k !== 0) {`
			`if ($rule->captures[$k]->contentName !== null) {`
			`array_pop($this->scopeStack);`
			`}`
			`if ($rule->captures[$k]->name !== null) {`
			`array_pop($this->scopeStack);`
			`}`
			`}`

			`array_pop($this->ruleStack);`
			`} else {`
One full line tokenizes lol 3 years ago			`$tokens[] = new Token(`
			`[ ...$this->scopeStack, $rule->captures[$k]->name ],`
			`$m[0]`
			`);`
Tokenization progress 3 years ago			`}`

One full line tokenizes lol 3 years ago			`$this->offset = $m[1] + strlen($m[0]);`
Tokenization progress 3 years ago			`$firstCapture = false;`
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago			`}`
Tokenization progress 3 years ago			`}`

One full line tokenizes lol 3 years ago			`$this->ruleStack[] = $rule;`

Tokenization progress 3 years ago			`if ($rule->patterns !== null) {`
One full line tokenizes lol 3 years ago			`$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];`
Tokenization progress 3 years ago			`}`
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago
Tokenization progress 3 years ago			`// Remove the name and contentName from the scope stack if present.`
			`if ($rule->contentName !== null) {`
			`array_pop($this->scopeStack);`
			`}`
			`if ($rule->name !== null) {`
			`array_pop($this->scopeStack);`
			`}`

			`// If the rule has a whole match capture (0) then remove its name and`
			`// contentName, too.`
			`$j = 0;`
			`while ($j++ < $wholeMatchCaptureScopeCount) {`
			`array_pop($this->scopeStack);`
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago			`}`

Tokenization progress 3 years ago			`// And remove the rule from the rule stack, too.`
			`array_pop($this->ruleStack);`
			`break 2;`
Setting up Tokenizer for recursion 3 years ago			`}`
Tokenization progress 3 years ago			`// Otherwise, if the rule is a Reference then retrieve its patterns, splice into`
			`// the rule list, and reprocess the rule.`
			`elseif ($rule instanceof Reference && $obj = $rule->get()) {`
			`if ($obj instanceof PatternList) {`
			`$obj = $obj->getIterator();`
			`} elseif ($obj instanceof Grammar) {`
			`$obj = $obj->patterns->getIterator();`
			`}`

			`array_splice($currentRules, $i, 1, $obj);`
			`$currentRulesCount = count($currentRules);`
			`continue;`
Setting up Tokenizer for recursion 3 years ago			`}`

Tokenization progress 3 years ago			`break;`
Setting up Tokenizer for recursion 3 years ago			`}`
			`}`
Tokenization progress 3 years ago
			`break;`
Setting up Tokenizer for recursion 3 years ago			`}`
Many changes • Lines are now converted to UTF-32 while tokenizing so that byte offsets may be cleanly converted to character offsets • Now when grammars are parsed into Grammar objects begin and end matches are converted to regular matches by adding end matches to the pattern's pattern list to simplify tokenization. • Highlight::withFile and Highlight::withString now accept an encoding parameter which defaults to UTF-8. 3 years ago
Tokenization progress 3 years ago			`return $tokens;`
Setting up Tokenizer for recursion 3 years ago			`}`
Working on Grammars 3 years ago			`}`