Lit/lib/Tokenizer.php

<?php
/** @license MIT
 * Copyright 2021 Dustin Wilson et al.
 * See LICENSE file for details */

declare(strict_types=1);
namespace dW\Lit;
use dW\Lit\Scope\Parser as ScopeParser;
use dW\Lit\Grammar\{
        Pattern,
        Reference
};


class Tokenizer {
    protected \Generator $data;
    protected Grammar $grammar;
    protected array $ruleStack;
    protected array $scopeStack;


    public function __construct(\Generator $data, Grammar $grammar) {
        $this->data = $data;
        $this->grammar = $grammar;
        $this->ruleStack = [ $this->grammar ];
        $this->scopeStack = [ $this->grammar->scopeName ];
    }


    public function tokenize(): \Generator {
        $appendNewLine = true;
        foreach ($this->data as $lineNumber => $inputLine) {
            yield $lineNumber => $this->_tokenize($inputLine);
            /*$line = $inputLine;
            $lineWithNewLine = ($appendNewLine) ? "$line\n" : $line;
            $initialStackRuleLength = count($this->ruleStack);
            $position = 0;
            $tokenCount = 0;

            while (true) {
                $initialStackRuleLength = count($this->ruleStack);
                $previousPosition = $position;
                if ($position > mb_strlen($line)) {
                    break;
                }
            }*/
        }
    }


    protected function getMatch(string $regex, string $line, int $offset = 0): ?array {
        if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE, $offset) !== 1) {
            return null;
        }

        return $match;
    }

    protected function _tokenize(string $line, int &$offset = 0): array {
        $tokens = [];
        $lineLength = strlen($line);

        while (true) {
            $currentRules = end($this->ruleStack)->patterns->getIterator();
            $currentRulesCount = count($currentRules);

            for ($i = 0; $i < $currentRulesCount; $i++) {
                while (true) {
                    $rule = $currentRules[$i];
                    // If the rule is a Pattern and matches the line at the offset then tokenize the
                    // matches.
                    if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line, $offset)) {
                        // First, remove the first entry in the match, the full
                        // match, leaving only the subpatterns.
                        //unset($match[0]);

                        // Add the name and contentName to the scope stack
                        // if present.
                        if ($rule->name !== null) {
                            $this->scopeStack[] = $rule->name;
                        }
                        if ($rule->contentName !== null) {
                            $this->scopeStack[] = $rule->contentName;
                        }

                        $wholeMatchCaptureScopeCount = 0;
                        if ($rule->captures !== null) {
                            // Iterate through each of the matched subpatterns and create tokens from the
                            // captures.
                            foreach ($match as $k => $m) {
                                if ($m[0] === '') {
                                    continue;
                                }

                                // If the subpattern begins after the offset then create a token from the bits
                                // of the line in-between.
                                if ($m[1] > $offset) {
                                    $scopeStack = $this->scopeStack;
                                    // If this is the first capture, then the scopes added to the stack need to be
                                    // removed from this token's scope stack as this will grab everything before
                                    // this match began.
                                    if ($k === 0) {
                                        if ($rule->contentName !== null) {
                                            array_pop($scopeStack);
                                        }
                                        if ($rule->name !== null) {
                                            array_pop($scopeStack);
                                        }
                                    }

                                    $tokens[] = [
                                        'scopes' => $scopeStack,
                                        'string' => substr($line, $offset, $m[1])
                                    ];
                                    $offset = $m[1];
                                }

                                // The first match is the whole match, and if there are captures for it the name
                                // and contentName should be added to the stack regardless of whether it has
                                // patterns or not. However, keep count of how many were added to the stack so
                                // they may be removed when this rule has finished tokenizing.
                                if ($k === 0) {
                                    if (!isset($rule->captures[0])) {
                                        continue;
                                    }

                                    if ($rule->captures[0]->name !== null) {
                                        $this->scopeStack[] = $rule->captures[0]->name;
                                        $wholeMatchCaptureScopeCount++;
                                    }
                                    if ($rule->captures[0]->contentName !== null) {
                                        $this->scopeStack[] = $rule->captures[0]->contentName;
                                        $wholeMatchCaptureScopeCount++;
                                    }
                                }

                                // If the capture rule has patterns of its own then
                                // those must be matched, too.
                                if ($rule->captures[$k]->patterns !== null) {
                                    $this->ruleStack[] = $rule->captures[$k];

                                    // The scope stack for the whole match is handled above, so only handle that for
                                    // other captures.
                                    if ($k !== 0) {
                                        if ($rule->captures->name !== null) {
                                            $this->scopeStack[] = $rule->captures[$k]->name;
                                        }
                                        if ($rule->captures->contentName !== null) {
                                            $this->scopeStack[] = $rule->captures[$k]->contentName;
                                        }
                                    }

                                    $tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];

                                    // The scope stack for the whole match is handled above, so only handle that for
                                    // other captures.
                                    if ($k !== 0) {
                                        if ($rule->captures[$k]->contentName !== null) {
                                            array_pop($this->scopeStack);
                                        }
                                        if ($rule->captures[$k]->name !== null) {
                                            array_pop($this->scopeStack);
                                        }
                                    }

                                    array_pop($this->ruleStack);
                                } else {
                                    $tokens[] = [
                                        'scopes' => [ ...$this->scopeStack, $rule->captures[$k]->name ],
                                        'string' => $m[0]
                                    ];
                                }

                                $offset = $m[1] + strlen($m[0]);
                                $firstCapture = false;
                            }
                        }

                        if ($rule->patterns !== null) {
                            $tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];
                        }

                        // Remove the name and contentName from the scope stack if present.
                        if ($rule->contentName !== null) {
                            array_pop($this->scopeStack);
                        }
                        if ($rule->name !== null) {
                            array_pop($this->scopeStack);
                        }

                        // If the rule has a whole match capture (0) then remove its name and
                        // contentName, too.
                        $j = 0;
                        while ($j++ < $wholeMatchCaptureScopeCount) {
                            array_pop($this->scopeStack);
                        }

                        // And remove the rule from the rule stack, too.
                        array_pop($this->ruleStack);

                        echo "\n";
                        die(var_export($tokens));
                        break 2;
                    }
                    // Otherwise, if the rule is a Reference then retrieve its patterns, splice into
                    // the rule list, and reprocess the rule.
                    elseif ($rule instanceof Reference && $obj = $rule->get()) {
                        if ($obj instanceof PatternList) {
                            $obj = $obj->getIterator();
                        } elseif ($obj instanceof Grammar) {
                            $obj = $obj->patterns->getIterator();
                        }

                        array_splice($currentRules, $i, 1, $obj);
                        $currentRulesCount = count($currentRules);
                        continue;
                    }

                    break;
                }
            }

            break;
        }

        return $tokens;
    }
}
Changed Project name to Fukkus 3 years ago			`<?php`
			`/** @license MIT`
			`* Copyright 2021 Dustin Wilson et al.`
Cleaning up a bit 3 years ago			`* See LICENSE file for details */`
Changed Project name to Fukkus 3 years ago
			`declare(strict_types=1);`
Trying to figure out structure 3 years ago			`namespace dW\Lit;`
Tokenizing stuff... maybe? :) 3 years ago			`use dW\Lit\Scope\Parser as ScopeParser;`
			`use dW\Lit\Grammar\{`
			`Pattern,`
			`Reference`
			`};`
Changed Project name to Fukkus 3 years ago
Trying to start code tokenization 3 years ago
Working on Grammars 3 years ago			`class Tokenizer {`
			`protected \Generator $data;`
Changed Pattern to Rule to be consistent with other implementations 3 years ago			`protected Grammar $grammar;`
Trying to start code tokenization 3 years ago			`protected array $ruleStack;`
Tokenizing stuff... maybe? :) 3 years ago			`protected array $scopeStack;`
Reorganized data folder 3 years ago
Many changes • Lines are now converted to UTF-32 while tokenizing so that byte offsets may be cleanly converted to character offsets • Now when grammars are parsed into Grammar objects begin and end matches are converted to regular matches by adding end matches to the pattern's pattern list to simplify tokenization. • Highlight::withFile and Highlight::withString now accept an encoding parameter which defaults to UTF-8. 3 years ago
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago			`public function __construct(\Generator $data, Grammar $grammar) {`
Working on Grammars 3 years ago			`$this->data = $data;`
Changed Pattern to Rule to be consistent with other implementations 3 years ago			`$this->grammar = $grammar;`
Trying to start code tokenization 3 years ago			`$this->ruleStack = [ $this->grammar ];`
Tokenizing stuff... maybe? :) 3 years ago			`$this->scopeStack = [ $this->grammar->scopeName ];`
Working on Grammars 3 years ago			`}`

Changed Pattern to Rule to be consistent with other implementations 3 years ago
Working on Grammars 3 years ago			`public function tokenize(): \Generator {`
Breaking Tokenizer 3 years ago			`$appendNewLine = true;`
			`foreach ($this->data as $lineNumber => $inputLine) {`
Many changes • Lines are now converted to UTF-32 while tokenizing so that byte offsets may be cleanly converted to character offsets • Now when grammars are parsed into Grammar objects begin and end matches are converted to regular matches by adding end matches to the pattern's pattern list to simplify tokenization. • Highlight::withFile and Highlight::withString now accept an encoding parameter which defaults to UTF-8. 3 years ago			`yield $lineNumber => $this->_tokenize($inputLine);`
			`/*$line = $inputLine;`
Breaking Tokenizer 3 years ago			`$lineWithNewLine = ($appendNewLine) ? "$line\n" : $line;`
			`$initialStackRuleLength = count($this->ruleStack);`
			`$position = 0;`
			`$tokenCount = 0;`

			`while (true) {`
			`$initialStackRuleLength = count($this->ruleStack);`
			`$previousPosition = $position;`
			`if ($position > mb_strlen($line)) {`
			`break;`
			`}`
Many changes • Lines are now converted to UTF-32 while tokenizing so that byte offsets may be cleanly converted to character offsets • Now when grammars are parsed into Grammar objects begin and end matches are converted to regular matches by adding end matches to the pattern's pattern list to simplify tokenization. • Highlight::withFile and Highlight::withString now accept an encoding parameter which defaults to UTF-8. 3 years ago			`}*/`
Working on Grammars 3 years ago			`}`
			`}`
Tokenizing stuff... maybe? :) 3 years ago

Many changes • Lines are now converted to UTF-32 while tokenizing so that byte offsets may be cleanly converted to character offsets • Now when grammars are parsed into Grammar objects begin and end matches are converted to regular matches by adding end matches to the pattern's pattern list to simplify tokenization. • Highlight::withFile and Highlight::withString now accept an encoding parameter which defaults to UTF-8. 3 years ago			`protected function getMatch(string $regex, string $line, int $offset = 0): ?array {`
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago			`if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE, $offset) !== 1) {`
Tokenizing stuff... maybe? :) 3 years ago			`return null;`
			`}`

			`return $match;`
			`}`
Setting up Tokenizer for recursion 3 years ago
Tokenization progress 3 years ago			`protected function _tokenize(string $line, int &$offset = 0): array {`
			`$tokens = [];`
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago			`$lineLength = strlen($line);`
Setting up Tokenizer for recursion 3 years ago
Tokenization progress 3 years ago			`while (true) {`
			`$currentRules = end($this->ruleStack)->patterns->getIterator();`
			`$currentRulesCount = count($currentRules);`

			`for ($i = 0; $i < $currentRulesCount; $i++) {`
			`while (true) {`
			`$rule = $currentRules[$i];`
			`// If the rule is a Pattern and matches the line at the offset then tokenize the`
			`// matches.`
			`if ($rule instanceof Pattern && $match = $this->getMatch($rule->match, $line, $offset)) {`
			`// First, remove the first entry in the match, the full`
			`// match, leaving only the subpatterns.`
			`//unset($match[0]);`

			`// Add the name and contentName to the scope stack`
			`// if present.`
			`if ($rule->name !== null) {`
			`$this->scopeStack[] = $rule->name;`
			`}`
			`if ($rule->contentName !== null) {`
			`$this->scopeStack[] = $rule->contentName;`
			`}`

			`$wholeMatchCaptureScopeCount = 0;`
			`if ($rule->captures !== null) {`
			`// Iterate through each of the matched subpatterns and create tokens from the`
			`// captures.`
			`foreach ($match as $k => $m) {`
			`if ($m[0] === '') {`
			`continue;`
			`}`

			`// If the subpattern begins after the offset then create a token from the bits`
			`// of the line in-between.`
			`if ($m[1] > $offset) {`
			`$scopeStack = $this->scopeStack;`
			`// If this is the first capture, then the scopes added to the stack need to be`
			`// removed from this token's scope stack as this will grab everything before`
			`// this match began.`
			`if ($k === 0) {`
			`if ($rule->contentName !== null) {`
			`array_pop($scopeStack);`
			`}`
			`if ($rule->name !== null) {`
			`array_pop($scopeStack);`
			`}`
			`}`

			`$tokens[] = [`
			`'scopes' => $scopeStack,`
			`'string' => substr($line, $offset, $m[1])`
			`];`
			`$offset = $m[1];`
			`}`

			`// The first match is the whole match, and if there are captures for it the name`
			`// and contentName should be added to the stack regardless of whether it has`
			`// patterns or not. However, keep count of how many were added to the stack so`
			`// they may be removed when this rule has finished tokenizing.`
			`if ($k === 0) {`
			`if (!isset($rule->captures[0])) {`
			`continue;`
			`}`

			`if ($rule->captures[0]->name !== null) {`
			`$this->scopeStack[] = $rule->captures[0]->name;`
			`$wholeMatchCaptureScopeCount++;`
			`}`
			`if ($rule->captures[0]->contentName !== null) {`
			`$this->scopeStack[] = $rule->captures[0]->contentName;`
			`$wholeMatchCaptureScopeCount++;`
			`}`
			`}`

			`// If the capture rule has patterns of its own then`
			`// those must be matched, too.`
			`if ($rule->captures[$k]->patterns !== null) {`
			`$this->ruleStack[] = $rule->captures[$k];`

			`// The scope stack for the whole match is handled above, so only handle that for`
			`// other captures.`
			`if ($k !== 0) {`
			`if ($rule->captures->name !== null) {`
			`$this->scopeStack[] = $rule->captures[$k]->name;`
			`}`
			`if ($rule->captures->contentName !== null) {`
			`$this->scopeStack[] = $rule->captures[$k]->contentName;`
			`}`
			`}`

			`$tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];`

			`// The scope stack for the whole match is handled above, so only handle that for`
			`// other captures.`
			`if ($k !== 0) {`
			`if ($rule->captures[$k]->contentName !== null) {`
			`array_pop($this->scopeStack);`
			`}`
			`if ($rule->captures[$k]->name !== null) {`
			`array_pop($this->scopeStack);`
			`}`
			`}`

			`array_pop($this->ruleStack);`
			`} else {`
			`$tokens[] = [`
			`'scopes' => [ ...$this->scopeStack, $rule->captures[$k]->name ],`
			`'string' => $m[0]`
			`];`
			`}`

			`$offset = $m[1] + strlen($m[0]);`
			`$firstCapture = false;`
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago			`}`
Tokenization progress 3 years ago			`}`

			`if ($rule->patterns !== null) {`
			`$tokens = [ ...$tokens, ...$this->_tokenize($line, $offset) ];`
			`}`
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago
Tokenization progress 3 years ago			`// Remove the name and contentName from the scope stack if present.`
			`if ($rule->contentName !== null) {`
			`array_pop($this->scopeStack);`
			`}`
			`if ($rule->name !== null) {`
			`array_pop($this->scopeStack);`
			`}`

			`// If the rule has a whole match capture (0) then remove its name and`
			`// contentName, too.`
			`$j = 0;`
			`while ($j++ < $wholeMatchCaptureScopeCount) {`
			`array_pop($this->scopeStack);`
Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago			`}`

Tokenization progress 3 years ago			`// And remove the rule from the rule stack, too.`
			`array_pop($this->ruleStack);`

Reverting to using UTF-8 and preg_match. mb_ereg is garbage 3 years ago			`echo "\n";`
			`die(var_export($tokens));`
Tokenization progress 3 years ago			`break 2;`
Setting up Tokenizer for recursion 3 years ago			`}`
Tokenization progress 3 years ago			`// Otherwise, if the rule is a Reference then retrieve its patterns, splice into`
			`// the rule list, and reprocess the rule.`
			`elseif ($rule instanceof Reference && $obj = $rule->get()) {`
			`if ($obj instanceof PatternList) {`
			`$obj = $obj->getIterator();`
			`} elseif ($obj instanceof Grammar) {`
			`$obj = $obj->patterns->getIterator();`
			`}`

			`array_splice($currentRules, $i, 1, $obj);`
			`$currentRulesCount = count($currentRules);`
			`continue;`
Setting up Tokenizer for recursion 3 years ago			`}`

Tokenization progress 3 years ago			`break;`
Setting up Tokenizer for recursion 3 years ago			`}`
			`}`
Tokenization progress 3 years ago
			`break;`
Setting up Tokenizer for recursion 3 years ago			`}`
Many changes • Lines are now converted to UTF-32 while tokenizing so that byte offsets may be cleanly converted to character offsets • Now when grammars are parsed into Grammar objects begin and end matches are converted to regular matches by adding end matches to the pattern's pattern list to simplify tokenization. • Highlight::withFile and Highlight::withString now accept an encoding parameter which defaults to UTF-8. 3 years ago
Tokenization progress 3 years ago			`return $tokens;`
Setting up Tokenizer for recursion 3 years ago			`}`
Working on Grammars 3 years ago			`}`