data = $data; $this->encoding = $encoding; $this->grammar = $grammar; $this->ruleStack = [ $this->grammar ]; $this->scopeStack = [ $this->grammar->scopeName ]; if ($this->grammar->contentScopeName !== null) { $this->scopeStack[] = $this->grammar->contentScopeName; } } public function tokenize(): \Generator { $appendNewLine = true; foreach ($this->data as $lineNumber => $inputLine) { yield $lineNumber => $this->_tokenize($inputLine); /*$line = $inputLine; $lineWithNewLine = ($appendNewLine) ? "$line\n" : $line; $initialStackRuleLength = count($this->ruleStack); $position = 0; $tokenCount = 0; while (true) { $initialStackRuleLength = count($this->ruleStack); $previousPosition = $position; if ($position > mb_strlen($line)) { break; } }*/ } } protected function getMatch(string $regex, string $line, int $offset = 0): ?array { // Using mbstring's regular expressions because it truly supports multibyte // strings but also because the original implementation used Oniguruma. mb_ereg_search_init($line, mb_convert_encoding($regex, 'UTF-32')); if ($offset !== 0) { // UTF-32 uses 4 bytes for every character; multiply by 4 to convert from // character offset to byte offset. mb_ereg_search_setpos($offset * 4); } $pos = mb_ereg_search_pos(); if ($pos === false) { return null; } // UTF-32 uses 4 bytes for every character; divide by 4 to get character // offsets. $length = $pos[1] / 4; $pos = [ 'start' => $pos[0] / 4, ]; $pos['end'] = $pos['start'] + $length; $match = mb_ereg_search_getregs(); // Convert the matches back to the original encoding. foreach ($match as &$m) { $m = mb_convert_encoding($m, $this->encoding, 'UTF-32'); } $match['offset'] = $pos; return $match; } protected function _tokenize(string $inputLine, int $offset = 0): array { $currentRules = end($this->ruleStack)->patterns->getIterator(); $currentRulesCount = count($currentRules); $results = []; $line = $inputLine; for ($i = 0; $i < $currentRulesCount; $i++) { while (true) { $rule = $currentRules[$i]; if ($rule instanceof Pattern) { if ($match = $this->getMatch($rule->match, $line, $offset)) { $offset = $match['offset']['end']; } } elseif ($rule instanceof Reference && $obj = $rule->get()) { if ($obj instanceof PatternList) { $obj = $obj->getIterator(); } elseif ($obj instanceof Grammar) { $obj = $obj->patterns->getIterator(); } array_splice($currentRules, $i, 1, $obj); $currentRulesCount = count($currentRules); continue; } break; } } return $inputLine; } }