Browse Source

Minor optimizations to tokenizer

Dustin Wilson 3 years ago
  1. 25
  2. 8
  3. 50


@ -11,31 +11,32 @@ class Data {
use FauxReadOnly;
// True if on the first line
protected bool $_firstLine = true;
protected \Generator $generator;
// The stored generator
protected \Generator $_generator;
// True if on the last line.
protected bool $_lastLine = false;
// Some matches will check for the last line before the final newline, so this
// will be true if on the line before the final newline or if on the last line
// if there isn't an extra newline at the end of the string.
protected bool $_lastLineBeforeFinalNewLine = false;
// The input string split into an array by newline
protected array $lines = [];
// The length of the data array
protected int $linesLength = 0;
public function __construct(string $data) {
$this->generator = $this->lineGenerator($data);
$this->lines = explode("\n", $data);
$this->linesLength = count($this->lines);
$this->_generator = $this->lineGenerator();
public function get(): \Generator {
return $this->generator;
protected function lineGenerator(string $string): \Generator {
$string = explode("\n", $string);
$lastLineIndex = count($string) - 1;
$lastLineBeforeFinalNewLineIndex = ($string[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex;
protected function lineGenerator(): \Generator {
$lastLineIndex = $this->linesLength - 1;
$lastLineBeforeFinalNewLineIndex = ($this->lines[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex;
foreach ($string as $lineNumber => $line) {
foreach ($this->lines as $lineNumber => $line) {
$this->_lastLine = ($lineNumber === $lastLineIndex);
$this->_lastLineBeforeFinalNewLine = ($lineNumber === $lastLineBeforeFinalNewLineIndex);
yield $lineNumber + 1 => $line;


@ -29,6 +29,9 @@ class Grammar {
protected ?array $_repository;
protected ?string $_scopeName;
protected const ESCAPE_SLASHES_REGEX = '/(?<!\\\)\//S';
protected const LONG_CHARACTER_CODE_REGEX = '/\\\x\{([0-9A-Fa-f]+)\}/S';
public function __construct(?string $scopeName = null, ?array $patterns = null, ?string $name = null, ?array $injections = null, ?array $repository = null) {
$this->_name = $name;
@ -198,11 +201,12 @@ class Grammar {
$p['beginPattern'] = true;
case 'match':
// Escape forward slashes that aren't escaped in regexes.
$value = preg_replace('/(?<!\\\)\//', '\/', $value);
$value = preg_replace(self::ESCAPE_SLASHES_REGEX, '\/', $value);
// Fix oniguruma long character codes.
$value = preg_replace_callback('/\\\x\{([0-9A-Fa-f]+)\}/', function($matches) {
$value = preg_replace_callback(self::LONG_CHARACTER_CODE_REGEX, function($matches) {
return "\x{" . (((int)base_convert($matches[1], 16, 10) > 0x10ffff) ? '10ffff' : $matches[1]) . "}";
}, $value);
$p['match'] = "/$value/Su";
$modified = true;


@ -43,7 +43,7 @@ class Tokenizer {
public function tokenize(): \Generator {
foreach ($this->data->get() as $lineNumber => $line) {
foreach ($this->data->generator as $lineNumber => $line) {
$this->lineNumber = $lineNumber;
$this->line = $line;
@ -76,7 +76,7 @@ class Tokenizer {
protected function resolveScopeName(string $scopeName, array $match): string {
return preg_replace_callback(self::SCOPE_RESOLVE_REGEX, function($m) use($match) {
$replacement = $match[(int)$m[1]][0] ?? $m[1];
$replacement = trim($match[(int)$m[1]][0] ?? $m[1]);
$command = $m[2] ?? null;
switch ($command) {
case 'downcase': return strtolower($replacement);
@ -88,7 +88,7 @@ class Tokenizer {
}, $scopeName);
protected function tokenizeLine(int $lineLength): array {
protected function tokenizeLine(int $stopOffset): array {
$tokens = [];
while (true) {
@ -117,25 +117,25 @@ class Tokenizer {
// If the rule is a Pattern
if ($rule instanceof Pattern) {
// Throw out pattern regexes with anchors that shouldn't match the current line.
// This is necessary because the tokenizer is fed data line by line and
// therefore anchors that match the beginning of the document and the end won't
// do anything.
if (preg_match(self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1) {
if (
// \A anchors match the beginning of the whole string, not just this line
($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
// \z anchors match the end of the whole string, not just this line
($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
// \Z anchors match the end of the whole string or before the final newline if
// there's a trailing newline in the string
($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
) {
if (preg_match($rule->match, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $this->offset) === 1) {
// Throw out pattern regexes with anchors that shouldn't match the current line.
// This is necessary because the tokenizer is fed data line by line and
// therefore anchors that match the beginning of the document and the end won't
// do anything.
if (preg_match(
self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1 && (
// \A anchors match the beginning of the whole string, not just this line
($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
// \z anchors match the end of the whole string, not just this line
($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
// \Z anchors match the end of the whole string or before the final newline if
// there's a trailing newline in the string
($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
) {
continue 2;
if (preg_match($rule->match, "{$this->line}\n", $match, PREG_OFFSET_CAPTURE, $this->offset)) {
// If the match's offset is the same as the current offset then it is the
// closest match. There's no need to iterate anymore through the patterns.
if ($match[0][1] === $this->offset) {
@ -351,13 +351,13 @@ class Tokenizer {
$this->ruleStack[] = $pattern;
// If the rule has patterns process tokens from its subpatterns.
if ($pattern->patterns !== null && $this->offset < $lineLength) {
if ($pattern->patterns !== null && $this->offset < $stopOffset) {
// If the pattern has just a regular match (meaning neither a begin nor an end
// pattern) but has subpatterns then only tokenize the part of the line that's
// within the match. Otherwise, tokenize up to the line's length. Because of
// recursion, the line length could be set by this step before or within the
// within the match. Otherwise, tokenize up to the stop offset. Because of
// recursion, the stop offset could be set by this step before or within the
// capture tokenization process.
$tokens = [ ...$tokens, ...$this->tokenizeLine((!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $lineLength) ];
$tokens = [ ...$tokens, ...$this->tokenizeLine((!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : $stopOffset) ];
// If the offset is before the end of the match then create a token from the
@ -406,7 +406,7 @@ class Tokenizer {
// If the offset isn't at the end of the line then look for more matches.
if ($this->offset < $lineLength) {
if ($this->offset < $stopOffset) {
@ -419,7 +419,7 @@ class Tokenizer {
$this->ruleStack[] = $injection;
$this->activeInjection = $injection;
if ($this->offset < $lineLength) {
if ($this->offset < $stopOffset) {
continue 2;
