From cba093dd68270bde8cd36f261fbb1aef1b780081 Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Sat, 21 Aug 2021 21:27:07 -0500 Subject: [PATCH] Removed getting data from file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Added pattern match anchor support. • Data is now an instanced class with support only for string input. • Data now has firstLine, lastLine, and lastLineBeforeFinalNewLine properties to facilitate anchoring • Highlight now has a static toDOM method for highlighting to a DOM tree instead of the withFile and withString methods for accepting different kinds of input • Tokenizer now only outputs newline tokens if not the last line • Tokenizer now throws out pattern match regexes if their anchors are invalid for the current line. • Tokenizer now won't mistakenly emit empty string tokens. --- lib/Data.php | 38 +++++++++++++------ lib/Highlight.php | 14 +++---- lib/Tokenizer.php | 94 +++++++++++++++++++++++++++++------------------ 3 files changed, 90 insertions(+), 56 deletions(-) diff --git a/lib/Data.php b/lib/Data.php index 16e755a..2d78b6a 100644 --- a/lib/Data.php +++ b/lib/Data.php @@ -8,22 +8,38 @@ namespace dW\Lit; class Data { - public static function fileToGenerator(string $filepath, string $encoding = 'UTF-8'): \Generator { - $lineNumber = 0; - $fp = fopen($filepath, 'r'); - try { - while ($line = fgets($fp)) { - yield ++$lineNumber => $line; - } - } finally { - fclose($fp); - } + use FauxReadOnly; + // True if on the first line + protected bool $_firstLine = true; + protected \Generator $generator; + // True if on the last line. + protected bool $_lastLine = false; + // Some matches will check for the last line before the final newline, so this + // will be true if on the line before the final newline or if on the last line + // if there isn't an extra newline at the end of the string. + protected bool $_lastLineBeforeFinalNewLine = false; + + + public function __construct(string $data) { + $this->generator = $this->lineGenerator($data); + } + + + public function get(): \Generator { + return $this->generator; } - public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator { + + protected function lineGenerator(string $string): \Generator { $string = explode("\n", $string); + $lastLineIndex = count($string) - 1; + $lastLineBeforeFinalNewLineIndex = ($string[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex; + foreach ($string as $lineNumber => $line) { + $this->_lastLine = ($lineNumber === $lastLineIndex); + $this->_lastLineBeforeFinalNewLine = ($lineNumber === $lastLineBeforeFinalNewLineIndex); yield $lineNumber + 1 => $line; + $this->_firstLine = false; } } } \ No newline at end of file diff --git a/lib/Highlight.php b/lib/Highlight.php index 5c7ea3f..4b1cf2f 100644 --- a/lib/Highlight.php +++ b/lib/Highlight.php @@ -9,26 +9,22 @@ use dW\Lit\Grammar\Exception; class Highlight { - public static function withFile(string $filepath, string $scopeName) { - return self::highlight(Data::fileToGenerator($filepath), $scopeName); - } - - public static function withString(string $string, string $scopeName) { - return self::highlight(Data::stringToGenerator($string), $scopeName); + public static function toDOM(string $data, string $scopeName) { + self::highlight($data, $scopeName); } - protected static function highlight(\Generator $data, string $scopeName) { + protected static function highlight(string $data, string $scopeName) { $grammar = GrammarRegistry::get($scopeName); if ($grammar === false) { throw new Exception(Exception::GRAMMAR_MISSING, $scopeName); } - $tokenizer = new Tokenizer($data, $grammar); + $tokenizer = new Tokenizer(new Data($data), $grammar); $tokenList = $tokenizer->tokenize(); foreach ($tokenList as $lineNumber => $tokens) { - if ($lineNumber === 7) { + if ($lineNumber === 19) { var_export($tokens); echo "\n"; die(); diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index c50f226..819ae63 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -17,7 +17,7 @@ use dW\Lit\Scope\{ class Tokenizer { - protected \Generator $data; + protected Data $data; protected Grammar $grammar; protected int $offset = 0; protected ?Pattern $activeInjection = null; @@ -26,8 +26,11 @@ class Tokenizer { protected int $debug = 0; protected int $debugCount = 0; + protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S'; + protected const ANCHOR_CHECK_REGEX = '/(?data = $data; $this->grammar = $grammar; $this->ruleStack = [ $this->grammar ]; @@ -36,7 +39,7 @@ class Tokenizer { public function tokenize(): \Generator { - foreach ($this->data as $lineNumber => $line) { + foreach ($this->data->get() as $lineNumber => $line) { $this->debug = $lineNumber; $this->debugCount = 0; $this->offset = 0; @@ -46,11 +49,19 @@ class Tokenizer { // Output a token for everything else contained on the line including the // newline or just a newline if there weren't any spare characters left on the - // line. - $tokens[] = new Token( - $this->scopeStack, - ($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength - $this->offset) . "\n" : "\n" - ); + // line. If it is the last line, and there's nothing else remaining on the line + // then output no additional token. + if ($this->offset < $lineLength) { + $tokens[] = new Token( + $this->scopeStack, + substr($line, $this->offset, $lineLength - $this->offset) . ((!$this->data->lastLine) ? "\n" : '') + ); + } elseif (!$this->data->lastLine) { + $tokens[] = new Token( + $this->scopeStack, + "\n" + ); + } $this->debugCount++; @@ -60,7 +71,7 @@ class Tokenizer { protected function resolveScopeName(string $scopeName, array $match): string { - return preg_replace_callback('/\$(\d+)|\${(\d+):\/(downcase|upcase)}/', function($m) use ($match) { + return preg_replace_callback(self::SCOPE_RESOLVE_REGEX, function($m) use($match) { $replacement = $match[(int)$m[1]][0] ?? $m[1]; $command = $m[2] ?? null; switch ($command) { @@ -101,25 +112,42 @@ class Tokenizer { while (true) { $rule = $currentRules[$i]; - // If the rule is a Pattern and matches the line at the offset then... - if ($rule instanceof Pattern && preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) { - // If the match's offset is the same as the current offset then it is the - // closest match. There's no need to iterate anymore through the patterns. - if ($match[0][1] === $this->offset) { - $closestMatch = [ - 'match' => $match, - 'pattern' => $rule - ]; - break 2; + // If the rule is a Pattern + if ($rule instanceof Pattern) { + // Throw out pattern regexes with anchors that cannot match the current line. + if (preg_match(self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1) { + if ( + // \A anchors match the beginning of the whole string, not just this line + ($validRegexMatch[1] === 'A' && !$this->data->firstLine) || + // \z anchors match the end of the whole string, not just this line + ($validRegexMatch[1] === 'z' && !$this->data->lastLine) || + // \Z anchors match the end of the whole string or before the final newline if + // there's a trailing newline in the string + ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine) + ) { + continue 2; + } } - // Otherwise, if the closest match is currently null or the match's offset is - // less than the closest match's offset then set the match as the closest match - // and continue looking for a closer one. - elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) { - $closestMatch = [ - 'match' => $match, - 'pattern' => $rule - ]; + + if (preg_match($rule->match, "$line\n", $match, PREG_OFFSET_CAPTURE, $this->offset)) { + // If the match's offset is the same as the current offset then it is the + // closest match. There's no need to iterate anymore through the patterns. + if ($match[0][1] === $this->offset) { + $closestMatch = [ + 'match' => $match, + 'pattern' => $rule + ]; + break 2; + } + // Otherwise, if the closest match is currently null or the match's offset is + // less than the closest match's offset then set the match as the closest match + // and continue looking for a closer one. + elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) { + $closestMatch = [ + 'match' => $match, + 'pattern' => $rule + ]; + } } } // Otherwise, if the rule is a Reference then retrieve its patterns, splice into @@ -129,7 +157,7 @@ class Tokenizer { $obj = $obj->patterns; } - array_splice($currentRules, $i, 1, $obj); + array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj); $currentRulesCount = count($currentRules); continue; } @@ -143,12 +171,6 @@ class Tokenizer { $match = $closestMatch['match']; $pattern = $closestMatch['pattern']; - // **¡TEMPORARY!** Haven't implemented begin and end line - // anchors, so let's toss patterns with them completely for now. - //if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) { - // continue; - //} - // If the subpattern begins after the offset then create a token from the bits // of the line in-between the last token and the one(s) about to be created. if ($match[0][1] > $this->offset) { @@ -227,8 +249,8 @@ class Tokenizer { } } // Otherwise, if the rule doesn't have captures then a token is created from the - // entire match. - else { + // entire match, but only if the matched text isn't empty. + elseif ($match[0][0] !== '') { $tokens[] = new Token( $this->scopeStack, $match[0][0]