From cba093dd68270bde8cd36f261fbb1aef1b780081 Mon Sep 17 00:00:00 2001
From: Dustin Wilson <dustin@dustinwilson.com>
Date: Sat, 21 Aug 2021 21:27:07 -0500
Subject: [PATCH] Removed getting data from file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Added pattern match anchor support.
• Data is now an instanced class with support only for string input.
• Data now has firstLine, lastLine, and lastLineBeforeFinalNewLine properties to facilitate anchoring
• Highlight now has a static toDOM method for highlighting to a DOM tree instead of the withFile and withString methods for accepting different kinds of input
• Tokenizer now only outputs newline tokens if not the last line
• Tokenizer now throws out pattern match regexes if their anchors are invalid for the current line.
• Tokenizer now won't mistakenly emit empty string tokens.
---
 lib/Data.php      | 38 +++++++++++++------
 lib/Highlight.php | 14 +++----
 lib/Tokenizer.php | 94 +++++++++++++++++++++++++++++------------------
 3 files changed, 90 insertions(+), 56 deletions(-)

diff --git a/lib/Data.php b/lib/Data.php
index 16e755a..2d78b6a 100644
--- a/lib/Data.php
+++ b/lib/Data.php
@@ -8,22 +8,38 @@ namespace dW\Lit;
 
 
 class Data {
-    public static function fileToGenerator(string $filepath, string $encoding = 'UTF-8'): \Generator {
-        $lineNumber = 0;
-        $fp = fopen($filepath, 'r');
-        try {
-            while ($line = fgets($fp)) {
-                yield ++$lineNumber => $line;
-            }
-        } finally {
-            fclose($fp);
-        }
+    use FauxReadOnly;
+    // True if on the first line
+    protected bool $_firstLine = true;
+    protected \Generator $generator;
+    // True if on the last line.
+    protected bool $_lastLine = false;
+    // Some matches will check for the last line before the final newline, so this
+    // will be true if on the line before the final newline or if on the last line
+    // if there isn't an extra newline at the end of the string.
+    protected bool $_lastLineBeforeFinalNewLine = false;
+
+
+    public function __construct(string $data) {
+        $this->generator = $this->lineGenerator($data);
+    }
+
+
+    public function get(): \Generator {
+        return $this->generator;
     }
 
-    public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator {
+
+    protected function lineGenerator(string $string): \Generator {
         $string = explode("\n", $string);
+        $lastLineIndex = count($string) - 1;
+        $lastLineBeforeFinalNewLineIndex = ($string[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex;
+
         foreach ($string as $lineNumber => $line) {
+            $this->_lastLine = ($lineNumber === $lastLineIndex);
+            $this->_lastLineBeforeFinalNewLine = ($lineNumber === $lastLineBeforeFinalNewLineIndex);
             yield $lineNumber + 1 => $line;
+            $this->_firstLine = false;
         }
     }
 }
\ No newline at end of file
diff --git a/lib/Highlight.php b/lib/Highlight.php
index 5c7ea3f..4b1cf2f 100644
--- a/lib/Highlight.php
+++ b/lib/Highlight.php
@@ -9,26 +9,22 @@ use dW\Lit\Grammar\Exception;
 
 
 class Highlight {
-    public static function withFile(string $filepath, string $scopeName) {
-        return self::highlight(Data::fileToGenerator($filepath), $scopeName);
-    }
-
-    public static function withString(string $string, string $scopeName) {
-        return self::highlight(Data::stringToGenerator($string), $scopeName);
+    public static function toDOM(string $data, string $scopeName) {
+        self::highlight($data, $scopeName);
     }
 
 
-    protected static function highlight(\Generator $data, string $scopeName) {
+    protected static function highlight(string $data, string $scopeName) {
         $grammar = GrammarRegistry::get($scopeName);
         if ($grammar === false) {
             throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
         }
 
-        $tokenizer = new Tokenizer($data, $grammar);
+        $tokenizer = new Tokenizer(new Data($data), $grammar);
         $tokenList = $tokenizer->tokenize();
 
         foreach ($tokenList as $lineNumber => $tokens) {
-            if ($lineNumber === 7) {
+            if ($lineNumber === 19) {
                 var_export($tokens);
                 echo "\n";
                 die();
diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php
index c50f226..819ae63 100644
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@@ -17,7 +17,7 @@ use dW\Lit\Scope\{
 
 
 class Tokenizer {
-    protected \Generator $data;
+    protected Data $data;
     protected Grammar $grammar;
     protected int $offset = 0;
     protected ?Pattern $activeInjection = null;
@@ -26,8 +26,11 @@ class Tokenizer {
     protected int $debug = 0;
     protected int $debugCount = 0;
 
+    protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S';
+    protected const ANCHOR_CHECK_REGEX = '/(?<!\\\)\\\([AGzZ])/S';
 
-    public function __construct(\Generator $data, Grammar $grammar) {
+
+    public function __construct(Data $data, Grammar $grammar) {
         $this->data = $data;
         $this->grammar = $grammar;
         $this->ruleStack = [ $this->grammar ];
@@ -36,7 +39,7 @@ class Tokenizer {
 
 
     public function tokenize(): \Generator {
-        foreach ($this->data as $lineNumber => $line) {
+        foreach ($this->data->get() as $lineNumber => $line) {
             $this->debug = $lineNumber;
             $this->debugCount = 0;
             $this->offset = 0;
@@ -46,11 +49,19 @@ class Tokenizer {
 
             // Output a token for everything else contained on the line including the
             // newline or just a newline if there weren't any spare characters left on the
-            // line.
-            $tokens[] = new Token(
-                $this->scopeStack,
-                ($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength - $this->offset) . "\n" : "\n"
-            );
+            // line. If it is the last line, and there's nothing else remaining on the line
+            // then output no additional token.
+            if ($this->offset < $lineLength) {
+                $tokens[] = new Token(
+                    $this->scopeStack,
+                    substr($line, $this->offset, $lineLength - $this->offset) . ((!$this->data->lastLine) ? "\n" : '')
+                );
+            } elseif (!$this->data->lastLine) {
+                $tokens[] = new Token(
+                    $this->scopeStack,
+                    "\n"
+                );
+            }
 
             $this->debugCount++;
 
@@ -60,7 +71,7 @@ class Tokenizer {
 
 
     protected function resolveScopeName(string $scopeName, array $match): string {
-        return preg_replace_callback('/\$(\d+)|\${(\d+):\/(downcase|upcase)}/', function($m) use ($match) {
+        return preg_replace_callback(self::SCOPE_RESOLVE_REGEX, function($m) use($match) {
             $replacement = $match[(int)$m[1]][0] ?? $m[1];
             $command = $m[2] ?? null;
             switch ($command) {
@@ -101,25 +112,42 @@ class Tokenizer {
                 while (true) {
                     $rule = $currentRules[$i];
 
-                    // If the rule is a Pattern and matches the line at the offset then...
-                    if ($rule instanceof Pattern && preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) {
-                        // If the match's offset is the same as the current offset then it is the
-                        // closest match. There's no need to iterate anymore through the patterns.
-                        if ($match[0][1] === $this->offset) {
-                            $closestMatch = [
-                                'match' => $match,
-                                'pattern' => $rule
-                            ];
-                            break 2;
+                    // If the rule is a Pattern
+                    if ($rule instanceof Pattern) {
+                        // Throw out pattern regexes with anchors that cannot match the current line.
+                        if (preg_match(self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1) {
+                            if (
+                                // \A anchors match the beginning of the whole string, not just this line
+                                ($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
+                                // \z anchors match the end of the whole string, not just this line
+                                ($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
+                                // \Z anchors match the end of the whole string or before the final newline if
+                                // there's a trailing newline in the string
+                                ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
+                            ) {
+                                continue 2;
+                            }
                         }
-                        // Otherwise, if the closest match is currently null or the match's offset is
-                        // less than the closest match's offset then set the match as the closest match
-                        // and continue looking for a closer one.
-                        elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
-                            $closestMatch = [
-                                'match' => $match,
-                                'pattern' => $rule
-                            ];
+
+                        if (preg_match($rule->match, "$line\n", $match, PREG_OFFSET_CAPTURE, $this->offset)) {
+                            // If the match's offset is the same as the current offset then it is the
+                            // closest match. There's no need to iterate anymore through the patterns.
+                            if ($match[0][1] === $this->offset) {
+                                $closestMatch = [
+                                    'match' => $match,
+                                    'pattern' => $rule
+                                ];
+                                break 2;
+                            }
+                            // Otherwise, if the closest match is currently null or the match's offset is
+                            // less than the closest match's offset then set the match as the closest match
+                            // and continue looking for a closer one.
+                            elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
+                                $closestMatch = [
+                                    'match' => $match,
+                                    'pattern' => $rule
+                                ];
+                            }
                         }
                     }
                     // Otherwise, if the rule is a Reference then retrieve its patterns, splice into
@@ -129,7 +157,7 @@ class Tokenizer {
                             $obj = $obj->patterns;
                         }
 
-                        array_splice($currentRules, $i, 1, $obj);
+                        array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj);
                         $currentRulesCount = count($currentRules);
                         continue;
                     }
@@ -143,12 +171,6 @@ class Tokenizer {
                 $match = $closestMatch['match'];
                 $pattern = $closestMatch['pattern'];
 
-                // **¡TEMPORARY!** Haven't implemented begin and end line
-                // anchors, so let's toss patterns with them completely for now.
-                //if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) {
-                //    continue;
-                //}
-
                 // If the subpattern begins after the offset then create a token from the bits
                 // of the line in-between the last token and the one(s) about to be created.
                 if ($match[0][1] > $this->offset) {
@@ -227,8 +249,8 @@ class Tokenizer {
                     }
                 }
                 // Otherwise, if the rule doesn't have captures then a token is created from the
-                // entire match.
-                else {
+                // entire match, but only if the matched text isn't empty.
+                elseif ($match[0][0] !== '') {
                     $tokens[] = new Token(
                         $this->scopeStack,
                         $match[0][0]