Removed getting data from file

• Added pattern match anchor support. • Data is now an instanced class with support only for string input. • Data now has firstLine, lastLine, and lastLineBeforeFinalNewLine properties to facilitate anchoring • Highlight now has a static toDOM method for highlighting to a DOM tree instead of the withFile and withString methods for accepting different kinds of input • Tokenizer now only outputs newline tokens if not the last line • Tokenizer now throws out pattern match regexes if their anchors are invalid for the current line. • Tokenizer now won't mistakenly emit empty string tokens.
3 years ago · cba093dd68
3 changed files with 90 additions and 56 deletions
--- a/lib/Data.php
+++ b/lib/Data.php
@ -8,22 +8,38 @@ namespace dW\Lit;


 class Data {
-    public static function fileToGenerator(string $filepath, string $encoding = 'UTF-8'): \Generator {
-        $lineNumber = 0;
-        $fp = fopen($filepath, 'r');
-        try {
-            while ($line = fgets($fp)) {
-                yield ++$lineNumber => $line;
-            }
-        } finally {
-            fclose($fp);
-        }
+    use FauxReadOnly;
+    // True if on the first line
+    protected bool $_firstLine = true;
+    protected \Generator $generator;
+    // True if on the last line.
+    protected bool $_lastLine = false;
+    // Some matches will check for the last line before the final newline, so this
+    // will be true if on the line before the final newline or if on the last line
+    // if there isn't an extra newline at the end of the string.
+    protected bool $_lastLineBeforeFinalNewLine = false;
+
+
+    public function __construct(string $data) {
+        $this->generator = $this->lineGenerator($data);
+    }
+
+
+    public function get(): \Generator {
+        return $this->generator;
    }

-    public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator {
+
+    protected function lineGenerator(string $string): \Generator {
        $string = explode("\n", $string);
+        $lastLineIndex = count($string) - 1;
+        $lastLineBeforeFinalNewLineIndex = ($string[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex;
+
        foreach ($string as $lineNumber => $line) {
+            $this->_lastLine = ($lineNumber === $lastLineIndex);
+            $this->_lastLineBeforeFinalNewLine = ($lineNumber === $lastLineBeforeFinalNewLineIndex);
            yield $lineNumber + 1 => $line;
+            $this->_firstLine = false;
        }
    }
 }
--- a/lib/Highlight.php
+++ b/lib/Highlight.php
@ -9,26 +9,22 @@ use dW\Lit\Grammar\Exception;


 class Highlight {
-    public static function withFile(string $filepath, string $scopeName) {
-        return self::highlight(Data::fileToGenerator($filepath), $scopeName);
-    }
-
-    public static function withString(string $string, string $scopeName) {
-        return self::highlight(Data::stringToGenerator($string), $scopeName);
+    public static function toDOM(string $data, string $scopeName) {
+        self::highlight($data, $scopeName);
    }


-    protected static function highlight(\Generator $data, string $scopeName) {
+    protected static function highlight(string $data, string $scopeName) {
        $grammar = GrammarRegistry::get($scopeName);
        if ($grammar === false) {
            throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
        }

-        $tokenizer = new Tokenizer($data, $grammar);
+        $tokenizer = new Tokenizer(new Data($data), $grammar);
        $tokenList = $tokenizer->tokenize();

        foreach ($tokenList as $lineNumber => $tokens) {
-            if ($lineNumber === 7) {
+            if ($lineNumber === 19) {
                var_export($tokens);
                echo "\n";
                die();
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@ -17,7 +17,7 @@ use dW\Lit\Scope\{


 class Tokenizer {
-    protected \Generator $data;
+    protected Data $data;
    protected Grammar $grammar;
    protected int $offset = 0;
    protected ?Pattern $activeInjection = null;
@ -26,8 +26,11 @@ class Tokenizer {
    protected int $debug = 0;
    protected int $debugCount = 0;

+    protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S';
+    protected const ANCHOR_CHECK_REGEX = '/(?<!\\\)\\\([AGzZ])/S';

-    public function __construct(\Generator $data, Grammar $grammar) {
+
+    public function __construct(Data $data, Grammar $grammar) {
        $this->data = $data;
        $this->grammar = $grammar;
        $this->ruleStack = [ $this->grammar ];
@ -36,7 +39,7 @@ class Tokenizer {


    public function tokenize(): \Generator {
-        foreach ($this->data as $lineNumber => $line) {
+        foreach ($this->data->get() as $lineNumber => $line) {
            $this->debug = $lineNumber;
            $this->debugCount = 0;
            $this->offset = 0;
@ -46,11 +49,19 @@ class Tokenizer {

            // Output a token for everything else contained on the line including the
            // newline or just a newline if there weren't any spare characters left on the
-            // line.
-            $tokens[] = new Token(
-                $this->scopeStack,
-                ($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength - $this->offset) . "\n" : "\n"
-            );
+            // line. If it is the last line, and there's nothing else remaining on the line
+            // then output no additional token.
+            if ($this->offset < $lineLength) {
+                $tokens[] = new Token(
+                    $this->scopeStack,
+                    substr($line, $this->offset, $lineLength - $this->offset) . ((!$this->data->lastLine) ? "\n" : '')
+                );
+            } elseif (!$this->data->lastLine) {
+                $tokens[] = new Token(
+                    $this->scopeStack,
+                    "\n"
+                );
+            }

            $this->debugCount++;

@ -60,7 +71,7 @@ class Tokenizer {


    protected function resolveScopeName(string $scopeName, array $match): string {
-        return preg_replace_callback('/\$(\d+)|\${(\d+):\/(downcase|upcase)}/', function($m) use ($match) {
+        return preg_replace_callback(self::SCOPE_RESOLVE_REGEX, function($m) use($match) {
            $replacement = $match[(int)$m[1]][0] ?? $m[1];
            $command = $m[2] ?? null;
            switch ($command) {
@ -101,25 +112,42 @@ class Tokenizer {
                while (true) {
                    $rule = $currentRules[$i];

-                    // If the rule is a Pattern and matches the line at the offset then...
-                    if ($rule instanceof Pattern && preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) {
-                        // If the match's offset is the same as the current offset then it is the
-                        // closest match. There's no need to iterate anymore through the patterns.
-                        if ($match[0][1] === $this->offset) {
-                            $closestMatch = [
-                                'match' => $match,
-                                'pattern' => $rule
-                            ];
-                            break 2;
+                    // If the rule is a Pattern
+                    if ($rule instanceof Pattern) {
+                        // Throw out pattern regexes with anchors that cannot match the current line.
+                        if (preg_match(self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1) {
+                            if (
+                                // \A anchors match the beginning of the whole string, not just this line
+                                ($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
+                                // \z anchors match the end of the whole string, not just this line
+                                ($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
+                                // \Z anchors match the end of the whole string or before the final newline if
+                                // there's a trailing newline in the string
+                                ($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
+                            ) {
+                                continue 2;
+                            }
                        }
-                        // Otherwise, if the closest match is currently null or the match's offset is
-                        // less than the closest match's offset then set the match as the closest match
-                        // and continue looking for a closer one.
-                        elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
-                            $closestMatch = [
-                                'match' => $match,
-                                'pattern' => $rule
-                            ];
+
+                        if (preg_match($rule->match, "$line\n", $match, PREG_OFFSET_CAPTURE, $this->offset)) {
+                            // If the match's offset is the same as the current offset then it is the
+                            // closest match. There's no need to iterate anymore through the patterns.
+                            if ($match[0][1] === $this->offset) {
+                                $closestMatch = [
+                                    'match' => $match,
+                                    'pattern' => $rule
+                                ];
+                                break 2;
+                            }
+                            // Otherwise, if the closest match is currently null or the match's offset is
+                            // less than the closest match's offset then set the match as the closest match
+                            // and continue looking for a closer one.
+                            elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
+                                $closestMatch = [
+                                    'match' => $match,
+                                    'pattern' => $rule
+                                ];
+                            }
                        }
                    }
                    // Otherwise, if the rule is a Reference then retrieve its patterns, splice into
@ -129,7 +157,7 @@ class Tokenizer {
                            $obj = $obj->patterns;
                        }

-                        array_splice($currentRules, $i, 1, $obj);
+                        array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj);
                        $currentRulesCount = count($currentRules);
                        continue;
                    }
@ -143,12 +171,6 @@ class Tokenizer {
                $match = $closestMatch['match'];
                $pattern = $closestMatch['pattern'];

-                // **¡TEMPORARY!** Haven't implemented begin and end line
-                // anchors, so let's toss patterns with them completely for now.
-                //if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) {
-                //    continue;
-                //}
-
                // If the subpattern begins after the offset then create a token from the bits
                // of the line in-between the last token and the one(s) about to be created.
                if ($match[0][1] > $this->offset) {
@ -227,8 +249,8 @@ class Tokenizer {
                    }
                }
                // Otherwise, if the rule doesn't have captures then a token is created from the
-                // entire match.
-                else {
+                // entire match, but only if the matched text isn't empty.
+                elseif ($match[0][0] !== '') {
                    $tokens[] = new Token(
                        $this->scopeStack,
                        $match[0][0]