From b7e1353821fea8286a41eeef42d99b5615e95d88 Mon Sep 17 00:00:00 2001
From: Dustin Wilson <dustin@dustinwilson.com>
Date: Mon, 23 Aug 2021 09:36:46 -0500
Subject: [PATCH] Subpatterns now limited to their parent pattern's length (if
 necessary)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Removed Token class in favor of associative arrays in anticipation of
token manipulation in captures. (ugh)
---
 lib/Token.php     | 19 -----------
 lib/Tokenizer.php | 82 ++++++++++++++++++++++++++---------------------
 2 files changed, 45 insertions(+), 56 deletions(-)
 delete mode 100644 lib/Token.php

diff --git a/lib/Token.php b/lib/Token.php
deleted file mode 100644
index 83518ef..0000000
--- a/lib/Token.php
+++ /dev/null
@@ -1,19 +0,0 @@
-<?php
-/** @license MIT
- * Copyright 2021 Dustin Wilson et al.
- * See LICENSE file for details */
-
-declare(strict_types=1);
-namespace dW\Lit;
-
-class Token {
-    use FauxReadOnly;
-    protected array $_scopes;
-    protected string $_text;
-
-
-    public function __construct(array $scopes, string $text) {
-        $this->_scopes = $scopes;
-        $this->text = $text;
-    }
-}
\ No newline at end of file
diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php
index 44e86a4..7c9623e 100644
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@@ -53,16 +53,16 @@ class Tokenizer {
             // line. If it is the last line, and there's nothing else remaining on the line
             // then output no additional token.
             if ($this->offset < $lineLength) {
-                $tokens[] = new Token(
-                    $this->scopeStack,
-                    substr($line, $this->offset, $lineLength - $this->offset) . ((!$this->data->lastLine) ? "\n" : '')
-                );
+                $tokens[] = [
+                    'scopes' => $this->scopeStack,
+                    'text' => substr($line, $this->offset, $lineLength - $this->offset) . ((!$this->data->lastLine) ? "\n" : '')
+                ];
                 $this->debugCount++;
             } elseif (!$this->data->lastLine) {
-                $tokens[] = new Token(
-                    $this->scopeStack,
-                    "\n"
-                );
+                $tokens[] = [
+                    'scopes' => $this->scopeStack,
+                    'text' => "\n"
+                ];
                 $this->debugCount++;
             }
 
@@ -85,9 +85,12 @@ class Tokenizer {
         }, $scopeName);
     }
 
-    protected function tokenizeLine(string $line): array {
+    protected function tokenizeLine(string $line, int $lineLength = 0): array {
         $tokens = [];
-        $lineLength = strlen($line);
+        // When processing subpatterns a linelength is specified based upon the parent
+        // match's string length (like with captures), otherwise set the line length to
+        // the entire line.
+        $lineLength = ($lineLength === 0) ? strlen($line) : $lineLength;
 
         while (true) {
             if ($this->activeInjection === null && $this->grammar->injections !== null) {
@@ -181,10 +184,10 @@ class Tokenizer {
                 // If the subpattern begins after the offset then create a token from the bits
                 // of the line in-between the last token and the one(s) about to be created.
                 if ($match[0][1] > $this->offset) {
-                    $tokens[] = new Token(
-                        $this->scopeStack,
-                        substr($line, $this->offset, $match[0][1] - $this->offset)
-                    );
+                    $tokens[] = [
+                        'scopes' => $this->scopeStack,
+                        'text' => substr($line, $this->offset, $match[0][1] - $this->offset)
+                    ];
                     $this->debugCount++;
                     $this->offset = $match[0][1];
                 }
@@ -205,10 +208,10 @@ class Tokenizer {
                         // If the capture begins after the offset then create a token from the bits of
                         // the line in-between the last token and the one(s) about to be created.
                         if ($k > 0 && $m[1] > $this->offset) {
-                            $tokens[] = new Token(
-                                $this->scopeStack,
-                                substr($line, $this->offset, $m[1] - $this->offset)
-                            );
+                            $tokens[] = [
+                                'scopes' => $this->scopeStack,
+                                'text' => substr($line, $this->offset, $m[1] - $this->offset)
+                            ];
                             $this->debugCount++;
                             $this->offset = $m[1];
                         }
@@ -222,16 +225,18 @@ class Tokenizer {
                         // process the patterns, and then pop the capture off the stack.
                         if ($pattern->captures[$k]->patterns !== null) {
                             $this->ruleStack[] = $pattern->captures[$k];
-                            $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
+                            // Only tokenize the part of the line that's contains the match.
+                            $captureLength = $m[1] + strlen($m[0]);
+                            $tokens = [ ...$tokens, ...$this->tokenizeLine($line, $captureLength) ];
 
                             // If the offset is before the end of the capture then create a token from the
                             // bits of the capture from the offset until the end of the capture.
-                            $endOffset = $m[1] + strlen($m[0]);
+                            $endOffset = $captureLength;
                             if ($endOffset > $this->offset) {
-                                $tokens[] = new Token(
-                                    $this->scopeStack,
-                                    substr($line, $this->offset, $endOffset - $this->offset)
-                                );
+                                $tokens[] = [
+                                    'scopes' => $this->scopeStack,
+                                    'text' => substr($line, $this->offset, $endOffset - $this->offset)
+                                ];
                                 $this->debugCount++;
                                 $this->offset = $endOffset;
                             }
@@ -240,10 +245,10 @@ class Tokenizer {
                         }
                         // Otherwise, create a token for the capture.
                         else {
-                            $tokens[] = new Token(
-                                $this->scopeStack,
-                                $m[0]
-                            );
+                            $tokens[] = [
+                                'scopes' => $this->scopeStack,
+                                'text' => $m[0]
+                            ];
                             $this->debugCount++;
                         }
 
@@ -258,10 +263,10 @@ class Tokenizer {
                 // Otherwise, if the rule doesn't have captures then a token is created from the
                 // entire match, but only if the matched text isn't empty.
                 elseif ($match[0][0] !== '') {
-                    $tokens[] = new Token(
-                        $this->scopeStack,
-                        $match[0][0]
-                    );
+                    $tokens[] = [
+                        'scopes' => $this->scopeStack,
+                        'text' => $match[0][0]
+                    ];
 
                     $this->offset = $match[0][1] + strlen($match[0][0]);
                     $this->debugCount++;
@@ -277,17 +282,20 @@ class Tokenizer {
 
                 // If the rule has patterns process tokens from its subpatterns.
                 if ($pattern->patterns !== null && $this->offset < $lineLength) {
-                    $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
+                    // If the pattern has just a regular match (meaning neither a begin nor an end
+                    // pattern) but has subpatterns then only tokenize the part of the line that's
+                    // within the match.
+                    $tokens = [ ...$tokens, ...$this->tokenizeLine($line, (!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : 0) ];
                 }
 
                 // If the offset is before the end of the match then create a token from the
                 // bits of the match from the offset until the end of the match.
                 $endOffset = $match[0][1] + strlen($match[0][0]);
                 if ($endOffset > $this->offset) {
-                    $tokens[] = new Token(
-                        $this->scopeStack,
-                        substr($line, $this->offset, $endOffset - $this->offset)
-                    );
+                    $tokens[] = [
+                        'scopes' => $this->scopeStack,
+                        'text' => substr($line, $this->offset, $endOffset - $this->offset)
+                    ];
                     $this->debugCount++;
                     $this->offset = $endOffset;
                 }