Browse Source

Subpatterns now limited to their parent pattern's length (if necessary)

• Removed Token class in favor of associative arrays in anticipation of 
token manipulation in captures. (ugh)
main
Dustin Wilson 3 years ago
parent
commit
b7e1353821
  1. 19
      lib/Token.php
  2. 82
      lib/Tokenizer.php

19
lib/Token.php

@ -1,19 +0,0 @@
<?php
/** @license MIT
* Copyright 2021 Dustin Wilson et al.
* See LICENSE file for details */
declare(strict_types=1);
namespace dW\Lit;
class Token {
use FauxReadOnly;
protected array $_scopes;
protected string $_text;
public function __construct(array $scopes, string $text) {
$this->_scopes = $scopes;
$this->text = $text;
}
}

82
lib/Tokenizer.php

@ -53,16 +53,16 @@ class Tokenizer {
// line. If it is the last line, and there's nothing else remaining on the line // line. If it is the last line, and there's nothing else remaining on the line
// then output no additional token. // then output no additional token.
if ($this->offset < $lineLength) { if ($this->offset < $lineLength) {
$tokens[] = new Token( $tokens[] = [
$this->scopeStack, 'scopes' => $this->scopeStack,
substr($line, $this->offset, $lineLength - $this->offset) . ((!$this->data->lastLine) ? "\n" : '') 'text' => substr($line, $this->offset, $lineLength - $this->offset) . ((!$this->data->lastLine) ? "\n" : '')
); ];
$this->debugCount++; $this->debugCount++;
} elseif (!$this->data->lastLine) { } elseif (!$this->data->lastLine) {
$tokens[] = new Token( $tokens[] = [
$this->scopeStack, 'scopes' => $this->scopeStack,
"\n" 'text' => "\n"
); ];
$this->debugCount++; $this->debugCount++;
} }
@ -85,9 +85,12 @@ class Tokenizer {
}, $scopeName); }, $scopeName);
} }
protected function tokenizeLine(string $line): array { protected function tokenizeLine(string $line, int $lineLength = 0): array {
$tokens = []; $tokens = [];
$lineLength = strlen($line); // When processing subpatterns a linelength is specified based upon the parent
// match's string length (like with captures), otherwise set the line length to
// the entire line.
$lineLength = ($lineLength === 0) ? strlen($line) : $lineLength;
while (true) { while (true) {
if ($this->activeInjection === null && $this->grammar->injections !== null) { if ($this->activeInjection === null && $this->grammar->injections !== null) {
@ -181,10 +184,10 @@ class Tokenizer {
// If the subpattern begins after the offset then create a token from the bits // If the subpattern begins after the offset then create a token from the bits
// of the line in-between the last token and the one(s) about to be created. // of the line in-between the last token and the one(s) about to be created.
if ($match[0][1] > $this->offset) { if ($match[0][1] > $this->offset) {
$tokens[] = new Token( $tokens[] = [
$this->scopeStack, 'scopes' => $this->scopeStack,
substr($line, $this->offset, $match[0][1] - $this->offset) 'text' => substr($line, $this->offset, $match[0][1] - $this->offset)
); ];
$this->debugCount++; $this->debugCount++;
$this->offset = $match[0][1]; $this->offset = $match[0][1];
} }
@ -205,10 +208,10 @@ class Tokenizer {
// If the capture begins after the offset then create a token from the bits of // If the capture begins after the offset then create a token from the bits of
// the line in-between the last token and the one(s) about to be created. // the line in-between the last token and the one(s) about to be created.
if ($k > 0 && $m[1] > $this->offset) { if ($k > 0 && $m[1] > $this->offset) {
$tokens[] = new Token( $tokens[] = [
$this->scopeStack, 'scopes' => $this->scopeStack,
substr($line, $this->offset, $m[1] - $this->offset) 'text' => substr($line, $this->offset, $m[1] - $this->offset)
); ];
$this->debugCount++; $this->debugCount++;
$this->offset = $m[1]; $this->offset = $m[1];
} }
@ -222,16 +225,18 @@ class Tokenizer {
// process the patterns, and then pop the capture off the stack. // process the patterns, and then pop the capture off the stack.
if ($pattern->captures[$k]->patterns !== null) { if ($pattern->captures[$k]->patterns !== null) {
$this->ruleStack[] = $pattern->captures[$k]; $this->ruleStack[] = $pattern->captures[$k];
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ]; // Only tokenize the part of the line that's contains the match.
$captureLength = $m[1] + strlen($m[0]);
$tokens = [ ...$tokens, ...$this->tokenizeLine($line, $captureLength) ];
// If the offset is before the end of the capture then create a token from the // If the offset is before the end of the capture then create a token from the
// bits of the capture from the offset until the end of the capture. // bits of the capture from the offset until the end of the capture.
$endOffset = $m[1] + strlen($m[0]); $endOffset = $captureLength;
if ($endOffset > $this->offset) { if ($endOffset > $this->offset) {
$tokens[] = new Token( $tokens[] = [
$this->scopeStack, 'scopes' => $this->scopeStack,
substr($line, $this->offset, $endOffset - $this->offset) 'text' => substr($line, $this->offset, $endOffset - $this->offset)
); ];
$this->debugCount++; $this->debugCount++;
$this->offset = $endOffset; $this->offset = $endOffset;
} }
@ -240,10 +245,10 @@ class Tokenizer {
} }
// Otherwise, create a token for the capture. // Otherwise, create a token for the capture.
else { else {
$tokens[] = new Token( $tokens[] = [
$this->scopeStack, 'scopes' => $this->scopeStack,
$m[0] 'text' => $m[0]
); ];
$this->debugCount++; $this->debugCount++;
} }
@ -258,10 +263,10 @@ class Tokenizer {
// Otherwise, if the rule doesn't have captures then a token is created from the // Otherwise, if the rule doesn't have captures then a token is created from the
// entire match, but only if the matched text isn't empty. // entire match, but only if the matched text isn't empty.
elseif ($match[0][0] !== '') { elseif ($match[0][0] !== '') {
$tokens[] = new Token( $tokens[] = [
$this->scopeStack, 'scopes' => $this->scopeStack,
$match[0][0] 'text' => $match[0][0]
); ];
$this->offset = $match[0][1] + strlen($match[0][0]); $this->offset = $match[0][1] + strlen($match[0][0]);
$this->debugCount++; $this->debugCount++;
@ -277,17 +282,20 @@ class Tokenizer {
// If the rule has patterns process tokens from its subpatterns. // If the rule has patterns process tokens from its subpatterns.
if ($pattern->patterns !== null && $this->offset < $lineLength) { if ($pattern->patterns !== null && $this->offset < $lineLength) {
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ]; // If the pattern has just a regular match (meaning neither a begin nor an end
// pattern) but has subpatterns then only tokenize the part of the line that's
// within the match.
$tokens = [ ...$tokens, ...$this->tokenizeLine($line, (!$pattern->beginPattern && !$pattern->endPattern) ? strlen($match[0][0]) : 0) ];
} }
// If the offset is before the end of the match then create a token from the // If the offset is before the end of the match then create a token from the
// bits of the match from the offset until the end of the match. // bits of the match from the offset until the end of the match.
$endOffset = $match[0][1] + strlen($match[0][0]); $endOffset = $match[0][1] + strlen($match[0][0]);
if ($endOffset > $this->offset) { if ($endOffset > $this->offset) {
$tokens[] = new Token( $tokens[] = [
$this->scopeStack, 'scopes' => $this->scopeStack,
substr($line, $this->offset, $endOffset - $this->offset) 'text' => substr($line, $this->offset, $endOffset - $this->offset)
); ];
$this->debugCount++; $this->debugCount++;
$this->offset = $endOffset; $this->offset = $endOffset;
} }

Loading…
Cancel
Save