diff --git a/lib/Grammar.php b/lib/Grammar.php index fd9173f..d7cf970 100644 --- a/lib/Grammar.php +++ b/lib/Grammar.php @@ -148,6 +148,7 @@ class Grammar { 'match' => null, 'patterns' => null, 'captures' => null, + 'beginPattern' => false, 'endPattern' => (isset($pattern['endPattern']) && $pattern['endPattern']) ]; @@ -238,6 +239,7 @@ class Grammar { $modified = true; break; case 'begin': + $p['beginPattern'] = true; case 'match': $value = str_replace('/', '\/', $value); $value = preg_replace_callback('/\\\(x|o)\{([0-9a-fA-F]{5,})\}/', function($matches) { diff --git a/lib/Grammar/Pattern.php b/lib/Grammar/Pattern.php index 3afe0df..955edb0 100644 --- a/lib/Grammar/Pattern.php +++ b/lib/Grammar/Pattern.php @@ -10,6 +10,7 @@ use dW\Lit\Grammar; /** Contains patterns responsible for matching a portion of the document */ class Pattern extends Rule { + protected bool $_beginPattern = false; protected ?array $_captures; protected ?string $_contentName; protected bool $_endPattern = false; @@ -19,7 +20,8 @@ class Pattern extends Rule { protected ?array $_patterns; - public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $match = null, ?array $patterns = null, ?array $captures = null, bool $endPattern = false) { + public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $match = null, ?array $patterns = null, ?array $captures = null, bool $beginPattern = false, bool $endPattern = false) { + $this->_beginPattern = $beginPattern; $this->_name = $name; $this->_contentName = $contentName; $this->_match = $match; @@ -37,7 +39,7 @@ class Pattern extends Rule { $p = $p->withOwnerGrammar($ownerGrammar); } } - + return $new; } } \ No newline at end of file diff --git a/lib/Highlight.php b/lib/Highlight.php index 684f198..bad5b55 100644 --- a/lib/Highlight.php +++ b/lib/Highlight.php @@ -28,8 +28,11 @@ class Highlight { $tokenList = $tokenizer->tokenize(); foreach ($tokenList as $lineNumber => $tokens) { - die(var_export($tokens)); - //echo "$lineNumber: $line\n"; + var_export($tokens); + echo "\n"; + if ($lineNumber === 2) { + die(); + } } } } \ No newline at end of file diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index cf7e8cc..00b20f8 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -23,7 +23,7 @@ class Tokenizer { protected ?Pattern $activeInjection = null; protected array $ruleStack; protected array $scopeStack; - protected $debug = false; + protected int $debug = 0; public function __construct(\Generator $data, Grammar $grammar) { @@ -36,6 +36,7 @@ class Tokenizer { public function tokenize(): \Generator { foreach ($this->data as $lineNumber => $line) { + $this->debug = $lineNumber; $this->offset = 0; $tokens = $this->tokenizeLine($line); @@ -97,6 +98,12 @@ class Tokenizer { // If the rule is a Pattern and matches the line at the offset then tokenize the // matches. if ($rule instanceof Pattern && preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) { + // ¡TEMPORARY! Haven't implemented begin and end line + // anchors, so let's toss them completely. + if (preg_match('/\\\(?:A|G)/', $rule->match)) { + continue 2; + } + // Add the name and contentName to the scope stack // if present. if ($rule->name !== null) { @@ -106,7 +113,6 @@ class Tokenizer { $this->scopeStack[] = $this->resolveScopeName($rule->contentName, $match); } - $wholeMatchCaptureScopeCount = 0; if ($rule->captures !== null) { // Iterate through each of the matched subpatterns and create tokens from the // captures. @@ -140,8 +146,7 @@ class Tokenizer { // The first match is the whole match, and if there are captures for it the name // and contentName should be added to the stack regardless of whether it has - // patterns or not. However, keep count of how many were added to the stack so - // they may be removed when this rule has finished tokenizing. + // patterns or not. if ($k === 0) { if (!isset($rule->captures[0])) { continue; @@ -149,11 +154,9 @@ class Tokenizer { if ($rule->captures[0]->name !== null) { $this->scopeStack[] = $this->resolveScopeName($rule->captures[0]->name, $match); - $wholeMatchCaptureScopeCount++; } if ($rule->captures[0]->contentName !== null) { $this->scopeStack[] = $this->resolveScopeName($rule->captures[0]->contentName, $match); - $wholeMatchCaptureScopeCount++; } } @@ -164,7 +167,7 @@ class Tokenizer { // The scope stack for the whole match is handled above, so only handle that for // other captures. - if ($k !== 0) { + if ($k > 0) { if ($rule->captures[$k]->name !== null) { $this->scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match); } @@ -177,7 +180,7 @@ class Tokenizer { // The scope stack for the whole match is handled above, so only handle that for // other captures. - if ($k !== 0) { + if ($k > 0) { if ($rule->captures[$k]->contentName !== null) { array_pop($this->scopeStack); } @@ -188,8 +191,21 @@ class Tokenizer { array_pop($this->ruleStack); } else { + // If it's not the 0 capture and a capture without any patterns add the name + // and content names if they exist to the token's scope stack but not to the + // global one. + $scopeStack = $this->scopeStack; + if ($k > 0) { + if ($rule->captures[$k]->name !== null) { + $scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match); + } + if ($rule->captures[$k]->contentName !== null) { + $scopeStack[] = $this->resolveScopeName($rule->captures[$k]->contentName, $match); + } + } + $tokens[] = new Token( - [ ...$this->scopeStack, $this->resolveScopeName($rule->captures[$k]->name, $match) ], + $scopeStack, $m[0] ); } @@ -201,10 +217,63 @@ class Tokenizer { $this->ruleStack[] = $rule; - if ($rule->patterns !== null) { + if ($rule->patterns !== null && $this->offset < $lineLength) { $tokens = [ ...$tokens, ...$this->tokenizeLine($line) ]; } + if (!$rule->beginPattern) { + if ($rule->endPattern) { + while (!end($this->ruleStack)->beginPattern) { + $popped = array_pop($this->ruleStack); + + if ($popped->captures !== null && isset($popped->captures[0])) { + if ($popped->captures[0]->contentName !== null) { + array_pop($this->scopeStack); + } + if ($popped->captures[0]->name !== null) { + array_pop($this->scopeStack); + } + } + + if ($popped->contentName !== null) { + array_pop($this->scopeStack); + } + if ($popped->name !== null) { + array_pop($this->scopeStack); + } + + // If what was just popped is the active injection then remove it, too + if ($popped === $this->activeInjection) { + $this->activeInjection = null; + } + } + } + + $popped = array_pop($this->ruleStack); + + if ($popped->captures !== null && isset($popped->captures[0])) { + if ($popped->captures[0]->contentName !== null) { + array_pop($this->scopeStack); + } + if ($popped->captures[0]->name !== null) { + array_pop($this->scopeStack); + } + } + + if ($popped->contentName !== null) { + array_pop($this->scopeStack); + } + if ($popped->name !== null) { + array_pop($this->scopeStack); + } + + // If what was just popped is the active injection then remove it, too. + if ($popped === $this->activeInjection) { + $this->activeInjection = null; + } + } + + /* // Remove the name and contentName from the scope stack if present. if ($rule->contentName !== null) { array_pop($this->scopeStack); @@ -222,22 +291,22 @@ class Tokenizer { // And remove the rule from the rule stack, too. $popped = array_pop($this->ruleStack); + // If what was just popped is the active injection then remove it, too. if ($popped === $this->activeInjection) { $this->activeInjection = null; - } + }*/ break 2; } // Otherwise, if the rule is a Reference then retrieve its patterns, splice into // the rule list, and reprocess the rule. elseif ($rule instanceof Reference && $obj = $rule->get()) { - if ($obj instanceof Grammar || $rule instanceof RepositoryReference) { + if ($obj instanceof Grammar || ($rule instanceof RepositoryReference && $obj->match === null)) { $obj = $obj->patterns; } array_splice($currentRules, $i, 1, $obj); - $currentRulesCount = count($currentRules); continue; }