Browse Source

Minor fixes, still broken lol

• When parsing JSON grammars match regexes now only escape unescaped 
forward slashes.
• When parsing JSON grammars match regexes now truncate unicode 
character codes larger than 0x10ffff to 0x10ffff, the largest possible 
unicode character.
• Content names should only be applied to what is between begin/end 
patterns. Might need to fix to not apply to end patterns themselves.
main
Dustin Wilson 3 years ago
parent
commit
699aeebf93
  1. 10
      lib/Grammar.php
  2. 127
      lib/Tokenizer.php

10
lib/Grammar.php

@ -241,10 +241,12 @@ class Grammar {
case 'begin':
$p['beginPattern'] = true;
case 'match':
$value = str_replace('/', '\/', $value);
$value = preg_replace_callback('/\\\(x|o)\{([0-9a-fA-F]{5,})\}/', function($matches) {
$code = substr($matches[2], 0, 4);
return "\\{$matches[1]}{"."$code}";
// Escape forward slashes that aren't escaped in regexes.
$value = preg_replace('/(?<!\\\)\//', '\/', $value);
// Truncate unicode character codes that are too long.
$value = preg_replace_callback('/\\\x\{([0-9A-Fa-f]{6,})\}/', function($matches) {
$code = ((int)base_convert($matches[1], 16, 10) > 0x10ffff) ? '10ffff' : $matches[1];
return "\\x{"."$code}";
}, $value);
$p['match'] = "/$value/u";

127
lib/Tokenizer.php

@ -95,12 +95,16 @@ class Tokenizer {
while (true) {
$rule = $currentRules[$i];
if ($rule instanceof Pattern) {
echo "Match: {$rule->match}\n\n";
}
// If the rule is a Pattern and matches the line at the offset then tokenize the
// matches.
if ($rule instanceof Pattern && preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) {
// ¡TEMPORARY! Haven't implemented begin and end line
// anchors, so let's toss them completely.
if (preg_match('/\\\(?:A|G)/', $rule->match)) {
if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) {
continue 2;
}
@ -109,9 +113,6 @@ class Tokenizer {
if ($rule->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->name, $match);
}
if ($rule->contentName !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->contentName, $match);
}
if ($rule->captures !== null) {
// Iterate through each of the matched subpatterns and create tokens from the
@ -128,13 +129,8 @@ class Tokenizer {
// If this is the first capture, then the scopes added to the stack need to be
// removed from this token's scope stack as this will grab everything before
// this match began.
if ($k === 0) {
if ($rule->contentName !== null) {
array_pop($scopeStack);
}
if ($rule->name !== null) {
array_pop($scopeStack);
}
if ($k === 0 && $rule->name !== null) {
array_pop($scopeStack);
}
$tokens[] = new Token(
@ -144,20 +140,8 @@ class Tokenizer {
$this->offset = $m[1];
}
// The first match is the whole match, and if there are captures for it the name
// and contentName should be added to the stack regardless of whether it has
// patterns or not.
if ($k === 0) {
if (!isset($rule->captures[0])) {
continue;
}
if ($rule->captures[0]->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->captures[0]->name, $match);
}
if ($rule->captures[0]->contentName !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->captures[0]->contentName, $match);
}
if ($k === 0 && !isset($rule->captures[0])) {
continue;
}
// If the capture rule has patterns of its own then
@ -165,43 +149,20 @@ class Tokenizer {
if ($rule->captures[$k]->patterns !== null) {
$this->ruleStack[] = $rule->captures[$k];
// The scope stack for the whole match is handled above, so only handle that for
// other captures.
if ($k > 0) {
if ($rule->captures[$k]->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match);
}
if ($rule->captures[$k]->contentName !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->captures[$k]->contentName, $match);
}
if ($rule->captures[$k]->name !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match);
}
$tokens = [ ...$tokens, ...$this->tokenizeLine($line) ];
// The scope stack for the whole match is handled above, so only handle that for
// other captures.
if ($k > 0) {
if ($rule->captures[$k]->contentName !== null) {
array_pop($this->scopeStack);
}
if ($rule->captures[$k]->name !== null) {
array_pop($this->scopeStack);
}
}
array_pop($this->ruleStack);
} else {
// If it's not the 0 capture and a capture without any patterns add the name
// and content names if they exist to the token's scope stack but not to the
// global one.
$scopeStack = $this->scopeStack;
if ($k > 0) {
if ($rule->captures[$k]->name !== null) {
$scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match);
}
if ($rule->captures[$k]->contentName !== null) {
$scopeStack[] = $this->resolveScopeName($rule->captures[$k]->contentName, $match);
}
if ($rule->captures[$k]->name !== null) {
$scopeStack[] = $this->resolveScopeName($rule->captures[$k]->name, $match);
}
$tokens[] = new Token(
@ -210,11 +171,20 @@ class Tokenizer {
);
}
if ($rule->captures[$k]->name !== null) {
array_pop($this->scopeStack);
}
$this->offset = $m[1] + strlen($m[0]);
$firstCapture = false;
}
}
// If the pattern is a begin pattern and has a content name then add that to the
// scope stack before processing the children.
if ($rule->beginPattern && $rule->contentName !== null) {
$this->scopeStack[] = $this->resolveScopeName($rule->contentName, $match);
}
$this->ruleStack[] = $rule;
if ($rule->patterns !== null && $this->offset < $lineLength) {
@ -226,23 +196,11 @@ class Tokenizer {
while (!end($this->ruleStack)->beginPattern) {
$popped = array_pop($this->ruleStack);
if ($popped->captures !== null && isset($popped->captures[0])) {
if ($popped->captures[0]->contentName !== null) {
array_pop($this->scopeStack);
}
if ($popped->captures[0]->name !== null) {
array_pop($this->scopeStack);
}
}
if ($popped->contentName !== null) {
array_pop($this->scopeStack);
}
if ($popped->name !== null) {
array_pop($this->scopeStack);
}
// If what was just popped is the active injection then remove it, too
// If what was just popped is the active injection then remove it, too.
if ($popped === $this->activeInjection) {
$this->activeInjection = null;
}
@ -251,16 +209,9 @@ class Tokenizer {
$popped = array_pop($this->ruleStack);
if ($popped->captures !== null && isset($popped->captures[0])) {
if ($popped->captures[0]->contentName !== null) {
array_pop($this->scopeStack);
}
if ($popped->captures[0]->name !== null) {
array_pop($this->scopeStack);
}
}
if ($popped->contentName !== null) {
// If the original rule was an end pattern the one just popped will be its begin
// pattern. Pop its content name if it exists.
if ($rule->endPattern && $popped->contentName !== null) {
array_pop($this->scopeStack);
}
if ($popped->name !== null) {
@ -273,30 +224,6 @@ class Tokenizer {
}
}
/*
// Remove the name and contentName from the scope stack if present.
if ($rule->contentName !== null) {
array_pop($this->scopeStack);
}
if ($rule->name !== null) {
array_pop($this->scopeStack);
}
// If the rule has a whole match capture (0) then remove its name and
// contentName, too.
$j = 0;
while ($j++ < $wholeMatchCaptureScopeCount) {
array_pop($this->scopeStack);
}
// And remove the rule from the rule stack, too.
$popped = array_pop($this->ruleStack);
// If what was just popped is the active injection then remove it, too.
if ($popped === $this->activeInjection) {
$this->activeInjection = null;
}*/
break 2;
}
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into

Loading…
Cancel
Save