Browse Source

Reverting to using UTF-8 and preg_match. mb_ereg is garbage

main
Dustin Wilson 3 years ago
parent
commit
4ed8ffcd26
  1. 8
      lib/Data.php
  2. 10
      lib/Grammar.php
  3. 16
      lib/Highlight.php
  4. 54
      lib/Tokenizer.php

8
lib/Data.php

@ -13,9 +13,7 @@ class Data {
$fp = fopen($filepath, 'r'); $fp = fopen($filepath, 'r');
try { try {
while ($line = fgets($fp)) { while ($line = fgets($fp)) {
// Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making yield ++$lineNumber => $line;
// converting byte offsets to character offsets as easy as dividing by 4.
yield ++$lineNumber => mb_convert_encoding($line, 'UTF-32', $encoding);
} }
} finally { } finally {
fclose($fp); fclose($fp);
@ -25,9 +23,7 @@ class Data {
public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator { public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator {
$string = explode("\n", $string); $string = explode("\n", $string);
foreach ($string as $lineNumber => $line) { foreach ($string as $lineNumber => $line) {
// Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making yield $lineNumber + 1 => $line;
// converting byte offsets to character offsets as easy as dividing by 4.
yield $lineNumber + 1 => mb_convert_encoding($line, 'UTF-32', $encoding);
} }
} }
} }

10
lib/Grammar.php

@ -191,8 +191,8 @@ class Grammar {
throw new Exception(Exception::JSON_MISSING_PROPERTY, $filename, 'end'); throw new Exception(Exception::JSON_MISSING_PROPERTY, $filename, 'end');
} }
$begin = $pattern['begin'];//str_replace('/', '\/', $pattern['begin']); $begin = str_replace('/', '\/', $pattern['begin']);
$p['match'] = $begin;//"/$begin/"; $p['match'] = "/$begin/u";
$modified = true; $modified = true;
if (isset($pattern['beginCaptures'])) { if (isset($pattern['beginCaptures'])) {
@ -209,7 +209,7 @@ class Grammar {
} }
$endPattern = [ $endPattern = [
'match' => $pattern['end'],//"/" . str_replace('/', '\/', $pattern['end']) . "/", 'match' => '/' . str_replace('/', '\/', $pattern['end']) . '/u',
'endPattern' => true 'endPattern' => true
]; ];
@ -236,8 +236,8 @@ class Grammar {
$modified = true; $modified = true;
break; break;
case 'match': case 'match':
//$value = str_replace('/', '\/', $value); $value = str_replace('/', '\/', $value);
$p['match'] = $value;//"/$value/"; $p['match'] = "/$value/u";
$modified = true; $modified = true;
break; break;
case 'captures': case 'captures':

16
lib/Highlight.php

@ -9,30 +9,26 @@ use dW\Lit\Grammar\Exception;
class Highlight { class Highlight {
public static function withFile(string $filepath, string $scopeName, string $encoding = 'UTF-8') { public static function withFile(string $filepath, string $scopeName) {
return self::highlight(Data::fileToGenerator($filepath, $encoding), $scopeName, $encoding); return self::highlight(Data::fileToGenerator($filepath), $scopeName);
} }
public static function withString(string $string, string $scopeName, string $encoding = 'UTF-8') { public static function withString(string $string, string $scopeName) {
return self::highlight(Data::stringToGenerator($string, $encoding), $scopeName, $encoding); return self::highlight(Data::stringToGenerator($string), $scopeName);
} }
protected static function highlight(\Generator $data, string $scopeName, string $encoding) { protected static function highlight(\Generator $data, string $scopeName) {
$grammar = GrammarRegistry::get($scopeName); $grammar = GrammarRegistry::get($scopeName);
if ($grammar === false) { if ($grammar === false) {
throw new Exception(Exception::GRAMMAR_MISSING, $scopeName); throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
} }
mb_regex_encoding('UTF-32'); $tokenizer = new Tokenizer($data, $grammar);
$tokenizer = new Tokenizer($data, $grammar, $encoding);
$tokenList = $tokenizer->tokenize(); $tokenList = $tokenizer->tokenize();
foreach ($tokenList as $lineNumber => $line) { foreach ($tokenList as $lineNumber => $line) {
echo "$lineNumber: $line\n"; echo "$lineNumber: $line\n";
} }
mb_regex_encoding();
} }
} }

54
lib/Tokenizer.php

@ -14,15 +14,13 @@ use dW\Lit\Grammar\{
class Tokenizer { class Tokenizer {
protected \Generator $data; protected \Generator $data;
protected string $encoding;
protected Grammar $grammar; protected Grammar $grammar;
protected array $ruleStack; protected array $ruleStack;
protected array $scopeStack; protected array $scopeStack;
public function __construct(\Generator $data, Grammar $grammar, string $encoding) { public function __construct(\Generator $data, Grammar $grammar) {
$this->data = $data; $this->data = $data;
$this->encoding = $encoding;
$this->grammar = $grammar; $this->grammar = $grammar;
$this->ruleStack = [ $this->grammar ]; $this->ruleStack = [ $this->grammar ];
$this->scopeStack = [ $this->grammar->scopeName ]; $this->scopeStack = [ $this->grammar->scopeName ];
@ -55,36 +53,10 @@ class Tokenizer {
protected function getMatch(string $regex, string $line, int $offset = 0): ?array { protected function getMatch(string $regex, string $line, int $offset = 0): ?array {
// Using mbstring's regular expressions because it truly supports multibyte if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE, $offset) !== 1) {
// strings but also because the original implementation used Oniguruma.
mb_ereg_search_init($line, mb_convert_encoding($regex, 'UTF-32'));
if ($offset !== 0) {
// UTF-32 uses 4 bytes for every character; multiply by 4 to convert from
// character offset to byte offset.
mb_ereg_search_setpos($offset * 4);
}
$pos = mb_ereg_search_pos();
if ($pos === false) {
return null; return null;
} }
// UTF-32 uses 4 bytes for every character; divide by 4 to get character
// offsets.
$length = $pos[1] / 4;
$pos = [
'start' => $pos[0] / 4,
];
$pos['end'] = $pos['start'] + $length;
$match = mb_ereg_search_getregs();
// Convert the matches back to the original encoding.
foreach ($match as &$m) {
$m = mb_convert_encoding($m, $this->encoding, 'UTF-32');
}
$match['offset'] = $pos;
return $match; return $match;
} }
@ -93,13 +65,33 @@ class Tokenizer {
$currentRulesCount = count($currentRules); $currentRulesCount = count($currentRules);
$results = []; $results = [];
$line = $inputLine; $line = $inputLine;
$lineLength = strlen($line);
for ($i = 0; $i < $currentRulesCount; $i++) { for ($i = 0; $i < $currentRulesCount; $i++) {
while (true) { while (true) {
$rule = $currentRules[$i]; $rule = $currentRules[$i];
if ($rule instanceof Pattern) { if ($rule instanceof Pattern) {
if ($match = $this->getMatch($rule->match, $line, $offset)) { if ($match = $this->getMatch($rule->match, $line, $offset)) {
$offset = $match['offset']['end']; $tokens = [];
unset($match[0]);
foreach ($match as $k => $m) {
if ($m[1] > $offset) {
$tokens[] = [
'scope' => $this->scopeStack,
'string' => substr($line, $offset, $m[1])
];
$offset = $m[1];
}
$tokens[] = [
'scope' => [ ...$this->scopeStack, $rule->captures[$k]->name ],
'string' => $m[0]
];
$offset = $m[1] + strlen($m[0]);
}
echo "\n";
die(var_export($tokens));
} }
} elseif ($rule instanceof Reference && $obj = $rule->get()) { } elseif ($rule instanceof Reference && $obj = $rule->get()) {
if ($obj instanceof PatternList) { if ($obj instanceof PatternList) {

Loading…
Cancel
Save