Browse Source

Reverting to using UTF-8 and preg_match. mb_ereg is garbage

main
Dustin Wilson 3 years ago
parent
commit
4ed8ffcd26
  1. 8
      lib/Data.php
  2. 10
      lib/Grammar.php
  3. 16
      lib/Highlight.php
  4. 54
      lib/Tokenizer.php

8
lib/Data.php

@ -13,9 +13,7 @@ class Data {
$fp = fopen($filepath, 'r');
try {
while ($line = fgets($fp)) {
// Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
// converting byte offsets to character offsets as easy as dividing by 4.
yield ++$lineNumber => mb_convert_encoding($line, 'UTF-32', $encoding);
yield ++$lineNumber => $line;
}
} finally {
fclose($fp);
@ -25,9 +23,7 @@ class Data {
public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator {
$string = explode("\n", $string);
foreach ($string as $lineNumber => $line) {
// Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
// converting byte offsets to character offsets as easy as dividing by 4.
yield $lineNumber + 1 => mb_convert_encoding($line, 'UTF-32', $encoding);
yield $lineNumber + 1 => $line;
}
}
}

10
lib/Grammar.php

@ -191,8 +191,8 @@ class Grammar {
throw new Exception(Exception::JSON_MISSING_PROPERTY, $filename, 'end');
}
$begin = $pattern['begin'];//str_replace('/', '\/', $pattern['begin']);
$p['match'] = $begin;//"/$begin/";
$begin = str_replace('/', '\/', $pattern['begin']);
$p['match'] = "/$begin/u";
$modified = true;
if (isset($pattern['beginCaptures'])) {
@ -209,7 +209,7 @@ class Grammar {
}
$endPattern = [
'match' => $pattern['end'],//"/" . str_replace('/', '\/', $pattern['end']) . "/",
'match' => '/' . str_replace('/', '\/', $pattern['end']) . '/u',
'endPattern' => true
];
@ -236,8 +236,8 @@ class Grammar {
$modified = true;
break;
case 'match':
//$value = str_replace('/', '\/', $value);
$p['match'] = $value;//"/$value/";
$value = str_replace('/', '\/', $value);
$p['match'] = "/$value/u";
$modified = true;
break;
case 'captures':

16
lib/Highlight.php

@ -9,30 +9,26 @@ use dW\Lit\Grammar\Exception;
class Highlight {
public static function withFile(string $filepath, string $scopeName, string $encoding = 'UTF-8') {
return self::highlight(Data::fileToGenerator($filepath, $encoding), $scopeName, $encoding);
public static function withFile(string $filepath, string $scopeName) {
return self::highlight(Data::fileToGenerator($filepath), $scopeName);
}
public static function withString(string $string, string $scopeName, string $encoding = 'UTF-8') {
return self::highlight(Data::stringToGenerator($string, $encoding), $scopeName, $encoding);
public static function withString(string $string, string $scopeName) {
return self::highlight(Data::stringToGenerator($string), $scopeName);
}
protected static function highlight(\Generator $data, string $scopeName, string $encoding) {
protected static function highlight(\Generator $data, string $scopeName) {
$grammar = GrammarRegistry::get($scopeName);
if ($grammar === false) {
throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
}
mb_regex_encoding('UTF-32');
$tokenizer = new Tokenizer($data, $grammar, $encoding);
$tokenizer = new Tokenizer($data, $grammar);
$tokenList = $tokenizer->tokenize();
foreach ($tokenList as $lineNumber => $line) {
echo "$lineNumber: $line\n";
}
mb_regex_encoding();
}
}

54
lib/Tokenizer.php

@ -14,15 +14,13 @@ use dW\Lit\Grammar\{
class Tokenizer {
protected \Generator $data;
protected string $encoding;
protected Grammar $grammar;
protected array $ruleStack;
protected array $scopeStack;
public function __construct(\Generator $data, Grammar $grammar, string $encoding) {
public function __construct(\Generator $data, Grammar $grammar) {
$this->data = $data;
$this->encoding = $encoding;
$this->grammar = $grammar;
$this->ruleStack = [ $this->grammar ];
$this->scopeStack = [ $this->grammar->scopeName ];
@ -55,36 +53,10 @@ class Tokenizer {
protected function getMatch(string $regex, string $line, int $offset = 0): ?array {
// Using mbstring's regular expressions because it truly supports multibyte
// strings but also because the original implementation used Oniguruma.
mb_ereg_search_init($line, mb_convert_encoding($regex, 'UTF-32'));
if ($offset !== 0) {
// UTF-32 uses 4 bytes for every character; multiply by 4 to convert from
// character offset to byte offset.
mb_ereg_search_setpos($offset * 4);
}
$pos = mb_ereg_search_pos();
if ($pos === false) {
if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE, $offset) !== 1) {
return null;
}
// UTF-32 uses 4 bytes for every character; divide by 4 to get character
// offsets.
$length = $pos[1] / 4;
$pos = [
'start' => $pos[0] / 4,
];
$pos['end'] = $pos['start'] + $length;
$match = mb_ereg_search_getregs();
// Convert the matches back to the original encoding.
foreach ($match as &$m) {
$m = mb_convert_encoding($m, $this->encoding, 'UTF-32');
}
$match['offset'] = $pos;
return $match;
}
@ -93,13 +65,33 @@ class Tokenizer {
$currentRulesCount = count($currentRules);
$results = [];
$line = $inputLine;
$lineLength = strlen($line);
for ($i = 0; $i < $currentRulesCount; $i++) {
while (true) {
$rule = $currentRules[$i];
if ($rule instanceof Pattern) {
if ($match = $this->getMatch($rule->match, $line, $offset)) {
$offset = $match['offset']['end'];
$tokens = [];
unset($match[0]);
foreach ($match as $k => $m) {
if ($m[1] > $offset) {
$tokens[] = [
'scope' => $this->scopeStack,
'string' => substr($line, $offset, $m[1])
];
$offset = $m[1];
}
$tokens[] = [
'scope' => [ ...$this->scopeStack, $rule->captures[$k]->name ],
'string' => $m[0]
];
$offset = $m[1] + strlen($m[0]);
}
echo "\n";
die(var_export($tokens));
}
} elseif ($rule instanceof Reference && $obj = $rule->get()) {
if ($obj instanceof PatternList) {

Loading…
Cancel
Save