Reverting to using UTF-8 and preg_match. mb_ereg is garbage

3 years ago · 4ed8ffcd26
4 changed files with 36 additions and 52 deletions
--- a/lib/Data.php
+++ b/lib/Data.php
@ -13,9 +13,7 @@ class Data {
        $fp = fopen($filepath, 'r');
        try {
            while ($line = fgets($fp)) {
-                // Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
-                // converting byte offsets to character offsets as easy as dividing by 4.
-                yield ++$lineNumber => mb_convert_encoding($line, 'UTF-32', $encoding);
+                yield ++$lineNumber => $line;
            }
        } finally {
            fclose($fp);
@ -25,9 +23,7 @@ class Data {
    public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator {
        $string = explode("\n", $string);
        foreach ($string as $lineNumber => $line) {
-            // Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
-            // converting byte offsets to character offsets as easy as dividing by 4.
-            yield $lineNumber + 1 => mb_convert_encoding($line, 'UTF-32', $encoding);
+            yield $lineNumber + 1 => $line;
        }
    }
 }
--- a/lib/Grammar.php
+++ b/lib/Grammar.php
@ -191,8 +191,8 @@ class Grammar {
                throw new Exception(Exception::JSON_MISSING_PROPERTY, $filename, 'end');
            }

-            $begin = $pattern['begin'];//str_replace('/', '\/', $pattern['begin']);
-            $p['match'] = $begin;//"/$begin/";
+            $begin = str_replace('/', '\/', $pattern['begin']);
+            $p['match'] = "/$begin/u";
            $modified = true;

            if (isset($pattern['beginCaptures'])) {
@ -209,7 +209,7 @@ class Grammar {
            }

            $endPattern = [
-                'match' => $pattern['end'],//"/" . str_replace('/', '\/', $pattern['end']) . "/",
+                'match' => '/' . str_replace('/', '\/', $pattern['end']) . '/u',
                'endPattern' => true
            ];

@ -236,8 +236,8 @@ class Grammar {
                    $modified = true;
                break;
                case 'match':
-                    //$value = str_replace('/', '\/', $value);
-                    $p['match'] = $value;//"/$value/";
+                    $value = str_replace('/', '\/', $value);
+                    $p['match'] = "/$value/u";
                    $modified = true;
                break;
                case 'captures':
--- a/lib/Highlight.php
+++ b/lib/Highlight.php
@ -9,30 +9,26 @@ use dW\Lit\Grammar\Exception;


 class Highlight {
-    public static function withFile(string $filepath, string $scopeName, string $encoding = 'UTF-8') {
-        return self::highlight(Data::fileToGenerator($filepath, $encoding), $scopeName, $encoding);
+    public static function withFile(string $filepath, string $scopeName) {
+        return self::highlight(Data::fileToGenerator($filepath), $scopeName);
    }

-    public static function withString(string $string, string $scopeName, string $encoding = 'UTF-8') {
-        return self::highlight(Data::stringToGenerator($string, $encoding), $scopeName, $encoding);
+    public static function withString(string $string, string $scopeName) {
+        return self::highlight(Data::stringToGenerator($string), $scopeName);
    }


-    protected static function highlight(\Generator $data, string $scopeName, string $encoding) {
+    protected static function highlight(\Generator $data, string $scopeName) {
        $grammar = GrammarRegistry::get($scopeName);
        if ($grammar === false) {
            throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
        }

-        mb_regex_encoding('UTF-32');
-
-        $tokenizer = new Tokenizer($data, $grammar, $encoding);
+        $tokenizer = new Tokenizer($data, $grammar);
        $tokenList = $tokenizer->tokenize();

        foreach ($tokenList as $lineNumber => $line) {
            echo "$lineNumber: $line\n";
        }
-
-        mb_regex_encoding();
    }
 }
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@ -14,15 +14,13 @@ use dW\Lit\Grammar\{

 class Tokenizer {
    protected \Generator $data;
-    protected string $encoding;
    protected Grammar $grammar;
    protected array $ruleStack;
    protected array $scopeStack;


-    public function __construct(\Generator $data, Grammar $grammar, string $encoding) {
+    public function __construct(\Generator $data, Grammar $grammar) {
        $this->data = $data;
-        $this->encoding = $encoding;
        $this->grammar = $grammar;
        $this->ruleStack = [ $this->grammar ];
        $this->scopeStack = [ $this->grammar->scopeName ];
@ -55,36 +53,10 @@ class Tokenizer {


    protected function getMatch(string $regex, string $line, int $offset = 0): ?array {
-        // Using mbstring's regular expressions because it truly supports multibyte
-        // strings but also because the original implementation used Oniguruma.
-        mb_ereg_search_init($line, mb_convert_encoding($regex, 'UTF-32'));
-
-        if ($offset !== 0) {
-            // UTF-32 uses 4 bytes for every character; multiply by 4 to convert from
-            // character offset to byte offset.
-            mb_ereg_search_setpos($offset * 4);
-        }
-
-        $pos = mb_ereg_search_pos();
-        if ($pos === false) {
+        if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE, $offset) !== 1) {
            return null;
        }

-        // UTF-32 uses 4 bytes for every character; divide by 4 to get character
-        // offsets.
-        $length = $pos[1] / 4;
-        $pos = [
-            'start' => $pos[0] / 4,
-        ];
-        $pos['end'] = $pos['start'] + $length;
-
-        $match = mb_ereg_search_getregs();
-        // Convert the matches back to the original encoding.
-        foreach ($match as &$m) {
-            $m = mb_convert_encoding($m, $this->encoding, 'UTF-32');
-        }
-
-        $match['offset'] = $pos;
        return $match;
    }

@ -93,13 +65,33 @@ class Tokenizer {
        $currentRulesCount = count($currentRules);
        $results = [];
        $line = $inputLine;
+        $lineLength = strlen($line);

        for ($i = 0; $i < $currentRulesCount; $i++) {
            while (true) {
                $rule = $currentRules[$i];
                if ($rule instanceof Pattern) {
                    if ($match = $this->getMatch($rule->match, $line, $offset)) {
-                        $offset = $match['offset']['end'];
+                        $tokens = [];
+                        unset($match[0]);
+                        foreach ($match as $k => $m) {
+                            if ($m[1] > $offset) {
+                                $tokens[] = [
+                                    'scope' => $this->scopeStack,
+                                    'string' => substr($line, $offset, $m[1])
+                                ];
+                                $offset = $m[1];
+                            }
+
+                            $tokens[] = [
+                                'scope' => [ ...$this->scopeStack, $rule->captures[$k]->name ],
+                                'string' => $m[0]
+                            ];
+                            $offset = $m[1] + strlen($m[0]);
+                        }
+
+                        echo "\n";
+                        die(var_export($tokens));
                    }
                } elseif ($rule instanceof Reference && $obj = $rule->get()) {
                    if ($obj instanceof PatternList) {