Many changes

• Lines are now converted to UTF-32 while tokenizing so that byte offsets may be cleanly converted to character offsets • Now when grammars are parsed into Grammar objects begin and end matches are converted to regular matches by adding end matches to the pattern's pattern list to simplify tokenization. • Highlight::withFile and Highlight::withString now accept an encoding parameter which defaults to UTF-8.
3 years ago · 5a3322a0cb
7 changed files with 128 additions and 76 deletions
--- a/composer.json
+++ b/composer.json
@ -12,9 +12,10 @@
    ],
    "require": {
        "php": "^7.4 || ^8.0",
+        "ext-dom": "*",
        "ext-intl": "*",
        "ext-json": "*",
-        "ext-dom": "*",
+        "ext-mbstring": "*",
        "docopt/docopt": "^1.0"
    },
    "autoload": {
--- a/composer.lock
+++ b/composer.lock
@ -4,7 +4,7 @@
        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
        "This file is @generated automatically"
    ],
-    "content-hash": "eb84c086d7c773cf5f8ad5ad2b9e546e",
+    "content-hash": "7f3c88aa5023ebb6ebad3e513973d927",
    "packages": [
        {
            "name": "docopt/docopt",
@ -65,10 +65,11 @@
    "prefer-lowest": false,
    "platform": {
        "php": "^7.4 || ^8.0",
+        "ext-dom": "*",
        "ext-intl": "*",
        "ext-json": "*",
-        "ext-dom": "*"
+        "ext-mbstring": "*"
    },
    "platform-dev": [],
-    "plugin-api-version": "2.1.0"
+    "plugin-api-version": "2.0.0"
 }
--- a/lib/Data.php
+++ b/lib/Data.php
@ -8,22 +8,26 @@ namespace dW\Lit;


 class Data {
-    public static function fileToGenerator(string $filepath): \Generator {
+    public static function fileToGenerator(string $filepath, string $encoding = 'UTF-8'): \Generator {
        $lineNumber = 0;
        $fp = fopen($filepath, 'r');
        try {
            while ($line = fgets($fp)) {
-                yield ++$lineNumber => $line;
+                // Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
+                // converting byte offsets to character offsets as easy as dividing by 4.
+                yield ++$lineNumber => mb_convert_encoding($line, 'UTF-32', $encoding);
            }
        } finally {
            fclose($fp);
        }
    }

-    public static function stringToGenerator(string $string): \Generator {
+    public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator {
        $string = explode("\n", $string);
        foreach ($string as $lineNumber => $line) {
-            yield $lineNumber + 1 => $line;
+            // Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
+            // converting byte offsets to character offsets as easy as dividing by 4.
+            yield $lineNumber + 1 => mb_convert_encoding($line, 'UTF-32', $encoding);
        }
    }
 }
--- a/lib/Grammar.php
+++ b/lib/Grammar.php
@ -148,7 +148,7 @@ class Grammar {
    }


-    protected function parseJSONPattern(array $pattern, string $filename): Pattern|Reference|\WeakReference|null {
+    protected function parseJSONPattern(array $pattern, string $filename): Pattern|Reference|null {
        if (isset($pattern['include'])) {
            if ($pattern['include'][0] === '#') {
                return new RepositoryReference(substr($pattern['include'], 1), $this);
@ -165,40 +165,82 @@ class Grammar {
            'ownerGrammar' => $this,
            'name' => null,
            'contentName' => null,
-            'begin' => null,
-            'end' => null,
            'match' => null,
            'patterns' => null,
            'captures' => null,
-            'beginCaptures' => null,
-            'endCaptures' => null,
-            'applyEndPatternLast' => false
+            'endPattern' => false
        ];

        $modified = false;
+
+        $applyEndPatternLast = false;
+        if (isset($pattern['applyEndPatternLast'])) {
+            $applyEndPatternLast = $pattern['applyEndPatternLast'];
+            if (!is_bool($applyEndPatternLast) || (!is_int($applyEndPatternLast) && ($applyEndPatternLast !== 0 && $applyEndPatternLast !== 1))) {
+                throw new Exception(Exception::JSON_INVALID_TYPE, 'Boolean, 0, or 1', 'applyEndPatternLast', gettype($applyEndPatternLast), $filename);
+            }
+
+            $applyEndPatternLast = (bool)$applyEndPatternLast;
+        }
+
+        // Begin and end matches are handled in this implementation by parsing begin
+        // matches as regular matches and appending the end match as a pattern to the
+        // end of the pattern's patterns.
+        if (isset($pattern['begin'])) {
+            if (!isset($pattern['end'])) {
+                throw new Exception(Exception::JSON_MISSING_PROPERTY, $filename, 'end');
+            }
+
+            $begin = $pattern['begin'];//str_replace('/', '\/', $pattern['begin']);
+            $p['match'] = $begin;//"/$begin/";
+            $modified = true;
+
+            if (isset($pattern['beginCaptures'])) {
+                $pattern['captures'] = $pattern['beginCaptures'];
+            } elseif (isset($pattern['captures'])) {
+                $pattern['captures'] = $pattern['captures'];
+            }
+
+            $endCaptures = null;
+            if (isset($pattern['endCaptures'])) {
+                $endCaptures = $pattern['endCaptures'];
+            } elseif (isset($pattern['captures'])) {
+                $endCaptures = $pattern['captures'];
+            }
+
+            $endPattern = [
+                'match' => $pattern['end'],//"/" . str_replace('/', '\/', $pattern['end']) . "/",
+                'endPattern' => true
+            ];
+
+            if ($endCaptures !== null) {
+                $endPattern['captures'] = $endCaptures;
+            }
+
+            if (isset($pattern['patterns'])) {
+                if ($applyEndPatternLast) {
+                    $pattern['patterns'][] = $endPattern;
+                } else {
+                    array_unshift($pattern['patterns'], $endPattern);
+                }
+            } else {
+                $pattern['patterns'] = [ $endPattern ];
+            }
+        }
+
        foreach ($pattern as $key => $value) {
            switch ($key) {
-                case 'applyEndPatternLast':
-                    if (!is_bool($value) || (!is_int($value) && ($value !== 0 && $value !== 1))) {
-                        throw new Exception(Exception::JSON_INVALID_TYPE, 'Boolean, 0, or 1', 'applyEndPatternLast', gettype($value), $filename);
-                    }
-
-                    $value = (bool)$value;
                case 'name':
                case 'contentName':
                    $p[$key] = $value;
                    $modified = true;
                break;
-                case 'begin':
-                case 'end':
                case 'match':
-                    $value = str_replace('/', '\/', $value);
-                    $p[$key] = "/$value/";
+                    //$value = str_replace('/', '\/', $value);
+                    $p['match'] = $value;//"/$value/";
                    $modified = true;
                break;
                case 'captures':
-                case 'beginCaptures':
-                case 'endCaptures':
                    if (!is_array($value)) {
                        throw new Exception(Exception::JSON_INVALID_TYPE, 'Array', $key, gettype($value), $filename);
                    }
@ -240,7 +282,7 @@ class Grammar {
        return ($modified) ? new Pattern(...$p) : null;
    }

-    protected function parseJSONPatternList(array $list, string $filename): Pattern|PatternList|null {
+    protected function parseJSONPatternList(array $list, string $filename): ?PatternList {
        $result = [];
        foreach ($list as $pattern) {
            $p = $this->parseJSONPattern($pattern, $filename);
--- a/lib/Grammar/Pattern.php
+++ b/lib/Grammar/Pattern.php
@ -11,29 +11,22 @@ use dW\Lit\Grammar;
 /** Contains patterns responsible for matching a portion of the document */
 class Pattern extends Rule {
    protected bool $_applyEndPatternLast = false;
-    protected ?string $_begin;
-    protected ?CaptureList $_beginCaptures;
    protected ?CaptureList $_captures;
    protected ?string $_contentName;
-    protected ?string $_end;
-    protected ?CaptureList $_endCaptures;
+    protected bool $_endPattern = false;
    protected ?string $_match;
    protected ?string $_name;
    protected \WeakReference $_ownerGrammar;
    protected ?PatternList $_patterns;


-    public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $begin = null, ?string $end = null, ?string $match = null, ?PatternList $patterns = null, ?CaptureList $captures = null, ?CaptureList $beginCaptures = null, ?CaptureList $endCaptures = null, bool $applyEndPatternLast = false) {
+    public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $match = null, ?PatternList $patterns = null, ?CaptureList $captures = null, bool $endPattern = false) {
        $this->_name = $name;
        $this->_contentName = $contentName;
-        $this->_begin = $begin;
-        $this->_end = $end;
        $this->_match = $match;
        $this->_patterns = $patterns;
        $this->_captures = $captures;
-        $this->_beginCaptures = $beginCaptures;
-        $this->_endCaptures = $endCaptures;
-        $this->_applyEndPatternLast = $applyEndPatternLast;
+        $this->_endPattern = $endPattern;
        $this->_ownerGrammar = ($ownerGrammar === null) ? null : \WeakReference::create($ownerGrammar);
    }
 }
--- a/lib/Highlight.php
+++ b/lib/Highlight.php
@ -9,26 +9,30 @@ use dW\Lit\Grammar\Exception;


 class Highlight {
-    public static function withFile(string $filepath, string $scopeName) {
-        return self::highlight(Data::fileToGenerator($filepath), $scopeName);
+    public static function withFile(string $filepath, string $scopeName, string $encoding = 'UTF-8') {
+        return self::highlight(Data::fileToGenerator($filepath, $encoding), $scopeName, $encoding);
    }

-    public static function withString(string $string, string $scopeName) {
-        return self::highlight(Data::stringToGenerator($string), $scopeName);
+    public static function withString(string $string, string $scopeName, string $encoding = 'UTF-8') {
+        return self::highlight(Data::stringToGenerator($string, $encoding), $scopeName, $encoding);
    }


-    protected static function highlight(\Generator $data, string $scopeName) {
+    protected static function highlight(\Generator $data, string $scopeName, string $encoding) {
        $grammar = GrammarRegistry::get($scopeName);
        if ($grammar === false) {
            throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
        }

-        $tokenizer = new Tokenizer($data, $grammar);
+        mb_regex_encoding('UTF-32');
+
+        $tokenizer = new Tokenizer($data, $grammar, $encoding);
        $tokenList = $tokenizer->tokenize();

        foreach ($tokenList as $lineNumber => $line) {
            echo "$lineNumber: $line\n";
        }
+
+        mb_regex_encoding();
    }
 }
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@ -14,13 +14,15 @@ use dW\Lit\Grammar\{

 class Tokenizer {
    protected \Generator $data;
+    protected string $encoding;
    protected Grammar $grammar;
    protected array $ruleStack;
    protected array $scopeStack;
-    

-    public function __construct(\Generator $data, Grammar $grammar) {
+
+    public function __construct(\Generator $data, Grammar $grammar, string $encoding) {
        $this->data = $data;
+        $this->encoding = $encoding;
        $this->grammar = $grammar;
        $this->ruleStack = [ $this->grammar ];
        $this->scopeStack = [ $this->grammar->scopeName ];
@ -33,9 +35,9 @@ class Tokenizer {

    public function tokenize(): \Generator {
        $appendNewLine = true;
-
        foreach ($this->data as $lineNumber => $inputLine) {
-            $line = $inputLine;
+            yield $lineNumber => $this->_tokenize($inputLine);
+            /*$line = $inputLine;
            $lineWithNewLine = ($appendNewLine) ? "$line\n" : $line;
            $initialStackRuleLength = count($this->ruleStack);
            $position = 0;
@ -47,20 +49,46 @@ class Tokenizer {
                if ($position > mb_strlen($line)) {
                    break;
                }
-            }
+            }*/
        }
    }


-    protected function getMatch(string $regex, string $line): ?array {
-        if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE) !== 1) {
+    protected function getMatch(string $regex, string $line, int $offset = 0): ?array {
+        // Using mbstring's regular expressions because it truly supports multibyte
+        // strings but also because the original implementation used Oniguruma.
+        mb_ereg_search_init($line, mb_convert_encoding($regex, 'UTF-32'));
+
+        if ($offset !== 0) {
+            // UTF-32 uses 4 bytes for every character; multiply by 4 to convert from
+            // character offset to byte offset.
+            mb_ereg_search_setpos($offset * 4);
+        }
+
+        $pos = mb_ereg_search_pos();
+        if ($pos === false) {
            return null;
        }

+        // UTF-32 uses 4 bytes for every character; divide by 4 to get character
+        // offsets.
+        $length = $pos[1] / 4;
+        $pos = [
+            'start' => $pos[0] / 4,
+        ];
+        $pos['end'] = $pos['start'] + $length;
+
+        $match = mb_ereg_search_getregs();
+        // Convert the matches back to the original encoding.
+        foreach ($match as &$m) {
+            $m = mb_convert_encoding($m, $this->encoding, 'UTF-32');
+        }
+
+        $match['offset'] = $pos;
        return $match;
    }

-    protected function tokenizeLine(string $inputLine): array {
+    protected function _tokenize(string $inputLine, int $offset = 0): array {
        $currentRules = end($this->ruleStack)->patterns->getIterator();
        $currentRulesCount = count($currentRules);
        $results = [];
@ -70,31 +98,8 @@ class Tokenizer {
            while (true) {
                $rule = $currentRules[$i];
                if ($rule instanceof Pattern) {
-                    $matchMode = null;
-                    $regex = null;
-                    if ($rule->match !== null) {
-                        $regex = $rule->match;
-                        $matchMode = self::MATCH_MODE_SINGLE;
-                    } elseif ($rule->begin !== null) {
-                        $regex = $rule->begin;
-                        $matchMode = self::MATCH_MODE_BEGINEND;
-                    }
-
-                    if ($matchMode !== null && $match = $this->getMatch($regex, $line)) {
-                        $scopeStack = $this->scopeStack;
-                        if ($rule->name !== null) {
-                            $scopeStack[] = $rule->name;
-                        }
-                        if ($rule->contentName !== null) {
-                            $scopeStack[] = $rule->contentName;
-                        }
-
-                        die(var_export($rule));
-
-                        if ($matchMode === self::MATCH_MODE_BEGINEND) {
-                            $this->ruleStack[] = $rule;
-                            $this->scopeStack[] = $scopeStack;
-                        }
+                    if ($match = $this->getMatch($rule->match, $line, $offset)) {
+                        $offset = $match['offset']['end'];
                    }
                } elseif ($rule instanceof Reference && $obj = $rule->get()) {
                    if ($obj instanceof PatternList) {
@ -111,5 +116,7 @@ class Tokenizer {
                break;
            }
        }
+
+        return $inputLine;
    }
 }