diff --git a/composer.json b/composer.json index 8e7dd20..ce2898a 100644 --- a/composer.json +++ b/composer.json @@ -12,9 +12,10 @@ ], "require": { "php": "^7.4 || ^8.0", + "ext-dom": "*", "ext-intl": "*", "ext-json": "*", - "ext-dom": "*", + "ext-mbstring": "*", "docopt/docopt": "^1.0" }, "autoload": { diff --git a/composer.lock b/composer.lock index adf82a0..2fa9735 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "eb84c086d7c773cf5f8ad5ad2b9e546e", + "content-hash": "7f3c88aa5023ebb6ebad3e513973d927", "packages": [ { "name": "docopt/docopt", @@ -65,10 +65,11 @@ "prefer-lowest": false, "platform": { "php": "^7.4 || ^8.0", + "ext-dom": "*", "ext-intl": "*", "ext-json": "*", - "ext-dom": "*" + "ext-mbstring": "*" }, "platform-dev": [], - "plugin-api-version": "2.1.0" + "plugin-api-version": "2.0.0" } diff --git a/lib/Data.php b/lib/Data.php index 54ffe3c..dd1dc35 100644 --- a/lib/Data.php +++ b/lib/Data.php @@ -8,22 +8,26 @@ namespace dW\Lit; class Data { - public static function fileToGenerator(string $filepath): \Generator { + public static function fileToGenerator(string $filepath, string $encoding = 'UTF-8'): \Generator { $lineNumber = 0; $fp = fopen($filepath, 'r'); try { while ($line = fgets($fp)) { - yield ++$lineNumber => $line; + // Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making + // converting byte offsets to character offsets as easy as dividing by 4. + yield ++$lineNumber => mb_convert_encoding($line, 'UTF-32', $encoding); } } finally { fclose($fp); } } - public static function stringToGenerator(string $string): \Generator { + public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator { $string = explode("\n", $string); foreach ($string as $lineNumber => $line) { - yield $lineNumber + 1 => $line; + // Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making + // converting byte offsets to character offsets as easy as dividing by 4. + yield $lineNumber + 1 => mb_convert_encoding($line, 'UTF-32', $encoding); } } } \ No newline at end of file diff --git a/lib/Grammar.php b/lib/Grammar.php index 6af9875..7c74b12 100644 --- a/lib/Grammar.php +++ b/lib/Grammar.php @@ -148,7 +148,7 @@ class Grammar { } - protected function parseJSONPattern(array $pattern, string $filename): Pattern|Reference|\WeakReference|null { + protected function parseJSONPattern(array $pattern, string $filename): Pattern|Reference|null { if (isset($pattern['include'])) { if ($pattern['include'][0] === '#') { return new RepositoryReference(substr($pattern['include'], 1), $this); @@ -165,40 +165,82 @@ class Grammar { 'ownerGrammar' => $this, 'name' => null, 'contentName' => null, - 'begin' => null, - 'end' => null, 'match' => null, 'patterns' => null, 'captures' => null, - 'beginCaptures' => null, - 'endCaptures' => null, - 'applyEndPatternLast' => false + 'endPattern' => false ]; $modified = false; + + $applyEndPatternLast = false; + if (isset($pattern['applyEndPatternLast'])) { + $applyEndPatternLast = $pattern['applyEndPatternLast']; + if (!is_bool($applyEndPatternLast) || (!is_int($applyEndPatternLast) && ($applyEndPatternLast !== 0 && $applyEndPatternLast !== 1))) { + throw new Exception(Exception::JSON_INVALID_TYPE, 'Boolean, 0, or 1', 'applyEndPatternLast', gettype($applyEndPatternLast), $filename); + } + + $applyEndPatternLast = (bool)$applyEndPatternLast; + } + + // Begin and end matches are handled in this implementation by parsing begin + // matches as regular matches and appending the end match as a pattern to the + // end of the pattern's patterns. + if (isset($pattern['begin'])) { + if (!isset($pattern['end'])) { + throw new Exception(Exception::JSON_MISSING_PROPERTY, $filename, 'end'); + } + + $begin = $pattern['begin'];//str_replace('/', '\/', $pattern['begin']); + $p['match'] = $begin;//"/$begin/"; + $modified = true; + + if (isset($pattern['beginCaptures'])) { + $pattern['captures'] = $pattern['beginCaptures']; + } elseif (isset($pattern['captures'])) { + $pattern['captures'] = $pattern['captures']; + } + + $endCaptures = null; + if (isset($pattern['endCaptures'])) { + $endCaptures = $pattern['endCaptures']; + } elseif (isset($pattern['captures'])) { + $endCaptures = $pattern['captures']; + } + + $endPattern = [ + 'match' => $pattern['end'],//"/" . str_replace('/', '\/', $pattern['end']) . "/", + 'endPattern' => true + ]; + + if ($endCaptures !== null) { + $endPattern['captures'] = $endCaptures; + } + + if (isset($pattern['patterns'])) { + if ($applyEndPatternLast) { + $pattern['patterns'][] = $endPattern; + } else { + array_unshift($pattern['patterns'], $endPattern); + } + } else { + $pattern['patterns'] = [ $endPattern ]; + } + } + foreach ($pattern as $key => $value) { switch ($key) { - case 'applyEndPatternLast': - if (!is_bool($value) || (!is_int($value) && ($value !== 0 && $value !== 1))) { - throw new Exception(Exception::JSON_INVALID_TYPE, 'Boolean, 0, or 1', 'applyEndPatternLast', gettype($value), $filename); - } - - $value = (bool)$value; case 'name': case 'contentName': $p[$key] = $value; $modified = true; break; - case 'begin': - case 'end': case 'match': - $value = str_replace('/', '\/', $value); - $p[$key] = "/$value/"; + //$value = str_replace('/', '\/', $value); + $p['match'] = $value;//"/$value/"; $modified = true; break; case 'captures': - case 'beginCaptures': - case 'endCaptures': if (!is_array($value)) { throw new Exception(Exception::JSON_INVALID_TYPE, 'Array', $key, gettype($value), $filename); } @@ -240,7 +282,7 @@ class Grammar { return ($modified) ? new Pattern(...$p) : null; } - protected function parseJSONPatternList(array $list, string $filename): Pattern|PatternList|null { + protected function parseJSONPatternList(array $list, string $filename): ?PatternList { $result = []; foreach ($list as $pattern) { $p = $this->parseJSONPattern($pattern, $filename); diff --git a/lib/Grammar/Pattern.php b/lib/Grammar/Pattern.php index 976b8d6..c4434d4 100644 --- a/lib/Grammar/Pattern.php +++ b/lib/Grammar/Pattern.php @@ -11,29 +11,22 @@ use dW\Lit\Grammar; /** Contains patterns responsible for matching a portion of the document */ class Pattern extends Rule { protected bool $_applyEndPatternLast = false; - protected ?string $_begin; - protected ?CaptureList $_beginCaptures; protected ?CaptureList $_captures; protected ?string $_contentName; - protected ?string $_end; - protected ?CaptureList $_endCaptures; + protected bool $_endPattern = false; protected ?string $_match; protected ?string $_name; protected \WeakReference $_ownerGrammar; protected ?PatternList $_patterns; - public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $begin = null, ?string $end = null, ?string $match = null, ?PatternList $patterns = null, ?CaptureList $captures = null, ?CaptureList $beginCaptures = null, ?CaptureList $endCaptures = null, bool $applyEndPatternLast = false) { + public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $match = null, ?PatternList $patterns = null, ?CaptureList $captures = null, bool $endPattern = false) { $this->_name = $name; $this->_contentName = $contentName; - $this->_begin = $begin; - $this->_end = $end; $this->_match = $match; $this->_patterns = $patterns; $this->_captures = $captures; - $this->_beginCaptures = $beginCaptures; - $this->_endCaptures = $endCaptures; - $this->_applyEndPatternLast = $applyEndPatternLast; + $this->_endPattern = $endPattern; $this->_ownerGrammar = ($ownerGrammar === null) ? null : \WeakReference::create($ownerGrammar); } } \ No newline at end of file diff --git a/lib/Highlight.php b/lib/Highlight.php index 5ff651d..eb3e56f 100644 --- a/lib/Highlight.php +++ b/lib/Highlight.php @@ -9,26 +9,30 @@ use dW\Lit\Grammar\Exception; class Highlight { - public static function withFile(string $filepath, string $scopeName) { - return self::highlight(Data::fileToGenerator($filepath), $scopeName); + public static function withFile(string $filepath, string $scopeName, string $encoding = 'UTF-8') { + return self::highlight(Data::fileToGenerator($filepath, $encoding), $scopeName, $encoding); } - public static function withString(string $string, string $scopeName) { - return self::highlight(Data::stringToGenerator($string), $scopeName); + public static function withString(string $string, string $scopeName, string $encoding = 'UTF-8') { + return self::highlight(Data::stringToGenerator($string, $encoding), $scopeName, $encoding); } - protected static function highlight(\Generator $data, string $scopeName) { + protected static function highlight(\Generator $data, string $scopeName, string $encoding) { $grammar = GrammarRegistry::get($scopeName); if ($grammar === false) { throw new Exception(Exception::GRAMMAR_MISSING, $scopeName); } - $tokenizer = new Tokenizer($data, $grammar); + mb_regex_encoding('UTF-32'); + + $tokenizer = new Tokenizer($data, $grammar, $encoding); $tokenList = $tokenizer->tokenize(); foreach ($tokenList as $lineNumber => $line) { echo "$lineNumber: $line\n"; } + + mb_regex_encoding(); } } \ No newline at end of file diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 2edd0c8..c80da82 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -14,13 +14,15 @@ use dW\Lit\Grammar\{ class Tokenizer { protected \Generator $data; + protected string $encoding; protected Grammar $grammar; protected array $ruleStack; protected array $scopeStack; - - public function __construct(\Generator $data, Grammar $grammar) { + + public function __construct(\Generator $data, Grammar $grammar, string $encoding) { $this->data = $data; + $this->encoding = $encoding; $this->grammar = $grammar; $this->ruleStack = [ $this->grammar ]; $this->scopeStack = [ $this->grammar->scopeName ]; @@ -33,9 +35,9 @@ class Tokenizer { public function tokenize(): \Generator { $appendNewLine = true; - foreach ($this->data as $lineNumber => $inputLine) { - $line = $inputLine; + yield $lineNumber => $this->_tokenize($inputLine); + /*$line = $inputLine; $lineWithNewLine = ($appendNewLine) ? "$line\n" : $line; $initialStackRuleLength = count($this->ruleStack); $position = 0; @@ -47,20 +49,46 @@ class Tokenizer { if ($position > mb_strlen($line)) { break; } - } + }*/ } } - protected function getMatch(string $regex, string $line): ?array { - if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE) !== 1) { + protected function getMatch(string $regex, string $line, int $offset = 0): ?array { + // Using mbstring's regular expressions because it truly supports multibyte + // strings but also because the original implementation used Oniguruma. + mb_ereg_search_init($line, mb_convert_encoding($regex, 'UTF-32')); + + if ($offset !== 0) { + // UTF-32 uses 4 bytes for every character; multiply by 4 to convert from + // character offset to byte offset. + mb_ereg_search_setpos($offset * 4); + } + + $pos = mb_ereg_search_pos(); + if ($pos === false) { return null; } + // UTF-32 uses 4 bytes for every character; divide by 4 to get character + // offsets. + $length = $pos[1] / 4; + $pos = [ + 'start' => $pos[0] / 4, + ]; + $pos['end'] = $pos['start'] + $length; + + $match = mb_ereg_search_getregs(); + // Convert the matches back to the original encoding. + foreach ($match as &$m) { + $m = mb_convert_encoding($m, $this->encoding, 'UTF-32'); + } + + $match['offset'] = $pos; return $match; } - protected function tokenizeLine(string $inputLine): array { + protected function _tokenize(string $inputLine, int $offset = 0): array { $currentRules = end($this->ruleStack)->patterns->getIterator(); $currentRulesCount = count($currentRules); $results = []; @@ -70,31 +98,8 @@ class Tokenizer { while (true) { $rule = $currentRules[$i]; if ($rule instanceof Pattern) { - $matchMode = null; - $regex = null; - if ($rule->match !== null) { - $regex = $rule->match; - $matchMode = self::MATCH_MODE_SINGLE; - } elseif ($rule->begin !== null) { - $regex = $rule->begin; - $matchMode = self::MATCH_MODE_BEGINEND; - } - - if ($matchMode !== null && $match = $this->getMatch($regex, $line)) { - $scopeStack = $this->scopeStack; - if ($rule->name !== null) { - $scopeStack[] = $rule->name; - } - if ($rule->contentName !== null) { - $scopeStack[] = $rule->contentName; - } - - die(var_export($rule)); - - if ($matchMode === self::MATCH_MODE_BEGINEND) { - $this->ruleStack[] = $rule; - $this->scopeStack[] = $scopeStack; - } + if ($match = $this->getMatch($rule->match, $line, $offset)) { + $offset = $match['offset']['end']; } } elseif ($rule instanceof Reference && $obj = $rule->get()) { if ($obj instanceof PatternList) { @@ -111,5 +116,7 @@ class Tokenizer { break; } } + + return $inputLine; } } \ No newline at end of file