Browse Source

Many changes

• Lines are now converted to UTF-32 while tokenizing so that byte 
offsets may be cleanly converted to character offsets
• Now when grammars are parsed into Grammar objects begin and end 
matches are converted to regular matches by adding end matches to the 
pattern's pattern list to simplify tokenization.
• Highlight::withFile and Highlight::withString now accept an encoding 
parameter which defaults to UTF-8.
main
Dustin Wilson 3 years ago
parent
commit
5a3322a0cb
  1. 3
      composer.json
  2. 7
      composer.lock
  3. 12
      lib/Data.php
  4. 80
      lib/Grammar.php
  5. 13
      lib/Grammar/Pattern.php
  6. 16
      lib/Highlight.php
  7. 73
      lib/Tokenizer.php

3
composer.json

@ -12,9 +12,10 @@
], ],
"require": { "require": {
"php": "^7.4 || ^8.0", "php": "^7.4 || ^8.0",
"ext-dom": "*",
"ext-intl": "*", "ext-intl": "*",
"ext-json": "*", "ext-json": "*",
"ext-dom": "*", "ext-mbstring": "*",
"docopt/docopt": "^1.0" "docopt/docopt": "^1.0"
}, },
"autoload": { "autoload": {

7
composer.lock

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "eb84c086d7c773cf5f8ad5ad2b9e546e", "content-hash": "7f3c88aa5023ebb6ebad3e513973d927",
"packages": [ "packages": [
{ {
"name": "docopt/docopt", "name": "docopt/docopt",
@ -65,10 +65,11 @@
"prefer-lowest": false, "prefer-lowest": false,
"platform": { "platform": {
"php": "^7.4 || ^8.0", "php": "^7.4 || ^8.0",
"ext-dom": "*",
"ext-intl": "*", "ext-intl": "*",
"ext-json": "*", "ext-json": "*",
"ext-dom": "*" "ext-mbstring": "*"
}, },
"platform-dev": [], "platform-dev": [],
"plugin-api-version": "2.1.0" "plugin-api-version": "2.0.0"
} }

12
lib/Data.php

@ -8,22 +8,26 @@ namespace dW\Lit;
class Data { class Data {
public static function fileToGenerator(string $filepath): \Generator { public static function fileToGenerator(string $filepath, string $encoding = 'UTF-8'): \Generator {
$lineNumber = 0; $lineNumber = 0;
$fp = fopen($filepath, 'r'); $fp = fopen($filepath, 'r');
try { try {
while ($line = fgets($fp)) { while ($line = fgets($fp)) {
yield ++$lineNumber => $line; // Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
// converting byte offsets to character offsets as easy as dividing by 4.
yield ++$lineNumber => mb_convert_encoding($line, 'UTF-32', $encoding);
} }
} finally { } finally {
fclose($fp); fclose($fp);
} }
} }
public static function stringToGenerator(string $string): \Generator { public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator {
$string = explode("\n", $string); $string = explode("\n", $string);
foreach ($string as $lineNumber => $line) { foreach ($string as $lineNumber => $line) {
yield $lineNumber + 1 => $line; // Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
// converting byte offsets to character offsets as easy as dividing by 4.
yield $lineNumber + 1 => mb_convert_encoding($line, 'UTF-32', $encoding);
} }
} }
} }

80
lib/Grammar.php

@ -148,7 +148,7 @@ class Grammar {
} }
protected function parseJSONPattern(array $pattern, string $filename): Pattern|Reference|\WeakReference|null { protected function parseJSONPattern(array $pattern, string $filename): Pattern|Reference|null {
if (isset($pattern['include'])) { if (isset($pattern['include'])) {
if ($pattern['include'][0] === '#') { if ($pattern['include'][0] === '#') {
return new RepositoryReference(substr($pattern['include'], 1), $this); return new RepositoryReference(substr($pattern['include'], 1), $this);
@ -165,40 +165,82 @@ class Grammar {
'ownerGrammar' => $this, 'ownerGrammar' => $this,
'name' => null, 'name' => null,
'contentName' => null, 'contentName' => null,
'begin' => null,
'end' => null,
'match' => null, 'match' => null,
'patterns' => null, 'patterns' => null,
'captures' => null, 'captures' => null,
'beginCaptures' => null, 'endPattern' => false
'endCaptures' => null,
'applyEndPatternLast' => false
]; ];
$modified = false; $modified = false;
$applyEndPatternLast = false;
if (isset($pattern['applyEndPatternLast'])) {
$applyEndPatternLast = $pattern['applyEndPatternLast'];
if (!is_bool($applyEndPatternLast) || (!is_int($applyEndPatternLast) && ($applyEndPatternLast !== 0 && $applyEndPatternLast !== 1))) {
throw new Exception(Exception::JSON_INVALID_TYPE, 'Boolean, 0, or 1', 'applyEndPatternLast', gettype($applyEndPatternLast), $filename);
}
$applyEndPatternLast = (bool)$applyEndPatternLast;
}
// Begin and end matches are handled in this implementation by parsing begin
// matches as regular matches and appending the end match as a pattern to the
// end of the pattern's patterns.
if (isset($pattern['begin'])) {
if (!isset($pattern['end'])) {
throw new Exception(Exception::JSON_MISSING_PROPERTY, $filename, 'end');
}
$begin = $pattern['begin'];//str_replace('/', '\/', $pattern['begin']);
$p['match'] = $begin;//"/$begin/";
$modified = true;
if (isset($pattern['beginCaptures'])) {
$pattern['captures'] = $pattern['beginCaptures'];
} elseif (isset($pattern['captures'])) {
$pattern['captures'] = $pattern['captures'];
}
$endCaptures = null;
if (isset($pattern['endCaptures'])) {
$endCaptures = $pattern['endCaptures'];
} elseif (isset($pattern['captures'])) {
$endCaptures = $pattern['captures'];
}
$endPattern = [
'match' => $pattern['end'],//"/" . str_replace('/', '\/', $pattern['end']) . "/",
'endPattern' => true
];
if ($endCaptures !== null) {
$endPattern['captures'] = $endCaptures;
}
if (isset($pattern['patterns'])) {
if ($applyEndPatternLast) {
$pattern['patterns'][] = $endPattern;
} else {
array_unshift($pattern['patterns'], $endPattern);
}
} else {
$pattern['patterns'] = [ $endPattern ];
}
}
foreach ($pattern as $key => $value) { foreach ($pattern as $key => $value) {
switch ($key) { switch ($key) {
case 'applyEndPatternLast':
if (!is_bool($value) || (!is_int($value) && ($value !== 0 && $value !== 1))) {
throw new Exception(Exception::JSON_INVALID_TYPE, 'Boolean, 0, or 1', 'applyEndPatternLast', gettype($value), $filename);
}
$value = (bool)$value;
case 'name': case 'name':
case 'contentName': case 'contentName':
$p[$key] = $value; $p[$key] = $value;
$modified = true; $modified = true;
break; break;
case 'begin':
case 'end':
case 'match': case 'match':
$value = str_replace('/', '\/', $value); //$value = str_replace('/', '\/', $value);
$p[$key] = "/$value/"; $p['match'] = $value;//"/$value/";
$modified = true; $modified = true;
break; break;
case 'captures': case 'captures':
case 'beginCaptures':
case 'endCaptures':
if (!is_array($value)) { if (!is_array($value)) {
throw new Exception(Exception::JSON_INVALID_TYPE, 'Array', $key, gettype($value), $filename); throw new Exception(Exception::JSON_INVALID_TYPE, 'Array', $key, gettype($value), $filename);
} }
@ -240,7 +282,7 @@ class Grammar {
return ($modified) ? new Pattern(...$p) : null; return ($modified) ? new Pattern(...$p) : null;
} }
protected function parseJSONPatternList(array $list, string $filename): Pattern|PatternList|null { protected function parseJSONPatternList(array $list, string $filename): ?PatternList {
$result = []; $result = [];
foreach ($list as $pattern) { foreach ($list as $pattern) {
$p = $this->parseJSONPattern($pattern, $filename); $p = $this->parseJSONPattern($pattern, $filename);

13
lib/Grammar/Pattern.php

@ -11,29 +11,22 @@ use dW\Lit\Grammar;
/** Contains patterns responsible for matching a portion of the document */ /** Contains patterns responsible for matching a portion of the document */
class Pattern extends Rule { class Pattern extends Rule {
protected bool $_applyEndPatternLast = false; protected bool $_applyEndPatternLast = false;
protected ?string $_begin;
protected ?CaptureList $_beginCaptures;
protected ?CaptureList $_captures; protected ?CaptureList $_captures;
protected ?string $_contentName; protected ?string $_contentName;
protected ?string $_end; protected bool $_endPattern = false;
protected ?CaptureList $_endCaptures;
protected ?string $_match; protected ?string $_match;
protected ?string $_name; protected ?string $_name;
protected \WeakReference $_ownerGrammar; protected \WeakReference $_ownerGrammar;
protected ?PatternList $_patterns; protected ?PatternList $_patterns;
public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $begin = null, ?string $end = null, ?string $match = null, ?PatternList $patterns = null, ?CaptureList $captures = null, ?CaptureList $beginCaptures = null, ?CaptureList $endCaptures = null, bool $applyEndPatternLast = false) { public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $match = null, ?PatternList $patterns = null, ?CaptureList $captures = null, bool $endPattern = false) {
$this->_name = $name; $this->_name = $name;
$this->_contentName = $contentName; $this->_contentName = $contentName;
$this->_begin = $begin;
$this->_end = $end;
$this->_match = $match; $this->_match = $match;
$this->_patterns = $patterns; $this->_patterns = $patterns;
$this->_captures = $captures; $this->_captures = $captures;
$this->_beginCaptures = $beginCaptures; $this->_endPattern = $endPattern;
$this->_endCaptures = $endCaptures;
$this->_applyEndPatternLast = $applyEndPatternLast;
$this->_ownerGrammar = ($ownerGrammar === null) ? null : \WeakReference::create($ownerGrammar); $this->_ownerGrammar = ($ownerGrammar === null) ? null : \WeakReference::create($ownerGrammar);
} }
} }

16
lib/Highlight.php

@ -9,26 +9,30 @@ use dW\Lit\Grammar\Exception;
class Highlight { class Highlight {
public static function withFile(string $filepath, string $scopeName) { public static function withFile(string $filepath, string $scopeName, string $encoding = 'UTF-8') {
return self::highlight(Data::fileToGenerator($filepath), $scopeName); return self::highlight(Data::fileToGenerator($filepath, $encoding), $scopeName, $encoding);
} }
public static function withString(string $string, string $scopeName) { public static function withString(string $string, string $scopeName, string $encoding = 'UTF-8') {
return self::highlight(Data::stringToGenerator($string), $scopeName); return self::highlight(Data::stringToGenerator($string, $encoding), $scopeName, $encoding);
} }
protected static function highlight(\Generator $data, string $scopeName) { protected static function highlight(\Generator $data, string $scopeName, string $encoding) {
$grammar = GrammarRegistry::get($scopeName); $grammar = GrammarRegistry::get($scopeName);
if ($grammar === false) { if ($grammar === false) {
throw new Exception(Exception::GRAMMAR_MISSING, $scopeName); throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
} }
$tokenizer = new Tokenizer($data, $grammar); mb_regex_encoding('UTF-32');
$tokenizer = new Tokenizer($data, $grammar, $encoding);
$tokenList = $tokenizer->tokenize(); $tokenList = $tokenizer->tokenize();
foreach ($tokenList as $lineNumber => $line) { foreach ($tokenList as $lineNumber => $line) {
echo "$lineNumber: $line\n"; echo "$lineNumber: $line\n";
} }
mb_regex_encoding();
} }
} }

73
lib/Tokenizer.php

@ -14,13 +14,15 @@ use dW\Lit\Grammar\{
class Tokenizer { class Tokenizer {
protected \Generator $data; protected \Generator $data;
protected string $encoding;
protected Grammar $grammar; protected Grammar $grammar;
protected array $ruleStack; protected array $ruleStack;
protected array $scopeStack; protected array $scopeStack;
public function __construct(\Generator $data, Grammar $grammar) {
public function __construct(\Generator $data, Grammar $grammar, string $encoding) {
$this->data = $data; $this->data = $data;
$this->encoding = $encoding;
$this->grammar = $grammar; $this->grammar = $grammar;
$this->ruleStack = [ $this->grammar ]; $this->ruleStack = [ $this->grammar ];
$this->scopeStack = [ $this->grammar->scopeName ]; $this->scopeStack = [ $this->grammar->scopeName ];
@ -33,9 +35,9 @@ class Tokenizer {
public function tokenize(): \Generator { public function tokenize(): \Generator {
$appendNewLine = true; $appendNewLine = true;
foreach ($this->data as $lineNumber => $inputLine) { foreach ($this->data as $lineNumber => $inputLine) {
$line = $inputLine; yield $lineNumber => $this->_tokenize($inputLine);
/*$line = $inputLine;
$lineWithNewLine = ($appendNewLine) ? "$line\n" : $line; $lineWithNewLine = ($appendNewLine) ? "$line\n" : $line;
$initialStackRuleLength = count($this->ruleStack); $initialStackRuleLength = count($this->ruleStack);
$position = 0; $position = 0;
@ -47,20 +49,46 @@ class Tokenizer {
if ($position > mb_strlen($line)) { if ($position > mb_strlen($line)) {
break; break;
} }
} }*/
} }
} }
protected function getMatch(string $regex, string $line): ?array { protected function getMatch(string $regex, string $line, int $offset = 0): ?array {
if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE) !== 1) { // Using mbstring's regular expressions because it truly supports multibyte
// strings but also because the original implementation used Oniguruma.
mb_ereg_search_init($line, mb_convert_encoding($regex, 'UTF-32'));
if ($offset !== 0) {
// UTF-32 uses 4 bytes for every character; multiply by 4 to convert from
// character offset to byte offset.
mb_ereg_search_setpos($offset * 4);
}
$pos = mb_ereg_search_pos();
if ($pos === false) {
return null; return null;
} }
// UTF-32 uses 4 bytes for every character; divide by 4 to get character
// offsets.
$length = $pos[1] / 4;
$pos = [
'start' => $pos[0] / 4,
];
$pos['end'] = $pos['start'] + $length;
$match = mb_ereg_search_getregs();
// Convert the matches back to the original encoding.
foreach ($match as &$m) {
$m = mb_convert_encoding($m, $this->encoding, 'UTF-32');
}
$match['offset'] = $pos;
return $match; return $match;
} }
protected function tokenizeLine(string $inputLine): array { protected function _tokenize(string $inputLine, int $offset = 0): array {
$currentRules = end($this->ruleStack)->patterns->getIterator(); $currentRules = end($this->ruleStack)->patterns->getIterator();
$currentRulesCount = count($currentRules); $currentRulesCount = count($currentRules);
$results = []; $results = [];
@ -70,31 +98,8 @@ class Tokenizer {
while (true) { while (true) {
$rule = $currentRules[$i]; $rule = $currentRules[$i];
if ($rule instanceof Pattern) { if ($rule instanceof Pattern) {
$matchMode = null; if ($match = $this->getMatch($rule->match, $line, $offset)) {
$regex = null; $offset = $match['offset']['end'];
if ($rule->match !== null) {
$regex = $rule->match;
$matchMode = self::MATCH_MODE_SINGLE;
} elseif ($rule->begin !== null) {
$regex = $rule->begin;
$matchMode = self::MATCH_MODE_BEGINEND;
}
if ($matchMode !== null && $match = $this->getMatch($regex, $line)) {
$scopeStack = $this->scopeStack;
if ($rule->name !== null) {
$scopeStack[] = $rule->name;
}
if ($rule->contentName !== null) {
$scopeStack[] = $rule->contentName;
}
die(var_export($rule));
if ($matchMode === self::MATCH_MODE_BEGINEND) {
$this->ruleStack[] = $rule;
$this->scopeStack[] = $scopeStack;
}
} }
} elseif ($rule instanceof Reference && $obj = $rule->get()) { } elseif ($rule instanceof Reference && $obj = $rule->get()) {
if ($obj instanceof PatternList) { if ($obj instanceof PatternList) {
@ -111,5 +116,7 @@ class Tokenizer {
break; break;
} }
} }
return $inputLine;
} }
} }
Loading…
Cancel
Save