Browse Source

Many changes

• Lines are now converted to UTF-32 while tokenizing so that byte 
offsets may be cleanly converted to character offsets
• Now when grammars are parsed into Grammar objects begin and end 
matches are converted to regular matches by adding end matches to the 
pattern's pattern list to simplify tokenization.
• Highlight::withFile and Highlight::withString now accept an encoding 
parameter which defaults to UTF-8.
main
Dustin Wilson 3 years ago
parent
commit
5a3322a0cb
  1. 3
      composer.json
  2. 7
      composer.lock
  3. 12
      lib/Data.php
  4. 80
      lib/Grammar.php
  5. 13
      lib/Grammar/Pattern.php
  6. 16
      lib/Highlight.php
  7. 73
      lib/Tokenizer.php

3
composer.json

@ -12,9 +12,10 @@
],
"require": {
"php": "^7.4 || ^8.0",
"ext-dom": "*",
"ext-intl": "*",
"ext-json": "*",
"ext-dom": "*",
"ext-mbstring": "*",
"docopt/docopt": "^1.0"
},
"autoload": {

7
composer.lock

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "eb84c086d7c773cf5f8ad5ad2b9e546e",
"content-hash": "7f3c88aa5023ebb6ebad3e513973d927",
"packages": [
{
"name": "docopt/docopt",
@ -65,10 +65,11 @@
"prefer-lowest": false,
"platform": {
"php": "^7.4 || ^8.0",
"ext-dom": "*",
"ext-intl": "*",
"ext-json": "*",
"ext-dom": "*"
"ext-mbstring": "*"
},
"platform-dev": [],
"plugin-api-version": "2.1.0"
"plugin-api-version": "2.0.0"
}

12
lib/Data.php

@ -8,22 +8,26 @@ namespace dW\Lit;
class Data {
public static function fileToGenerator(string $filepath): \Generator {
public static function fileToGenerator(string $filepath, string $encoding = 'UTF-8'): \Generator {
$lineNumber = 0;
$fp = fopen($filepath, 'r');
try {
while ($line = fgets($fp)) {
yield ++$lineNumber => $line;
// Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
// converting byte offsets to character offsets as easy as dividing by 4.
yield ++$lineNumber => mb_convert_encoding($line, 'UTF-32', $encoding);
}
} finally {
fclose($fp);
}
}
public static function stringToGenerator(string $string): \Generator {
public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator {
$string = explode("\n", $string);
foreach ($string as $lineNumber => $line) {
yield $lineNumber + 1 => $line;
// Lines are converted to UTF-32 because everything in UTF-32 is 4 bytes, making
// converting byte offsets to character offsets as easy as dividing by 4.
yield $lineNumber + 1 => mb_convert_encoding($line, 'UTF-32', $encoding);
}
}
}

80
lib/Grammar.php

@ -148,7 +148,7 @@ class Grammar {
}
protected function parseJSONPattern(array $pattern, string $filename): Pattern|Reference|\WeakReference|null {
protected function parseJSONPattern(array $pattern, string $filename): Pattern|Reference|null {
if (isset($pattern['include'])) {
if ($pattern['include'][0] === '#') {
return new RepositoryReference(substr($pattern['include'], 1), $this);
@ -165,40 +165,82 @@ class Grammar {
'ownerGrammar' => $this,
'name' => null,
'contentName' => null,
'begin' => null,
'end' => null,
'match' => null,
'patterns' => null,
'captures' => null,
'beginCaptures' => null,
'endCaptures' => null,
'applyEndPatternLast' => false
'endPattern' => false
];
$modified = false;
$applyEndPatternLast = false;
if (isset($pattern['applyEndPatternLast'])) {
$applyEndPatternLast = $pattern['applyEndPatternLast'];
if (!is_bool($applyEndPatternLast) || (!is_int($applyEndPatternLast) && ($applyEndPatternLast !== 0 && $applyEndPatternLast !== 1))) {
throw new Exception(Exception::JSON_INVALID_TYPE, 'Boolean, 0, or 1', 'applyEndPatternLast', gettype($applyEndPatternLast), $filename);
}
$applyEndPatternLast = (bool)$applyEndPatternLast;
}
// Begin and end matches are handled in this implementation by parsing begin
// matches as regular matches and appending the end match as a pattern to the
// end of the pattern's patterns.
if (isset($pattern['begin'])) {
if (!isset($pattern['end'])) {
throw new Exception(Exception::JSON_MISSING_PROPERTY, $filename, 'end');
}
$begin = $pattern['begin'];//str_replace('/', '\/', $pattern['begin']);
$p['match'] = $begin;//"/$begin/";
$modified = true;
if (isset($pattern['beginCaptures'])) {
$pattern['captures'] = $pattern['beginCaptures'];
} elseif (isset($pattern['captures'])) {
$pattern['captures'] = $pattern['captures'];
}
$endCaptures = null;
if (isset($pattern['endCaptures'])) {
$endCaptures = $pattern['endCaptures'];
} elseif (isset($pattern['captures'])) {
$endCaptures = $pattern['captures'];
}
$endPattern = [
'match' => $pattern['end'],//"/" . str_replace('/', '\/', $pattern['end']) . "/",
'endPattern' => true
];
if ($endCaptures !== null) {
$endPattern['captures'] = $endCaptures;
}
if (isset($pattern['patterns'])) {
if ($applyEndPatternLast) {
$pattern['patterns'][] = $endPattern;
} else {
array_unshift($pattern['patterns'], $endPattern);
}
} else {
$pattern['patterns'] = [ $endPattern ];
}
}
foreach ($pattern as $key => $value) {
switch ($key) {
case 'applyEndPatternLast':
if (!is_bool($value) || (!is_int($value) && ($value !== 0 && $value !== 1))) {
throw new Exception(Exception::JSON_INVALID_TYPE, 'Boolean, 0, or 1', 'applyEndPatternLast', gettype($value), $filename);
}
$value = (bool)$value;
case 'name':
case 'contentName':
$p[$key] = $value;
$modified = true;
break;
case 'begin':
case 'end':
case 'match':
$value = str_replace('/', '\/', $value);
$p[$key] = "/$value/";
//$value = str_replace('/', '\/', $value);
$p['match'] = $value;//"/$value/";
$modified = true;
break;
case 'captures':
case 'beginCaptures':
case 'endCaptures':
if (!is_array($value)) {
throw new Exception(Exception::JSON_INVALID_TYPE, 'Array', $key, gettype($value), $filename);
}
@ -240,7 +282,7 @@ class Grammar {
return ($modified) ? new Pattern(...$p) : null;
}
protected function parseJSONPatternList(array $list, string $filename): Pattern|PatternList|null {
protected function parseJSONPatternList(array $list, string $filename): ?PatternList {
$result = [];
foreach ($list as $pattern) {
$p = $this->parseJSONPattern($pattern, $filename);

13
lib/Grammar/Pattern.php

@ -11,29 +11,22 @@ use dW\Lit\Grammar;
/** Contains patterns responsible for matching a portion of the document */
class Pattern extends Rule {
protected bool $_applyEndPatternLast = false;
protected ?string $_begin;
protected ?CaptureList $_beginCaptures;
protected ?CaptureList $_captures;
protected ?string $_contentName;
protected ?string $_end;
protected ?CaptureList $_endCaptures;
protected bool $_endPattern = false;
protected ?string $_match;
protected ?string $_name;
protected \WeakReference $_ownerGrammar;
protected ?PatternList $_patterns;
public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $begin = null, ?string $end = null, ?string $match = null, ?PatternList $patterns = null, ?CaptureList $captures = null, ?CaptureList $beginCaptures = null, ?CaptureList $endCaptures = null, bool $applyEndPatternLast = false) {
public function __construct(Grammar $ownerGrammar, ?string $name = null, ?string $contentName = null, ?string $match = null, ?PatternList $patterns = null, ?CaptureList $captures = null, bool $endPattern = false) {
$this->_name = $name;
$this->_contentName = $contentName;
$this->_begin = $begin;
$this->_end = $end;
$this->_match = $match;
$this->_patterns = $patterns;
$this->_captures = $captures;
$this->_beginCaptures = $beginCaptures;
$this->_endCaptures = $endCaptures;
$this->_applyEndPatternLast = $applyEndPatternLast;
$this->_endPattern = $endPattern;
$this->_ownerGrammar = ($ownerGrammar === null) ? null : \WeakReference::create($ownerGrammar);
}
}

16
lib/Highlight.php

@ -9,26 +9,30 @@ use dW\Lit\Grammar\Exception;
class Highlight {
public static function withFile(string $filepath, string $scopeName) {
return self::highlight(Data::fileToGenerator($filepath), $scopeName);
public static function withFile(string $filepath, string $scopeName, string $encoding = 'UTF-8') {
return self::highlight(Data::fileToGenerator($filepath, $encoding), $scopeName, $encoding);
}
public static function withString(string $string, string $scopeName) {
return self::highlight(Data::stringToGenerator($string), $scopeName);
public static function withString(string $string, string $scopeName, string $encoding = 'UTF-8') {
return self::highlight(Data::stringToGenerator($string, $encoding), $scopeName, $encoding);
}
protected static function highlight(\Generator $data, string $scopeName) {
protected static function highlight(\Generator $data, string $scopeName, string $encoding) {
$grammar = GrammarRegistry::get($scopeName);
if ($grammar === false) {
throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
}
$tokenizer = new Tokenizer($data, $grammar);
mb_regex_encoding('UTF-32');
$tokenizer = new Tokenizer($data, $grammar, $encoding);
$tokenList = $tokenizer->tokenize();
foreach ($tokenList as $lineNumber => $line) {
echo "$lineNumber: $line\n";
}
mb_regex_encoding();
}
}

73
lib/Tokenizer.php

@ -14,13 +14,15 @@ use dW\Lit\Grammar\{
class Tokenizer {
protected \Generator $data;
protected string $encoding;
protected Grammar $grammar;
protected array $ruleStack;
protected array $scopeStack;
public function __construct(\Generator $data, Grammar $grammar) {
public function __construct(\Generator $data, Grammar $grammar, string $encoding) {
$this->data = $data;
$this->encoding = $encoding;
$this->grammar = $grammar;
$this->ruleStack = [ $this->grammar ];
$this->scopeStack = [ $this->grammar->scopeName ];
@ -33,9 +35,9 @@ class Tokenizer {
public function tokenize(): \Generator {
$appendNewLine = true;
foreach ($this->data as $lineNumber => $inputLine) {
$line = $inputLine;
yield $lineNumber => $this->_tokenize($inputLine);
/*$line = $inputLine;
$lineWithNewLine = ($appendNewLine) ? "$line\n" : $line;
$initialStackRuleLength = count($this->ruleStack);
$position = 0;
@ -47,20 +49,46 @@ class Tokenizer {
if ($position > mb_strlen($line)) {
break;
}
}
}*/
}
}
protected function getMatch(string $regex, string $line): ?array {
if (preg_match($regex, $line, $match, PREG_OFFSET_CAPTURE) !== 1) {
protected function getMatch(string $regex, string $line, int $offset = 0): ?array {
// Using mbstring's regular expressions because it truly supports multibyte
// strings but also because the original implementation used Oniguruma.
mb_ereg_search_init($line, mb_convert_encoding($regex, 'UTF-32'));
if ($offset !== 0) {
// UTF-32 uses 4 bytes for every character; multiply by 4 to convert from
// character offset to byte offset.
mb_ereg_search_setpos($offset * 4);
}
$pos = mb_ereg_search_pos();
if ($pos === false) {
return null;
}
// UTF-32 uses 4 bytes for every character; divide by 4 to get character
// offsets.
$length = $pos[1] / 4;
$pos = [
'start' => $pos[0] / 4,
];
$pos['end'] = $pos['start'] + $length;
$match = mb_ereg_search_getregs();
// Convert the matches back to the original encoding.
foreach ($match as &$m) {
$m = mb_convert_encoding($m, $this->encoding, 'UTF-32');
}
$match['offset'] = $pos;
return $match;
}
protected function tokenizeLine(string $inputLine): array {
protected function _tokenize(string $inputLine, int $offset = 0): array {
$currentRules = end($this->ruleStack)->patterns->getIterator();
$currentRulesCount = count($currentRules);
$results = [];
@ -70,31 +98,8 @@ class Tokenizer {
while (true) {
$rule = $currentRules[$i];
if ($rule instanceof Pattern) {
$matchMode = null;
$regex = null;
if ($rule->match !== null) {
$regex = $rule->match;
$matchMode = self::MATCH_MODE_SINGLE;
} elseif ($rule->begin !== null) {
$regex = $rule->begin;
$matchMode = self::MATCH_MODE_BEGINEND;
}
if ($matchMode !== null && $match = $this->getMatch($regex, $line)) {
$scopeStack = $this->scopeStack;
if ($rule->name !== null) {
$scopeStack[] = $rule->name;
}
if ($rule->contentName !== null) {
$scopeStack[] = $rule->contentName;
}
die(var_export($rule));
if ($matchMode === self::MATCH_MODE_BEGINEND) {
$this->ruleStack[] = $rule;
$this->scopeStack[] = $scopeStack;
}
if ($match = $this->getMatch($rule->match, $line, $offset)) {
$offset = $match['offset']['end'];
}
} elseif ($rule instanceof Reference && $obj = $rule->get()) {
if ($obj instanceof PatternList) {
@ -111,5 +116,7 @@ class Tokenizer {
break;
}
}
return $inputLine;
}
}
Loading…
Cancel
Save