Browse Source

Removed getting data from file

• Added pattern match anchor support.
• Data is now an instanced class with support only for string input.
• Data now has firstLine, lastLine, and lastLineBeforeFinalNewLine properties to facilitate anchoring
• Highlight now has a static toDOM method for highlighting to a DOM tree instead of the withFile and withString methods for accepting different kinds of input
• Tokenizer now only outputs newline tokens if not the last line
• Tokenizer now throws out pattern match regexes if their anchors are invalid for the current line.
• Tokenizer now won't mistakenly emit empty string tokens.
Dustin Wilson 3 years ago
  1. 38
  2. 14
  3. 94


@ -8,22 +8,38 @@ namespace dW\Lit;
class Data {
public static function fileToGenerator(string $filepath, string $encoding = 'UTF-8'): \Generator {
$lineNumber = 0;
$fp = fopen($filepath, 'r');
try {
while ($line = fgets($fp)) {
yield ++$lineNumber => $line;
} finally {
use FauxReadOnly;
// True if on the first line
protected bool $_firstLine = true;
protected \Generator $generator;
// True if on the last line.
protected bool $_lastLine = false;
// Some matches will check for the last line before the final newline, so this
// will be true if on the line before the final newline or if on the last line
// if there isn't an extra newline at the end of the string.
protected bool $_lastLineBeforeFinalNewLine = false;
public function __construct(string $data) {
$this->generator = $this->lineGenerator($data);
public function get(): \Generator {
return $this->generator;
public static function stringToGenerator(string $string, string $encoding = 'UTF-8'): \Generator {
protected function lineGenerator(string $string): \Generator {
$string = explode("\n", $string);
$lastLineIndex = count($string) - 1;
$lastLineBeforeFinalNewLineIndex = ($string[$lastLineIndex] === '') ? $lastLineIndex - 1 : $lastLineIndex;
foreach ($string as $lineNumber => $line) {
$this->_lastLine = ($lineNumber === $lastLineIndex);
$this->_lastLineBeforeFinalNewLine = ($lineNumber === $lastLineBeforeFinalNewLineIndex);
yield $lineNumber + 1 => $line;
$this->_firstLine = false;


@ -9,26 +9,22 @@ use dW\Lit\Grammar\Exception;
class Highlight {
public static function withFile(string $filepath, string $scopeName) {
return self::highlight(Data::fileToGenerator($filepath), $scopeName);
public static function withString(string $string, string $scopeName) {
return self::highlight(Data::stringToGenerator($string), $scopeName);
public static function toDOM(string $data, string $scopeName) {
self::highlight($data, $scopeName);
protected static function highlight(\Generator $data, string $scopeName) {
protected static function highlight(string $data, string $scopeName) {
$grammar = GrammarRegistry::get($scopeName);
if ($grammar === false) {
throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
$tokenizer = new Tokenizer($data, $grammar);
$tokenizer = new Tokenizer(new Data($data), $grammar);
$tokenList = $tokenizer->tokenize();
foreach ($tokenList as $lineNumber => $tokens) {
if ($lineNumber === 7) {
if ($lineNumber === 19) {
echo "\n";


@ -17,7 +17,7 @@ use dW\Lit\Scope\{
class Tokenizer {
protected \Generator $data;
protected Data $data;
protected Grammar $grammar;
protected int $offset = 0;
protected ?Pattern $activeInjection = null;
@ -26,8 +26,11 @@ class Tokenizer {
protected int $debug = 0;
protected int $debugCount = 0;
protected const SCOPE_RESOLVE_REGEX = '/\$(\d+)|\${(\d+):\/(downcase|upcase)}/S';
protected const ANCHOR_CHECK_REGEX = '/(?<!\\\)\\\([AGzZ])/S';
public function __construct(\Generator $data, Grammar $grammar) {
public function __construct(Data $data, Grammar $grammar) {
$this->data = $data;
$this->grammar = $grammar;
$this->ruleStack = [ $this->grammar ];
@ -36,7 +39,7 @@ class Tokenizer {
public function tokenize(): \Generator {
foreach ($this->data as $lineNumber => $line) {
foreach ($this->data->get() as $lineNumber => $line) {
$this->debug = $lineNumber;
$this->debugCount = 0;
$this->offset = 0;
@ -46,11 +49,19 @@ class Tokenizer {
// Output a token for everything else contained on the line including the
// newline or just a newline if there weren't any spare characters left on the
// line.
$tokens[] = new Token(
($this->offset < $lineLength) ? substr($line, $this->offset, $lineLength - $this->offset) . "\n" : "\n"
// line. If it is the last line, and there's nothing else remaining on the line
// then output no additional token.
if ($this->offset < $lineLength) {
$tokens[] = new Token(
substr($line, $this->offset, $lineLength - $this->offset) . ((!$this->data->lastLine) ? "\n" : '')
} elseif (!$this->data->lastLine) {
$tokens[] = new Token(
@ -60,7 +71,7 @@ class Tokenizer {
protected function resolveScopeName(string $scopeName, array $match): string {
return preg_replace_callback('/\$(\d+)|\${(\d+):\/(downcase|upcase)}/', function($m) use ($match) {
return preg_replace_callback(self::SCOPE_RESOLVE_REGEX, function($m) use($match) {
$replacement = $match[(int)$m[1]][0] ?? $m[1];
$command = $m[2] ?? null;
switch ($command) {
@ -101,25 +112,42 @@ class Tokenizer {
while (true) {
$rule = $currentRules[$i];
// If the rule is a Pattern and matches the line at the offset then...
if ($rule instanceof Pattern && preg_match($rule->match, $line, $match, PREG_OFFSET_CAPTURE, $this->offset)) {
// If the match's offset is the same as the current offset then it is the
// closest match. There's no need to iterate anymore through the patterns.
if ($match[0][1] === $this->offset) {
$closestMatch = [
'match' => $match,
'pattern' => $rule
break 2;
// If the rule is a Pattern
if ($rule instanceof Pattern) {
// Throw out pattern regexes with anchors that cannot match the current line.
if (preg_match(self::ANCHOR_CHECK_REGEX, $rule->match, $validRegexMatch) === 1) {
if (
// \A anchors match the beginning of the whole string, not just this line
($validRegexMatch[1] === 'A' && !$this->data->firstLine) ||
// \z anchors match the end of the whole string, not just this line
($validRegexMatch[1] === 'z' && !$this->data->lastLine) ||
// \Z anchors match the end of the whole string or before the final newline if
// there's a trailing newline in the string
($validRegexMatch[1] === 'Z' && !$this->data->lastLineBeforeFinalNewLine)
) {
continue 2;
// Otherwise, if the closest match is currently null or the match's offset is
// less than the closest match's offset then set the match as the closest match
// and continue looking for a closer one.
elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
$closestMatch = [
'match' => $match,
'pattern' => $rule
if (preg_match($rule->match, "$line\n", $match, PREG_OFFSET_CAPTURE, $this->offset)) {
// If the match's offset is the same as the current offset then it is the
// closest match. There's no need to iterate anymore through the patterns.
if ($match[0][1] === $this->offset) {
$closestMatch = [
'match' => $match,
'pattern' => $rule
break 2;
// Otherwise, if the closest match is currently null or the match's offset is
// less than the closest match's offset then set the match as the closest match
// and continue looking for a closer one.
elseif ($closestMatch === null || $match[0][1] < $closestMatch['match'][0][1]) {
$closestMatch = [
'match' => $match,
'pattern' => $rule
// Otherwise, if the rule is a Reference then retrieve its patterns, splice into
@ -129,7 +157,7 @@ class Tokenizer {
$obj = $obj->patterns;
array_splice($currentRules, $i, 1, $obj);
array_splice($currentRules, $i, 1, ($obj instanceof Pattern) ? [ $obj ] : $obj);
$currentRulesCount = count($currentRules);
@ -143,12 +171,6 @@ class Tokenizer {
$match = $closestMatch['match'];
$pattern = $closestMatch['pattern'];
// **¡TEMPORARY!** Haven't implemented begin and end line
// anchors, so let's toss patterns with them completely for now.
//if (preg_match('/\\\(?:A|G|Z)/', $rule->match)) {
// continue;
// If the subpattern begins after the offset then create a token from the bits
// of the line in-between the last token and the one(s) about to be created.
if ($match[0][1] > $this->offset) {
@ -227,8 +249,8 @@ class Tokenizer {
// Otherwise, if the rule doesn't have captures then a token is created from the
// entire match.
else {
// entire match, but only if the matched text isn't empty.
elseif ($match[0][0] !== '') {
$tokens[] = new Token(
