Browse Source

Playing around with parsing using VSCode-style tokenization

main
Dustin Wilson 3 years ago
parent
commit
010c0ddca5
  1. 68
      lib/Scope/Data.php
  2. 119
      lib/Scope/Parser.php
  3. 40
      lib/Scope/Tokenizer.php

68
lib/Scope/Data.php

@ -9,79 +9,37 @@ namespace dW\Highlighter\Scope;
class Data {
protected string $data;
protected int $_position = 0;
protected int $position = 0;
protected int $endPosition;
public function __construct(string $data) {
$this->data = $data;
$this->endPosition = strlen($data);
preg_match('/[LRB]:|[A-Za-z0-9-+_\*\.]+|[\,\|\-\(\)]/', $data, $matches);
$this->data = $matches[0] ?? [];
$this->endPosition = count($this->data);
}
public function consume(int $length = 1): string|bool {
if ($this->_position === $this->endPosition) {
public function consume(): string|bool {
if ($this->position === $this->endPosition) {
return false;
}
$stop = $this->_position + $length - 1;
if ($stop > $this->endPosition) {
$stop = $this->endPosition;
}
$result = '';
while ($this->_position <= $stop) {
$result .= $this->data[$this->_position++];
}
return $result;
}
public function consumeIf(string $match): string|bool {
return $this->consumeWhile($match, 1);
}
public function consumeWhile(string $match, $limit = null): string|bool {
if ($this->_position === $this->endPosition) {
return false;
}
$length = strspn($this->data, $match, $this->_position, $limit);
if ($length === 0) {
return '';
}
return $this->consume($length);
return $this->data[$this->position++];
}
public function peek(int $length = 1): string|bool {
if ($this->_position === $this->endPosition) {
public function peek(): string|bool {
if ($this->position === $this->endPosition) {
return false;
}
$stop = $this->_position + $length - 1;
if ($stop >= $this->endPosition) {
$stop = $this->endPosition;
}
$output = '';
for ($i = $this->_position; $i <= $stop; $i++) {
$output .= $this->data[$i];
}
return $output;
return $this->data[$this->position + 1];
}
public function unconsumeTo(int $position = 1): bool {
if ($position < 0 || $position > $this->endPosition) {
public function unconsume(): bool {
if ($this->position < 0) {
return false;
}
$this->_position = $position;
$this->position--;
return true;
}
public function __get(string $name) {
if ($name === 'position') {
return $this->_position;
}
}
}

119
lib/Scope/Parser.php

@ -7,113 +7,58 @@ declare(strict_types=1);
namespace dW\Highlighter\Scope;
class Parser {
protected Data $data;
protected string $token;
protected Tokenizer $tokenizer;
protected static Parser $instance;
protected const PREFIX_REGEX = '/^[LRB]:$/S';
protected const SCOPE_REGEX = '/^[A-Za-z0-9-+_\.]+$/S';
protected function __construct(string $selector) {
$this->tokenizer = new Tokenizer($selector);
}
public static function parse(string $selector): array {
public static function parse(string $selector): Matcher|false {
self::$instance = new self($selector);
$result = [];
while (self::$instance->token = self::$instance->tokenizer->next()) {
$priority = 0;
if (strlen(self::$instance->token) === 2 && self::$instance->token[1] === ':') {
switch (self::$instance->token[0]) {
case 'R': $priority = 1;
break;
case 'L': $priority = -1;
break;
default: die('OOK!');
}
self::$instance->token = self::$instance->tokenizer->next();
if (self::$instance->token === false) {
break;
}
}
$matcher = self::parseConjunction();
if ($matcher === false) {
$matcher = self::parseOperand();
}
$result[] = [
'matcher' => $matcher,
'priority' => $priority
];
if (self::$instance->token !== ',') {
break;
}
}
return $result;
return self::parseSelector();
}
protected static function parseConjunction(): AndMatcher|false {
$matchers = [];
while ($matcher = self::parseOperand()) {
$matchers[] = $matcher;
}
return (count($matchers) > 1) ? new AndMatcher($matchers[0], $matchers[1]) : false;
protected function __construct(string $selector) {
$this->data = new Data($selector);
}
protected static function parseInnerExpression(): Matcher|false {
$matchers = [];
while ($matcher = self::parseConjunction()) {
$matchers[] = $matcher;
if (self::$instance->token === '|' || self::$instance->token === ',') {
do {
self::$instance->token = self::$instance->tokenizer->next();
} while (self::$instance->token === '|' || self::$instance->token === ',');
protected static function parseSelector(): Matcher {
while (self::$instance->token = self::$instance->data->consume()) {
if (preg_match(self::PREFIX_REGEX, self::$instance->token)) {
$peek = self::$instance->data->peek();
if ($peek === '(') {
$result = self::parseGroup();
} elseif (preg_match(self::SCOPE_REGEX, self::$instance->token)) {
$result = self::parsePath();
} else {
die('Group or path expected.');
}
} elseif (preg_match(self::SCOPE_REGEX, self::$instance->token)) {
$result = self::parseScope();
} elseif (self::$instance->token === '(') {
continue;
} else {
break;
die('Group, path, or scope expected.');
}
}
return (count($matchers) > 1) ? new OrMatcher($matchers[0], $matchers[1]) : false;
}
protected static function parseOperand(): Matcher|false {
if (self::$instance->token === '-') {
self::$instance->token = self::$instance->tokenizer->next();
$matcher = self::parseOperand();
if ($matcher === false) {
die('OH SHIT');
}
return new NegateMatcher($matcher);
return $result;
}
}
if (self::$instance->token === '(') {
self::$instance->token = self::$instance->tokenizer->next();
$expressionInParents = self::parseInnerExpression();
if (self::$instance->token === ')') {
self::$instance->token = self::$instance->tokenizer->next();
}
return $expressionInParents;
protected static function parseScope(): Matcher {
if (!preg_match('/^(?:[A-Za-z0-9-_]+|\*)(?:\.(?:[A-Za-z0-9-+_]+|\*))*$/S', self::$instance->token)) {
die('Invalid scope');
}
if (self::$instance->tokenizer->tokenIsIdentifier()) {
$identifiers = [];
do {
$identifiers[] = self::$instance->token;
self::$instance->token = self::$instance->tokenizer->next();
} while (self::$instance->tokenizer->tokenIsIdentifier());
return new ScopeMatcher(...$identifiers);
$segments = explode('.', $token);
foreach ($segments as $index => $segment) {
$segments[$index] = ($segment !== '*') ? new SegmentMatcher($segment) : new TrueMatcher();
}
return false;
return new ScopeMatcher(...$segments);
}
}

40
lib/Scope/Tokenizer.php

@ -1,40 +0,0 @@
<?php
/** @license MIT
* Copyright 2021 Dustin Wilson et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace dW\Highlighter\Scope;
class Tokenizer {
protected array $matches = [];
protected int $position = 0;
public function __construct(string $scope) {
preg_match_all('/([LR]:|[\w\.:][\w\.:\-]*|[\,\|\-\(\)])/', $scope, $matches);
$this->matches = $matches[1];
}
public function next(): string|false {
if (count($this->matches) === 0) {
return false;
}
$result = $this->matches[$this->position] ?? false;
if ($result !== false) {
$this->position++;
}
return $result;
}
public function tokenIsIdentifier(): bool {
if (!isset($this->matches[$this->position])) {
return false;
}
return (!!$this->matches[$this->position] && !!preg_match('/[\w\.:]+/', $this->matches[$this->position]));
}
}
Loading…
Cancel
Save