Browse Source

Added DOM creation

• Fixed bug where nonexistent grammars would cause tokenizer to fail.
• Added mensbeam/html as a dependency, removed docopt/docopt and 
ext-mbstring.
• Discovered bug when injections are removed from the stack when 
tokenizing, investigating.
main
Dustin Wilson 3 years ago
parent
commit
f944ca9b9c
  1. 14
      composer.json
  2. 233
      composer.lock
  3. 2
      lib/Grammar/GrammarReference.php
  4. 2
      lib/GrammarRegistry.php
  5. 57
      lib/Highlight.php
  6. 8
      lib/Tokenizer.php

14
composer.json

@ -1,5 +1,5 @@
{
"name": "dw/highlighter",
"name": "dw/lit",
"type": "library",
"description": "TextMate-like syntax highlighting in PHP",
"license": "MIT",
@ -10,18 +10,22 @@
"homepage": "https://dustinwilson.com/"
}
],
"repositories": [
{
"type": "vcs",
"url": "https://code.mensbeam.com/MensBeam/HTML"
}
],
"require": {
"php": "^7.4 || ^8.0",
"ext-dom": "*",
"ext-intl": "*",
"ext-json": "*",
"ext-mbstring": "*",
"docopt/docopt": "^1.0"
"mensbeam/html": "dev-master"
},
"autoload": {
"psr-4": {
"dW\\Lit\\": "lib/",
"dW\\Lit\\Scope\\": "lib/Scope/Matchers/"
"dW\\Lit\\": "lib/"
}
}
}

233
composer.lock

@ -4,33 +4,212 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "7f3c88aa5023ebb6ebad3e513973d927",
"content-hash": "4adc23361d14315afd30b4dd2846557e",
"packages": [
{
"name": "docopt/docopt",
"version": "1.0.4",
"name": "mensbeam/html",
"version": "dev-master",
"source": {
"type": "git",
"url": "https://github.com/docopt/docopt.php.git",
"reference": "bf3683a16e09fa1665e493eb4d5a29469e132a4f"
"url": "https://code.mensbeam.com/MensBeam/HTML",
"reference": "4a7511dd49e85ad1865a3252176047266d62b087"
},
"require": {
"ext-dom": "*",
"mensbeam/intl": ">=0.9.0",
"mensbeam/mimesniff": "^0.2.0",
"php": ">=7.1"
},
"require-dev": {
"bamarni/composer-bin-plugin": "^1.3",
"daux/daux.io": "^0.16.0",
"masterminds/html5": "^2.7"
},
"suggest": {
"ext-ctype": "Improved performance"
},
"default-branch": true,
"type": "library",
"autoload": {
"psr-4": {
"MensBeam\\HTML\\": [
"lib/",
"lib/DOM",
"lib/DOM/traits"
]
},
"classmap": [
"lib/Token.php"
],
"files": [
"lib/ctype.php"
]
},
"autoload-dev": {
"psr-4": {
"MensBeam\\HTML\\Test\\": "tests/lib/",
"MensBeam\\HTML\\TestCase\\": "tests/cases/"
}
},
"scripts": {
"post-install-cmd": [
"@composer bin all install"
],
"post-update-cmd": [
"@composer bin all update"
]
},
"license": [
"MIT"
],
"authors": [
{
"name": "Dustin Wilson",
"email": "dustin@dustinwilson.com",
"homepage": "https://dustinwilson.com/"
},
{
"name": "J. King",
"email": "jking@jkingweb.ca",
"homepage": "https://jkingweb.ca/"
}
],
"description": "Parses modern HTML text into a PHP DOMDocument",
"time": "2021-04-26T21:07:13+00:00"
},
{
"name": "mensbeam/intl",
"version": "0.9.0",
"source": {
"type": "git",
"url": "https://github.com/mensbeam/intl.git",
"reference": "de037b182ce99aaa90ebc09b0ee0457ddf1d07bc"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/docopt/docopt.php/zipball/bf3683a16e09fa1665e493eb4d5a29469e132a4f",
"reference": "bf3683a16e09fa1665e493eb4d5a29469e132a4f",
"url": "https://api.github.com/repos/mensbeam/intl/zipball/de037b182ce99aaa90ebc09b0ee0457ddf1d07bc",
"reference": "de037b182ce99aaa90ebc09b0ee0457ddf1d07bc",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
"php": ">=7.1"
},
"require-dev": {
"phpunit/phpunit": "4.1.*"
"bamarni/composer-bin-plugin": "*",
"ext-intl": "*"
},
"type": "library",
"autoload": {
"classmap": [
"src/docopt.php"
]
"psr-4": {
"MensBeam\\Intl\\": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "J. King",
"email": "jking@jkingweb.ca",
"homepage": "https://jkingweb.ca/"
}
],
"description": "A set of dependency-free basic internationalization tools",
"keywords": [
"WHATWG",
"charset",
"encoding",
"internationalization",
"intl",
"unicode",
"utf-8",
"utf8"
],
"support": {
"issues": "https://github.com/mensbeam/intl/issues",
"source": "https://github.com/mensbeam/intl/tree/0.9.0"
},
"time": "2021-03-25T19:08:04+00:00"
},
{
"name": "mensbeam/mimesniff",
"version": "0.2.1",
"source": {
"type": "git",
"url": "https://github.com/mensbeam/mime.git",
"reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/mensbeam/mime/zipball/c19be2496ab1e27fbf9c3483c2a9faa2781796cd",
"reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd",
"shasum": ""
},
"require": {
"php": ">=7.1",
"psr/http-message": "^1.0"
},
"require-dev": {
"bamarni/composer-bin-plugin": "^1.3",
"ext-intl": "*"
},
"type": "library",
"autoload": {
"psr-4": {
"MensBeam\\Mime\\": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "J. King",
"email": "jking@jkingweb.ca",
"homepage": "https://jkingweb.ca/"
}
],
"description": "An implementation of the WHATWG MIME Sniffing specification",
"keywords": [
"WHATWG",
"mime",
"mimesniff"
],
"support": {
"issues": "https://github.com/mensbeam/mime/issues",
"source": "https://github.com/mensbeam/mime/tree/0.2.1"
},
"time": "2021-03-07T03:58:00+00:00"
},
{
"name": "psr/http-message",
"version": "1.0.1",
"source": {
"type": "git",
"url": "https://github.com/php-fig/http-message.git",
"reference": "f6561bf28d520154e4b0ec72be95418abe6d9363"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/php-fig/http-message/zipball/f6561bf28d520154e4b0ec72be95418abe6d9363",
"reference": "f6561bf28d520154e4b0ec72be95418abe6d9363",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.0.x-dev"
}
},
"autoload": {
"psr-4": {
"Psr\\Http\\Message\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
@ -38,37 +217,39 @@
],
"authors": [
{
"name": "Blake Williams",
"email": "code@shabbyrobe.org",
"homepage": "http://docopt.org/",
"role": "Developer"
"name": "PHP-FIG",
"homepage": "http://www.php-fig.org/"
}
],
"description": "Port of Python's docopt for PHP >=5.3",
"homepage": "http://github.com/docopt/docopt.php",
"description": "Common interface for HTTP messages",
"homepage": "https://github.com/php-fig/http-message",
"keywords": [
"cli",
"docs"
"http",
"http-message",
"psr",
"psr-7",
"request",
"response"
],
"support": {
"issues": "https://github.com/docopt/docopt.php/issues",
"source": "https://github.com/docopt/docopt.php/tree/1.0.4"
"source": "https://github.com/php-fig/http-message/tree/master"
},
"time": "2019-12-03T02:48:46+00:00"
"time": "2016-08-06T14:39:51+00:00"
}
],
"packages-dev": [],
"aliases": [],
"minimum-stability": "stable",
"stability-flags": [],
"stability-flags": {
"mensbeam/html": 20
},
"prefer-stable": false,
"prefer-lowest": false,
"platform": {
"php": "^7.4 || ^8.0",
"ext-dom": "*",
"ext-intl": "*",
"ext-json": "*",
"ext-mbstring": "*"
"ext-json": "*"
},
"platform-dev": [],
"plugin-api-version": "2.0.0"

2
lib/Grammar/GrammarReference.php

@ -24,7 +24,7 @@ class GrammarReference extends Reference {
}
public function get(): Grammar {
public function get(): Grammar|false {
return GrammarRegistry::get($this->_scopeName);
}
}

2
lib/GrammarRegistry.php

@ -16,7 +16,7 @@ class GrammarRegistry {
return true;
}
public static function get(string $scopeName): Grammar|bool {
public static function get(string $scopeName): Grammar|false {
if (array_key_exists($scopeName, self::$storage)) {
return self::$storage[$scopeName];
} else {

57
lib/Highlight.php

@ -6,15 +6,23 @@
declare(strict_types=1);
namespace dW\Lit;
use dW\Lit\Grammar\Exception;
use MensBeam\HTML\{
Document,
Element
};
class Highlight {
public static function toDOM(string $data, string $scopeName) {
self::highlight($data, $scopeName);
public static function toDOM(string $data, string $scopeName, ?Document $document = null, string $encoding = 'windows-1252'): Element {
return self::highlight($data, $scopeName, $document, $encoding);
}
public static function toString(string $data, string $scopeName, string $encoding = 'windows-1252'): string {
return (string)self::highlight($data, $scopeName, null, $encoding);
}
protected static function highlight(string $data, string $scopeName) {
protected static function highlight(string $data, string $scopeName, ?Document $document = null, string $encoding = 'windows-1252'): Element {
$grammar = GrammarRegistry::get($scopeName);
if ($grammar === false) {
throw new Exception(Exception::GRAMMAR_MISSING, $scopeName);
@ -23,7 +31,50 @@ class Highlight {
$tokenizer = new Tokenizer(new Data($data), $grammar);
$tokenList = $tokenizer->tokenize();
if ($document === null) {
$document = new Document();
$document->encoding = $encoding;
}
$pre = $document->createElement('pre');
$code = $document->createElement('code');
$code->setAttribute('class', str_replace('.', ' ', $scopeName));
$pre->appendChild($code);
$elementStack = [ $code ];
$scopeStack = [ $scopeName ];
foreach ($tokenList as $lineNumber => $tokens) {
continue;
foreach ($tokens as $token) {
$lastKey = count($token['scopes']) - 1;
foreach ($token['scopes'] as $key => $scope) {
$keyExists = array_key_exists($key, $scopeStack);
if (!$keyExists || $scopeStack[$key] !== $scope) {
if ($keyExists && $scopeStack[$key] !== $scope) {
$scopeStack = array_slice($scopeStack, 0, $key);
$elementStack = array_slice($elementStack, 0, $key);
}
$span = $document->createElement('span');
$span->setAttribute('class', str_replace('.', ' ', $scope));
end($elementStack)->appendChild($span);
$scopeStack[] = $scope;
$elementStack[] = $span;
}
if ($key === $lastKey) {
if (array_key_exists($key + 1, $scopeStack)) {
$scopeStack = array_slice($scopeStack, 0, $key + 1);
$elementStack = array_slice($elementStack, 0, $key + 1);
}
end($elementStack)->appendChild($document->createTextNode($token['text']));
}
}
}
}
return $pre;
}
}

8
lib/Tokenizer.php

@ -110,7 +110,7 @@ class Tokenizer {
if ($selector->matches($this->scopeStack)) {
$prefix = $selector->getPrefix($this->scopeStack);
if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) {
$this->scopeStack[] = $injection;
$this->ruleStack[] = $injection;
$this->activeInjection = $injection;
break;
}
@ -131,6 +131,12 @@ class Tokenizer {
while (true) {
$rule = $currentRules[$i];
// Grammar references can return false if the grammar does not exist, so
// continue on if the current rule is false.
if ($rule === false) {
continue 2;
}
// If the rule is a Pattern
if ($rule instanceof Pattern) {
if (preg_match($rule->match, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $this->offset) === 1) {

Loading…
Cancel
Save