From f944ca9b9cad6098fdacb8085a8e8a40cf5c6e69 Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Tue, 14 Sep 2021 13:27:05 -0500 Subject: [PATCH] Added DOM creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Fixed bug where nonexistent grammars would cause tokenizer to fail. • Added mensbeam/html as a dependency, removed docopt/docopt and ext-mbstring. • Discovered bug when injections are removed from the stack when tokenizing, investigating. --- composer.json | 14 +- composer.lock | 233 +++++++++++++++++++++++++++---- lib/Grammar/GrammarReference.php | 2 +- lib/GrammarRegistry.php | 2 +- lib/Highlight.php | 57 +++++++- lib/Tokenizer.php | 8 +- 6 files changed, 279 insertions(+), 37 deletions(-) diff --git a/composer.json b/composer.json index ce2898a..ac93423 100644 --- a/composer.json +++ b/composer.json @@ -1,5 +1,5 @@ { - "name": "dw/highlighter", + "name": "dw/lit", "type": "library", "description": "TextMate-like syntax highlighting in PHP", "license": "MIT", @@ -10,18 +10,22 @@ "homepage": "https://dustinwilson.com/" } ], + "repositories": [ + { + "type": "vcs", + "url": "https://code.mensbeam.com/MensBeam/HTML" + } + ], "require": { "php": "^7.4 || ^8.0", "ext-dom": "*", "ext-intl": "*", "ext-json": "*", - "ext-mbstring": "*", - "docopt/docopt": "^1.0" + "mensbeam/html": "dev-master" }, "autoload": { "psr-4": { - "dW\\Lit\\": "lib/", - "dW\\Lit\\Scope\\": "lib/Scope/Matchers/" + "dW\\Lit\\": "lib/" } } } diff --git a/composer.lock b/composer.lock index 2fa9735..7d50761 100644 --- a/composer.lock +++ b/composer.lock @@ -4,33 +4,212 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "7f3c88aa5023ebb6ebad3e513973d927", + "content-hash": "4adc23361d14315afd30b4dd2846557e", "packages": [ { - "name": "docopt/docopt", - "version": "1.0.4", + "name": "mensbeam/html", + "version": "dev-master", "source": { "type": "git", - "url": "https://github.com/docopt/docopt.php.git", - "reference": "bf3683a16e09fa1665e493eb4d5a29469e132a4f" + "url": "https://code.mensbeam.com/MensBeam/HTML", + "reference": "4a7511dd49e85ad1865a3252176047266d62b087" + }, + "require": { + "ext-dom": "*", + "mensbeam/intl": ">=0.9.0", + "mensbeam/mimesniff": "^0.2.0", + "php": ">=7.1" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.3", + "daux/daux.io": "^0.16.0", + "masterminds/html5": "^2.7" + }, + "suggest": { + "ext-ctype": "Improved performance" + }, + "default-branch": true, + "type": "library", + "autoload": { + "psr-4": { + "MensBeam\\HTML\\": [ + "lib/", + "lib/DOM", + "lib/DOM/traits" + ] + }, + "classmap": [ + "lib/Token.php" + ], + "files": [ + "lib/ctype.php" + ] + }, + "autoload-dev": { + "psr-4": { + "MensBeam\\HTML\\Test\\": "tests/lib/", + "MensBeam\\HTML\\TestCase\\": "tests/cases/" + } + }, + "scripts": { + "post-install-cmd": [ + "@composer bin all install" + ], + "post-update-cmd": [ + "@composer bin all update" + ] + }, + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Dustin Wilson", + "email": "dustin@dustinwilson.com", + "homepage": "https://dustinwilson.com/" + }, + { + "name": "J. King", + "email": "jking@jkingweb.ca", + "homepage": "https://jkingweb.ca/" + } + ], + "description": "Parses modern HTML text into a PHP DOMDocument", + "time": "2021-04-26T21:07:13+00:00" + }, + { + "name": "mensbeam/intl", + "version": "0.9.0", + "source": { + "type": "git", + "url": "https://github.com/mensbeam/intl.git", + "reference": "de037b182ce99aaa90ebc09b0ee0457ddf1d07bc" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/docopt/docopt.php/zipball/bf3683a16e09fa1665e493eb4d5a29469e132a4f", - "reference": "bf3683a16e09fa1665e493eb4d5a29469e132a4f", + "url": "https://api.github.com/repos/mensbeam/intl/zipball/de037b182ce99aaa90ebc09b0ee0457ddf1d07bc", + "reference": "de037b182ce99aaa90ebc09b0ee0457ddf1d07bc", "shasum": "" }, "require": { - "php": ">=5.3.0" + "php": ">=7.1" }, "require-dev": { - "phpunit/phpunit": "4.1.*" + "bamarni/composer-bin-plugin": "*", + "ext-intl": "*" }, "type": "library", "autoload": { - "classmap": [ - "src/docopt.php" - ] + "psr-4": { + "MensBeam\\Intl\\": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "J. King", + "email": "jking@jkingweb.ca", + "homepage": "https://jkingweb.ca/" + } + ], + "description": "A set of dependency-free basic internationalization tools", + "keywords": [ + "WHATWG", + "charset", + "encoding", + "internationalization", + "intl", + "unicode", + "utf-8", + "utf8" + ], + "support": { + "issues": "https://github.com/mensbeam/intl/issues", + "source": "https://github.com/mensbeam/intl/tree/0.9.0" + }, + "time": "2021-03-25T19:08:04+00:00" + }, + { + "name": "mensbeam/mimesniff", + "version": "0.2.1", + "source": { + "type": "git", + "url": "https://github.com/mensbeam/mime.git", + "reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/mensbeam/mime/zipball/c19be2496ab1e27fbf9c3483c2a9faa2781796cd", + "reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd", + "shasum": "" + }, + "require": { + "php": ">=7.1", + "psr/http-message": "^1.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.3", + "ext-intl": "*" + }, + "type": "library", + "autoload": { + "psr-4": { + "MensBeam\\Mime\\": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "J. King", + "email": "jking@jkingweb.ca", + "homepage": "https://jkingweb.ca/" + } + ], + "description": "An implementation of the WHATWG MIME Sniffing specification", + "keywords": [ + "WHATWG", + "mime", + "mimesniff" + ], + "support": { + "issues": "https://github.com/mensbeam/mime/issues", + "source": "https://github.com/mensbeam/mime/tree/0.2.1" + }, + "time": "2021-03-07T03:58:00+00:00" + }, + { + "name": "psr/http-message", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-message.git", + "reference": "f6561bf28d520154e4b0ec72be95418abe6d9363" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-message/zipball/f6561bf28d520154e4b0ec72be95418abe6d9363", + "reference": "f6561bf28d520154e4b0ec72be95418abe6d9363", + "shasum": "" + }, + "require": { + "php": ">=5.3.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Message\\": "src/" + } }, "notification-url": "https://packagist.org/downloads/", "license": [ @@ -38,37 +217,39 @@ ], "authors": [ { - "name": "Blake Williams", - "email": "code@shabbyrobe.org", - "homepage": "http://docopt.org/", - "role": "Developer" + "name": "PHP-FIG", + "homepage": "http://www.php-fig.org/" } ], - "description": "Port of Python's docopt for PHP >=5.3", - "homepage": "http://github.com/docopt/docopt.php", + "description": "Common interface for HTTP messages", + "homepage": "https://github.com/php-fig/http-message", "keywords": [ - "cli", - "docs" + "http", + "http-message", + "psr", + "psr-7", + "request", + "response" ], "support": { - "issues": "https://github.com/docopt/docopt.php/issues", - "source": "https://github.com/docopt/docopt.php/tree/1.0.4" + "source": "https://github.com/php-fig/http-message/tree/master" }, - "time": "2019-12-03T02:48:46+00:00" + "time": "2016-08-06T14:39:51+00:00" } ], "packages-dev": [], "aliases": [], "minimum-stability": "stable", - "stability-flags": [], + "stability-flags": { + "mensbeam/html": 20 + }, "prefer-stable": false, "prefer-lowest": false, "platform": { "php": "^7.4 || ^8.0", "ext-dom": "*", "ext-intl": "*", - "ext-json": "*", - "ext-mbstring": "*" + "ext-json": "*" }, "platform-dev": [], "plugin-api-version": "2.0.0" diff --git a/lib/Grammar/GrammarReference.php b/lib/Grammar/GrammarReference.php index ba68f2a..d066da2 100644 --- a/lib/Grammar/GrammarReference.php +++ b/lib/Grammar/GrammarReference.php @@ -24,7 +24,7 @@ class GrammarReference extends Reference { } - public function get(): Grammar { + public function get(): Grammar|false { return GrammarRegistry::get($this->_scopeName); } } \ No newline at end of file diff --git a/lib/GrammarRegistry.php b/lib/GrammarRegistry.php index 7f4c0d3..ecee453 100644 --- a/lib/GrammarRegistry.php +++ b/lib/GrammarRegistry.php @@ -16,7 +16,7 @@ class GrammarRegistry { return true; } - public static function get(string $scopeName): Grammar|bool { + public static function get(string $scopeName): Grammar|false { if (array_key_exists($scopeName, self::$storage)) { return self::$storage[$scopeName]; } else { diff --git a/lib/Highlight.php b/lib/Highlight.php index 4539dde..9b9557e 100644 --- a/lib/Highlight.php +++ b/lib/Highlight.php @@ -6,15 +6,23 @@ declare(strict_types=1); namespace dW\Lit; use dW\Lit\Grammar\Exception; +use MensBeam\HTML\{ + Document, + Element +}; class Highlight { - public static function toDOM(string $data, string $scopeName) { - self::highlight($data, $scopeName); + public static function toDOM(string $data, string $scopeName, ?Document $document = null, string $encoding = 'windows-1252'): Element { + return self::highlight($data, $scopeName, $document, $encoding); + } + + public static function toString(string $data, string $scopeName, string $encoding = 'windows-1252'): string { + return (string)self::highlight($data, $scopeName, null, $encoding); } - protected static function highlight(string $data, string $scopeName) { + protected static function highlight(string $data, string $scopeName, ?Document $document = null, string $encoding = 'windows-1252'): Element { $grammar = GrammarRegistry::get($scopeName); if ($grammar === false) { throw new Exception(Exception::GRAMMAR_MISSING, $scopeName); @@ -23,7 +31,50 @@ class Highlight { $tokenizer = new Tokenizer(new Data($data), $grammar); $tokenList = $tokenizer->tokenize(); + if ($document === null) { + $document = new Document(); + $document->encoding = $encoding; + } + + $pre = $document->createElement('pre'); + $code = $document->createElement('code'); + $code->setAttribute('class', str_replace('.', ' ', $scopeName)); + $pre->appendChild($code); + + $elementStack = [ $code ]; + $scopeStack = [ $scopeName ]; + foreach ($tokenList as $lineNumber => $tokens) { + continue; + foreach ($tokens as $token) { + $lastKey = count($token['scopes']) - 1; + foreach ($token['scopes'] as $key => $scope) { + $keyExists = array_key_exists($key, $scopeStack); + if (!$keyExists || $scopeStack[$key] !== $scope) { + if ($keyExists && $scopeStack[$key] !== $scope) { + $scopeStack = array_slice($scopeStack, 0, $key); + $elementStack = array_slice($elementStack, 0, $key); + } + + $span = $document->createElement('span'); + $span->setAttribute('class', str_replace('.', ' ', $scope)); + end($elementStack)->appendChild($span); + $scopeStack[] = $scope; + $elementStack[] = $span; + } + + if ($key === $lastKey) { + if (array_key_exists($key + 1, $scopeStack)) { + $scopeStack = array_slice($scopeStack, 0, $key + 1); + $elementStack = array_slice($elementStack, 0, $key + 1); + } + + end($elementStack)->appendChild($document->createTextNode($token['text'])); + } + } + } } + + return $pre; } } \ No newline at end of file diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 081c808..8060e62 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -110,7 +110,7 @@ class Tokenizer { if ($selector->matches($this->scopeStack)) { $prefix = $selector->getPrefix($this->scopeStack); if ($prefix === Filter::PREFIX_LEFT || $prefix === Filter::PREFIX_BOTH) { - $this->scopeStack[] = $injection; + $this->ruleStack[] = $injection; $this->activeInjection = $injection; break; } @@ -131,6 +131,12 @@ class Tokenizer { while (true) { $rule = $currentRules[$i]; + // Grammar references can return false if the grammar does not exist, so + // continue on if the current rule is false. + if ($rule === false) { + continue 2; + } + // If the rule is a Pattern if ($rule instanceof Pattern) { if (preg_match($rule->match, $this->line . ((!$this->data->lastLine) ? "\n" : ''), $match, PREG_OFFSET_CAPTURE, $this->offset) === 1) {