Beginning Rewrite
This commit is contained in:
parent
06cffb2e25
commit
a89f6c9f09
11 changed files with 4167 additions and 10018 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -62,3 +62,5 @@ $RECYCLE.BIN/
|
|||
|
||||
# .nfs files are created when an open file is removed but is still being accessed
|
||||
.nfs*
|
||||
|
||||
/vendor/
|
||||
|
|
28
composer.json
Normal file
28
composer.json
Normal file
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"name": "dw/html5",
|
||||
"description": "Parses HTML5 text into a php DOMDocument",
|
||||
"type": "library",
|
||||
"require": {
|
||||
"php": "^7.0",
|
||||
"ext-intl": "*",
|
||||
"ext-mcrypt": "*",
|
||||
"ext-hash": "*"
|
||||
},
|
||||
"license": "MIT",
|
||||
"authors": [
|
||||
{
|
||||
"name": "Dustin Wilson",
|
||||
"email": "dustin@dustinwilson.com",
|
||||
"homepage": "https://dustinwilson.com/"
|
||||
}
|
||||
],
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"dW\\HTML5\\": "lib/"
|
||||
},
|
||||
"classmap": ["lib/Token.php"]
|
||||
},
|
||||
"autoload-dev": {
|
||||
"files": ["lib/Token.php"]
|
||||
}
|
||||
}
|
22
composer.lock
generated
Normal file
22
composer.lock
generated
Normal file
|
@ -0,0 +1,22 @@
|
|||
{
|
||||
"_readme": [
|
||||
"This file locks the dependencies of your project to a known state",
|
||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "9ae6b8a7d4d830c471b21ccefc1ee8fb",
|
||||
"packages": [],
|
||||
"packages-dev": [],
|
||||
"aliases": [],
|
||||
"minimum-stability": "stable",
|
||||
"stability-flags": [],
|
||||
"prefer-stable": false,
|
||||
"prefer-lowest": false,
|
||||
"platform": {
|
||||
"php": "^7.0",
|
||||
"ext-intl": "*",
|
||||
"ext-mcrypt": "*",
|
||||
"ext-hash": "*"
|
||||
},
|
||||
"platform-dev": []
|
||||
}
|
10023
html5.php
Normal file → Executable file
10023
html5.php
Normal file → Executable file
File diff suppressed because it is too large
Load diff
74
lib/DOM.php
Normal file
74
lib/DOM.php
Normal file
|
@ -0,0 +1,74 @@
|
|||
<?php
|
||||
declare(strict_types=1);
|
||||
namespace dW\HTML5;
|
||||
|
||||
class DOM {
|
||||
public static function getAncestor(mixed $needle, \DOMElement $context): \DOMElement {
|
||||
return static::ancestor($needle, $context, true);
|
||||
}
|
||||
|
||||
public static function hasAncestor(mixed $needle, \DOMElement $context): bool {
|
||||
return static::ancestor($needle, $context, false);
|
||||
}
|
||||
|
||||
public static function getDescendant(mixed $needle, \DOMElement $context): \DOMNode {
|
||||
return static::descendant($needle, $context, true);
|
||||
}
|
||||
|
||||
public static function hasDescendant(mixed $needle, \DOMElement $context): bool {
|
||||
return static::descendant($needle, $context, false);
|
||||
}
|
||||
|
||||
public static function descendant(mixed $needle, \DOMElement $context, bool $returnNode = true): \DOMNode {
|
||||
if ($context->hasChildNodes() === false) {
|
||||
return ($returnNode === true) ? null : false;
|
||||
}
|
||||
|
||||
$context = $context->firstChild;
|
||||
|
||||
do {
|
||||
$result = static::compare($needle, $context);
|
||||
if (!is_null($result)) {
|
||||
return ($returnNode === true) ? $result : true;
|
||||
}
|
||||
|
||||
$result = static::descendant($needle, $context);
|
||||
if (!is_null($result)) {
|
||||
return ($returnNode === true) ? $result : true;
|
||||
}
|
||||
} while ($context = $context->nextSibling);
|
||||
|
||||
return ($returnNode === true) ? null : false;
|
||||
}
|
||||
|
||||
protected static function ancestor(mixed $needle, \DOMElement $context, bool $returnNode = true) {
|
||||
while ($context = $context->parentNode) {
|
||||
$result = static::compare($needle, $context);
|
||||
if (!is_null($result)) {
|
||||
return ($returnNode === true) ? $result : true;
|
||||
}
|
||||
}
|
||||
|
||||
return ($returnNode === true) ? null : false;
|
||||
}
|
||||
|
||||
protected static function compare(mixed $needle, \DOMNode $context): \DOMNode {
|
||||
if (is_string($needle)) {
|
||||
if ($context instanceof \DOMElement && $context->nodeName == $needle) {
|
||||
return $context;
|
||||
}
|
||||
} elseif ($needle instanceof \DOMNode) {
|
||||
if ($context->isSameNode($needle)) {
|
||||
return $context;
|
||||
}
|
||||
} elseif ($needle instanceof \Closure) {
|
||||
if ($needle($context) === true) {
|
||||
return $context;
|
||||
}
|
||||
} else {
|
||||
throw new Exception(Exception::DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED, gettype($needle));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
439
lib/DataStream.php
Normal file
439
lib/DataStream.php
Normal file
File diff suppressed because one or more lines are too long
79
lib/Exception.php
Normal file
79
lib/Exception.php
Normal file
|
@ -0,0 +1,79 @@
|
|||
<?php
|
||||
declare(strict_types=1);
|
||||
namespace dW\HTML5;
|
||||
|
||||
class Exception extends \Exception {
|
||||
const INVALID_CODE = 10000;
|
||||
const UNKNOWN_ERROR = 10001;
|
||||
const INCORRECT_PARAMETERS_FOR_MESSAGE = 10002;
|
||||
|
||||
const PARSER_DOMDOCUMENT_EXPECTED = 10101;
|
||||
const PARSER_DOMELEMENT_DOMDOCUMENT_DOMDOCUMENTFRAG_EXPECTED = 10102;
|
||||
const PARSER_DOMNODE_EXPECTED = 10103;
|
||||
|
||||
const STACK_INVALID_INDEX = 10201;
|
||||
const STACK_DOMNODE_ONLY = 10202;
|
||||
|
||||
const DATASTREAM_NODATA = 10301;
|
||||
const DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH = 10302;
|
||||
|
||||
const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10401;
|
||||
|
||||
protected static $messages = [10000 => 'Invalid error code',
|
||||
10001 => 'Unknown error; escaping',
|
||||
10002 => 'Incorrect number of parameters for Exception message; %s expected',
|
||||
|
||||
10101 => 'DOMDocument expected; found %s',
|
||||
10102 => 'DOMElement, DOMDocument, or DOMDocumentFrag expected; found %s',
|
||||
10103 => 'DOMNode expected; found %s',
|
||||
|
||||
10201 => '%s is an invalid index',
|
||||
10202 => 'Instances of DOMNode are the only types allowed in an HTML5Stack',
|
||||
|
||||
10301 => 'Data string expected; found %s',
|
||||
10302 => '%s is an invalid data consumption length; a value of 1 or above is expected',
|
||||
|
||||
10401 => 'The first argument must either be an instance of \DOMElement, a string, or a closure; found %s'];
|
||||
|
||||
public function __construct(int $code, ...$args) {
|
||||
if (!isset(static::$messages[$code])) {
|
||||
throw new Exception(static::INVALID_CODE);
|
||||
}
|
||||
|
||||
$message = static::$messages[$code];
|
||||
$previous = null;
|
||||
|
||||
// Grab a previous exception if there is one.
|
||||
if ($args[0] instanceof \Throwable) {
|
||||
$previous = array_shift($args);
|
||||
} elseif (end($args) instanceof \Throwable) {
|
||||
$previous = array_pop($args);
|
||||
}
|
||||
|
||||
// Count the number of replacements needed in the message.
|
||||
$count = substr_count($message, '%s');
|
||||
// If the number of replacements don't match the arguments then oops.
|
||||
if (count($args) !== $count) {
|
||||
throw new Exception(static::INCORRECT_PARAMETERS_FOR_MESSAGE, $count);
|
||||
}
|
||||
|
||||
if ($count > 0) {
|
||||
// Convert newlines and tabs in the arguments to words to better express what they
|
||||
// are.
|
||||
/*$args = array_map(function($value) {
|
||||
switch ($value) {
|
||||
case "\n": return 'Newline';
|
||||
break;
|
||||
case "\t": return 'Tab';
|
||||
break;
|
||||
default: return $value;
|
||||
}
|
||||
}, $args);*/
|
||||
|
||||
// Go through each of the arguments and run sprintf on the strings.
|
||||
$message = call_user_func_array('sprintf', array_merge([$message], $args));
|
||||
}
|
||||
|
||||
parent::__construct($message, $code, $previous);
|
||||
}
|
||||
}
|
86
lib/ParseError.php
Normal file
86
lib/ParseError.php
Normal file
|
@ -0,0 +1,86 @@
|
|||
<?php
|
||||
declare(strict_types=1);
|
||||
namespace dW\HTML5;
|
||||
|
||||
class ParseError {
|
||||
// DataStream object passed to it used to get information used in error
|
||||
// reporting.
|
||||
public static $data;
|
||||
|
||||
const TAG_NAME_EXPECTED = 0;
|
||||
const UNEXPECTED_EOF = 1;
|
||||
const UNEXPECTED_CHARACTER = 2;
|
||||
const ATTRIBUTE_EXISTS = 3;
|
||||
const UNEXPECTED_TAG_END = 4;
|
||||
const UNEXPECTED_START_TAG = 5;
|
||||
const UNEXPECTED_END_TAG = 6;
|
||||
const UNEXPECTED_DOCTYPE = 7;
|
||||
const INVALID_DOCTYPE = 8;
|
||||
const INVALID_CONTROL_OR_NONCHARACTERS = 9;
|
||||
|
||||
protected static $messages = ['Tag name expected; found %s',
|
||||
'Unexpected end-of-file; %s expected',
|
||||
'Unexpected "%s" character; %s expected',
|
||||
'%s attribute already exists; discarding',
|
||||
'Unexpected tag end; %s expected',
|
||||
'Unexpected %s start tag; %s expected',
|
||||
'Unexpected %s end tag; %s expected',
|
||||
'Unexpected DOCTYPE; %s expected',
|
||||
'Invalid DOCTYPE',
|
||||
'Invalid Control or Non-character; removing'];
|
||||
|
||||
public static function errorHandler($code, $message, $file, $line, array $context) {
|
||||
if ($code === E_USER_WARNING) {
|
||||
$errMsg = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, static::$data->filePath);
|
||||
|
||||
if (static::$data->length !== 0) {
|
||||
$errMsg .= sprintf(" on line %s, column %s\n", static ::$data->line, static::$data->column);
|
||||
} else {
|
||||
$errMsg .= "\n";
|
||||
}
|
||||
|
||||
echo $errMsg;
|
||||
}
|
||||
}
|
||||
|
||||
public static function trigger(int $code, DataStream $data, ...$args): bool {
|
||||
if (!isset(static::$messages[$code])) {
|
||||
throw new Exception(Exception::INVALID_CODE);
|
||||
}
|
||||
|
||||
static::$data = $data;
|
||||
|
||||
// Set the error handler and honor already-set error reporting rules.
|
||||
set_error_handler('\\dW\\HTML5\\ParseError::errorHandler', error_reporting());
|
||||
|
||||
$message = static::$messages[$code];
|
||||
|
||||
// Count the number of replacements needed in the message.
|
||||
$count = substr_count($message, '%s');
|
||||
// If the number of replacements don't match the arguments then oops.
|
||||
if (count($args) !== $count) {
|
||||
throw new Exception(static::INCORRECT_PARAMETERS_FOR_MESSAGE, $count);
|
||||
}
|
||||
|
||||
if ($count > 0) {
|
||||
// Convert newlines and tabs in the arguments to words to better express what they
|
||||
// are.
|
||||
$args = array_map(function($value) {
|
||||
switch ($value) {
|
||||
case "\n": return 'Newline';
|
||||
break;
|
||||
case "\t": return 'Tab';
|
||||
break;
|
||||
default: return $value;
|
||||
}
|
||||
}, $args);
|
||||
|
||||
// Go through each of the arguments and run sprintf on the strings.
|
||||
$message = call_user_func_array('sprintf', array_merge([$message], $args));
|
||||
}
|
||||
|
||||
$output = trigger_error($message, E_USER_WARNING);
|
||||
restore_error_handler();
|
||||
return $output;
|
||||
}
|
||||
}
|
3259
lib/Parser.php
Normal file
3259
lib/Parser.php
Normal file
File diff suppressed because it is too large
Load diff
90
lib/Stack.php
Normal file
90
lib/Stack.php
Normal file
|
@ -0,0 +1,90 @@
|
|||
<?php
|
||||
declare(strict_types=1);
|
||||
namespace dW\HTML5;
|
||||
|
||||
class Stack implements \ArrayAccess {
|
||||
protected $storage = [];
|
||||
|
||||
// Temporarily change this from DOMNode to HTML5StartTagToken for the purposes of
|
||||
// testing the tokenizer.
|
||||
public function offsetSet($offset, $value) {
|
||||
if ($offset < 0) {
|
||||
throw new Exception(Exception::STACK_INVALID_INDEX);
|
||||
}
|
||||
|
||||
if (is_null($offset)) {
|
||||
$this->storage[] = $value;
|
||||
} else {
|
||||
$this->storage[$offset] = $value;
|
||||
}
|
||||
}
|
||||
|
||||
public function offsetExists($offset) {
|
||||
return isset($this->storage[$offset]);
|
||||
}
|
||||
|
||||
public function offsetUnset($offset) {
|
||||
if ($offset < 0) {
|
||||
throw new Exception(Exception::STACK_INVALID_INDEX);
|
||||
}
|
||||
|
||||
unset($this->storage[$offset]);
|
||||
}
|
||||
|
||||
public function offsetGet($offset) {
|
||||
if ($offset < 0) {
|
||||
throw new Exception(Exception::STACK_INVALID_INDEX);
|
||||
}
|
||||
|
||||
return $this->storage[$offset];
|
||||
}
|
||||
|
||||
public function pop() {
|
||||
return array_pop($this->storage);
|
||||
}
|
||||
|
||||
public static function search(mixed $needle): int {
|
||||
if (!$needle) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($needle instanceof DOMElement) {
|
||||
foreach ($this->storage as $key=>$value) {
|
||||
if ($value->isSameNode($needle)) {
|
||||
return $key;
|
||||
}
|
||||
}
|
||||
} elseif (is_string($needle)) {
|
||||
foreach ($this->storage as $key=>$value) {
|
||||
if ($value->nodeName === $needle) {
|
||||
return $key;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function __get($property) {
|
||||
switch ($property) {
|
||||
case 'length': return count($this->storage);
|
||||
break;
|
||||
case 'currentNode':
|
||||
$currentNode = end($this->storage);
|
||||
return ($currentNode) ? $currentNode : null;
|
||||
break;
|
||||
case 'adjustedCurrentNode':
|
||||
# The adjusted current node is the context element if the parser was created by
|
||||
# the HTML fragment parsing algorithm and the stack of open elements has only one
|
||||
# element in it (fragment case); otherwise, the adjusted current node is the
|
||||
# current node.
|
||||
return (Parser::$self->fragmentCase && $this->length === 1) ? Parser::$self->fragmentContext : $this->currentNode;
|
||||
break;
|
||||
case 'currentNodeName':
|
||||
$currentNode = $this->currentNode;
|
||||
return ($currentNode && $currentNode->nodeType) ? $currentNode->nodeName : null;
|
||||
break;
|
||||
default: return null;
|
||||
}
|
||||
}
|
||||
}
|
83
lib/Token.php
Normal file
83
lib/Token.php
Normal file
|
@ -0,0 +1,83 @@
|
|||
<?php
|
||||
declare(strict_types=1);
|
||||
namespace dW\HTML5;
|
||||
|
||||
abstract class Token {}
|
||||
|
||||
abstract class DataToken extends Token {
|
||||
public $data;
|
||||
|
||||
public function __construct($data) {
|
||||
$this->data = (string)$data;
|
||||
}
|
||||
}
|
||||
|
||||
abstract class TagToken extends Token {
|
||||
public $name;
|
||||
|
||||
public function __construct($name) {
|
||||
$this->name = (string)$name;
|
||||
}
|
||||
}
|
||||
|
||||
class EOFToken extends Token {}
|
||||
|
||||
class DOCTYPEToken extends Token {
|
||||
public $forceQuirks = false;
|
||||
public $public;
|
||||
public $system;
|
||||
|
||||
public function __construct($name = null, $public = null, $system = null) {
|
||||
$this->name = (string)$name;
|
||||
|
||||
$this->public = (string)$public;
|
||||
$this->system = (string)$system;
|
||||
}
|
||||
}
|
||||
|
||||
class CharacterToken extends DataToken {}
|
||||
|
||||
class CommentToken extends DataToken {
|
||||
public function __construct($data = '') {
|
||||
parent::__construct($data);
|
||||
}
|
||||
}
|
||||
|
||||
class StartTagToken extends TagToken {
|
||||
public $namespace;
|
||||
public $selfClosing;
|
||||
|
||||
protected $_attributes;
|
||||
|
||||
public function __construct($name, bool $selfClosing = false, string $namespace = \dW\HTML5\Parser::HTML_NAMESPACE) {
|
||||
$this->selfClosing = $selfClosing;
|
||||
$this->namespace = $namespace;
|
||||
parent::__construct($name);
|
||||
}
|
||||
|
||||
public function getAttribute(string $name): \DOMAttr {
|
||||
return ($this->_attributes[$name]) ? $this->_attributes[$name] : null;
|
||||
}
|
||||
|
||||
public function hasAttribute(string $name): bool {
|
||||
return (isset($this->_attributes[$name]));
|
||||
}
|
||||
|
||||
public function removeAttribute(string $name) {
|
||||
unset($this->_attributes[$name]);
|
||||
}
|
||||
|
||||
public function setAttribute($name, $value) {
|
||||
$this->_attributes[(string)$name] = (string)$value;
|
||||
}
|
||||
|
||||
public function __get($property) {
|
||||
if ($property === 'attributes') {
|
||||
return $this->_attributes;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class EndTagToken extends TagToken {}
|
Loading…
Reference in a new issue