Dustin Wilson
6 years ago
11 changed files with 4167 additions and 10018 deletions
@ -0,0 +1,28 @@ |
|||
{ |
|||
"name": "dw/html5", |
|||
"description": "Parses HTML5 text into a php DOMDocument", |
|||
"type": "library", |
|||
"require": { |
|||
"php": "^7.0", |
|||
"ext-intl": "*", |
|||
"ext-mcrypt": "*", |
|||
"ext-hash": "*" |
|||
}, |
|||
"license": "MIT", |
|||
"authors": [ |
|||
{ |
|||
"name": "Dustin Wilson", |
|||
"email": "dustin@dustinwilson.com", |
|||
"homepage": "https://dustinwilson.com/" |
|||
} |
|||
], |
|||
"autoload": { |
|||
"psr-4": { |
|||
"dW\\HTML5\\": "lib/" |
|||
}, |
|||
"classmap": ["lib/Token.php"] |
|||
}, |
|||
"autoload-dev": { |
|||
"files": ["lib/Token.php"] |
|||
} |
|||
} |
@ -0,0 +1,22 @@ |
|||
{ |
|||
"_readme": [ |
|||
"This file locks the dependencies of your project to a known state", |
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", |
|||
"This file is @generated automatically" |
|||
], |
|||
"content-hash": "9ae6b8a7d4d830c471b21ccefc1ee8fb", |
|||
"packages": [], |
|||
"packages-dev": [], |
|||
"aliases": [], |
|||
"minimum-stability": "stable", |
|||
"stability-flags": [], |
|||
"prefer-stable": false, |
|||
"prefer-lowest": false, |
|||
"platform": { |
|||
"php": "^7.0", |
|||
"ext-intl": "*", |
|||
"ext-mcrypt": "*", |
|||
"ext-hash": "*" |
|||
}, |
|||
"platform-dev": [] |
|||
} |
File diff suppressed because it is too large
@ -0,0 +1,74 @@ |
|||
<?php |
|||
declare(strict_types=1); |
|||
namespace dW\HTML5; |
|||
|
|||
class DOM { |
|||
public static function getAncestor(mixed $needle, \DOMElement $context): \DOMElement { |
|||
return static::ancestor($needle, $context, true); |
|||
} |
|||
|
|||
public static function hasAncestor(mixed $needle, \DOMElement $context): bool { |
|||
return static::ancestor($needle, $context, false); |
|||
} |
|||
|
|||
public static function getDescendant(mixed $needle, \DOMElement $context): \DOMNode { |
|||
return static::descendant($needle, $context, true); |
|||
} |
|||
|
|||
public static function hasDescendant(mixed $needle, \DOMElement $context): bool { |
|||
return static::descendant($needle, $context, false); |
|||
} |
|||
|
|||
public static function descendant(mixed $needle, \DOMElement $context, bool $returnNode = true): \DOMNode { |
|||
if ($context->hasChildNodes() === false) { |
|||
return ($returnNode === true) ? null : false; |
|||
} |
|||
|
|||
$context = $context->firstChild; |
|||
|
|||
do { |
|||
$result = static::compare($needle, $context); |
|||
if (!is_null($result)) { |
|||
return ($returnNode === true) ? $result : true; |
|||
} |
|||
|
|||
$result = static::descendant($needle, $context); |
|||
if (!is_null($result)) { |
|||
return ($returnNode === true) ? $result : true; |
|||
} |
|||
} while ($context = $context->nextSibling); |
|||
|
|||
return ($returnNode === true) ? null : false; |
|||
} |
|||
|
|||
protected static function ancestor(mixed $needle, \DOMElement $context, bool $returnNode = true) { |
|||
while ($context = $context->parentNode) { |
|||
$result = static::compare($needle, $context); |
|||
if (!is_null($result)) { |
|||
return ($returnNode === true) ? $result : true; |
|||
} |
|||
} |
|||
|
|||
return ($returnNode === true) ? null : false; |
|||
} |
|||
|
|||
protected static function compare(mixed $needle, \DOMNode $context): \DOMNode { |
|||
if (is_string($needle)) { |
|||
if ($context instanceof \DOMElement && $context->nodeName == $needle) { |
|||
return $context; |
|||
} |
|||
} elseif ($needle instanceof \DOMNode) { |
|||
if ($context->isSameNode($needle)) { |
|||
return $context; |
|||
} |
|||
} elseif ($needle instanceof \Closure) { |
|||
if ($needle($context) === true) { |
|||
return $context; |
|||
} |
|||
} else { |
|||
throw new Exception(Exception::DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED, gettype($needle)); |
|||
} |
|||
|
|||
return null; |
|||
} |
|||
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,79 @@ |
|||
<?php |
|||
declare(strict_types=1); |
|||
namespace dW\HTML5; |
|||
|
|||
class Exception extends \Exception { |
|||
const INVALID_CODE = 10000; |
|||
const UNKNOWN_ERROR = 10001; |
|||
const INCORRECT_PARAMETERS_FOR_MESSAGE = 10002; |
|||
|
|||
const PARSER_DOMDOCUMENT_EXPECTED = 10101; |
|||
const PARSER_DOMELEMENT_DOMDOCUMENT_DOMDOCUMENTFRAG_EXPECTED = 10102; |
|||
const PARSER_DOMNODE_EXPECTED = 10103; |
|||
|
|||
const STACK_INVALID_INDEX = 10201; |
|||
const STACK_DOMNODE_ONLY = 10202; |
|||
|
|||
const DATASTREAM_NODATA = 10301; |
|||
const DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH = 10302; |
|||
|
|||
const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10401; |
|||
|
|||
protected static $messages = [10000 => 'Invalid error code', |
|||
10001 => 'Unknown error; escaping', |
|||
10002 => 'Incorrect number of parameters for Exception message; %s expected', |
|||
|
|||
10101 => 'DOMDocument expected; found %s', |
|||
10102 => 'DOMElement, DOMDocument, or DOMDocumentFrag expected; found %s', |
|||
10103 => 'DOMNode expected; found %s', |
|||
|
|||
10201 => '%s is an invalid index', |
|||
10202 => 'Instances of DOMNode are the only types allowed in an HTML5Stack', |
|||
|
|||
10301 => 'Data string expected; found %s', |
|||
10302 => '%s is an invalid data consumption length; a value of 1 or above is expected', |
|||
|
|||
10401 => 'The first argument must either be an instance of \DOMElement, a string, or a closure; found %s']; |
|||
|
|||
public function __construct(int $code, ...$args) { |
|||
if (!isset(static::$messages[$code])) { |
|||
throw new Exception(static::INVALID_CODE); |
|||
} |
|||
|
|||
$message = static::$messages[$code]; |
|||
$previous = null; |
|||
|
|||
// Grab a previous exception if there is one. |
|||
if ($args[0] instanceof \Throwable) { |
|||
$previous = array_shift($args); |
|||
} elseif (end($args) instanceof \Throwable) { |
|||
$previous = array_pop($args); |
|||
} |
|||
|
|||
// Count the number of replacements needed in the message. |
|||
$count = substr_count($message, '%s'); |
|||
// If the number of replacements don't match the arguments then oops. |
|||
if (count($args) !== $count) { |
|||
throw new Exception(static::INCORRECT_PARAMETERS_FOR_MESSAGE, $count); |
|||
} |
|||
|
|||
if ($count > 0) { |
|||
// Convert newlines and tabs in the arguments to words to better express what they |
|||
// are. |
|||
/*$args = array_map(function($value) { |
|||
switch ($value) { |
|||
case "\n": return 'Newline'; |
|||
break; |
|||
case "\t": return 'Tab'; |
|||
break; |
|||
default: return $value; |
|||
} |
|||
}, $args);*/ |
|||
|
|||
// Go through each of the arguments and run sprintf on the strings. |
|||
$message = call_user_func_array('sprintf', array_merge([$message], $args)); |
|||
} |
|||
|
|||
parent::__construct($message, $code, $previous); |
|||
} |
|||
} |
@ -0,0 +1,86 @@ |
|||
<?php |
|||
declare(strict_types=1); |
|||
namespace dW\HTML5; |
|||
|
|||
class ParseError { |
|||
// DataStream object passed to it used to get information used in error |
|||
// reporting. |
|||
public static $data; |
|||
|
|||
const TAG_NAME_EXPECTED = 0; |
|||
const UNEXPECTED_EOF = 1; |
|||
const UNEXPECTED_CHARACTER = 2; |
|||
const ATTRIBUTE_EXISTS = 3; |
|||
const UNEXPECTED_TAG_END = 4; |
|||
const UNEXPECTED_START_TAG = 5; |
|||
const UNEXPECTED_END_TAG = 6; |
|||
const UNEXPECTED_DOCTYPE = 7; |
|||
const INVALID_DOCTYPE = 8; |
|||
const INVALID_CONTROL_OR_NONCHARACTERS = 9; |
|||
|
|||
protected static $messages = ['Tag name expected; found %s', |
|||
'Unexpected end-of-file; %s expected', |
|||
'Unexpected "%s" character; %s expected', |
|||
'%s attribute already exists; discarding', |
|||
'Unexpected tag end; %s expected', |
|||
'Unexpected %s start tag; %s expected', |
|||
'Unexpected %s end tag; %s expected', |
|||
'Unexpected DOCTYPE; %s expected', |
|||
'Invalid DOCTYPE', |
|||
'Invalid Control or Non-character; removing']; |
|||
|
|||
public static function errorHandler($code, $message, $file, $line, array $context) { |
|||
if ($code === E_USER_WARNING) { |
|||
$errMsg = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, static::$data->filePath); |
|||
|
|||
if (static::$data->length !== 0) { |
|||
$errMsg .= sprintf(" on line %s, column %s\n", static ::$data->line, static::$data->column); |
|||
} else { |
|||
$errMsg .= "\n"; |
|||
} |
|||
|
|||
echo $errMsg; |
|||
} |
|||
} |
|||
|
|||
public static function trigger(int $code, DataStream $data, ...$args): bool { |
|||
if (!isset(static::$messages[$code])) { |
|||
throw new Exception(Exception::INVALID_CODE); |
|||
} |
|||
|
|||
static::$data = $data; |
|||
|
|||
// Set the error handler and honor already-set error reporting rules. |
|||
set_error_handler('\\dW\\HTML5\\ParseError::errorHandler', error_reporting()); |
|||
|
|||
$message = static::$messages[$code]; |
|||
|
|||
// Count the number of replacements needed in the message. |
|||
$count = substr_count($message, '%s'); |
|||
// If the number of replacements don't match the arguments then oops. |
|||
if (count($args) !== $count) { |
|||
throw new Exception(static::INCORRECT_PARAMETERS_FOR_MESSAGE, $count); |
|||
} |
|||
|
|||
if ($count > 0) { |
|||
// Convert newlines and tabs in the arguments to words to better express what they |
|||
// are. |
|||
$args = array_map(function($value) { |
|||
switch ($value) { |
|||
case "\n": return 'Newline'; |
|||
break; |
|||
case "\t": return 'Tab'; |
|||
break; |
|||
default: return $value; |
|||
} |
|||
}, $args); |
|||
|
|||
// Go through each of the arguments and run sprintf on the strings. |
|||
$message = call_user_func_array('sprintf', array_merge([$message], $args)); |
|||
} |
|||
|
|||
$output = trigger_error($message, E_USER_WARNING); |
|||
restore_error_handler(); |
|||
return $output; |
|||
} |
|||
} |
File diff suppressed because it is too large
@ -0,0 +1,90 @@ |
|||
<?php |
|||
declare(strict_types=1); |
|||
namespace dW\HTML5; |
|||
|
|||
class Stack implements \ArrayAccess { |
|||
protected $storage = []; |
|||
|
|||
// Temporarily change this from DOMNode to HTML5StartTagToken for the purposes of |
|||
// testing the tokenizer. |
|||
public function offsetSet($offset, $value) { |
|||
if ($offset < 0) { |
|||
throw new Exception(Exception::STACK_INVALID_INDEX); |
|||
} |
|||
|
|||
if (is_null($offset)) { |
|||
$this->storage[] = $value; |
|||
} else { |
|||
$this->storage[$offset] = $value; |
|||
} |
|||
} |
|||
|
|||
public function offsetExists($offset) { |
|||
return isset($this->storage[$offset]); |
|||
} |
|||
|
|||
public function offsetUnset($offset) { |
|||
if ($offset < 0) { |
|||
throw new Exception(Exception::STACK_INVALID_INDEX); |
|||
} |
|||
|
|||
unset($this->storage[$offset]); |
|||
} |
|||
|
|||
public function offsetGet($offset) { |
|||
if ($offset < 0) { |
|||
throw new Exception(Exception::STACK_INVALID_INDEX); |
|||
} |
|||
|
|||
return $this->storage[$offset]; |
|||
} |
|||
|
|||
public function pop() { |
|||
return array_pop($this->storage); |
|||
} |
|||
|
|||
public static function search(mixed $needle): int { |
|||
if (!$needle) { |
|||
return false; |
|||
} |
|||
|
|||
if ($needle instanceof DOMElement) { |
|||
foreach ($this->storage as $key=>$value) { |
|||
if ($value->isSameNode($needle)) { |
|||
return $key; |
|||
} |
|||
} |
|||
} elseif (is_string($needle)) { |
|||
foreach ($this->storage as $key=>$value) { |
|||
if ($value->nodeName === $needle) { |
|||
return $key; |
|||
} |
|||
} |
|||
} |
|||
|
|||
return false; |
|||
} |
|||
|
|||
public function __get($property) { |
|||
switch ($property) { |
|||
case 'length': return count($this->storage); |
|||
break; |
|||
case 'currentNode': |
|||
$currentNode = end($this->storage); |
|||
return ($currentNode) ? $currentNode : null; |
|||
break; |
|||
case 'adjustedCurrentNode': |
|||
# The adjusted current node is the context element if the parser was created by |
|||
# the HTML fragment parsing algorithm and the stack of open elements has only one |
|||
# element in it (fragment case); otherwise, the adjusted current node is the |
|||
# current node. |
|||
return (Parser::$self->fragmentCase && $this->length === 1) ? Parser::$self->fragmentContext : $this->currentNode; |
|||
break; |
|||
case 'currentNodeName': |
|||
$currentNode = $this->currentNode; |
|||
return ($currentNode && $currentNode->nodeType) ? $currentNode->nodeName : null; |
|||
break; |
|||
default: return null; |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,83 @@ |
|||
<?php |
|||
declare(strict_types=1); |
|||
namespace dW\HTML5; |
|||
|
|||
abstract class Token {} |
|||
|
|||
abstract class DataToken extends Token { |
|||
public $data; |
|||
|
|||
public function __construct($data) { |
|||
$this->data = (string)$data; |
|||
} |
|||
} |
|||
|
|||
abstract class TagToken extends Token { |
|||
public $name; |
|||
|
|||
public function __construct($name) { |
|||
$this->name = (string)$name; |
|||
} |
|||
} |
|||
|
|||
class EOFToken extends Token {} |
|||
|
|||
class DOCTYPEToken extends Token { |
|||
public $forceQuirks = false; |
|||
public $public; |
|||
public $system; |
|||
|
|||
public function __construct($name = null, $public = null, $system = null) { |
|||
$this->name = (string)$name; |
|||
|
|||
$this->public = (string)$public; |
|||
$this->system = (string)$system; |
|||
} |
|||
} |
|||
|
|||
class CharacterToken extends DataToken {} |
|||
|
|||
class CommentToken extends DataToken { |
|||
public function __construct($data = '') { |
|||
parent::__construct($data); |
|||
} |
|||
} |
|||
|
|||
class StartTagToken extends TagToken { |
|||
public $namespace; |
|||
public $selfClosing; |
|||
|
|||
protected $_attributes; |
|||
|
|||
public function __construct($name, bool $selfClosing = false, string $namespace = \dW\HTML5\Parser::HTML_NAMESPACE) { |
|||
$this->selfClosing = $selfClosing; |
|||
$this->namespace = $namespace; |
|||
parent::__construct($name); |
|||
} |
|||
|
|||
public function getAttribute(string $name): \DOMAttr { |
|||
return ($this->_attributes[$name]) ? $this->_attributes[$name] : null; |
|||
} |
|||
|
|||
public function hasAttribute(string $name): bool { |
|||
return (isset($this->_attributes[$name])); |
|||
} |
|||
|
|||
public function removeAttribute(string $name) { |
|||
unset($this->_attributes[$name]); |
|||
} |
|||
|
|||
public function setAttribute($name, $value) { |
|||
$this->_attributes[(string)$name] = (string)$value; |
|||
} |
|||
|
|||
public function __get($property) { |
|||
if ($property === 'attributes') { |
|||
return $this->_attributes; |
|||
} |
|||
|
|||
return null; |
|||
} |
|||
} |
|||
|
|||
class EndTagToken extends TagToken {} |
Loading…
Reference in new issue