Browse Source

Beginning Rewrite

ns
Dustin Wilson 6 years ago
parent
commit
a89f6c9f09
  1. 2
      .gitignore
  2. 28
      composer.json
  3. 22
      composer.lock
  4. 10023
      html5.php
  5. 74
      lib/DOM.php
  6. 439
      lib/DataStream.php
  7. 79
      lib/Exception.php
  8. 86
      lib/ParseError.php
  9. 3259
      lib/Parser.php
  10. 90
      lib/Stack.php
  11. 83
      lib/Token.php

2
.gitignore

@ -62,3 +62,5 @@ $RECYCLE.BIN/
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
/vendor/

28
composer.json

@ -0,0 +1,28 @@
{
"name": "dw/html5",
"description": "Parses HTML5 text into a php DOMDocument",
"type": "library",
"require": {
"php": "^7.0",
"ext-intl": "*",
"ext-mcrypt": "*",
"ext-hash": "*"
},
"license": "MIT",
"authors": [
{
"name": "Dustin Wilson",
"email": "dustin@dustinwilson.com",
"homepage": "https://dustinwilson.com/"
}
],
"autoload": {
"psr-4": {
"dW\\HTML5\\": "lib/"
},
"classmap": ["lib/Token.php"]
},
"autoload-dev": {
"files": ["lib/Token.php"]
}
}

22
composer.lock

@ -0,0 +1,22 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"content-hash": "9ae6b8a7d4d830c471b21ccefc1ee8fb",
"packages": [],
"packages-dev": [],
"aliases": [],
"minimum-stability": "stable",
"stability-flags": [],
"prefer-stable": false,
"prefer-lowest": false,
"platform": {
"php": "^7.0",
"ext-intl": "*",
"ext-mcrypt": "*",
"ext-hash": "*"
},
"platform-dev": []
}

10023
html5.php

File diff suppressed because it is too large

74
lib/DOM.php

@ -0,0 +1,74 @@
<?php
declare(strict_types=1);
namespace dW\HTML5;
class DOM {
public static function getAncestor(mixed $needle, \DOMElement $context): \DOMElement {
return static::ancestor($needle, $context, true);
}
public static function hasAncestor(mixed $needle, \DOMElement $context): bool {
return static::ancestor($needle, $context, false);
}
public static function getDescendant(mixed $needle, \DOMElement $context): \DOMNode {
return static::descendant($needle, $context, true);
}
public static function hasDescendant(mixed $needle, \DOMElement $context): bool {
return static::descendant($needle, $context, false);
}
public static function descendant(mixed $needle, \DOMElement $context, bool $returnNode = true): \DOMNode {
if ($context->hasChildNodes() === false) {
return ($returnNode === true) ? null : false;
}
$context = $context->firstChild;
do {
$result = static::compare($needle, $context);
if (!is_null($result)) {
return ($returnNode === true) ? $result : true;
}
$result = static::descendant($needle, $context);
if (!is_null($result)) {
return ($returnNode === true) ? $result : true;
}
} while ($context = $context->nextSibling);
return ($returnNode === true) ? null : false;
}
protected static function ancestor(mixed $needle, \DOMElement $context, bool $returnNode = true) {
while ($context = $context->parentNode) {
$result = static::compare($needle, $context);
if (!is_null($result)) {
return ($returnNode === true) ? $result : true;
}
}
return ($returnNode === true) ? null : false;
}
protected static function compare(mixed $needle, \DOMNode $context): \DOMNode {
if (is_string($needle)) {
if ($context instanceof \DOMElement && $context->nodeName == $needle) {
return $context;
}
} elseif ($needle instanceof \DOMNode) {
if ($context->isSameNode($needle)) {
return $context;
}
} elseif ($needle instanceof \Closure) {
if ($needle($context) === true) {
return $context;
}
} else {
throw new Exception(Exception::DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED, gettype($needle));
}
return null;
}
}

439
lib/DataStream.php

File diff suppressed because one or more lines are too long

79
lib/Exception.php

@ -0,0 +1,79 @@
<?php
declare(strict_types=1);
namespace dW\HTML5;
class Exception extends \Exception {
const INVALID_CODE = 10000;
const UNKNOWN_ERROR = 10001;
const INCORRECT_PARAMETERS_FOR_MESSAGE = 10002;
const PARSER_DOMDOCUMENT_EXPECTED = 10101;
const PARSER_DOMELEMENT_DOMDOCUMENT_DOMDOCUMENTFRAG_EXPECTED = 10102;
const PARSER_DOMNODE_EXPECTED = 10103;
const STACK_INVALID_INDEX = 10201;
const STACK_DOMNODE_ONLY = 10202;
const DATASTREAM_NODATA = 10301;
const DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH = 10302;
const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10401;
protected static $messages = [10000 => 'Invalid error code',
10001 => 'Unknown error; escaping',
10002 => 'Incorrect number of parameters for Exception message; %s expected',
10101 => 'DOMDocument expected; found %s',
10102 => 'DOMElement, DOMDocument, or DOMDocumentFrag expected; found %s',
10103 => 'DOMNode expected; found %s',
10201 => '%s is an invalid index',
10202 => 'Instances of DOMNode are the only types allowed in an HTML5Stack',
10301 => 'Data string expected; found %s',
10302 => '%s is an invalid data consumption length; a value of 1 or above is expected',
10401 => 'The first argument must either be an instance of \DOMElement, a string, or a closure; found %s'];
public function __construct(int $code, ...$args) {
if (!isset(static::$messages[$code])) {
throw new Exception(static::INVALID_CODE);
}
$message = static::$messages[$code];
$previous = null;
// Grab a previous exception if there is one.
if ($args[0] instanceof \Throwable) {
$previous = array_shift($args);
} elseif (end($args) instanceof \Throwable) {
$previous = array_pop($args);
}
// Count the number of replacements needed in the message.
$count = substr_count($message, '%s');
// If the number of replacements don't match the arguments then oops.
if (count($args) !== $count) {
throw new Exception(static::INCORRECT_PARAMETERS_FOR_MESSAGE, $count);
}
if ($count > 0) {
// Convert newlines and tabs in the arguments to words to better express what they
// are.
/*$args = array_map(function($value) {
switch ($value) {
case "\n": return 'Newline';
break;
case "\t": return 'Tab';
break;
default: return $value;
}
}, $args);*/
// Go through each of the arguments and run sprintf on the strings.
$message = call_user_func_array('sprintf', array_merge([$message], $args));
}
parent::__construct($message, $code, $previous);
}
}

86
lib/ParseError.php

@ -0,0 +1,86 @@
<?php
declare(strict_types=1);
namespace dW\HTML5;
class ParseError {
// DataStream object passed to it used to get information used in error
// reporting.
public static $data;
const TAG_NAME_EXPECTED = 0;
const UNEXPECTED_EOF = 1;
const UNEXPECTED_CHARACTER = 2;
const ATTRIBUTE_EXISTS = 3;
const UNEXPECTED_TAG_END = 4;
const UNEXPECTED_START_TAG = 5;
const UNEXPECTED_END_TAG = 6;
const UNEXPECTED_DOCTYPE = 7;
const INVALID_DOCTYPE = 8;
const INVALID_CONTROL_OR_NONCHARACTERS = 9;
protected static $messages = ['Tag name expected; found %s',
'Unexpected end-of-file; %s expected',
'Unexpected "%s" character; %s expected',
'%s attribute already exists; discarding',
'Unexpected tag end; %s expected',
'Unexpected %s start tag; %s expected',
'Unexpected %s end tag; %s expected',
'Unexpected DOCTYPE; %s expected',
'Invalid DOCTYPE',
'Invalid Control or Non-character; removing'];
public static function errorHandler($code, $message, $file, $line, array $context) {
if ($code === E_USER_WARNING) {
$errMsg = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, static::$data->filePath);
if (static::$data->length !== 0) {
$errMsg .= sprintf(" on line %s, column %s\n", static ::$data->line, static::$data->column);
} else {
$errMsg .= "\n";
}
echo $errMsg;
}
}
public static function trigger(int $code, DataStream $data, ...$args): bool {
if (!isset(static::$messages[$code])) {
throw new Exception(Exception::INVALID_CODE);
}
static::$data = $data;
// Set the error handler and honor already-set error reporting rules.
set_error_handler('\\dW\\HTML5\\ParseError::errorHandler', error_reporting());
$message = static::$messages[$code];
// Count the number of replacements needed in the message.
$count = substr_count($message, '%s');
// If the number of replacements don't match the arguments then oops.
if (count($args) !== $count) {
throw new Exception(static::INCORRECT_PARAMETERS_FOR_MESSAGE, $count);
}
if ($count > 0) {
// Convert newlines and tabs in the arguments to words to better express what they
// are.
$args = array_map(function($value) {
switch ($value) {
case "\n": return 'Newline';
break;
case "\t": return 'Tab';
break;
default: return $value;
}
}, $args);
// Go through each of the arguments and run sprintf on the strings.
$message = call_user_func_array('sprintf', array_merge([$message], $args));
}
$output = trigger_error($message, E_USER_WARNING);
restore_error_handler();
return $output;
}
}

3259
lib/Parser.php

File diff suppressed because it is too large

90
lib/Stack.php

@ -0,0 +1,90 @@
<?php
declare(strict_types=1);
namespace dW\HTML5;
class Stack implements \ArrayAccess {
protected $storage = [];
// Temporarily change this from DOMNode to HTML5StartTagToken for the purposes of
// testing the tokenizer.
public function offsetSet($offset, $value) {
if ($offset < 0) {
throw new Exception(Exception::STACK_INVALID_INDEX);
}
if (is_null($offset)) {
$this->storage[] = $value;
} else {
$this->storage[$offset] = $value;
}
}
public function offsetExists($offset) {
return isset($this->storage[$offset]);
}
public function offsetUnset($offset) {
if ($offset < 0) {
throw new Exception(Exception::STACK_INVALID_INDEX);
}
unset($this->storage[$offset]);
}
public function offsetGet($offset) {
if ($offset < 0) {
throw new Exception(Exception::STACK_INVALID_INDEX);
}
return $this->storage[$offset];
}
public function pop() {
return array_pop($this->storage);
}
public static function search(mixed $needle): int {
if (!$needle) {
return false;
}
if ($needle instanceof DOMElement) {
foreach ($this->storage as $key=>$value) {
if ($value->isSameNode($needle)) {
return $key;
}
}
} elseif (is_string($needle)) {
foreach ($this->storage as $key=>$value) {
if ($value->nodeName === $needle) {
return $key;
}
}
}
return false;
}
public function __get($property) {
switch ($property) {
case 'length': return count($this->storage);
break;
case 'currentNode':
$currentNode = end($this->storage);
return ($currentNode) ? $currentNode : null;
break;
case 'adjustedCurrentNode':
# The adjusted current node is the context element if the parser was created by
# the HTML fragment parsing algorithm and the stack of open elements has only one
# element in it (fragment case); otherwise, the adjusted current node is the
# current node.
return (Parser::$self->fragmentCase && $this->length === 1) ? Parser::$self->fragmentContext : $this->currentNode;
break;
case 'currentNodeName':
$currentNode = $this->currentNode;
return ($currentNode && $currentNode->nodeType) ? $currentNode->nodeName : null;
break;
default: return null;
}
}
}

83
lib/Token.php

@ -0,0 +1,83 @@
<?php
declare(strict_types=1);
namespace dW\HTML5;
abstract class Token {}
abstract class DataToken extends Token {
public $data;
public function __construct($data) {
$this->data = (string)$data;
}
}
abstract class TagToken extends Token {
public $name;
public function __construct($name) {
$this->name = (string)$name;
}
}
class EOFToken extends Token {}
class DOCTYPEToken extends Token {
public $forceQuirks = false;
public $public;
public $system;
public function __construct($name = null, $public = null, $system = null) {
$this->name = (string)$name;
$this->public = (string)$public;
$this->system = (string)$system;
}
}
class CharacterToken extends DataToken {}
class CommentToken extends DataToken {
public function __construct($data = '') {
parent::__construct($data);
}
}
class StartTagToken extends TagToken {
public $namespace;
public $selfClosing;
protected $_attributes;
public function __construct($name, bool $selfClosing = false, string $namespace = \dW\HTML5\Parser::HTML_NAMESPACE) {
$this->selfClosing = $selfClosing;
$this->namespace = $namespace;
parent::__construct($name);
}
public function getAttribute(string $name): \DOMAttr {
return ($this->_attributes[$name]) ? $this->_attributes[$name] : null;
}
public function hasAttribute(string $name): bool {
return (isset($this->_attributes[$name]));
}
public function removeAttribute(string $name) {
unset($this->_attributes[$name]);
}
public function setAttribute($name, $value) {
$this->_attributes[(string)$name] = (string)$value;
}
public function __get($property) {
if ($property === 'attributes') {
return $this->_attributes;
}
return null;
}
}
class EndTagToken extends TagToken {}
Loading…
Cancel
Save