diff --git a/.gitignore b/.gitignore index ab58724..39bf42e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ # html5-parser specific -test.php +test*.php # General *.DS_Store diff --git a/lib/DOM/DOMException.php b/lib/DOM/DOMException.php new file mode 100644 index 0000000..6e8fad0 --- /dev/null +++ b/lib/DOM/DOMException.php @@ -0,0 +1,54 @@ + 'Modification not allowed here' + + 100 => 'Element, Document, or DOMDocumentFragment expected; found %s', + 101 => 'The first argument must either be an instance of \DOMNode, a string, or a closure; found %s', + 102 => 'Failed to set the "outerHTML" property; the element does not have a parent node' + ]; + + public function __construct(int $code, ...$args) { + if (!isset(static::$messages[$code])) { + throw new Exception(Exception::INVALID_CODE); + } + + $message = static::$messages[$code]; + $previous = null; + + if ($args) { + // Grab a previous exception if there is one. + if ($args[0] instanceof \Throwable) { + $previous = array_shift($args); + } elseif (end($args) instanceof \Throwable) { + $previous = array_pop($args); + } + } + + // Count the number of replacements needed in the message. + preg_match_all('/(\%(?:\d+\$)?s)/', $message, $matches); + $count = count(array_unique($matches[1])); + + // If the number of replacements don't match the arguments then oops. + if (count($args) !== $count) { + throw new Exception(Exception::INCORRECT_PARAMETERS_FOR_MESSAGE, $count); + } + + if ($count > 0) { + // Go through each of the arguments and run sprintf on the strings. + $message = call_user_func_array('sprintf', array_merge([$message], $args)); + } + + parent::__construct($message, $code, $previous); + } +} diff --git a/lib/DOM/Element.php b/lib/DOM/Element.php index 6b7ea83..1e3acf7 100644 --- a/lib/DOM/Element.php +++ b/lib/DOM/Element.php @@ -10,8 +10,6 @@ class Element extends \DOMElement { // Used for template elements public $content = null; - protected const SELF_CLOSING_ELEMENTS = ['area', 'base', 'basefont', 'bgsound', 'br', 'col', 'embed', 'frame', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']; - public function setAttribute($name, $value) { try { parent::setAttribute($name, $value); @@ -24,7 +22,7 @@ class Element extends \DOMElement { parent::setAttribute($name, $value); } } - + public function setAttributeNS($namespaceURI, $qualifiedName, $value) { try { parent::setAttributeNS($namespaceURI, $qualifiedName, $value); @@ -38,6 +36,120 @@ class Element extends \DOMElement { } } + public function __get(string $prop) { + switch ($prop) { + ### DOM Parsing Specification ### + # 2.3 The InnerHTML mixin + # + # On getting, return the result of invoking the fragment serializing algorithm + # on the context object providing true for the require well-formed flag (this + # might throw an exception instead of returning a string). + // DEVIATION: Parsing of XML documents will not be handled by this + // implementation, so there's no need for the well-formed flag. + case 'innerHTML': return $this->serialize($this); + break; + ### DOM Parsing Specification ### + # 2.4 Extensions to the Element interface + # outerHTML + # + # On getting, return the result of invoking the fragment serializing algorithm on a fictional node whose only child is the context object providing true for the require well-formed flag (this might throw an exception instead of returning a string). + // DEVIATION: Parsing of XML documents will not be handled by this + // implementation, so there's no need for the well-formed flag. + // OPTIMIZATION: When following the instructions above the fragment serializing + // algorithm (Element::serialize) would invoke Element::__toString, so just + // doing that instead of multiple function calls. + case 'outerHTML': return $this->__toString(); + break; + } + } + + public function __set(string $prop, $value) { + switch ($prop) { + case 'innerHTML': + ### DOM Parsing Specification ### + # 2.3 The InnerHTML mixin + # + # On setting, these steps must be run: + # 1. Let context element be the context object's host if the context object is a + # ShadowRoot object, or the context object otherwise. + // DEVIATION: There is no scripting in this implementation. + + # 2. Let fragment be the result of invoking the fragment parsing algorithm with + # the new value as markup, and with context element. + $frag = Parser::parse($value, $this->ownerDocument, $this->ownerDocument->documentEncoding, $this); + + # 3. If the context object is a template element, then let context object be the + # template's template contents (a DocumentFragment). + if ($this->nodeName === 'template') { + $this->content = $frag; + } + # 4. Replace all with fragment within the context object. + else { + # To replace all with a node within a parent, run these steps: + # + # 1. Let removedNodes be parent’s children. + // DEVIATION: removedNodes is used below for scripting. There is no scripting in + // this implementation. + + # 2. Let addedNodes be parent’s children. + // DEVIATION: addedNodes is used below for scripting. There is no scripting in + // this implementation. + + # 3. If node is a DocumentFragment node, then set addedNodes to node’s + # children. + + // DEVIATION: Again, there is no scripting in this implementation. + # 4. Otherwise, if node is non-null, set addedNodes to « node ». + // DEVIATION: Yet again, there is no scripting in this implementation. + + # 5. Remove all parent’s children, in tree order, with the suppress observers + # flag set. + // DEVIATION: There are no observers to suppress as there is no scripting in + // this implementation. + while ($this->hasChildNodes()) { + $this->removeChild($this->firstChild); + } + + # 6. Otherwise, if node is non-null, set addedNodes to « node ». + # If node is non-null, then insert node into parent before null with the + # suppress observers flag set. + // DEVIATION: Yet again, there is no scripting in this implementation. + + # 7. If either addedNodes or removedNodes is not empty, then queue a tree + # mutation record for parent with addedNodes, removedNodes, null, and null. + // DEVIATION: Normally the tree mutation record would do the actual replacement, + // but there is no scripting in this implementation. Going to simply append the + // fragment instead. + $this->appendChild($ook); + } + break; + + case 'outerHTML': + ### DOM Parsing Specification ### + # 2.4 Extensions to the Element interface + # outerHTML + # + # On setting, the following steps must be run: + # 1. Let parent be the context object's parent. + $parent = $this->parentNode; + + # 2. If parent is null, terminate these steps. There would be no way to obtain a + # reference to the nodes created even if the remaining steps were run. + // The spec is unclear here as to what to do. What do you return? Most browsers + // throw an exception here, so that's what we're going to do. + if ($parent === null) { + throw new DOMException(DOMException::OUTER_HTML_FAILED_NOPARENT); + } + # 3. If parent is a Document, throw a "NoModificationAllowedError" DOMException. + elseif ($parent instanceof Document) { + throw new DOMException(DOMException::NO_MODIFICATION_ALLOWED); + } + # 4. parent is a DocumentFragment, let parent be a new Element with: + + break; + } + } + public function __toString(): string { # If current node is an element in the HTML namespace, the MathML namespace, # or the SVG namespace, then let tagname be current node’s local name. @@ -48,9 +160,21 @@ class Element extends \DOMElement { $tagName = $this->nodeName; } + // Since tag names can contain characters that are invalid in PHP's XML DOM + // uncoerce the name when printing. + if (strpos($tagName, 'U') !== false) { + $tagName = $this->uncoerceName($tagName); + } + # Append a U+003C LESS-THAN SIGN character (<), followed by tagname. $s = "<$tagName"; + # If current node's is value is not null, and the element does not have an is + # attribute in its attribute list, then append the string " is="", followed by + # current node's is value escaped as described below in attribute mode, followed + # by a U+0022 QUOTATION MARK character ("). + // DEVIATION: There is no scripting support in this implementation. + # For each attribute that the element has, append a U+0020 SPACE character, # the attribute’s serialized name as described below, a U+003D EQUALS SIGN # character (=), a U+0022 QUOTATION MARK character ("), the attribute’s value, @@ -113,10 +237,9 @@ class Element extends \DOMElement { # Append a U+003E GREATER-THAN SIGN character (>). $s .= '>'; - # If current node is an area, base, basefont, bgsound, br, col, embed, frame, - # hr, img, input, link, meta, param, source, track or wbr element, then continue - # on to the next child node at this point. - if (in_array($tagName, self::SELF_CLOSING_ELEMENTS)) { + # If current node serializes as void, then continue on to the next child node at + # this point. + if ($this->serializesAsVoid()) { return $s; } diff --git a/lib/DOM/traits/Compare.php b/lib/DOM/traits/Compare.php index 852bc83..831b431 100644 --- a/lib/DOM/traits/Compare.php +++ b/lib/DOM/traits/Compare.php @@ -17,7 +17,7 @@ trait Compare { return $context; } } else { - throw new Exception(Exception::DOM_DOMNODE_STRING_OR_CLOSURE_EXPECTED, gettype($needle)); + throw new DOMException(DOMException::STRING_OR_CLOSURE_EXPECTED, gettype($needle)); } return null; diff --git a/lib/DOM/traits/EscapeString.php b/lib/DOM/traits/EscapeString.php index 7b7f98c..7a9cdeb 100644 --- a/lib/DOM/traits/EscapeString.php +++ b/lib/DOM/traits/EscapeString.php @@ -18,11 +18,7 @@ trait EscapeString { # 4. If the algorithm was not invoked in the attribute mode, replace any # occurrences of the "<" character by the string "&lt;", and any # occurrences of the ">" character by the string "&gt;". - if ($attribute) { - $string = str_replace(['"', '<', '>'], ['&quot;', '&lt;', '&gt;'], $string); - } - - return $string; + return ($attribute) ? str_replace('"', '&quot;', $string) : str_replace(['<', '>'], ['&lt;', '&gt;'], $string); } protected function coerceName(string $name): string { diff --git a/lib/DOM/traits/Serialize.php b/lib/DOM/traits/Serialize.php index b576fa7..bee3164 100644 --- a/lib/DOM/traits/Serialize.php +++ b/lib/DOM/traits/Serialize.php @@ -3,21 +3,35 @@ declare(strict_types=1); namespace dW\HTML5; trait Serialize { + protected function serializesAsVoid(): bool { + $name = $this->nodeName; + if ($name === 'area' || $name === 'base' || $name === 'basefont' || $name === 'bgsound' || $name === 'br' || $name === 'col' || $name === 'embed' || $name === 'hr' || $name === 'img' || $name === 'input' || $name === 'link' || $name === 'meta' || $name === 'param' || $name === 'source' || $name === 'track' || $name === 'wbr') { + return true; + } + + return false; + } + protected function serialize(\DOMNode $node = null): string { if (is_null($node)) { $node = $this; } if (!$node instanceof Element && !$node instanceof Document && !$node instanceof DocumentFragment) { - throw new Exception(Exception::DOM_ELEMENT_DOCUMENT_DOCUMENTFRAG_EXPECTED, gettype($node)); + throw new DOMException(DOMException::DOCUMENT_DOCUMENTFRAG_EXPECTED, gettype($node)); } - # 8.3. Serializing HTML fragments + # 13.3. Serializing HTML fragments # - # 1. Let s be a string, and initialize it to the empty string. + # 1. If the node serializes as void, then return the empty string. + if ($this->serializesAsVoid()) { + return ''; + } + + # 2. Let s be a string, and initialize it to the empty string. $s = ''; - # 2. If the node is a template element, then let the node instead be the + # 3. If the node is a template element, then let the node instead be the # template element’s template contents (a DocumentFragment node). if ($node instanceof Element && $node->nodeName === 'template') { $node = $node->content; @@ -43,7 +57,7 @@ trait Serialize { $start = 1; } - # 3. For each child node of the node, in tree order, run the following steps: + # 4. For each child node of the node, in tree order, run the following steps: for ($i = $start; $i < $nodesLength; $i++) { # 1. Let current node be the child node being processed. # 2. Append the appropriate string from the following list to s: @@ -51,7 +65,7 @@ trait Serialize { } } - # 4. The result of the algorithm is the string s. + # 5. Return s. return $s; } } \ No newline at end of file diff --git a/lib/Exception.php b/lib/Exception.php index 5acc025..259cafa 100644 --- a/lib/Exception.php +++ b/lib/Exception.php @@ -17,41 +17,35 @@ class Exception extends \Exception { const DATA_NODATA = 10301; const DATA_INVALID_DATA_CONSUMPTION_LENGTH = 10302; - const DOM_DOMNODE_STRING_OR_CLOSURE_EXPECTED = 10401; - const DOM_ELEMENT_DOCUMENT_DOCUMENTFRAG_EXPECTED = 10402; + const TOKENIZER_INVALID_STATE = 10401; - const TOKENIZER_INVALID_STATE = 10501; + const TREEBUILDER_FORMELEMENT_EXPECTED = 10501; + const TREEBUILDER_DOCUMENTFRAG_ELEMENT_DOCUMENT_DOCUMENTFRAG_EXPECTED = 10502; + const TREEBUILDER_UNEXPECTED_END_OF_FILE = 10503; - const TREEBUILDER_FORMELEMENT_EXPECTED = 10601; - const TREEBUILDER_DOCUMENTFRAG_ELEMENT_DOCUMENT_DOCUMENTFRAG_EXPECTED = 10602; - const TREEBUILDER_UNEXPECTED_END_OF_FILE = 10603; + protected static $messages = [ + 10000 => 'Invalid error code', + 10001 => 'Unknown error; escaping', + 10002 => 'Incorrect number of parameters for Exception message; %s expected', - const DOM_DISABLED_METHOD = 10701; + 10101 => 'Non-empty Document supplied as argument for Parser', - protected static $messages = [10000 => 'Invalid error code', - 10001 => 'Unknown error; escaping', - 10002 => 'Incorrect number of parameters for Exception message; %s expected', + 10201 => '%s is an invalid Stack index', + 10202 => 'Element, Document, or DOMDocumentFragment expected for fragment context', + 10203 => 'Element, string, or array expected', + 10203 => 'String or array expected', - 10101 => 'Non-empty Document supplied as argument for Parser', + 10301 => 'Data string expected; found %s', + 10302 => '%s is an invalid data consumption length; a value of 1 or above is expected', - 10201 => '%s is an invalid Stack index', - 10202 => 'Element, Document, or DOMDocumentFragment expected for fragment context', - 10203 => 'Element, string, or array expected', - 10203 => 'String or array expected', + 10401 => 'The Tokenizer has entered an invalid state', - 10301 => 'Data string expected; found %s', - 10302 => '%s is an invalid data consumption length; a value of 1 or above is expected', + 10501 => 'Form element expected, found %s', + 10502 => 'Element, Document, or DOMDocumentFragment expected; found %s', + 10503 => 'Unexpected end of file', - 10401 => 'The first argument must either be an instance of \DOMNode, a string, or a closure; found %s', - 10402 => 'Element, Document, or DOMDocumentFragment expected; found %s', - - 10501 => 'The Tokenizer has entered an invalid state', - - 10601 => 'Form element expected, found %s', - 10602 => 'Element, Document, or DOMDocumentFragment expected; found %s', - 10603 => 'Unexpected end of file', - - 10701 => 'Method %1$s::%2$s has been disabled from %1$s']; + 10601 => 'Method %1$s::%2$s has been disabled from %1$s' + ]; public function __construct(int $code, ...$args) { if (!isset(static::$messages[$code])) { @@ -78,7 +72,7 @@ class Exception extends \Exception { if (count($args) !== $count) { throw new Exception(self::INCORRECT_PARAMETERS_FOR_MESSAGE, $count); } - + if ($count > 0) { // Go through each of the arguments and run sprintf on the strings. $message = call_user_func_array('sprintf', array_merge([$message], $args));