2021-10-13 15:03:28 -04:00
|
|
|
<?php
|
|
|
|
/** @license MIT
|
|
|
|
* Copyright 2017 , Dustin Wilson, J. King et al.
|
|
|
|
* See LICENSE and AUTHORS files for details */
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace MensBeam\HTML\Parser;
|
|
|
|
|
|
|
|
use MensBeam\HTML\Parser;
|
|
|
|
|
|
|
|
abstract class Serializer {
|
2021-10-13 22:52:54 -04:00
|
|
|
use NameCoercion;
|
|
|
|
|
2021-10-13 15:03:28 -04:00
|
|
|
protected const VOID_ELEMENTS = ["basefont", "bgsound", "frame", "keygen", "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"];
|
2021-10-13 22:52:54 -04:00
|
|
|
protected const RAWTEXT_ELEMENTS = ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext"];
|
2021-10-26 22:51:44 -04:00
|
|
|
protected const BOOLEAN_ATTRIBUTES = [
|
|
|
|
'allowfullscreen' => ["iframe"],
|
|
|
|
'async' => ["script"],
|
|
|
|
'autofocus' => true,
|
|
|
|
'autoplay' => ["audio", "video"],
|
|
|
|
'checked' => ["input"],
|
|
|
|
'controls' => ["audio", "video"],
|
|
|
|
'default' => ["track"],
|
|
|
|
'defer' => ["script"],
|
|
|
|
'disabled' => ["button", "fieldset", "input", "link", "optgroup", "option", "select", "textarea"],
|
|
|
|
'formnovalidate' => ["button", "input"],
|
|
|
|
'hidden' => true,
|
|
|
|
'ismap' => ["img"],
|
|
|
|
'itemscope' => true,
|
|
|
|
'loop' => ["audio", "video"],
|
|
|
|
'multiple' => ["input", "select"],
|
|
|
|
'muted' => ["audio", "video"],
|
|
|
|
'nomodule' => ["script"],
|
|
|
|
'novalidate' => ["form"],
|
|
|
|
'open' => ["details", "dialog"],
|
|
|
|
'playsinline' => ["video"],
|
|
|
|
'readonly' => ["input", "textarea"],
|
|
|
|
'required' => ["input", "select", "textarea"],
|
|
|
|
'reversed' => ["ol"],
|
|
|
|
'selected' => ["option"],
|
|
|
|
];
|
2021-10-13 15:03:28 -04:00
|
|
|
|
2021-10-17 20:56:58 -04:00
|
|
|
/** Serializes an HTML DOM node to a string. This is equivalent to the outerHTML getter
|
|
|
|
*
|
|
|
|
* @param \DOMDocument|\DOMElement|\DOMText|\DOMComment|\DOMProcessingInstruction|\DOMDocumentFragment|\DOMDocumentType $node The node to serialize
|
2021-10-27 10:08:11 -04:00
|
|
|
* @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any
|
2021-10-17 20:56:58 -04:00
|
|
|
*/
|
2021-10-26 22:51:44 -04:00
|
|
|
public static function serialize(\DOMNode $node, ?Config $config = null): string {
|
|
|
|
$config = $config ?? new Config;
|
|
|
|
$boolAttr = $config->serializeBooleanAttributeValues ?? true;
|
2021-10-27 10:18:19 -04:00
|
|
|
$endTags = $config->serializeForeignVoidEndTags ?? true;
|
2021-10-26 22:51:44 -04:00
|
|
|
|
2021-10-13 15:03:28 -04:00
|
|
|
$s = "";
|
2021-10-13 22:52:54 -04:00
|
|
|
$stack = [];
|
2021-10-13 15:03:28 -04:00
|
|
|
$n = $node;
|
|
|
|
do {
|
|
|
|
# If current node is an Element
|
|
|
|
if ($n instanceof \DOMElement) {
|
2021-10-26 22:51:44 -04:00
|
|
|
$htmlElement = ($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE;
|
2021-10-13 15:03:28 -04:00
|
|
|
# If current node is an element in the HTML namespace,
|
|
|
|
# the MathML namespace, or the SVG namespace, then let
|
|
|
|
# tagname be current node's local name. Otherwise, let
|
|
|
|
# tagname be current node's qualified name.
|
|
|
|
if (in_array($n->namespaceURI ?? Parser::HTML_NAMESPACE, [Parser::HTML_NAMESPACE, Parser::SVG_NAMESPACE, Parser::MATHML_NAMESPACE])) {
|
2021-10-13 22:52:54 -04:00
|
|
|
$tagName = self::uncoerceName($n->localName);
|
2021-10-13 15:03:28 -04:00
|
|
|
} else {
|
2021-10-13 22:52:54 -04:00
|
|
|
$tagName = self::uncoerceName($n->tagName);
|
|
|
|
}
|
|
|
|
# Append a U+003C LESS-THAN SIGN character (<), followed by tagname.
|
|
|
|
$s .= "<$tagName";
|
|
|
|
# If current node's is value is not null, and the element does
|
|
|
|
# not have an is attribute in its attribute list, then
|
|
|
|
# append the string " is="", followed by current node's is
|
|
|
|
# value escaped as described below in attribute mode,
|
|
|
|
# followed by a U+0022 QUOTATION MARK character (").
|
|
|
|
// DEVIATION: We don't support custom elements
|
|
|
|
# For each attribute that the element has, append a
|
|
|
|
# U+0020 SPACE character, the attribute's serialized name as
|
|
|
|
# described below, a U+003D EQUALS SIGN character (=), a
|
|
|
|
# U+0022 QUOTATION MARK character ("), the attribute's
|
|
|
|
# value, escaped as described below in attribute mode, and
|
|
|
|
# a second U+0022 QUOTATION MARK character (").
|
|
|
|
foreach ($n->attributes as $a) {
|
2021-10-14 11:08:34 -04:00
|
|
|
# An attribute's serialized name for the purposes of the previous
|
|
|
|
# paragraph must be determined as follows:
|
|
|
|
|
|
|
|
# If the attribute has no namespace
|
|
|
|
if ($a->namespaceURI === null) {
|
|
|
|
# The attribute's serialized name is the attribute's local name.
|
|
|
|
$name = self::uncoerceName($a->localName);
|
|
|
|
}
|
|
|
|
# If the attribute is in the XML namespace
|
|
|
|
elseif ($a->namespaceURI === Parser::XML_NAMESPACE) {
|
|
|
|
# The attribute's serialized name is the string "xml:" followed
|
|
|
|
# by the attribute's local name.
|
|
|
|
$name = "xml:".self::uncoerceName($a->localName);
|
|
|
|
}
|
|
|
|
# If the attribute is in the XMLNS namespace...
|
|
|
|
elseif ($a->namespaceURI === Parser::XMLNS_NAMESPACE) {
|
|
|
|
# ... and the attribute's local name is xmlns
|
|
|
|
if ($a->localName === "xmlns") {
|
|
|
|
# The attribute's serialized name is the string "xmlns".
|
2021-10-15 19:41:08 -04:00
|
|
|
$name = "xmlns";
|
2021-10-14 11:08:34 -04:00
|
|
|
}
|
|
|
|
# ... and the attribute's local name is not xmlns
|
|
|
|
else {
|
|
|
|
# The attribute's serialized name is the string "xmlns:"
|
|
|
|
# followed by the attribute's local name.
|
|
|
|
$name = "xmlns:".self::uncoerceName($a->localName);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# If the attribute is in the XLink namespace
|
|
|
|
elseif ($a->namespaceURI === Parser::XLINK_NAMESPACE) {
|
|
|
|
# The attribute's serialized name is the string "xlink:"
|
|
|
|
# followed by the attribute's local name.
|
|
|
|
$name = "xlink:".self::uncoerceName($a->localName);
|
|
|
|
}
|
|
|
|
# If the attribute is in some other namespace
|
|
|
|
else {
|
|
|
|
# The attribute's serialized name is the attribute's qualified name.
|
2021-10-15 19:41:08 -04:00
|
|
|
$name = ($a->prefix !== "") ? $a->prefix.":".$a->name : $a->name;
|
2021-10-14 11:08:34 -04:00
|
|
|
}
|
2021-10-26 22:51:44 -04:00
|
|
|
// retrieve the attribute value
|
2021-10-15 19:41:08 -04:00
|
|
|
$value = self::escapeString((string) $a->value, true);
|
2021-10-14 11:08:34 -04:00
|
|
|
if (
|
2021-10-26 22:51:44 -04:00
|
|
|
$boolAttr
|
|
|
|
|| !$htmlElement
|
|
|
|
|| !isset(self::BOOLEAN_ATTRIBUTES[$name])
|
|
|
|
|| is_array(self::BOOLEAN_ATTRIBUTES[$name]) && !in_array($tagName, self::BOOLEAN_ATTRIBUTES[$name])
|
|
|
|
|| (strlen($value) && strtolower($value) !== $name)
|
2021-10-14 11:08:34 -04:00
|
|
|
) {
|
2021-10-26 22:51:44 -04:00
|
|
|
// print the attribute value unless the stars align
|
|
|
|
$s .= " $name=\"$value\"";
|
2021-10-13 22:52:54 -04:00
|
|
|
} else {
|
2021-10-26 22:51:44 -04:00
|
|
|
// omit the value if the stars do align
|
|
|
|
$s .= " $name";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# Append a U+003E GREATER-THAN SIGN character (>).
|
|
|
|
// If we're minimizing void foreign elements, insert a slash first where appropriate
|
|
|
|
if (!$endTags && !$htmlElement && !$n->hasChildNodes()) {
|
|
|
|
$s .= "/>";
|
|
|
|
} else {
|
|
|
|
$s .= ">";
|
|
|
|
# If current node serializes as void, then continue on to the
|
|
|
|
# next child node at this point.
|
|
|
|
# Append the value of running the HTML fragment serialization
|
|
|
|
# algorithm on the current node element (thus recursing into
|
|
|
|
# this algorithm for that element), followed by a
|
|
|
|
# U+003C LESS-THAN SIGN character (<), a U+002F SOLIDUS
|
|
|
|
# character (/), tagname again, and finally a
|
|
|
|
# U+003E GREATER-THAN SIGN character (>).
|
|
|
|
if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE || !in_array($tagName, self::VOID_ELEMENTS)) {
|
|
|
|
# If the node is a template element, then let the node instead
|
|
|
|
# be the template element's template contents
|
|
|
|
# (a DocumentFragment node).
|
|
|
|
if (
|
|
|
|
$htmlElement
|
|
|
|
&& $n->tagName === "template"
|
|
|
|
&& property_exists($n, "content")
|
|
|
|
&& $n->content instanceof \DOMDocumentFragment
|
|
|
|
) {
|
|
|
|
// NOTE: Treat template content as any other document
|
|
|
|
// fragment and just invoke the inner serializer
|
|
|
|
$s .= self::serializeInner($n->content, $config)."</$tagName>";
|
|
|
|
} elseif ($n->hasChildNodes()) {
|
|
|
|
// If the element has children, store its tag name and
|
|
|
|
// continue the loop with its first child; its end
|
|
|
|
// tag will be written out further down
|
|
|
|
$stack[] = $tagName;
|
|
|
|
$n = $n->firstChild;
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
// Otherwise just append the end tag now
|
|
|
|
$s .= "</$tagName>";
|
|
|
|
}
|
2021-10-13 22:52:54 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# If current node is a Text node
|
|
|
|
elseif ($n instanceof \DOMText) {
|
|
|
|
# If the parent of current node is a style, script, xmp,
|
|
|
|
# iframe, noembed, noframes, or plaintext element, or
|
|
|
|
# if the parent of current node is a noscript element
|
|
|
|
# and scripting is enabled for the node, then append
|
|
|
|
# the value of current node's data IDL attribute literally.
|
2021-10-14 11:08:34 -04:00
|
|
|
$p = $n->parentNode;
|
|
|
|
if ($p instanceof \DOMElement && ($p->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($p->tagName, self::RAWTEXT_ELEMENTS)) {
|
2021-10-13 22:52:54 -04:00
|
|
|
// NOTE: scripting is assumed not to be enabled
|
|
|
|
$s .= $n->data;
|
2021-10-13 15:03:28 -04:00
|
|
|
}
|
2021-10-13 22:52:54 -04:00
|
|
|
# Otherwise, append the value of current node's data IDL attribute, escaped as described below.
|
|
|
|
else {
|
|
|
|
$s .= self::escapeString($n->data);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# If current node is a Comment
|
|
|
|
elseif ($n instanceof \DOMComment) {
|
|
|
|
# Append the literal string "<!--" (U+003C LESS-THAN SIGN,
|
|
|
|
# U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
|
|
|
|
# U+002D HYPHEN-MINUS), followed by the value of current
|
|
|
|
# node's data IDL attribute, followed by the literal
|
|
|
|
# string "-->" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS,
|
|
|
|
# U+003E GREATER-THAN SIGN).
|
|
|
|
$s .= "<!--".$n->data."-->";
|
|
|
|
}
|
|
|
|
# If current node is a ProcessingInstruction
|
|
|
|
elseif ($n instanceof \DOMProcessingInstruction) {
|
|
|
|
# Append the literal string "<?" (U+003C LESS-THAN SIGN,
|
|
|
|
# U+003F QUESTION MARK), followed by the value of
|
|
|
|
# current node's target IDL attribute, followed by a
|
|
|
|
# single U+0020 SPACE character, followed by the value
|
|
|
|
# of current node's data IDL attribute, followed by a
|
|
|
|
# single U+003E GREATER-THAN SIGN character (>).
|
|
|
|
$s .= "<?".self::uncoerceName($n->target)." ".$n->data.">";
|
|
|
|
}
|
|
|
|
# If current node is a DocumentType
|
|
|
|
elseif ($n instanceof \DOMDocumentType) {
|
|
|
|
# Append the literal string "<!DOCTYPE" (U+003C LESS-THAN SIGN,
|
|
|
|
# U+0021 EXCLAMATION MARK, U+0044 LATIN CAPITAL LETTER D,
|
|
|
|
# U+004F LATIN CAPITAL LETTER O, U+0043 LATIN CAPITAL LETTER C,
|
|
|
|
# U+0054 LATIN CAPITAL LETTER T, U+0059 LATIN CAPITAL LETTER Y,
|
|
|
|
# U+0050 LATIN CAPITAL LETTER P, U+0045 LATIN CAPITAL LETTER E),
|
|
|
|
# followed by a space (U+0020 SPACE), followed by the value
|
|
|
|
# of current node's name IDL attribute, followed by the
|
|
|
|
# literal string ">" (U+003E GREATER-THAN SIGN).
|
|
|
|
$s .= "<!DOCTYPE ".trim($n->name).">";
|
2021-10-13 15:03:28 -04:00
|
|
|
}
|
2021-10-13 22:52:54 -04:00
|
|
|
// NOTE: Documents and document fragments have no outer content,
|
|
|
|
// so we can just serialize the inner content
|
|
|
|
elseif ($n instanceof \DOMDocument || $n instanceof \DOMDocumentFragment) {
|
2021-10-26 22:51:44 -04:00
|
|
|
return self::serializeInner($n, $config);
|
2021-10-13 22:52:54 -04:00
|
|
|
} else {
|
|
|
|
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($n)]);
|
|
|
|
}
|
2021-10-14 12:27:01 -04:00
|
|
|
// If the current node has no more siblings, go up the tree till a
|
|
|
|
// sibling is found or we've reached the original node
|
2021-10-13 22:52:54 -04:00
|
|
|
while (!$n->nextSibling && $stack) {
|
2021-10-14 12:27:01 -04:00
|
|
|
// Write out the stored end tag each time we go up the tree
|
2021-10-13 22:52:54 -04:00
|
|
|
$tagName = array_pop($stack);
|
|
|
|
$s .= "</$tagName>";
|
|
|
|
$n = $n->parentNode;
|
|
|
|
}
|
|
|
|
$n = $n->nextSibling;
|
2021-10-14 12:27:01 -04:00
|
|
|
} while ($stack); // Loop until we have traversed the subtree of the target node in full
|
2021-10-13 15:03:28 -04:00
|
|
|
return $s;
|
|
|
|
}
|
2021-10-13 22:52:54 -04:00
|
|
|
|
2021-10-17 20:56:58 -04:00
|
|
|
/** Serializes the children of an HTML DOM node to a string. This is equivalent to the innerHTML getter
|
|
|
|
*
|
|
|
|
* @param \DOMDocument|\DOMElement|\DOMDocumentFragment $node The node to serialize
|
2021-10-27 10:08:11 -04:00
|
|
|
* @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any
|
2021-10-17 20:56:58 -04:00
|
|
|
*/
|
2021-10-26 22:51:44 -04:00
|
|
|
public static function serializeInner(\DOMNode $node, ?Config $config = null): string {
|
2021-10-14 11:08:34 -04:00
|
|
|
# Let s be a string, and initialize it to the empty string.
|
|
|
|
$s = "";
|
2021-10-14 12:27:01 -04:00
|
|
|
|
2021-10-14 11:08:34 -04:00
|
|
|
if ($node instanceof \DOMElement && ($node->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
|
2021-10-14 12:27:01 -04:00
|
|
|
# If the node serializes as void, then return the empty string.
|
2021-10-22 09:55:09 -04:00
|
|
|
if (in_array($node->tagName, self::VOID_ELEMENTS)) {
|
2021-10-14 11:08:34 -04:00
|
|
|
return "";
|
2021-10-14 12:27:01 -04:00
|
|
|
}
|
|
|
|
# If the node is a template element, then let the node instead
|
|
|
|
# be the template element's template contents
|
|
|
|
# (a DocumentFragment node).
|
|
|
|
elseif ($node->tagName === "template" && property_exists($node, "content") && $node->content instanceof \DOMDocumentFragment) {
|
2021-10-14 11:08:34 -04:00
|
|
|
// NOTE: template elements won't necessarily have a content
|
|
|
|
// property because PHP's DOM does not support this natively
|
|
|
|
$node = $node->content;
|
2021-10-13 22:52:54 -04:00
|
|
|
}
|
|
|
|
}
|
2021-10-14 11:08:34 -04:00
|
|
|
if ($node instanceof \DOMElement || $node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) {
|
|
|
|
# For each child node of the node, in tree order, run the following steps:
|
2021-10-23 17:04:34 -04:00
|
|
|
// NOTE: the steps in question are implemented in the "serialize" routine
|
2021-10-14 11:08:34 -04:00
|
|
|
foreach ($node->childNodes as $n) {
|
2021-10-26 22:51:44 -04:00
|
|
|
$s .= self::serialize($n, $config);
|
2021-10-14 11:08:34 -04:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($node)]);
|
2021-10-13 22:52:54 -04:00
|
|
|
}
|
2021-10-14 11:08:34 -04:00
|
|
|
return $s;
|
2021-10-13 22:52:54 -04:00
|
|
|
}
|
2021-10-13 15:03:28 -04:00
|
|
|
}
|