Browse Source

Trying a rewrite with recursion to simplify pretty printing

pretty-print
Dustin Wilson 3 years ago
parent
commit
4cd3a9b03d
  1. 616
      lib/Parser/Serializer-old.php
  2. 588
      lib/Parser/Serializer.php

616
lib/Parser/Serializer-old.php

@ -0,0 +1,616 @@
<?php
/** @license MIT
* Copyright 2017 , Dustin Wilson, J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\HTML\Parser;
use MensBeam\HTML\Parser;
abstract class Serializer {
use NameCoercion;
// Elements treated as block elements when reformatting whitespace
protected const BLOCK_ELEMENTS = [ 'address', 'article', 'aside', 'blockquote', 'base', 'body', 'details', 'dialog', 'dd', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hr', 'html', 'isindex', 'li', 'link', 'main', 'meta', 'nav', 'ol', 'p', 'picture', 'pre', 'section', 'script', 'source', 'style', 'table', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'ul' ];
// List of h-elements which are used to determine element grouping for the
// purposes of reformatting whitespace
protected const H_ELEMENTS = [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ];
// List of preformatted elements where content is ignored for the purposes of
// reformatting whitespace
protected const PREFORMATTED_ELEMENTS = [ 'iframe', 'listing', 'noembed', 'noframes', 'noscript', 'plaintext', 'pre', 'style', 'script', 'textarea', 'title', 'xmp' ];
protected const VOID_ELEMENTS = ["basefont", "bgsound", "frame", "keygen", "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"];
protected const RAWTEXT_ELEMENTS = ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext"];
protected const BOOLEAN_ATTRIBUTES = [
'allowfullscreen' => ["iframe"],
'async' => ["script"],
'autofocus' => true,
'autoplay' => ["audio", "video"],
'checked' => ["input"],
'compact' => ["dir", "dl", "menu", "ol", "ul"],
'controls' => ["audio", "video"],
'declare' => ["object"],
'default' => ["track"],
'defer' => ["script"],
'disabled' => ["button", "fieldset", "input", "link", "optgroup", "option", "select", "textarea"],
'formnovalidate' => ["button", "input"],
'hidden' => true,
'ismap' => ["img"],
'itemscope' => true,
'loop' => ["audio", "video"],
'multiple' => ["input", "select"],
'muted' => ["audio", "video"],
'nohref' => ["area"],
'nomodule' => ["script"],
'noresize' => ["frame"],
'noshade' => ["hr"],
'novalidate' => ["form"],
'nowrap' => ["td", "th"],
'open' => ["details", "dialog"],
'playsinline' => ["video"],
'readonly' => ["input", "textarea"],
'required' => ["input", "select", "textarea"],
'reversed' => ["ol"],
'selected' => ["option"],
];
// Used when reformatting whitespace when nodes are checked for being treated as block.
protected const BLOCK_QUERY = 'count(.//*[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"][not(ancestor::iframe[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::listing[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::noembed[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::noframes[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::noscript[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::plaintext[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::pre[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::style[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::script[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::textarea[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::title[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::xmp[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"])][name()="address" or name()="article" or name()="aside" or name()="blockquote" or name()="base" or name()="body" or name()="details" or name()="dialog" or name()="dd" or name()="div" or name()="dl" or name()="dt" or name()="fieldset" or name()="figcaption" or name()="figure" or name()="footer" or name()="form" or name()="frame" or name()="frameset" or name()="h1" or name()="h2" or name()="h3" or name()="h4" or name()="h5" or name()="h6" or name()="head" or name()="header" or name()="hr" or name()="html" or name()="isindex" or name()="li" or name()="link" or name()="main" or name()="meta" or name()="nav" or name()="ol" or name()="p" or name()="picture" or name()="pre" or name()="section" or name()="script" or name()="source" or name()="style" or name()="table" or name()="td" or name()="tfoot" or name()="th" or name()="thead" or name()="title" or name()="tr" or name()="ul"][1])';
/** Serializes an HTML DOM node to a string. This is equivalent to the outerHTML getter
*
* @param \DOMDocument|\DOMElement|\DOMText|\DOMComment|\DOMProcessingInstruction|\DOMDocumentFragment|\DOMDocumentType $node The node to serialize
* @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any
*/
public static function serialize(\DOMNode $node, ?Config $config = null): string {
$config = $config ?? new Config;
$boolAttr = $config->serializeBooleanAttributeValues ?? true;
$endTags = $config->serializeForeignVoidEndTags ?? true;
$reformatWhitespace = $config->reformatWhitespace ?? false;
if ($reformatWhitespace) {
$indentStep = $config->indentStep ?? 1;
$indentChar = ($config->indentWithSpaces ?? true) ? ' ' : "\t";
}
$s = "";
$stack = [];
$n = $node;
if ($reformatWhitespace) {
$first = true;
$indentionLevel = 0;
$modifyStack = [];
}
do {
# If current node is an Element
if ($n instanceof \DOMElement) {
$htmlElement = ($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE;
# If current node is an element in the HTML namespace,
# the MathML namespace, or the SVG namespace, then let
# tagname be current node's local name. Otherwise, let
# tagname be current node's qualified name.
if (in_array($n->namespaceURI ?? Parser::HTML_NAMESPACE, [Parser::HTML_NAMESPACE, Parser::SVG_NAMESPACE, Parser::MATHML_NAMESPACE])) {
$tagName = self::uncoerceName($n->localName);
} else {
$tagName = self::uncoerceName($n->tagName);
}
if ($reformatWhitespace) {
$hasChildNodes = $n->hasChildNodes();
$modify = false;
// Start off by finding the first non-text node child in the document or fragment.
$firstNonTextNodeChild = null;
// If the parent node is null this means the element itself is being serialized.
// It is the first non-text node child.
if ($n->parentNode === null) {
$firstNonTextNodeChild = $n;
}
// Otherwise, if the node's parent node is a Document or a DocumentFragment then
// iterate through that parent node's children and get the first non-text node
// child.
elseif (($n->parentNode instanceof \DOMDocument || $n->parentNode instanceof \DOMDocumentFragment)) {
$t = $n->parentNode->firstChild;
do {
if (!$t instanceof \DOMText) {
$firstNonTextNodeChild = $t;
break;
}
} while ($t = $t->nextSibling);
}
// If the node is an HTML element...
if ($htmlElement) {
// If the element is to be treated as block then we need to modify whitespace.
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
}
// If the node is not an HTML element...
else {
// If the parent node is null then we need to modify whitespace.
if ($n->parentNode === null) {
$modify = true;
}
// If a foreign element with an html element parent
elseif (($n->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
// If the foreign element should be treated as block then we need to modify
// whitespace
$modify = self::treatAsBlock($n->parentNode);
}
// Otherwise, walk up the DOM and find the root foreign ancestor. If that
// ancestor is to be treated as block then we need to modify whitespace.
else {
$modify = self::treatForeignRootAsBlock($n);
}
}
// Only modify the whitespace here if the current node is not the first non-text
// node child. This is to prevent newlines from being printed when elements
// themsleves are serialized or if they're the first node in the tree when a
// Document or DocumentFragment.
if ($modify && $firstNonTextNodeChild !== $n) {
$previousNonTextNodeSiblingName = null;
$nn = $n;
while ($nn = $nn->previousSibling) {
if (!$nn instanceof \DOMText) {
$previousNonTextNodeSiblingName = $nn->nodeName;
break;
}
}
// If the previous non text node sibling doesn't have the same name as the
// current node and neither are h1-h6 elements then add an additional newline.
if ($previousNonTextNodeSiblingName !== null && $previousNonTextNodeSiblingName !== $tagName && count(array_intersect([ $previousNonTextNodeSiblingName, $tagName ], self::H_ELEMENTS)) !== 2) {
$s .= "\n";
}
$s .= "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
}
# Append a U+003C LESS-THAN SIGN character (<), followed by tagname.
$s .= "<$tagName";
# If current node's is value is not null, and the element does
# not have an is attribute in its attribute list, then
# append the string " is="", followed by current node's is
# value escaped as described below in attribute mode,
# followed by a U+0022 QUOTATION MARK character (").
// DEVIATION: We don't support custom elements
# For each attribute that the element has, append a
# U+0020 SPACE character, the attribute's serialized name as
# described below, a U+003D EQUALS SIGN character (=), a
# U+0022 QUOTATION MARK character ("), the attribute's
# value, escaped as described below in attribute mode, and
# a second U+0022 QUOTATION MARK character (").
foreach ($n->attributes as $a) {
# An attribute's serialized name for the purposes of the previous
# paragraph must be determined as follows:
# If the attribute has no namespace
if ($a->namespaceURI === null) {
# The attribute's serialized name is the attribute's local name.
$name = self::uncoerceName($a->localName);
}
# If the attribute is in the XML namespace
elseif ($a->namespaceURI === Parser::XML_NAMESPACE) {
# The attribute's serialized name is the string "xml:" followed
# by the attribute's local name.
$name = "xml:".self::uncoerceName($a->localName);
}
# If the attribute is in the XMLNS namespace...
elseif ($a->namespaceURI === Parser::XMLNS_NAMESPACE) {
# ... and the attribute's local name is xmlns
if ($a->localName === "xmlns") {
# The attribute's serialized name is the string "xmlns".
$name = "xmlns";
}
# ... and the attribute's local name is not xmlns
else {
# The attribute's serialized name is the string "xmlns:"
# followed by the attribute's local name.
$name = "xmlns:".self::uncoerceName($a->localName);
}
}
# If the attribute is in the XLink namespace
elseif ($a->namespaceURI === Parser::XLINK_NAMESPACE) {
# The attribute's serialized name is the string "xlink:"
# followed by the attribute's local name.
$name = "xlink:".self::uncoerceName($a->localName);
}
# If the attribute is in some other namespace
else {
# The attribute's serialized name is the attribute's qualified name.
$name = ($a->prefix !== "") ? $a->prefix.":".$a->name : $a->name;
}
// retrieve the attribute value
$value = self::escapeString((string) $a->value, true);
if (
$boolAttr
|| !$htmlElement
|| !isset(self::BOOLEAN_ATTRIBUTES[$name])
|| is_array(self::BOOLEAN_ATTRIBUTES[$name]) && !in_array($tagName, self::BOOLEAN_ATTRIBUTES[$name])
|| (strlen($value) && strtolower($value) !== $name)
) {
// print the attribute value unless the stars align
$s .= " $name=\"$value\"";
} else {
// omit the value if the stars do align
$s .= " $name";
}
}
# Append a U+003E GREATER-THAN SIGN character (>).
// If we're minimizing void foreign elements, insert a slash first where appropriate
if (!$endTags && !$htmlElement && !$n->hasChildNodes()) {
$s .= "/>";
} else {
$s .= ">";
# If current node serializes as void, then continue on to the next child node at
# this point.
# Append the value of running the HTML fragment serialization algorithm on the
# current node element (thus recursing into this algorithm for that element),
# followed by a U+003C LESS-THAN SIGN character (<), a U+002F SOLIDUS character (/),
# tagname again, and finally a U+003E GREATER-THAN SIGN character (>).
if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE || !in_array($tagName, self::VOID_ELEMENTS)) {
# If the node is a template element, then let the node instead be the template
# element's template contents (a DocumentFragment node).
if ($htmlElement && $tagName === "template") {
// Disable pretty printing when serializing templates in preformatted content
$templateConfig = $config;
$isPreformattedContent = self::isPreformattedContent($n);
if ($reformatWhitespace && $isPreformattedContent) {
$templateConfig->reformatWhitespace = false;
}
$nn = self::getTemplateContent($n);
$ss = '';
# For each child node of the node, in tree order, run the following steps:
foreach ($nn->childNodes as $nnn) {
$ss .= self::serialize($nnn, $config);
}
if ($reformatWhitespace) {
if (!$isPreformattedContent && $indentionLevel > 0) {
// If the template's content is to be treated as block content then post-indent
// newlines at 1 + the current indention level in the serialized template
// contents. Then append a newline followed by another indention at the current
// indention level for the end tag.
if (self::treatAsBlock($n)) {
$ss = str_replace("\n", "\n" . str_repeat($indentChar, ($indentionLevel + 1) * $indentStep), $ss) . "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
}
}
$s .= $ss;
} elseif ($n->hasChildNodes()) {
if ($reformatWhitespace) {
// If formatting output and the element's whitespace has already been modified
// increment the indention level
$indentionLevel++;
$prettyPrintStack[] = $n;
}
// If the element has children, store its tag name and continue the loop with
// its first child; its end tag will be written out further down
$stack[] = $tagName;
$n = $n->firstChild;
continue;
}
// Otherwise just append the end tag now
$s .= "</$tagName>";
}
}
}
# If current node is a Text node
elseif ($n instanceof \DOMText) {
# If the parent of current node is a style, script, xmp,
# iframe, noembed, noframes, or plaintext element, or
# if the parent of current node is a noscript element
# and scripting is enabled for the node, then append
# the value of current node's data IDL attribute literally.
$p = $n->parentNode;
if ($p instanceof \DOMElement && ($p->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($p->tagName, self::RAWTEXT_ELEMENTS)) {
// NOTE: scripting is assumed not to be enabled
$s .= $n->data;
}
# Otherwise, append the value of current node's data IDL attribute, escaped as described below.
else {
$t = $n->data;
if ($reformatWhitespace && !self::isPreformattedContent($n)) {
// If the node's parent node is to be treated as block or if it is not an HTML
// element and its root foreign element is to be treated as block...
if (self::treatAsBlock($n->parentNode) || (($n->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE && self::treatForeignRootAsBlock($n))) {
// If the text node's data is made up of only whitespace characters continue
// onto the next node
if (strspn($t, Data::WHITESPACE) === strlen($t)) {
// FIXME: this is temporary
goto next;
}
}
// Condense spaces and tabs into a single space.
$t = preg_replace('/ +/', ' ', str_replace("\t", ' ', $t));
}
$s .= self::escapeString($t);
}
}
# If current node is a Comment
elseif ($n instanceof \DOMComment) {
if ($reformatWhitespace && !self::isPreformattedContent($n)) {
$modify = false;
if (($n->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
} else {
if ($n->parentNode->parentNode !== null && ($n->parentNode->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
} elseif (self::treatForeignRootAsBlock($n)) {
$modify = true;
}
}
if ($modify) {
$previousNonTextNodeSiblingName = null;
$nn = $n;
while ($nn = $nn->previousSibling) {
if (!$nn instanceof \DOMText) {
$previousNonTextNodeSiblingName = $nn->nodeName;
break;
}
}
// Add an additional newline if the previous sibling wasn't a comment.
if ($previousNonTextNodeSiblingName !== null && $previousNonTextNodeSiblingName !== $n->nodeName) {
$s .= "\n";
}
$s .= "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
}
# Append the literal string "<!--" (U+003C LESS-THAN SIGN,
# U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
# U+002D HYPHEN-MINUS), followed by the value of current
# node's data IDL attribute, followed by the literal
# string "-->" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS,
# U+003E GREATER-THAN SIGN).
$s .= "<!--".$n->data."-->";
}
# If current node is a ProcessingInstruction
elseif ($n instanceof \DOMProcessingInstruction) {
if ($reformatWhitespace && !self::isPreformattedContent($n)) {
$modify = false;
if (($n->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
} else {
if ($n->parentNode->parentNode !== null && ($n->parentNode->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
} elseif (self::treatForeignRootAsBlock($n)) {
$modify = true;
}
}
if ($modify) {
$previousNonTextNodeSiblingName = null;
$nn = $n;
while ($nn = $nn->previousSibling) {
if (!$nn instanceof \DOMText) {
$previousNonTextNodeSiblingName = $nn->nodeName;
break;
}
}
// Add an additional newline if the previous sibling wasn't a comment.
if ($previousNonTextNodeSiblingName !== null && $previousNonTextNodeSiblingName !== $n->nodeName) {
$s .= "\n";
}
$s .= "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
}
# Append the literal string "<?" (U+003C LESS-THAN SIGN,
# U+003F QUESTION MARK), followed by the value of
# current node's target IDL attribute, followed by a
# single U+0020 SPACE character, followed by the value
# of current node's data IDL attribute, followed by a
# single U+003E GREATER-THAN SIGN character (>).
$s .= "<?".self::uncoerceName($n->target)." ".$n->data.">";
}
# If current node is a DocumentType
elseif ($n instanceof \DOMDocumentType) {
# Append the literal string "<!DOCTYPE" (U+003C LESS-THAN SIGN,
# U+0021 EXCLAMATION MARK, U+0044 LATIN CAPITAL LETTER D,
# U+004F LATIN CAPITAL LETTER O, U+0043 LATIN CAPITAL LETTER C,
# U+0054 LATIN CAPITAL LETTER T, U+0059 LATIN CAPITAL LETTER Y,
# U+0050 LATIN CAPITAL LETTER P, U+0045 LATIN CAPITAL LETTER E),
# followed by a space (U+0020 SPACE), followed by the value
# of current node's name IDL attribute, followed by the
# literal string ">" (U+003E GREATER-THAN SIGN).
$s .= "<!DOCTYPE ".trim($n->name).">";
}
// NOTE: Documents and document fragments have no outer content,
// so we can just serialize the inner content
elseif ($n instanceof \DOMDocument || $n instanceof \DOMDocumentFragment) {
return self::serializeInner($n, $config);
} else {
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($n)]);
}
next:
// If the current node has no more siblings, go up the tree till a
// sibling is found or we've reached the original node
while (!$n->nextSibling && $stack) {
// Write out the stored end tag each time we go up the tree
$tagName = array_pop($stack);
if ($reformatWhitespace) {
$indentionLevel--;
$tag = array_pop($prettyPrintStack);
$modify = false;
// If the element popped off the stack isn't a preformatted element...
if (!self::isPreformattedContent($n)) {
// If it is in the HTML namespace and is to be treated as block then we need to
// modify whitespace.
if (($tag->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($tag)) {
$modify = true;
}
} else {
$firstElementChild = null;
if (property_exists($tag, 'firstElementChild')) {
$firstElementChild = $tag->firstElementChild;
} else {
$t = $tag->firstChild;
do {
if ($t instanceof \DOMElement) {
$firstElementChild = $t;
break;
}
} while ($t = $t->nextSibling);
}
// Otherwise, if foreign and has a child element...
if ($firstElementChild !== null) {
// If the element popped off the stack has an HTML element parent and its parent
// is to be treated as block then we need to modify whitespace.
if ($tag->parentNode !== null && ($tag->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($tag->parentNode)) {
$modify = true;
}
// Otherwise, if the element's foreign root is to be treated as block we need to
// modify whitespace, too.
} elseif ($tag->parentNode === null || self::treatForeignRootAsBlock($tag)) {
$modify = true;
}
}
}
}
if ($modify) {
$s .= "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
}
$s .= "</$tagName>";
$n = $n->parentNode;
}
$n = $n->nextSibling;
} while ($stack); // Loop until we have traversed the subtree of the target node in full
return $s;
}
/** Serializes the children of an HTML DOM node to a string. This is equivalent to the innerHTML getter
*
* @param \DOMDocument|\DOMElement|\DOMDocumentFragment $node The node to serialize
* @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any
*/
public static function serializeInner(\DOMNode $node, ?Config $config = null): string {
$reformatWhitespace = $config->reformatWhitespace ?? false;
# Let s be a string, and initialize it to the empty string.
$s = "";
if ($node instanceof \DOMElement && ($node->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
# If the node serializes as void, then return the empty string.
if (in_array($node->tagName, self::VOID_ELEMENTS)) {
return "";
}
# If the node is a template element, then let the node instead be the template
# element's template contents (a DocumentFragment node).
elseif ($node->tagName === "template") {
$n = self::getTemplateContent($n);
# For each child node of the node, in tree order, run the following steps:
// NOTE: the steps in question are implemented in the "serialize" routine
foreach ($n->childNodes as $nn) {
$s .= self::serialize($nn, $config);
}
return $s;
}
}
if ($node instanceof \DOMElement || $node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) {
# For each child node of the node, in tree order, run the following steps:
// NOTE: the steps in question are implemented in the "serialize" routine
foreach ($node->childNodes as $n) {
$s .= self::serialize($n, $config);
}
} else {
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($node)]);
}
return $s;
}
protected static function getTemplateContent(\DOMElement $node, ?Config $config = null): \DOMNode {
// NOTE: PHP's DOM does not support the content property on template elements
// natively. This method exists purely so implementors of userland PHP DOM
// solutions may extend this method to get template contents how they need them.
return $node;
}
protected static function isPreformattedContent(\DOMNode $node): bool {
// NOTE: This method is used only when pretty printing. Implementors of userland
// PHP DOM solutions with template contents will need to extend this method to
// be able to moonwalk through document fragment hosts.
$n = $node;
do {
if ($n instanceof \DOMElement && ($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($n->tagName, self::PREFORMATTED_ELEMENTS)) {
return true;
}
} while ($n = $n->parentNode);
return false;
}
protected static function treatAsBlock(\DOMNode $node): bool {
// NOTE: This method is used only when pretty printing. Implementors of userland
// PHP DOM solutions with template contents will need to extend this method to
// check for any templates and look within their content fragments for "block"
// content.
if ($node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) {
return true;
}
$xpath = new \DOMXPath($node->ownerDocument);
return ($xpath->evaluate(self::BLOCK_QUERY, $node) > 0);
}
protected static function treatForeignRootAsBlock(\DOMNode $node): bool {
// NOTE: This method is used only when pretty printing. Implementors of userland
// PHP DOM solutions with template contents will need to extend this method to
// be able to moonwalk through document fragment hosts.
$n = $node;
while ($n = $n->parentNode) {
if ($n instanceof \DOMDocument || $n instanceof \DOMDocumentFragment || ($n instanceof \DOMElement && $n->parentNode === null)) {
return true;
} elseif (($n->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($n->parentNode)) {
return true;
}
break;
}
}
return false;
}
}

588
lib/Parser/Serializer.php

@ -54,8 +54,7 @@ abstract class Serializer {
'selected' => ["option"],
];
// Used when reformatting whitespace when nodes are checked for being treated as block.
/* Used when reformatting whitespace when nodes are checked for being treated as block. */
protected const BLOCK_QUERY = 'count(.//*[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"][not(ancestor::iframe[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::listing[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::noembed[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::noframes[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::noscript[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::plaintext[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::pre[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::style[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::script[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::textarea[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::title[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"] or ancestor::xmp[namespace-uri()="" or namespace-uri()="http://www.w3.org/1999/xhtml"])][name()="address" or name()="article" or name()="aside" or name()="blockquote" or name()="base" or name()="body" or name()="details" or name()="dialog" or name()="dd" or name()="div" or name()="dl" or name()="dt" or name()="fieldset" or name()="figcaption" or name()="figure" or name()="footer" or name()="form" or name()="frame" or name()="frameset" or name()="h1" or name()="h2" or name()="h3" or name()="h4" or name()="h5" or name()="h6" or name()="head" or name()="header" or name()="hr" or name()="html" or name()="isindex" or name()="li" or name()="link" or name()="main" or name()="meta" or name()="nav" or name()="ol" or name()="p" or name()="picture" or name()="pre" or name()="section" or name()="script" or name()="source" or name()="style" or name()="table" or name()="td" or name()="tfoot" or name()="th" or name()="thead" or name()="title" or name()="tr" or name()="ul"][1])';
/** Serializes an HTML DOM node to a string. This is equivalent to the outerHTML getter
@ -74,445 +73,178 @@ abstract class Serializer {
$indentChar = ($config->indentWithSpaces ?? true) ? ' ' : "\t";
}
# 2. Let s be a string, and initialize it to the empty string.
$s = "";
$stack = [];
$n = $node;
if ($reformatWhitespace) {
$first = true;
$indentionLevel = 0;
$modifyStack = [];
}
do {
# If current node is an Element
if ($n instanceof \DOMElement) {
$htmlElement = ($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE;
# If current node is an element in the HTML namespace,
# the MathML namespace, or the SVG namespace, then let
# tagname be current node's local name. Otherwise, let
# tagname be current node's qualified name.
if (in_array($n->namespaceURI ?? Parser::HTML_NAMESPACE, [Parser::HTML_NAMESPACE, Parser::SVG_NAMESPACE, Parser::MATHML_NAMESPACE])) {
$tagName = self::uncoerceName($n->localName);
} else {
$tagName = self::uncoerceName($n->tagName);
}
if ($reformatWhitespace) {
$hasChildNodes = $n->hasChildNodes();
$modify = false;
// Start off by finding the first non-text node child in the document or fragment.
$firstNonTextNodeChild = null;
// If the parent node is null this means the element itself is being serialized.
// It is the first non-text node child.
if ($n->parentNode === null) {
$firstNonTextNodeChild = $n;
}
// Otherwise, if the node's parent node is a Document or a DocumentFragment then
// iterate through that parent node's children and get the first non-text node
// child.
elseif (($n->parentNode instanceof \DOMDocument || $n->parentNode instanceof \DOMDocumentFragment)) {
$t = $n->parentNode->firstChild;
do {
if (!$t instanceof \DOMText) {
$firstNonTextNodeChild = $t;
break;
}
} while ($t = $t->nextSibling);
}
// If the node is an HTML element...
if ($htmlElement) {
// If the element is to be treated as block then we need to modify whitespace.
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
}
// If the node is not an HTML element...
else {
// If the parent node is null then we need to modify whitespace.
if ($n->parentNode === null) {
$modify = true;
}
// If a foreign element with an html element parent
elseif (($n->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
// If the foreign element should be treated as block then we need to modify
// whitespace
$modify = self::treatAsBlock($n->parentNode);
}
// Otherwise, walk up the DOM and find the root foreign ancestor. If that
// ancestor is to be treated as block then we need to modify whitespace.
else {
$modify = self::treatForeignRootAsBlock($n);
}
}
// Only modify the whitespace here if the current node is not the first non-text
// node child. This is to prevent newlines from being printed when elements
// themsleves are serialized or if they're the first node in the tree when a
// Document or DocumentFragment.
if ($modify && $firstNonTextNodeChild !== $n) {
$previousNonTextNodeSiblingName = null;
$nn = $n;
while ($nn = $nn->previousSibling) {
if (!$nn instanceof \DOMText) {
$previousNonTextNodeSiblingName = $nn->nodeName;
break;
}
}
# 3. If the node is a template element, then let the node instead be the
# template element’s template contents (a DocumentFragment node).
if ($node instanceof \DOMElement) {
$htmlElement = ($node->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE;
// If the previous non text node sibling doesn't have the same name as the
// current node and neither are h1-h6 elements then add an additional newline.
if ($previousNonTextNodeSiblingName !== null && $previousNonTextNodeSiblingName !== $tagName && count(array_intersect([ $previousNonTextNodeSiblingName, $tagName ], self::H_ELEMENTS)) !== 2) {
$s .= "\n";
}
if ($htmlElement && $node->tagName === 'template') {
$node = $node->content;
}
}
# If current node is an Element
if ($node instanceof \DOMElement) {
# If current node is an element in the HTML namespace, the MathML namespace, or
# the SVG namespace, then let tagname be current node's local name.
if (in_array($node->namespaceURI ?? Parser::HTML_NAMESPACE, [Parser::HTML_NAMESPACE, Parser::SVG_NAMESPACE, Parser::MATHML_NAMESPACE])) {
$tagName = self::uncoerceName($node->localName);
}
# Otherwise, let tagname be current node's qualified name.
else {
$tagName = self::uncoerceName($node->tagName);
}
$s .= "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
# Append a U+003C LESS-THAN SIGN character (<), followed by tagname.
$s .= "<$tagName";
# If current node's is value is not null, and the element does not have an is
# attribute in its attribute list, then append the string " is="", followed by
# current node's is value escaped as described below in attribute mode, followed
# by a U+0022 QUOTATION MARK character (").
// DEVIATION: We don't support custom elements
# For each attribute that the element has, append a U+0020 SPACE character, the
# attribute's serialized name as described below, a U+003D EQUALS SIGN character (=),
# a U+0022 QUOTATION MARK character ("), the attribute's value, escaped as
# described below in attribute mode, and a second U+0022 QUOTATION MARK
# character (").
foreach ($node->attributes as $a) {
# An attribute's serialized name for the purposes of the previous paragraph must
# be determined as follows:
# If the attribute has no namespace
if ($a->namespaceURI === null) {
# The attribute's serialized name is the attribute's local name.
$name = self::uncoerceName($a->localName);
}
# Append a U+003C LESS-THAN SIGN character (<), followed by tagname.
$s .= "<$tagName";
# If current node's is value is not null, and the element does
# not have an is attribute in its attribute list, then
# append the string " is="", followed by current node's is
# value escaped as described below in attribute mode,
# followed by a U+0022 QUOTATION MARK character (").
// DEVIATION: We don't support custom elements
# For each attribute that the element has, append a
# U+0020 SPACE character, the attribute's serialized name as
# described below, a U+003D EQUALS SIGN character (=), a
# U+0022 QUOTATION MARK character ("), the attribute's
# value, escaped as described below in attribute mode, and
# a second U+0022 QUOTATION MARK character (").
foreach ($n->attributes as $a) {
# An attribute's serialized name for the purposes of the previous
# paragraph must be determined as follows:
# If the attribute has no namespace
if ($a->namespaceURI === null) {
# The attribute's serialized name is the attribute's local name.
$name = self::uncoerceName($a->localName);
}
# If the attribute is in the XML namespace
elseif ($a->namespaceURI === Parser::XML_NAMESPACE) {
# The attribute's serialized name is the string "xml:" followed
# by the attribute's local name.
$name = "xml:".self::uncoerceName($a->localName);
}
# If the attribute is in the XMLNS namespace...
elseif ($a->namespaceURI === Parser::XMLNS_NAMESPACE) {
# ... and the attribute's local name is xmlns
if ($a->localName === "xmlns") {
# The attribute's serialized name is the string "xmlns".
$name = "xmlns";
}
# ... and the attribute's local name is not xmlns
else {
# The attribute's serialized name is the string "xmlns:"
# followed by the attribute's local name.
$name = "xmlns:".self::uncoerceName($a->localName);
}
}
# If the attribute is in the XLink namespace
elseif ($a->namespaceURI === Parser::XLINK_NAMESPACE) {
# The attribute's serialized name is the string "xlink:"
# followed by the attribute's local name.
$name = "xlink:".self::uncoerceName($a->localName);
# If the attribute is in the XML namespace
elseif ($a->namespaceURI === Parser::XML_NAMESPACE) {
# The attribute's serialized name is the string "xml:" followed
# by the attribute's local name.
$name = "xml:".self::uncoerceName($a->localName);
}
# If the attribute is in the XMLNS namespace...
elseif ($a->namespaceURI === Parser::XMLNS_NAMESPACE) {
# ... and the attribute's local name is xmlns
if ($a->localName === "xmlns") {
# The attribute's serialized name is the string "xmlns".
$name = "xmlns";
}
# If the attribute is in some other namespace
# ... and the attribute's local name is not xmlns
else {
# The attribute's serialized name is the attribute's qualified name.
$name = ($a->prefix !== "") ? $a->prefix.":".$a->name : $a->name;
}
// retrieve the attribute value
$value = self::escapeString((string) $a->value, true);
if (
$boolAttr
|| !$htmlElement
|| !isset(self::BOOLEAN_ATTRIBUTES[$name])
|| is_array(self::BOOLEAN_ATTRIBUTES[$name]) && !in_array($tagName, self::BOOLEAN_ATTRIBUTES[$name])
|| (strlen($value) && strtolower($value) !== $name)
) {
// print the attribute value unless the stars align
$s .= " $name=\"$value\"";
} else {
// omit the value if the stars do align
$s .= " $name";
}
}
# Append a U+003E GREATER-THAN SIGN character (>).
// If we're minimizing void foreign elements, insert a slash first where appropriate
if (!$endTags && !$htmlElement && !$n->hasChildNodes()) {
$s .= "/>";
} else {
$s .= ">";
# If current node serializes as void, then continue on to the next child node at
# this point.
# Append the value of running the HTML fragment serialization algorithm on the
# current node element (thus recursing into this algorithm for that element),
# followed by a U+003C LESS-THAN SIGN character (<), a U+002F SOLIDUS character (/),
# tagname again, and finally a U+003E GREATER-THAN SIGN character (>).
if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE || !in_array($tagName, self::VOID_ELEMENTS)) {
# If the node is a template element, then let the node instead be the template
# element's template contents (a DocumentFragment node).
if ($htmlElement && $tagName === "template") {
// Disable pretty printing when serializing templates in preformatted content
$templateConfig = $config;
$isPreformattedContent = self::isPreformattedContent($n);
if ($reformatWhitespace && $isPreformattedContent) {
$templateConfig->reformatWhitespace = false;
}
$nn = self::getTemplateContent($n);
$ss = '';
# For each child node of the node, in tree order, run the following steps:
foreach ($nn->childNodes as $nnn) {
$ss .= self::serialize($nnn, $config);
}
if ($reformatWhitespace) {
if (!$isPreformattedContent && $indentionLevel > 0) {
// If the template's content is to be treated as block content then post-indent
// newlines at 1 + the current indention level in the serialized template
// contents. Then append a newline followed by another indention at the current
// indention level for the end tag.
if (self::treatAsBlock($n)) {
$ss = str_replace("\n", "\n" . str_repeat($indentChar, ($indentionLevel + 1) * $indentStep), $ss) . "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
}
}
$s .= $ss;
} elseif ($n->hasChildNodes()) {
if ($reformatWhitespace) {
// If formatting output and the element's whitespace has already been modified
// increment the indention level
$indentionLevel++;
$prettyPrintStack[] = $n;
}
// If the element has children, store its tag name and continue the loop with
// its first child; its end tag will be written out further down
$stack[] = $tagName;
$n = $n->firstChild;
continue;
}
// Otherwise just append the end tag now
$s .= "</$tagName>";
# The attribute's serialized name is the string "xmlns:"
# followed by the attribute's local name.
$name = "xmlns:".self::uncoerceName($a->localName);
}
}
}
# If current node is a Text node
elseif ($n instanceof \DOMText) {
# If the parent of current node is a style, script, xmp,
# iframe, noembed, noframes, or plaintext element, or
# if the parent of current node is a noscript element
# and scripting is enabled for the node, then append
# the value of current node's data IDL attribute literally.
$p = $n->parentNode;
if ($p instanceof \DOMElement && ($p->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($p->tagName, self::RAWTEXT_ELEMENTS)) {
// NOTE: scripting is assumed not to be enabled
$s .= $n->data;
# If the attribute is in the XLink namespace
elseif ($a->namespaceURI === Parser::XLINK_NAMESPACE) {
# The attribute's serialized name is the string "xlink:"
# followed by the attribute's local name.
$name = "xlink:".self::uncoerceName($a->localName);
}
# Otherwise, append the value of current node's data IDL attribute, escaped as described below.
# If the attribute is in some other namespace
else {
$t = $n->data;
if ($reformatWhitespace && !self::isPreformattedContent($n)) {
// If the node's parent node is to be treated as block or if it is not an HTML
// element and its root foreign element is to be treated as block...
if (self::treatAsBlock($n->parentNode) || (($n->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE && self::treatForeignRootAsBlock($n))) {
// If the text node's data is made up of only whitespace characters continue
// onto the next node
if (strspn($t, Data::WHITESPACE) === strlen($t)) {
// FIXME: this is temporary
goto next;
}
}
// Condense spaces and tabs into a single space.
$t = preg_replace('/ +/', ' ', str_replace("\t", ' ', $t));
}
$s .= self::escapeString($t);
# The attribute's serialized name is the attribute's qualified name.
$name = ($a->prefix !== "") ? $a->prefix.":".$a->name : $a->name;
}
}
# If current node is a Comment
elseif ($n instanceof \DOMComment) {
if ($reformatWhitespace && !self::isPreformattedContent($n)) {
$modify = false;
if (($n->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
} else {
if ($n->parentNode->parentNode !== null && ($n->parentNode->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
} elseif (self::treatForeignRootAsBlock($n)) {
$modify = true;
}
}
if ($modify) {
$previousNonTextNodeSiblingName = null;
$nn = $n;
while ($nn = $nn->previousSibling) {
if (!$nn instanceof \DOMText) {
$previousNonTextNodeSiblingName = $nn->nodeName;
break;
}
}
// Add an additional newline if the previous sibling wasn't a comment.
if ($previousNonTextNodeSiblingName !== null && $previousNonTextNodeSiblingName !== $n->nodeName) {
$s .= "\n";
}
$s .= "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
// retrieve the attribute value
$value = self::escapeString((string) $a->value, true);
if (
$boolAttr
|| !$htmlElement
|| !isset(self::BOOLEAN_ATTRIBUTES[$name])
|| is_array(self::BOOLEAN_ATTRIBUTES[$name]) && !in_array($tagName, self::BOOLEAN_ATTRIBUTES[$name])
|| (strlen($value) && strtolower($value) !== $name)
) {
// print the attribute value unless the stars align
$s .= " $name=\"$value\"";
} else {
// omit the value if the stars do align
$s .= " $name";
}
# Append the literal string "<!--" (U+003C LESS-THAN SIGN,
# U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
# U+002D HYPHEN-MINUS), followed by the value of current
# node's data IDL attribute, followed by the literal
# string "-->" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS,
# U+003E GREATER-THAN SIGN).
$s .= "<!--".$n->data."-->";
}
# If current node is a ProcessingInstruction
elseif ($n instanceof \DOMProcessingInstruction) {
if ($reformatWhitespace && !self::isPreformattedContent($n)) {
$modify = false;
if (($n->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
} else {
if ($n->parentNode->parentNode !== null && ($n->parentNode->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($n->parentNode)) {
$modify = true;
}
} elseif (self::treatForeignRootAsBlock($n)) {
$modify = true;
}
}
if ($modify) {
$previousNonTextNodeSiblingName = null;
$nn = $n;
while ($nn = $nn->previousSibling) {
if (!$nn instanceof \DOMText) {
$previousNonTextNodeSiblingName = $nn->nodeName;
break;
}
}
// Add an additional newline if the previous sibling wasn't a comment.
if ($previousNonTextNodeSiblingName !== null && $previousNonTextNodeSiblingName !== $n->nodeName) {
$s .= "\n";
}
if (!$endTags && !$htmlElement && !$node->hasChildNodes()) {
// Printing XML-based content such as SVG as if it's HTML might be practical
// when a browser is serializing, but it's not in this library's usage. So, if
// the element is foreign and doesn't contain any children close the element
// instead and return s.
$s .= '/>';
return $s;
}
$s .= "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
}
# Append a U+003E GREATER-THAN SIGN character (>).
$s .= '>';
# Append the literal string "<?" (U+003C LESS-THAN SIGN,
# U+003F QUESTION MARK), followed by the value of
# current node's target IDL attribute, followed by a
# single U+0020 SPACE character, followed by the value
# of current node's data IDL attribute, followed by a
# single U+003E GREATER-THAN SIGN character (>).
$s .= "<?".self::uncoerceName($n->target)." ".$n->data.">";
# If current node serializes as void, then continue on to the next child node at
# this point.
if ($htmlElement && in_array($tagName, self::VOID_ELEMENTS)) {
return $s;
}
# If current node is a DocumentType
elseif ($n instanceof \DOMDocumentType) {
# Append the literal string "<!DOCTYPE" (U+003C LESS-THAN SIGN,
# U+0021 EXCLAMATION MARK, U+0044 LATIN CAPITAL LETTER D,
# U+004F LATIN CAPITAL LETTER O, U+0043 LATIN CAPITAL LETTER C,
# U+0054 LATIN CAPITAL LETTER T, U+0059 LATIN CAPITAL LETTER Y,
# U+0050 LATIN CAPITAL LETTER P, U+0045 LATIN CAPITAL LETTER E),
# followed by a space (U+0020 SPACE), followed by the value
# of current node's name IDL attribute, followed by the
# literal string ">" (U+003E GREATER-THAN SIGN).
$s .= "<!DOCTYPE ".trim($n->name).">";
# Append the value of running the HTML fragment serialization algorithm on the
# current node element (thus recursing into this algorithm for that element),
# followed by a U+003C LESS-THAN SIGN character (<), a U+002F SOLIDUS character (/),
# tagname again, and finally a U+003E GREATER-THAN SIGN character (>).
$s .= self::serializeInner($node, $config) . "</$tagName>";
}
# If current node is a Text node
elseif ($node instanceof \DOMText) {
# If the parent of current node is a style, script, xmp,
# iframe, noembed, noframes, or plaintext element, or
# if the parent of current node is a noscript element
# and scripting is enabled for the node, then append
# the value of current node's data IDL attribute literally.
$p = $node->parentNode;
if ($p instanceof \DOMElement && ($p->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($p->tagName, self::RAWTEXT_ELEMENTS)) {
// NOTE: scripting is assumed not to be enabled
$s .= $node->data;
}
// NOTE: Documents and document fragments have no outer content,
// so we can just serialize the inner content
elseif ($n instanceof \DOMDocument || $n instanceof \DOMDocumentFragment) {
return self::serializeInner($n, $config);
} else {
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($n)]);
# Otherwise, append the value of current node's data IDL attribute, escaped as described below.
else {
$s .= self::escapeString($node->data);
}
}
# If current node is a Comment
elseif ($node instanceof \DOMComment) {
# Append the literal string "<!--" (U+003C LESS-THAN SIGN, U+0021 EXCLAMATION
# MARK, U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS), followed by the value of
# current node’s data IDL attribute, followed by the literal string "-->"
# (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN).
$s .= "<!--{$node->data}-->";
}
# If current node is a ProcessingInstruction
elseif ($node instanceof \DOMProcessingInstruction) {
# Append the literal string "<?" (U+003C LESS-THAN SIGN, U+003F QUESTION MARK),
# followed by the value of current node’s target IDL attribute, followed by a
# single U+0020 SPACE character, followed by the value of current node’s data
# IDL attribute, followed by a single U+003E GREATER-THAN SIGN character (>).
$s .= '<?' . self::uncoerceName($node->target) . " {$n->data}>";
}
# If current node is a DocumentType
elseif ($node instanceof \DOMDocumentType) {
# Append the literal string "<!DOCTYPE" (U+003C LESS-THAN SIGN,
# U+0021 EXCLAMATION MARK, U+0044 LATIN CAPITAL LETTER D,
# U+004F LATIN CAPITAL LETTER O, U+0043 LATIN CAPITAL LETTER C,
# U+0054 LATIN CAPITAL LETTER T, U+0059 LATIN CAPITAL LETTER Y,
# U+0050 LATIN CAPITAL LETTER P, U+0045 LATIN CAPITAL LETTER E),
# followed by a space (U+0020 SPACE), followed by the value
# of current node's name IDL attribute, followed by the
# literal string ">" (U+003E GREATER-THAN SIGN).
$s .= '<!DOCTYPE ' . trim($node->name) . '>';
}
// NOTE: Documents and document fragments have no outer content,
// so we can just serialize the inner content
elseif ($node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) {
return self::serializeInner($node, $config);
} else {
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($node)]);
}
next:
// If the current node has no more siblings, go up the tree till a
// sibling is found or we've reached the original node
while (!$n->nextSibling && $stack) {
// Write out the stored end tag each time we go up the tree
$tagName = array_pop($stack);
if ($reformatWhitespace) {
$indentionLevel--;
$tag = array_pop($prettyPrintStack);
$modify = false;
// If the element popped off the stack isn't a preformatted element...
if (!self::isPreformattedContent($n)) {
// If it is in the HTML namespace and is to be treated as block then we need to
// modify whitespace.
if (($tag->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($tag)) {
$modify = true;
}
} else {
$firstElementChild = null;
if (property_exists($tag, 'firstElementChild')) {
$firstElementChild = $tag->firstElementChild;
} else {
$t = $tag->firstChild;
do {
if ($t instanceof \DOMElement) {
$firstElementChild = $t;
break;
}
} while ($t = $t->nextSibling);
}
// Otherwise, if foreign and has a child element...
if ($firstElementChild !== null) {
// If the element popped off the stack has an HTML element parent and its parent
// is to be treated as block then we need to modify whitespace.
if ($tag->parentNode !== null && ($tag->parentNode->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
if (self::treatAsBlock($tag->parentNode)) {
$modify = true;
}
// Otherwise, if the element's foreign root is to be treated as block we need to
// modify whitespace, too.
} elseif ($tag->parentNode === null || self::treatForeignRootAsBlock($tag)) {
$modify = true;
}
}
}
}
if ($modify) {
$s .= "\n" . str_repeat($indentChar, $indentionLevel * $indentStep);
}
}
$s .= "</$tagName>";
$n = $n->parentNode;
}
$n = $n->nextSibling;
} while ($stack); // Loop until we have traversed the subtree of the target node in full
return $s;
}
@ -522,28 +254,18 @@ abstract class Serializer {
* @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any
*/
public static function serializeInner(\DOMNode $node, ?Config $config = null): string {
$reformatWhitespace = $config->reformatWhitespace ?? false;
# Let s be a string, and initialize it to the empty string.
$s = "";
$s = '';
if ($node instanceof \DOMElement && ($node->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
# If the node serializes as void, then return the empty string.
if (in_array($node->tagName, self::VOID_ELEMENTS)) {
return "";
return '';
}
# If the node is a template element, then let the node instead be the template
# element's template contents (a DocumentFragment node).
elseif ($node->tagName === "template") {
$n = self::getTemplateContent($n);
# For each child node of the node, in tree order, run the following steps:
// NOTE: the steps in question are implemented in the "serialize" routine
foreach ($n->childNodes as $nn) {
$s .= self::serialize($nn, $config);
}
return $s;
elseif ($node->tagName === 'template') {
$node = self::getTemplateContent($node);
}
}
if ($node instanceof \DOMElement || $node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) {

Loading…
Cancel
Save