From e6d908a0ed491e0919b897aff2e8494ae6857475 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 13 Oct 2021 15:03:28 -0400 Subject: [PATCH 01/15] Start on serializer --- lib/Parser/Serializer.php | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 lib/Parser/Serializer.php diff --git a/lib/Parser/Serializer.php b/lib/Parser/Serializer.php new file mode 100644 index 0000000..bee01f9 --- /dev/null +++ b/lib/Parser/Serializer.php @@ -0,0 +1,34 @@ +namespaceURI ?? Parser::HTML_NAMESPACE, [Parser::HTML_NAMESPACE, Parser::SVG_NAMESPACE, Parser::MATHML_NAMESPACE])) { + $tagName = $n->localName; + } else { + $tagName = $n->tagName; + } + } + } while (false); + return $s; + } +} From e74d0d0f3116e7a5a744be6218c9df541321fb70 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 13 Oct 2021 22:52:54 -0400 Subject: [PATCH 02/15] Prototype of serializer Templates will need further handling --- lib/Parser/Exception.php | 3 + lib/Parser/NameCoercion.php | 8 +- lib/Parser/Serializer.php | 164 +++++++++++++++++++++++++++- lib/Parser/TreeConstructor.php | 12 +- tests/cases/TestTreeConstructor.php | 4 +- 5 files changed, 175 insertions(+), 16 deletions(-) diff --git a/lib/Parser/Exception.php b/lib/Parser/Exception.php index b9a1164..10b48ee 100644 --- a/lib/Parser/Exception.php +++ b/lib/Parser/Exception.php @@ -11,10 +11,13 @@ class Exception extends \Exception { public const FAILED_CREATING_DOCUMENT = 102; public const INVALID_DOCUMENT_CLASS = 103; + public const UNSUPPORTED_NODE_TYPE = 201; + protected static $messages = [ 101 => 'Fragment\'s quirks mode must be one of Parser::NO_QUIRKS_MODE, Parser::LIMITED_QUIRKS_MODE, or Parser::QUIRKS_MODE', 102 => 'Unable to create instance of configured document class "%s"', 103 => 'Configured document class "%s" must be a subclass of \DOMDocument', + 201 => 'Unable to serialize unsupported node type %s', ]; public function __construct(int $code, array $args = [], \Throwable $previous = null) { diff --git a/lib/Parser/NameCoercion.php b/lib/Parser/NameCoercion.php index eba16bf..b137480 100644 --- a/lib/Parser/NameCoercion.php +++ b/lib/Parser/NameCoercion.php @@ -10,7 +10,7 @@ use MensBeam\Intl\Encoding\UTF8; trait NameCoercion { /** @codeCoverageIgnore */ - protected function coerceNameFifthEdition(string $name): string { + protected static function coerceNameFifthEdition(string $name): string { // This matches the inverse of the production of NameChar in XML 1.0 Fifth Edition, // with the added exclusion of ":" from allowed characters // See https://www.w3.org/TR/REC-xml/#NT-NameStartChar @@ -30,7 +30,7 @@ trait NameCoercion { return $name; } - protected function coerceName(string $name): string { + protected static function coerceName(string $name): string { // This matches the inverse of the production of Name in XML 1.0 Fourth Edition, // with the added exclusion of ":" from allowed characters // See https://www.w3.org/TR/2006/REC-xml-20060816/#NT-NameChar @@ -50,7 +50,7 @@ trait NameCoercion { return $name; } - protected function uncoerceName(string $name): string { + protected static function uncoerceName(string $name): string { preg_match_all('/U[0-9A-F]{6}/', $name, $m); foreach (array_unique($m[0], \SORT_STRING) as $o) { $c = UTF8::encode(hexdec(substr($o, 1))); @@ -59,7 +59,7 @@ trait NameCoercion { return $name; } - protected function escapeString(string $string, bool $attribute = false): string { + protected static function escapeString(string $string, bool $attribute = false): string { # Escaping a string (for the purposes of the algorithm above) consists of # running the following steps: diff --git a/lib/Parser/Serializer.php b/lib/Parser/Serializer.php index bee01f9..5c19c48 100644 --- a/lib/Parser/Serializer.php +++ b/lib/Parser/Serializer.php @@ -9,11 +9,14 @@ namespace MensBeam\HTML\Parser; use MensBeam\HTML\Parser; abstract class Serializer { + use NameCoercion; + protected const VOID_ELEMENTS = ["basefont", "bgsound", "frame", "keygen", "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"]; + protected const RAWTEXT_ELEMENTS = ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext"]; public function seerializeOuter(\DOMNode $node): string { $s = ""; - $depth = 0; + $stack = []; $n = $node; do { # If current node is an Element @@ -23,12 +26,165 @@ abstract class Serializer { # tagname be current node's local name. Otherwise, let # tagname be current node's qualified name. if (in_array($n->namespaceURI ?? Parser::HTML_NAMESPACE, [Parser::HTML_NAMESPACE, Parser::SVG_NAMESPACE, Parser::MATHML_NAMESPACE])) { - $tagName = $n->localName; + $tagName = self::uncoerceName($n->localName); } else { - $tagName = $n->tagName; + $tagName = self::uncoerceName($n->tagName); + } + # Append a U+003C LESS-THAN SIGN character (<), followed by tagname. + $s .= "<$tagName"; + # If current node's is value is not null, and the element does + # not have an is attribute in its attribute list, then + # append the string " is="", followed by current node's is + # value escaped as described below in attribute mode, + # followed by a U+0022 QUOTATION MARK character ("). + // DEVIATION: We don't support custom elements + # For each attribute that the element has, append a + # U+0020 SPACE character, the attribute's serialized name as + # described below, a U+003D EQUALS SIGN character (=), a + # U+0022 QUOTATION MARK character ("), the attribute's + # value, escaped as described below in attribute mode, and + # a second U+0022 QUOTATION MARK character ("). + foreach ($n->attributes as $a) { + $s .= " ".self::serializeAttribute($a); + } + # Append a U+003E GREATER-THAN SIGN character (>). + $s .= ">"; + # If current node serializes as void, then continue on to the + # next child node at this point. + # Append the value of running the HTML fragment serialization + # algorithm on the current node element (thus recursing into + # this algorithm for that element), followed by a + # U+003C LESS-THAN SIGN character (<), a U+002F SOLIDUS + # character (/), tagname again, and finally a + # U+003E GREATER-THAN SIGN character (>). + if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && !in_array($tagName, self::VOID_ELEMENTS)) { + if ($n->hasChildNodes()) { + $stack[] = $tagName; + $n = $n->firstChild; + continue; + } else { + $s .= ""; + } + } + } + # If current node is a Text node + elseif ($n instanceof \DOMText) { + # If the parent of current node is a style, script, xmp, + # iframe, noembed, noframes, or plaintext element, or + # if the parent of current node is a noscript element + # and scripting is enabled for the node, then append + # the value of current node's data IDL attribute literally. + if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($n->parentNode->tagName, self::RAWTEXT_ELEMENTS)) { + // NOTE: scripting is assumed not to be enabled + $s .= $n->data; } + # Otherwise, append the value of current node's data IDL attribute, escaped as described below. + else { + $s .= self::escapeString($n->data); + } + } + # If current node is a Comment + elseif ($n instanceof \DOMComment) { + # Append the literal string "" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS, + # U+003E GREATER-THAN SIGN). + $s .= ""; + } + # If current node is a ProcessingInstruction + elseif ($n instanceof \DOMProcessingInstruction) { + # Append the literal string "). + $s .= "target)." ".$n->data.">"; + } + # If current node is a DocumentType + elseif ($n instanceof \DOMDocumentType) { + # Append the literal string "" (U+003E GREATER-THAN SIGN). + $s .= "name).">"; } - } while (false); + // NOTE: Documents and document fragments have no outer content, + // so we can just serialize the inner content + elseif ($n instanceof \DOMDocument || $n instanceof \DOMDocumentFragment) { + return self::serializeInner($n); + } else { + throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($n)]); + } + while (!$n->nextSibling && $stack) { + $tagName = array_pop($stack); + $s .= ""; + $n = $n->parentNode; + } + if (!$stack && $n->isSameNode($node)) { + break; + } + $n = $n->nextSibling; + } while (true); return $s; } + + protected static function serializeAttribute(\DOMAttr $a): string { + # For each attribute that the element has, append a + # U+0020 SPACE character, the attribute's serialized name as + # described below, a U+003D EQUALS SIGN character (=), a + # U+0022 QUOTATION MARK character ("), the attribute's + # value, escaped as described below in attribute mode, and + # a second U+0022 QUOTATION MARK character ("). + // NOTE: We won't add the space here; it's only appropriate + // if serializing an element. + + # An attribute's serialized name for the purposes of the previous + # paragraph must be determined as follows: + + # If the attribute has no namespace + if ($a->namespaceURI === null) { + # The attribute's serialized name is the attribute's local name. + $name = self::uncoerceName($a->localName); + } + # If the attribute is in the XML namespace + elseif ($a->namespaceURI === Parser::XML_NAMESPACE) { + # The attribute's serialized name is the string "xml:" followed + # by the attribute's local name. + $name = "xml:".self::uncoerceName($a->localName); + } + # If the attribute is in the XMLNS namespace... + elseif ($a->namespaceURI === Parser::XMLNS_NAMESPACE) { + # ... and the attribute's local name is xmlns + if ($a->localName === "xmlns") { + # The attribute's serialized name is the string "xmlns". + $a = "xmlns"; + } + # ... and the attribute's local name is not xmlns + else { + # The attribute's serialized name is the string "xmlns:" + # followed by the attribute's local name. + $name = "xmlns:".self::uncoerceName($a->localName); + } + } + # If the attribute is in the XLink namespace + elseif ($a->namespaceURI === Parser::XLINK_NAMESPACE) { + # The attribute's serialized name is the string "xlink:" + # followed by the attribute's local name. + $name = "xlink:".self::uncoerceName($a->localName); + } + # If the attribute is in some other namespace + else { + # The attribute's serialized name is the attribute's qualified name. + $name = $a->name; + } + $value = self::escapeString($a->value); + return "$name=\"$value\"';" + } } diff --git a/lib/Parser/TreeConstructor.php b/lib/Parser/TreeConstructor.php index f0b2a22..39b7738 100644 --- a/lib/Parser/TreeConstructor.php +++ b/lib/Parser/TreeConstructor.php @@ -338,7 +338,7 @@ class TreeConstructor { // If element name coercison has occurred at some earlier point, // we must coerce all end tag names to match mangled start tags if ($token instanceof EndTagToken && $this->mangledElements) { - $token->name = $this->coerceName($token->name); + $token->name = self::coerceName($token->name); } # 13.2.6 Tree construction @@ -401,7 +401,7 @@ class TreeConstructor { // If attribute name coercison has occurred at some earlier point, // we must coerce all attributes on html and body start tags in // case they are relocated to existing elements - $attrName = $this->mangledAttributes ? $this->coerceName($a->name) : $a->name; + $attrName = $this->mangledAttributes ? self::coerceName($a->name) : $a->name; if (!$top->hasAttributeNS(null, $attrName)) { $this->elementSetAttribute($top, null, $attrName, $a->value); } @@ -433,7 +433,7 @@ class TreeConstructor { // If attribute name coercison has occurred at some earlier point, // we must coerce all attributes on html and body start tags in // case they are relocated to existing elements - $attrName = $this->mangledAttributes ? $this->coerceName($a->name) : $a->name; + $attrName = $this->mangledAttributes ? self::coerceName($a->name) : $a->name; if (!$body->hasAttributeNS(null, $attrName)) { $this->elementSetAttribute($body, null, $attrName, $a->value); } @@ -4219,7 +4219,7 @@ class TreeConstructor { if ($namespace !== $this->htmlNamespace) { $qualifiedName = implode(":", array_map([$this, "coerceName"], explode(":", $token->name, 2))); } else { - $qualifiedName = $this->coerceName($token->name); + $qualifiedName = self::coerceName($token->name); } $element = $this->DOM->createElementNS($namespace, $qualifiedName); $this->mangledElements = true; @@ -4264,7 +4264,7 @@ class TreeConstructor { $a = $element->ownerDocument->createAttributeNS($namespaceURI, $qualifiedName); $element->ownerDocument->removeChild($element); } - $a->value = $this->escapeString($value, true); + $a->value = self::escapeString($value, true); $element->setAttributeNodeNS($a); } else { try { @@ -4276,7 +4276,7 @@ class TreeConstructor { if ($namespaceURI !== null) { $qualifiedName = implode(":", array_map([$this, "coerceName"], explode(":", $qualifiedName, 2))); } else { - $qualifiedName = $this->coerceName($qualifiedName); + $qualifiedName = self::coerceName($qualifiedName); } $element->setAttributeNS($namespaceURI, $qualifiedName, $value); $this->mangledAttributes = true; diff --git a/tests/cases/TestTreeConstructor.php b/tests/cases/TestTreeConstructor.php index bf377d4..b250d0f 100644 --- a/tests/cases/TestTreeConstructor.php +++ b/tests/cases/TestTreeConstructor.php @@ -180,7 +180,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { $prefix = "null "; } } - $localName = $this->uncoerceName($e->localName); + $localName = self::uncoerceName($e->localName); $this->push("<".$prefix.$localName.">"); $this->depth++; $attr = []; @@ -191,7 +191,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { assert((bool) $prefix, new \Exception("Prefix for namespace {$a->namespaceURI} is not defined")); $prefix .= " "; } - $attr[$prefix.$this->uncoerceName($a->name)] = $a->value; + $attr[$prefix.self::uncoerceName($a->name)] = $a->value; } ksort($attr, \SORT_STRING); foreach ($attr as $k => $v) { From c82127c61fbee01b0d15e2c03538d5c522e86074 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 14 Oct 2021 11:08:34 -0400 Subject: [PATCH 03/15] Hopefully complete serializer implementation --- lib/Parser/Serializer.php | 132 +++++++++++++++++++++++--------------- 1 file changed, 80 insertions(+), 52 deletions(-) diff --git a/lib/Parser/Serializer.php b/lib/Parser/Serializer.php index 5c19c48..11b969a 100644 --- a/lib/Parser/Serializer.php +++ b/lib/Parser/Serializer.php @@ -14,7 +14,7 @@ abstract class Serializer { protected const VOID_ELEMENTS = ["basefont", "bgsound", "frame", "keygen", "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"]; protected const RAWTEXT_ELEMENTS = ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext"]; - public function seerializeOuter(\DOMNode $node): string { + public static function serializeOuter(\DOMNode $node): string { $s = ""; $stack = []; $n = $node; @@ -45,7 +45,47 @@ abstract class Serializer { # value, escaped as described below in attribute mode, and # a second U+0022 QUOTATION MARK character ("). foreach ($n->attributes as $a) { - $s .= " ".self::serializeAttribute($a); + # An attribute's serialized name for the purposes of the previous + # paragraph must be determined as follows: + + # If the attribute has no namespace + if ($a->namespaceURI === null) { + # The attribute's serialized name is the attribute's local name. + $name = self::uncoerceName($a->localName); + } + # If the attribute is in the XML namespace + elseif ($a->namespaceURI === Parser::XML_NAMESPACE) { + # The attribute's serialized name is the string "xml:" followed + # by the attribute's local name. + $name = "xml:".self::uncoerceName($a->localName); + } + # If the attribute is in the XMLNS namespace... + elseif ($a->namespaceURI === Parser::XMLNS_NAMESPACE) { + # ... and the attribute's local name is xmlns + if ($a->localName === "xmlns") { + # The attribute's serialized name is the string "xmlns". + $a = "xmlns"; + } + # ... and the attribute's local name is not xmlns + else { + # The attribute's serialized name is the string "xmlns:" + # followed by the attribute's local name. + $name = "xmlns:".self::uncoerceName($a->localName); + } + } + # If the attribute is in the XLink namespace + elseif ($a->namespaceURI === Parser::XLINK_NAMESPACE) { + # The attribute's serialized name is the string "xlink:" + # followed by the attribute's local name. + $name = "xlink:".self::uncoerceName($a->localName); + } + # If the attribute is in some other namespace + else { + # The attribute's serialized name is the attribute's qualified name. + $name = $a->name; + } + $value = self::escapeString($a->value); + $s .= " $name=\"$value\""; } # Append a U+003E GREATER-THAN SIGN character (>). $s .= ">"; @@ -58,7 +98,19 @@ abstract class Serializer { # character (/), tagname again, and finally a # U+003E GREATER-THAN SIGN character (>). if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && !in_array($tagName, self::VOID_ELEMENTS)) { - if ($n->hasChildNodes()) { + # If the node is a template element, then let the node instead + # be the template element's template contents + # (a DocumentFragment node). + if ( + $n instanceof \DOMElement + && ($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE + && $n->tagName === "template" + && property_exists($n, "content") + && $n->content instanceof \DOMDocumentFragment + ) { + // NOTE: Treat template contents as any other document fragment and just invoke the inner serializer + $s .= self::serializeInner($n->content).""; + } elseif ($n->hasChildNodes()) { $stack[] = $tagName; $n = $n->firstChild; continue; @@ -74,7 +126,8 @@ abstract class Serializer { # if the parent of current node is a noscript element # and scripting is enabled for the node, then append # the value of current node's data IDL attribute literally. - if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($n->parentNode->tagName, self::RAWTEXT_ELEMENTS)) { + $p = $n->parentNode; + if ($p instanceof \DOMElement && ($p->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($p->tagName, self::RAWTEXT_ELEMENTS)) { // NOTE: scripting is assumed not to be enabled $s .= $n->data; } @@ -135,56 +188,31 @@ abstract class Serializer { return $s; } - protected static function serializeAttribute(\DOMAttr $a): string { - # For each attribute that the element has, append a - # U+0020 SPACE character, the attribute's serialized name as - # described below, a U+003D EQUALS SIGN character (=), a - # U+0022 QUOTATION MARK character ("), the attribute's - # value, escaped as described below in attribute mode, and - # a second U+0022 QUOTATION MARK character ("). - // NOTE: We won't add the space here; it's only appropriate - // if serializing an element. - - # An attribute's serialized name for the purposes of the previous - # paragraph must be determined as follows: - - # If the attribute has no namespace - if ($a->namespaceURI === null) { - # The attribute's serialized name is the attribute's local name. - $name = self::uncoerceName($a->localName); - } - # If the attribute is in the XML namespace - elseif ($a->namespaceURI === Parser::XML_NAMESPACE) { - # The attribute's serialized name is the string "xml:" followed - # by the attribute's local name. - $name = "xml:".self::uncoerceName($a->localName); - } - # If the attribute is in the XMLNS namespace... - elseif ($a->namespaceURI === Parser::XMLNS_NAMESPACE) { - # ... and the attribute's local name is xmlns - if ($a->localName === "xmlns") { - # The attribute's serialized name is the string "xmlns". - $a = "xmlns"; - } - # ... and the attribute's local name is not xmlns - else { - # The attribute's serialized name is the string "xmlns:" - # followed by the attribute's local name. - $name = "xmlns:".self::uncoerceName($a->localName); + public static function serializeInner(\DOMNode $node): string { + # Let s be a string, and initialize it to the empty string. + $s = ""; + # If the node serializes as void, then return the empty string. + # If the node is a template element, then let the node instead + # be the template element's template contents + # (a DocumentFragment node). + if ($node instanceof \DOMElement && ($node->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) { + if (!in_array($node->tagName, self::VOID_ELEMENTS)) { + return ""; + } elseif ($node->tagName === "template" && property_exists($node, "content") && $node->content instanceof \DOMDocumentFragment) { + // NOTE: template elements won't necessarily have a content + // property because PHP's DOM does not support this natively + $node = $node->content; } } - # If the attribute is in the XLink namespace - elseif ($a->namespaceURI === Parser::XLINK_NAMESPACE) { - # The attribute's serialized name is the string "xlink:" - # followed by the attribute's local name. - $name = "xlink:".self::uncoerceName($a->localName); - } - # If the attribute is in some other namespace - else { - # The attribute's serialized name is the attribute's qualified name. - $name = $a->name; + if ($node instanceof \DOMElement || $node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) { + # For each child node of the node, in tree order, run the following steps: + // NOTE: the steps in question are implemented in the "serializeOuter" routine + foreach ($node->childNodes as $n) { + $s .= self::serializeOuter($n); + } + } else { + throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($node)]); } - $value = self::escapeString($a->value); - return "$name=\"$value\"';" + return $s; } } From 180dcd3e512802242fc4a51be5373c6113ebb3c8 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 14 Oct 2021 12:27:01 -0400 Subject: [PATCH 04/15] Various corrections --- lib/Parser/Serializer.php | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/lib/Parser/Serializer.php b/lib/Parser/Serializer.php index 11b969a..ac749e4 100644 --- a/lib/Parser/Serializer.php +++ b/lib/Parser/Serializer.php @@ -97,24 +97,28 @@ abstract class Serializer { # U+003C LESS-THAN SIGN character (<), a U+002F SOLIDUS # character (/), tagname again, and finally a # U+003E GREATER-THAN SIGN character (>). - if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && !in_array($tagName, self::VOID_ELEMENTS)) { + if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE || !in_array($tagName, self::VOID_ELEMENTS)) { # If the node is a template element, then let the node instead # be the template element's template contents # (a DocumentFragment node). if ( - $n instanceof \DOMElement - && ($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE + ($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && $n->tagName === "template" && property_exists($n, "content") && $n->content instanceof \DOMDocumentFragment ) { - // NOTE: Treat template contents as any other document fragment and just invoke the inner serializer + // NOTE: Treat template content as any other document + // fragment and just invoke the inner serializer $s .= self::serializeInner($n->content).""; } elseif ($n->hasChildNodes()) { + // If the element has children, store its tag name and + // continue the loop with its first child; its end + // tag will be written out further down $stack[] = $tagName; $n = $n->firstChild; continue; } else { + // Otherwise just append the end tag now $s .= ""; } } @@ -175,30 +179,32 @@ abstract class Serializer { } else { throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($n)]); } + // If the current node has no more siblings, go up the tree till a + // sibling is found or we've reached the original node while (!$n->nextSibling && $stack) { + // Write out the stored end tag each time we go up the tree $tagName = array_pop($stack); $s .= ""; $n = $n->parentNode; } - if (!$stack && $n->isSameNode($node)) { - break; - } $n = $n->nextSibling; - } while (true); + } while ($stack); // Loop until we have traversed the subtree of the target node in full return $s; } public static function serializeInner(\DOMNode $node): string { # Let s be a string, and initialize it to the empty string. $s = ""; - # If the node serializes as void, then return the empty string. - # If the node is a template element, then let the node instead - # be the template element's template contents - # (a DocumentFragment node). + if ($node instanceof \DOMElement && ($node->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) { + # If the node serializes as void, then return the empty string. if (!in_array($node->tagName, self::VOID_ELEMENTS)) { return ""; - } elseif ($node->tagName === "template" && property_exists($node, "content") && $node->content instanceof \DOMDocumentFragment) { + } + # If the node is a template element, then let the node instead + # be the template element's template contents + # (a DocumentFragment node). + elseif ($node->tagName === "template" && property_exists($node, "content") && $node->content instanceof \DOMDocumentFragment) { // NOTE: template elements won't necessarily have a content // property because PHP's DOM does not support this natively $node = $node->content; From b8d4636664795564885dee94a7c1d5fde3ca01af Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 14 Oct 2021 15:44:14 -0400 Subject: [PATCH 05/15] Re-import of serializer tests Some errors remain --- tests/cases/TestSerializer.php | 160 +++++ tests/cases/serializer/README.md | 23 + tests/cases/serializer/mensbeam01.dat | 33 + tests/cases/serializer/mensbeam02.dat | 34 + tests/cases/serializer/wpt01.dat | 913 ++++++++++++++++++++++++++ tests/phpunit.dist.xml | 3 + 6 files changed, 1166 insertions(+) create mode 100644 tests/cases/TestSerializer.php create mode 100644 tests/cases/serializer/README.md create mode 100644 tests/cases/serializer/mensbeam01.dat create mode 100644 tests/cases/serializer/mensbeam02.dat create mode 100644 tests/cases/serializer/wpt01.dat diff --git a/tests/cases/TestSerializer.php b/tests/cases/TestSerializer.php new file mode 100644 index 0000000..29cfcf9 --- /dev/null +++ b/tests/cases/TestSerializer.php @@ -0,0 +1,160 @@ +append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/cases/serializer/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); + foreach ($files as $file) { + if (!in_array(basename($file), $blacklist)) { + yield from $this->parseTreeTestFile($file); + } + } + } + + /** + * @dataProvider provideStandardTreeTests + * @covers \MensBeam\HTML\Parser\Serializer + */ + public function testStandardTreeTests(array $data, bool $fragment, string $exp): void { + $node = $this->buildTree($data, $fragment); + $this->assertSame($exp, Serializer::serializeOuter($node)); + } + + protected function buildTree(array $data, bool $fragment, bool $formatOutput = false): \DOMNode { + $document = new \DOMDocument; + $document->formatOutput = $formatOutput; + if ($fragment) { + $document->appendChild($document->createElement("html")); + $out = $document->createDocumentFragment(); + } else { + $out = $document; + } + $cur = $out; + $pad = 2; + // process each line in turn + for ($l = 0; $l < sizeof($data); $l++) { + preg_match('/^(\|\s+)(.+)/', $data[$l], $m); + // pop any parents as long as the padding of the line is less than the expected padding + $p = strlen((string) $m[1]); + assert($p >= 2 && $p <= $pad && !($p % 2), new \Exception("Input data is invalid on line ".($l + 1))); + while ($p < $pad) { + $pad -= 2; + $cur = $cur->parentNode; + } + // act based upon what the rest of the line looks like + $d = $m[2]; + if (preg_match('/^$/', $d, $m)) { + // comment + $cur->appendChild($document->createComment($m[1])); + } elseif (preg_match('/^]*)(?: "([^"]*)" "([^"]*)")?)?>$/', $d, $m)) { + // doctype + $name = strlen((string) ($m[1] ?? "")) ? $m[1] : " "; + $public = strlen((string) ($m[2] ?? "")) ? $m[2] : ""; + $system = strlen((string) ($m[3] ?? "")) ? $m[3] : ""; + $cur->appendChild($document->implementation->createDocumentType($name, $public, $system)); + } elseif (preg_match('/^<\?([^ ]+) ([^>]*)>$/', $d, $m)) { + // processing instruction + $cur->appendChild($document->createProcessingInstruction($m[1], $m[2])); + } elseif (preg_match('/^<(?:([^ ]+) )?([^>]+)>$/', $d, $m)) { + // element + $ns = strlen((string) $m[1]) ? (array_flip(Parser::NAMESPACE_MAP)[$m[1]] ?? $m[1]) : null; + $cur = $cur->appendChild($document->createElementNS($ns, $m[2])); + $pad += 2; + } elseif (preg_match('/^(?:([^" ]+) )?([^"=]+)="((?:[^"]|"(?!$))*)"$/', $d, $m)) { + // attribute + $ns = strlen((string) $m[1]) ? (array_flip(Parser::NAMESPACE_MAP)[$m[1]] ?? $m[1]) : ""; + + if ($ns === '') { + $cur->setAttribute($m[2], $m[3]); + } else { + $cur->setAttributeNS($ns, $m[2], $m[3]); + } + } elseif (preg_match('/^"((?:[^"]|"(?!$))*)("?)$/', $d, $m)) { + // text + $t = $m[1]; + while (!strlen((string) $m[2])) { + preg_match('/^((?:[^"]|"(?!$))*)("?)$/', $data[++$l], $m); + $t .= "\n".$m[1]; + } + $cur->appendChild($document->createTextNode($t)); + } else { + throw new \Exception("Input data is invalid on line ".($l + 1)); + } + } + return $out; + } + + protected function parseTreeTestFile(string $file): \Generator { + $index = 0; + $l = 0; + $lines = array_map(function($v) { + return rtrim($v, "\n"); + }, file($file)); + while ($l < sizeof($lines)) { + $pos = $l + 1; + assert(in_array($lines[$l], ["#document", "#fragment"]), new \Exception("Test $file #$index does not start with #document or #fragment tag at line ".($l + 1))); + $fragment = $lines[$l] === "#fragment"; + // collect the test input + $data = []; + for (++$l; $l < sizeof($lines); $l++) { + if (preg_match('/^#(script-(on|off)|output)$/', $lines[$l])) { + break; + } + $data[] = $lines[$l]; + } + // set the script mode, if present + assert(preg_match('/^#(script-(on|off)|output)$/', $lines[$l]) === 1, new \Exception("Test $file #$index follows data with something other than script flag or output at line ".($l + 1))); + $script = null; + if ($lines[$l] === "#script-off") { + $script = false; + $l++; + } elseif ($lines[$l] === "#script-on") { + $script = true; + $l++; + } + // collect the output string + $exp = []; + assert($lines[$l] === "#output", new \Exception("Test $file #$index follows input with something other than output at line ".($l + 1))); + for (++$l; $l < sizeof($lines); $l++) { + if ($lines[$l] === "" && in_array(($lines[$l + 1] ?? ""), ["#document", "#fragment"])) { + break; + } + assert(preg_match('/^([^#]|$)/', $lines[$l]) === 1, new \Exception("Test $file #$index contains unrecognized data after output at line ".($l + 1))); + $exp[] = $lines[$l]; + } + $exp = implode("\n", $exp); + if (!$script) { + yield basename($file)." #$index (line $pos)" => [$data, $fragment, $exp]; + } + $l++; + $index++; + } + } +} diff --git a/tests/cases/serializer/README.md b/tests/cases/serializer/README.md new file mode 100644 index 0000000..25e9326 --- /dev/null +++ b/tests/cases/serializer/README.md @@ -0,0 +1,23 @@ +HTML DOM serialization tests +============================ + +The format of these tests is essentially the format of html5lib's tree construction tests in reverse. There are, however, important differences, so the format is documented in full here. + +Each file containing tree construction tests consists of any number of +tests separated by two newlines (LF) and a single newline before the end +of the file. For instance: + + [TEST]LF + LF + [TEST]LF + LF + [TEST]LF + +Where [TEST] is the following format: + +Each test begins with a line reading "#document" or "#fragment"; subsequent +lines represent the document or document fragment (respectively) used as +input, until a line is encountered which reads "#output", "#script-on", +or "#script-off". + + diff --git a/tests/cases/serializer/mensbeam01.dat b/tests/cases/serializer/mensbeam01.dat new file mode 100644 index 0000000..c317644 --- /dev/null +++ b/tests/cases/serializer/mensbeam01.dat @@ -0,0 +1,33 @@ +#fragment +| +#output + + +#fragment +| +| test💩test="test" +#output + + +#fragment +| +| "You should not see this text." +#output + + +#fragment +| +| class="test" +#output + + +#fragment +| +#output + + +#fragment +| +| poop💩="soccer" +#output + diff --git a/tests/cases/serializer/mensbeam02.dat b/tests/cases/serializer/mensbeam02.dat new file mode 100644 index 0000000..7760020 --- /dev/null +++ b/tests/cases/serializer/mensbeam02.dat @@ -0,0 +1,34 @@ +#document +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + diff --git a/tests/cases/serializer/wpt01.dat b/tests/cases/serializer/wpt01.dat new file mode 100644 index 0000000..0074d36 --- /dev/null +++ b/tests/cases/serializer/wpt01.dat @@ -0,0 +1,913 @@ +#fragment +| +#output + + +#fragment +| +| +#output + + +#fragment +| +| +| b="c" +#output + + +#fragment +| +| +| b="&" +#output + + +#fragment +| +| +| b=" " +#output + + +#fragment +| +| +| b=""" +#output + + +#fragment +| +| +| b="<" +#output + + +#fragment +| +| +| b=">" +#output + + +#fragment +| +| +| href="javascript:"<>"" +#output + + +#fragment +| +| +| xlink xlink:href="a" +#output + + +#fragment +| +| +| xmlns xmlns:svg="test" +#output + + +#fragment +| +| "a" +#output +a + +#fragment +| +| "&" +#output +& + +#fragment +| +| " " +#output +  + +#fragment +| +| "<" +#output +< + +#fragment +| +| ">" +#output +> + +#fragment +| +| """ +#output +" + +#fragment +| +| + +#fragment +| +| + +#fragment +| + +#fragment +| +| +| "<&>" +#output +<span><xmp><&> + +#fragment +| +| + +#fragment +| +| +| "<&>" +#output +<span><noembed><&> + +#fragment +| +| +| "<&>" +#output +<span><noframes><&> + +#fragment +| +|