diff --git a/tests/cases/TestSerializer.php b/tests/cases/TestSerializer.php new file mode 100644 index 0000000..29cfcf9 --- /dev/null +++ b/tests/cases/TestSerializer.php @@ -0,0 +1,160 @@ +append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/cases/serializer/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); + foreach ($files as $file) { + if (!in_array(basename($file), $blacklist)) { + yield from $this->parseTreeTestFile($file); + } + } + } + + /** + * @dataProvider provideStandardTreeTests + * @covers \MensBeam\HTML\Parser\Serializer + */ + public function testStandardTreeTests(array $data, bool $fragment, string $exp): void { + $node = $this->buildTree($data, $fragment); + $this->assertSame($exp, Serializer::serializeOuter($node)); + } + + protected function buildTree(array $data, bool $fragment, bool $formatOutput = false): \DOMNode { + $document = new \DOMDocument; + $document->formatOutput = $formatOutput; + if ($fragment) { + $document->appendChild($document->createElement("html")); + $out = $document->createDocumentFragment(); + } else { + $out = $document; + } + $cur = $out; + $pad = 2; + // process each line in turn + for ($l = 0; $l < sizeof($data); $l++) { + preg_match('/^(\|\s+)(.+)/', $data[$l], $m); + // pop any parents as long as the padding of the line is less than the expected padding + $p = strlen((string) $m[1]); + assert($p >= 2 && $p <= $pad && !($p % 2), new \Exception("Input data is invalid on line ".($l + 1))); + while ($p < $pad) { + $pad -= 2; + $cur = $cur->parentNode; + } + // act based upon what the rest of the line looks like + $d = $m[2]; + if (preg_match('/^$/', $d, $m)) { + // comment + $cur->appendChild($document->createComment($m[1])); + } elseif (preg_match('/^]*)(?: "([^"]*)" "([^"]*)")?)?>$/', $d, $m)) { + // doctype + $name = strlen((string) ($m[1] ?? "")) ? $m[1] : " "; + $public = strlen((string) ($m[2] ?? "")) ? $m[2] : ""; + $system = strlen((string) ($m[3] ?? "")) ? $m[3] : ""; + $cur->appendChild($document->implementation->createDocumentType($name, $public, $system)); + } elseif (preg_match('/^<\?([^ ]+) ([^>]*)>$/', $d, $m)) { + // processing instruction + $cur->appendChild($document->createProcessingInstruction($m[1], $m[2])); + } elseif (preg_match('/^<(?:([^ ]+) )?([^>]+)>$/', $d, $m)) { + // element + $ns = strlen((string) $m[1]) ? (array_flip(Parser::NAMESPACE_MAP)[$m[1]] ?? $m[1]) : null; + $cur = $cur->appendChild($document->createElementNS($ns, $m[2])); + $pad += 2; + } elseif (preg_match('/^(?:([^" ]+) )?([^"=]+)="((?:[^"]|"(?!$))*)"$/', $d, $m)) { + // attribute + $ns = strlen((string) $m[1]) ? (array_flip(Parser::NAMESPACE_MAP)[$m[1]] ?? $m[1]) : ""; + + if ($ns === '') { + $cur->setAttribute($m[2], $m[3]); + } else { + $cur->setAttributeNS($ns, $m[2], $m[3]); + } + } elseif (preg_match('/^"((?:[^"]|"(?!$))*)("?)$/', $d, $m)) { + // text + $t = $m[1]; + while (!strlen((string) $m[2])) { + preg_match('/^((?:[^"]|"(?!$))*)("?)$/', $data[++$l], $m); + $t .= "\n".$m[1]; + } + $cur->appendChild($document->createTextNode($t)); + } else { + throw new \Exception("Input data is invalid on line ".($l + 1)); + } + } + return $out; + } + + protected function parseTreeTestFile(string $file): \Generator { + $index = 0; + $l = 0; + $lines = array_map(function($v) { + return rtrim($v, "\n"); + }, file($file)); + while ($l < sizeof($lines)) { + $pos = $l + 1; + assert(in_array($lines[$l], ["#document", "#fragment"]), new \Exception("Test $file #$index does not start with #document or #fragment tag at line ".($l + 1))); + $fragment = $lines[$l] === "#fragment"; + // collect the test input + $data = []; + for (++$l; $l < sizeof($lines); $l++) { + if (preg_match('/^#(script-(on|off)|output)$/', $lines[$l])) { + break; + } + $data[] = $lines[$l]; + } + // set the script mode, if present + assert(preg_match('/^#(script-(on|off)|output)$/', $lines[$l]) === 1, new \Exception("Test $file #$index follows data with something other than script flag or output at line ".($l + 1))); + $script = null; + if ($lines[$l] === "#script-off") { + $script = false; + $l++; + } elseif ($lines[$l] === "#script-on") { + $script = true; + $l++; + } + // collect the output string + $exp = []; + assert($lines[$l] === "#output", new \Exception("Test $file #$index follows input with something other than output at line ".($l + 1))); + for (++$l; $l < sizeof($lines); $l++) { + if ($lines[$l] === "" && in_array(($lines[$l + 1] ?? ""), ["#document", "#fragment"])) { + break; + } + assert(preg_match('/^([^#]|$)/', $lines[$l]) === 1, new \Exception("Test $file #$index contains unrecognized data after output at line ".($l + 1))); + $exp[] = $lines[$l]; + } + $exp = implode("\n", $exp); + if (!$script) { + yield basename($file)." #$index (line $pos)" => [$data, $fragment, $exp]; + } + $l++; + $index++; + } + } +} diff --git a/tests/cases/serializer/README.md b/tests/cases/serializer/README.md new file mode 100644 index 0000000..25e9326 --- /dev/null +++ b/tests/cases/serializer/README.md @@ -0,0 +1,23 @@ +HTML DOM serialization tests +============================ + +The format of these tests is essentially the format of html5lib's tree construction tests in reverse. There are, however, important differences, so the format is documented in full here. + +Each file containing tree construction tests consists of any number of +tests separated by two newlines (LF) and a single newline before the end +of the file. For instance: + + [TEST]LF + LF + [TEST]LF + LF + [TEST]LF + +Where [TEST] is the following format: + +Each test begins with a line reading "#document" or "#fragment"; subsequent +lines represent the document or document fragment (respectively) used as +input, until a line is encountered which reads "#output", "#script-on", +or "#script-off". + + diff --git a/tests/cases/serializer/mensbeam01.dat b/tests/cases/serializer/mensbeam01.dat new file mode 100644 index 0000000..c317644 --- /dev/null +++ b/tests/cases/serializer/mensbeam01.dat @@ -0,0 +1,33 @@ +#fragment +| +#output + + +#fragment +| +| test💩test="test" +#output + + +#fragment +| +| "You should not see this text." +#output + + +#fragment +| +| class="test" +#output + + +#fragment +| +#output + + +#fragment +| +| poop💩="soccer" +#output + diff --git a/tests/cases/serializer/mensbeam02.dat b/tests/cases/serializer/mensbeam02.dat new file mode 100644 index 0000000..7760020 --- /dev/null +++ b/tests/cases/serializer/mensbeam02.dat @@ -0,0 +1,34 @@ +#document +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + diff --git a/tests/cases/serializer/wpt01.dat b/tests/cases/serializer/wpt01.dat new file mode 100644 index 0000000..0074d36 --- /dev/null +++ b/tests/cases/serializer/wpt01.dat @@ -0,0 +1,913 @@ +#fragment +| +#output + + +#fragment +| +| +#output + + +#fragment +| +| +| b="c" +#output + + +#fragment +| +| +| b="&" +#output + + +#fragment +| +| +| b=" " +#output + + +#fragment +| +| +| b=""" +#output + + +#fragment +| +| +| b="<" +#output + + +#fragment +| +| +| b=">" +#output + + +#fragment +| +| +| href="javascript:"<>"" +#output + + +#fragment +| +| +| xlink xlink:href="a" +#output + + +#fragment +| +| +| xmlns xmlns:svg="test" +#output + + +#fragment +| +| "a" +#output +a + +#fragment +| +| "&" +#output +& + +#fragment +| +| " " +#output +  + +#fragment +| +| "<" +#output +< + +#fragment +| +| ">" +#output +> + +#fragment +| +| """ +#output +" + +#fragment +| +| + +#fragment +| +| + +#fragment +| + +#fragment +| +| +| "<&>" +#output +<span><xmp><&> + +#fragment +| +| + +#fragment +| +| +| "<&>" +#output +<span><noembed><&> + +#fragment +| +| +| "<&>" +#output +<span><noframes><&> + +#fragment +| +|