Browse Source

Merge branch 'serialize'

serialize
J. King 3 years ago
parent
commit
bb8d49fb6a
  1. 50
      lib/Parser/AttributeSetter.php
  2. 60
      lib/Parser/Data.php
  3. 3
      lib/Parser/Exception.php
  4. 47
      lib/Parser/NameCoercion.php
  5. 4
      lib/Parser/ParseErrorEmitter.php
  6. 232
      lib/Parser/Serializer.php
  7. 116
      lib/Parser/TreeConstructor.php
  8. 2
      lib/Parser/ctype.php
  9. 22
      tests/cases/TestCharset.php
  10. 10
      tests/cases/TestParser.php
  11. 258
      tests/cases/TestSerializer.php
  12. 25
      tests/cases/TestTokenizer.php
  13. 9
      tests/cases/TestTreeConstructor.php
  14. 99
      tests/cases/serializer/README.md
  15. 33
      tests/cases/serializer/mensbeam01.dat
  16. 34
      tests/cases/serializer/mensbeam02.dat
  17. 913
      tests/cases/serializer/wpt01.dat
  18. 12
      tests/cases/tree-construction/mensbeam02.dat
  19. 129
      tests/cases/tree-construction/mensbeam03.dat
  20. 3
      tests/phpunit.dist.xml

50
lib/Parser/AttributeSetter.php

@ -0,0 +1,50 @@
<?php
/** @license MIT
* Copyright 2017 , Dustin Wilson, J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\HTML\Parser;
use MensBeam\HTML\Parser;
trait AttributeSetter {
public function elementSetAttribute(\DOMElement $element, ?string $namespaceURI, string $qualifiedName, string $value): void {
if ($namespaceURI === Parser::XMLNS_NAMESPACE) {
// NOTE: We create attribute nodes so that xmlns attributes
// don't get lost; otherwise they cannot be serialized.
// Furthermore we create the attribute node in a temporary
// document to avoid some related PHP bugs
$d = new \DOMDocument;
$d->appendChild($d->createElement("html"));
try {
$a = $d->createAttributeNS($namespaceURI, $qualifiedName);
// @codeCoverageIgnoreStart
} catch (\DOMException $e) {
// The attribute name is invalid for XML 1.0 Second Edition
// Replace any offending characters with "UHHHHHH" where H are the
// uppercase hexadecimal digits of the character's code point
// NOTE: This case is never encountered by the parser
$qualifiedName = self::coerceName($qualifiedName, true);
$a = $d->createAttributeNS($namespaceURI, $qualifiedName);
}
// @codeCoverageIgnoreEnd
$a->value = self::escapeString($value, true);
$element->setAttributeNodeNS($element->ownerDocument->importNode($a));
} else {
try {
$element->setAttributeNS($namespaceURI, $qualifiedName, $value);
} catch (\DOMException $e) {
// The attribute name is invalid for XML 1.0 Second Edition
// Replace any offending characters with "UHHHHHH" where H are the
// uppercase hexadecimal digits of the character's code point
$qualifiedName = self::coerceName($qualifiedName, ($namespaceURI !== null));
$element->setAttributeNS($namespaceURI, $qualifiedName, $value);
$this->mangledAttributes = true;
}
if ($qualifiedName === "id" && $namespaceURI === null) {
$element->setIdAttribute($qualifiedName, true);
}
}
}
}

60
lib/Parser/Data.php

@ -112,15 +112,17 @@ class Data {
else {
$char = "\n";
}
} elseif ($char === '') {
$this->eof = true;
}
// unless we're peeking, track line and column position, and whether we've hit EOF
if ($this->track) {
if ($char === "\n") {
if ($char === '') {
// do nothing
} elseif ($char === "\n") {
$this->newlines[$this->data->posChar()] = $this->_column;
$this->_column = 0;
$this->_line++;
} elseif ($char === '') {
$this->eof = true;
} else {
$this->_column++;
$len = strlen($char);
@ -155,7 +157,7 @@ class Data {
$this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM);
$this->lastError = $here;
} elseif ($tail === 0xBFBD && $this->data->posErr === $here) {
$this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM, $this->data->posByte);
$this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM);
$this->lastError = $here;
}
}
@ -184,20 +186,21 @@ class Data {
$here = $this->data->posChar();
// if the previous character was a normalized CR+LF pair, we need to go back two
if (isset($this->normalized[$here])) {
$this->data->seek(-1);
// NOTE: This case is never encountered by the parser
$this->data->seek(-1); // @codeCoverageIgnore
}
// recalculate line and column positions, if requested
if ($retreatPointer && $this->track) {
$col = $this->newlines[$here] ?? 0;
if ($col) {
$this->_column = $col;
// NOTE: These cases are never encountered by the parser
// @codeCoverageIgnoreStart
if ($col = $this->newlines[$here] ?? 0) {
$this->_column = $col + 1;
$this->_line--;
} else {
} elseif ($this->astrals[$here] ?? false) {
$this->_column--;
if ($this->astrals[$here] ?? false) {
$this->_column--;
}
}
// @codeCoverageIgnoreEnd
$this->_column--;
}
$this->data->seek(-1);
}
@ -252,21 +255,22 @@ class Data {
do {
// If the current position is the start of a line,
// get the column position of the end of the previous line
// NOTE: These cases are never encountered by the parser
// @codeCoverageIgnoreStart
if (isset($this->newlines[$pos])) {
$line--;
$col = $this->newlines[$pos];
$col = $this->newlines[$pos] + 1;
// If the newline was a normalized CR+LF pair,
// go back one extra character
if (isset($this->normalized[$pos])) {
$pos--;
}
} else {
$col--;
} elseif ($this->astrals[$pos] ?? false) {
// supplementary plane characters count as two
if ($this->astrals[$pos] ?? false) {
$this->_column--;
}
$col--;
}
// @codeCoverageIgnoreEnd
$col--;
$pos--;
} while (++$relativePos < 0);
return [$line, $col];
@ -274,19 +278,23 @@ class Data {
return [$this->_line, $this->_column + $relativePos];
}
} else {
return [0, 0];
return [0, 0]; // @codeCoverageIgnore
}
}
public function __get($property) {
switch ($property) {
case 'column': return $this->_column;
break;
case 'line': return $this->_line;
break;
case 'pointer': return $this->data->posChar();
break;
default: return null;
case 'column':
return $this->_column; // @codeCoverageIgnore
break;
case 'line':
return $this->_line; // @codeCoverageIgnore
break;
case 'pointer':
return $this->data->posChar();
break;
default:
return null; // @codeCoverageIgnore
}
}

3
lib/Parser/Exception.php

@ -11,10 +11,13 @@ class Exception extends \Exception {
public const FAILED_CREATING_DOCUMENT = 102;
public const INVALID_DOCUMENT_CLASS = 103;
public const UNSUPPORTED_NODE_TYPE = 201;
protected static $messages = [
101 => 'Fragment\'s quirks mode must be one of Parser::NO_QUIRKS_MODE, Parser::LIMITED_QUIRKS_MODE, or Parser::QUIRKS_MODE',
102 => 'Unable to create instance of configured document class "%s"',
103 => 'Configured document class "%s" must be a subclass of \DOMDocument',
201 => 'Unable to serialize unsupported node type %s',
];
public function __construct(int $code, array $args = [], \Throwable $previous = null) {

47
lib/Parser/NameCoercion.php

@ -10,7 +10,7 @@ use MensBeam\Intl\Encoding\UTF8;
trait NameCoercion {
/** @codeCoverageIgnore */
protected function coerceNameFifthEdition(string $name): string {
protected static function coerceNameFifthEdition(string $name): string {
// This matches the inverse of the production of NameChar in XML 1.0 Fifth Edition,
// with the added exclusion of ":" from allowed characters
// See https://www.w3.org/TR/REC-xml/#NT-NameStartChar
@ -30,27 +30,34 @@ trait NameCoercion {
return $name;
}
protected function coerceName(string $name): string {
// This matches the inverse of the production of Name in XML 1.0 Fourth Edition,
// with the added exclusion of ":" from allowed characters
// See https://www.w3.org/TR/2006/REC-xml-20060816/#NT-NameChar
preg_match_all('/[^_\.\-\x{41}-\x{5A}\x{61}-\x{7A}\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{100}-\x{131}\x{134}-\x{13E}\x{141}-\x{148}\x{14A}-\x{17E}\x{180}-\x{1C3}\x{1CD}-\x{1F0}\x{1F4}-\x{1F5}\x{1FA}-\x{217}\x{250}-\x{2A8}\x{2BB}-\x{2C1}\x{386}\x{388}-\x{38A}\x{38C}\x{38E}-\x{3A1}\x{3A3}-\x{3CE}\x{3D0}-\x{3D6}\x{3DA}\x{3DC}\x{3DE}\x{3E0}\x{3E2}-\x{3F3}\x{401}-\x{40C}\x{40E}-\x{44F}\x{451}-\x{45C}\x{45E}-\x{481}\x{490}-\x{4C4}\x{4C7}-\x{4C8}\x{4CB}-\x{4CC}\x{4D0}-\x{4EB}\x{4EE}-\x{4F5}\x{4F8}-\x{4F9}\x{531}-\x{556}\x{559}\x{561}-\x{586}\x{5D0}-\x{5EA}\x{5F0}-\x{5F2}\x{621}-\x{63A}\x{641}-\x{64A}\x{671}-\x{6B7}\x{6BA}-\x{6BE}\x{6C0}-\x{6CE}\x{6D0}-\x{6D3}\x{6D5}\x{6E5}-\x{6E6}\x{905}-\x{939}\x{93D}\x{958}-\x{961}\x{985}-\x{98C}\x{98F}-\x{990}\x{993}-\x{9A8}\x{9AA}-\x{9B0}\x{9B2}\x{9B6}-\x{9B9}\x{9DC}-\x{9DD}\x{9DF}-\x{9E1}\x{9F0}-\x{9F1}\x{A05}-\x{A0A}\x{A0F}-\x{A10}\x{A13}-\x{A28}\x{A2A}-\x{A30}\x{A32}-\x{A33}\x{A35}-\x{A36}\x{A38}-\x{A39}\x{A59}-\x{A5C}\x{A5E}\x{A72}-\x{A74}\x{A85}-\x{A8B}\x{A8D}\x{A8F}-\x{A91}\x{A93}-\x{AA8}\x{AAA}-\x{AB0}\x{AB2}-\x{AB3}\x{AB5}-\x{AB9}\x{ABD}\x{AE0}\x{B05}-\x{B0C}\x{B0F}-\x{B10}\x{B13}-\x{B28}\x{B2A}-\x{B30}\x{B32}-\x{B33}\x{B36}-\x{B39}\x{B3D}\x{B5C}-\x{B5D}\x{B5F}-\x{B61}\x{B85}-\x{B8A}\x{B8E}-\x{B90}\x{B92}-\x{B95}\x{B99}-\x{B9A}\x{B9C}\x{B9E}-\x{B9F}\x{BA3}-\x{BA4}\x{BA8}-\x{BAA}\x{BAE}-\x{BB5}\x{BB7}-\x{BB9}\x{C05}-\x{C0C}\x{C0E}-\x{C10}\x{C12}-\x{C28}\x{C2A}-\x{C33}\x{C35}-\x{C39}\x{C60}-\x{C61}\x{C85}-\x{C8C}\x{C8E}-\x{C90}\x{C92}-\x{CA8}\x{CAA}-\x{CB3}\x{CB5}-\x{CB9}\x{CDE}\x{CE0}-\x{CE1}\x{D05}-\x{D0C}\x{D0E}-\x{D10}\x{D12}-\x{D28}\x{D2A}-\x{D39}\x{D60}-\x{D61}\x{E01}-\x{E2E}\x{E30}\x{E32}-\x{E33}\x{E40}-\x{E45}\x{E81}-\x{E82}\x{E84}\x{E87}-\x{E88}\x{E8A}\x{E8D}\x{E94}-\x{E97}\x{E99}-\x{E9F}\x{EA1}-\x{EA3}\x{EA5}\x{EA7}\x{EAA}-\x{EAB}\x{EAD}-\x{EAE}\x{EB0}\x{EB2}-\x{EB3}\x{EBD}\x{EC0}-\x{EC4}\x{F40}-\x{F47}\x{F49}-\x{F69}\x{10A0}-\x{10C5}\x{10D0}-\x{10F6}\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}\x{110B}-\x{110C}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}\x{1150}\x{1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}\x{1167}\x{1169}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}\x{11AB}\x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}\x{11EB}\x{11F0}\x{11F9}\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}\x{3041}-\x{3094}\x{30A1}-\x{30FA}\x{3105}-\x{312C}\x{AC00}-\x{D7A3}\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}\x{30}-\x{39}\x{660}-\x{669}\x{6F0}-\x{6F9}\x{966}-\x{96F}\x{9E6}-\x{9EF}\x{A66}-\x{A6F}\x{AE6}-\x{AEF}\x{B66}-\x{B6F}\x{BE7}-\x{BEF}\x{C66}-\x{C6F}\x{CE6}-\x{CEF}\x{D66}-\x{D6F}\x{E50}-\x{E59}\x{ED0}-\x{ED9}\x{F20}-\x{F29}\x{300}-\x{345}\x{360}-\x{361}\x{483}-\x{486}\x{591}-\x{5A1}\x{5A3}-\x{5B9}\x{5BB}-\x{5BD}\x{5BF}\x{5C1}-\x{5C2}\x{5C4}\x{64B}-\x{652}\x{670}\x{6D6}-\x{6DC}\x{6DD}-\x{6DF}\x{6E0}-\x{6E4}\x{6E7}-\x{6E8}\x{6EA}-\x{6ED}\x{901}-\x{903}\x{93C}\x{93E}-\x{94C}\x{94D}\x{951}-\x{954}\x{962}-\x{963}\x{981}-\x{983}\x{9BC}\x{9BE}\x{9BF}\x{9C0}-\x{9C4}\x{9C7}-\x{9C8}\x{9CB}-\x{9CD}\x{9D7}\x{9E2}-\x{9E3}\x{A02}\x{A3C}\x{A3E}\x{A3F}\x{A40}-\x{A42}\x{A47}-\x{A48}\x{A4B}-\x{A4D}\x{A70}-\x{A71}\x{A81}-\x{A83}\x{ABC}\x{ABE}-\x{AC5}\x{AC7}-\x{AC9}\x{ACB}-\x{ACD}\x{B01}-\x{B03}\x{B3C}\x{B3E}-\x{B43}\x{B47}-\x{B48}\x{B4B}-\x{B4D}\x{B56}-\x{B57}\x{B82}-\x{B83}\x{BBE}-\x{BC2}\x{BC6}-\x{BC8}\x{BCA}-\x{BCD}\x{BD7}\x{C01}-\x{C03}\x{C3E}-\x{C44}\x{C46}-\x{C48}\x{C4A}-\x{C4D}\x{C55}-\x{C56}\x{C82}-\x{C83}\x{CBE}-\x{CC4}\x{CC6}-\x{CC8}\x{CCA}-\x{CCD}\x{CD5}-\x{CD6}\x{D02}-\x{D03}\x{D3E}-\x{D43}\x{D46}-\x{D48}\x{D4A}-\x{D4D}\x{D57}\x{E31}\x{E34}-\x{E3A}\x{E47}-\x{E4E}\x{EB1}\x{EB4}-\x{EB9}\x{EBB}-\x{EBC}\x{EC8}-\x{ECD}\x{F18}-\x{F19}\x{F35}\x{F37}\x{F39}\x{F3E}\x{F3F}\x{F71}-\x{F84}\x{F86}-\x{F8B}\x{F90}-\x{F95}\x{F97}\x{F99}-\x{FAD}\x{FB1}-\x{FB7}\x{FB9}\x{20D0}-\x{20DC}\x{20E1}\x{302A}-\x{302F}\x{3099}\x{309A}\x{B7}\x{2D0}\x{2D1}\x{387}\x{640}\x{E46}\x{EC6}\x{3005}\x{3031}-\x{3035}\x{309D}-\x{309E}\x{30FC}-\x{30FE}]/u', $name, $m);
foreach (array_unique($m[0], \SORT_STRING) as $c) {
$o = (new UTF8($c))->nextCode();
$esc = "U".str_pad(strtoupper(dechex($o)), 6, "0", \STR_PAD_LEFT);
$name = str_replace($c, $esc, $name);
}
// Apply stricter rules to the first character
if (preg_match('/^[^_\x{41}-\x{5A}\x{61}-\x{7A}\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{100}-\x{131}\x{134}-\x{13E}\x{141}-\x{148}\x{14A}-\x{17E}\x{180}-\x{1C3}\x{1CD}-\x{1F0}\x{1F4}-\x{1F5}\x{1FA}-\x{217}\x{250}-\x{2A8}\x{2BB}-\x{2C1}\x{386}\x{388}-\x{38A}\x{38C}\x{38E}-\x{3A1}\x{3A3}-\x{3CE}\x{3D0}-\x{3D6}\x{3DA}\x{3DC}\x{3DE}\x{3E0}\x{3E2}-\x{3F3}\x{401}-\x{40C}\x{40E}-\x{44F}\x{451}-\x{45C}\x{45E}-\x{481}\x{490}-\x{4C4}\x{4C7}-\x{4C8}\x{4CB}-\x{4CC}\x{4D0}-\x{4EB}\x{4EE}-\x{4F5}\x{4F8}-\x{4F9}\x{531}-\x{556}\x{559}\x{561}-\x{586}\x{5D0}-\x{5EA}\x{5F0}-\x{5F2}\x{621}-\x{63A}\x{641}-\x{64A}\x{671}-\x{6B7}\x{6BA}-\x{6BE}\x{6C0}-\x{6CE}\x{6D0}-\x{6D3}\x{6D5}\x{6E5}-\x{6E6}\x{905}-\x{939}\x{93D}\x{958}-\x{961}\x{985}-\x{98C}\x{98F}-\x{990}\x{993}-\x{9A8}\x{9AA}-\x{9B0}\x{9B2}\x{9B6}-\x{9B9}\x{9DC}-\x{9DD}\x{9DF}-\x{9E1}\x{9F0}-\x{9F1}\x{A05}-\x{A0A}\x{A0F}-\x{A10}\x{A13}-\x{A28}\x{A2A}-\x{A30}\x{A32}-\x{A33}\x{A35}-\x{A36}\x{A38}-\x{A39}\x{A59}-\x{A5C}\x{A5E}\x{A72}-\x{A74}\x{A85}-\x{A8B}\x{A8D}\x{A8F}-\x{A91}\x{A93}-\x{AA8}\x{AAA}-\x{AB0}\x{AB2}-\x{AB3}\x{AB5}-\x{AB9}\x{ABD}\x{AE0}\x{B05}-\x{B0C}\x{B0F}-\x{B10}\x{B13}-\x{B28}\x{B2A}-\x{B30}\x{B32}-\x{B33}\x{B36}-\x{B39}\x{B3D}\x{B5C}-\x{B5D}\x{B5F}-\x{B61}\x{B85}-\x{B8A}\x{B8E}-\x{B90}\x{B92}-\x{B95}\x{B99}-\x{B9A}\x{B9C}\x{B9E}-\x{B9F}\x{BA3}-\x{BA4}\x{BA8}-\x{BAA}\x{BAE}-\x{BB5}\x{BB7}-\x{BB9}\x{C05}-\x{C0C}\x{C0E}-\x{C10}\x{C12}-\x{C28}\x{C2A}-\x{C33}\x{C35}-\x{C39}\x{C60}-\x{C61}\x{C85}-\x{C8C}\x{C8E}-\x{C90}\x{C92}-\x{CA8}\x{CAA}-\x{CB3}\x{CB5}-\x{CB9}\x{CDE}\x{CE0}-\x{CE1}\x{D05}-\x{D0C}\x{D0E}-\x{D10}\x{D12}-\x{D28}\x{D2A}-\x{D39}\x{D60}-\x{D61}\x{E01}-\x{E2E}\x{E30}\x{E32}-\x{E33}\x{E40}-\x{E45}\x{E81}-\x{E82}\x{E84}\x{E87}-\x{E88}\x{E8A}\x{E8D}\x{E94}-\x{E97}\x{E99}-\x{E9F}\x{EA1}-\x{EA3}\x{EA5}\x{EA7}\x{EAA}-\x{EAB}\x{EAD}-\x{EAE}\x{EB0}\x{EB2}-\x{EB3}\x{EBD}\x{EC0}-\x{EC4}\x{F40}-\x{F47}\x{F49}-\x{F69}\x{10A0}-\x{10C5}\x{10D0}-\x{10F6}\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}\x{110B}-\x{110C}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}\x{1150}\x{1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}\x{1167}\x{1169}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}\x{11AB}\x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}\x{11EB}\x{11F0}\x{11F9}\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}\x{3041}-\x{3094}\x{30A1}-\x{30FA}\x{3105}-\x{312C}\x{AC00}-\x{D7A3}\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}]/u', $name, $m)) {
$c = (string) $m[0];
$o = (new UTF8($c))->nextCode();
$esc = "U".str_pad(strtoupper(dechex($o)), 6, "0", \STR_PAD_LEFT);
$name = $esc.substr($name, strlen($c));
protected static function coerceName(string $name, bool $prefixed = false): string {
if ($prefixed) {
$name = explode(":", $name, 2);
} else {
$name = [$name];
}
return $name;
return implode(":", array_map(function($name) {
// This matches the inverse of the production of Name in XML 1.0 Fourth Edition,
// with the added exclusion of ":" from allowed characters
// See https://www.w3.org/TR/2006/REC-xml-20060816/#NT-NameChar
preg_match_all('/[^_\.\-\x{41}-\x{5A}\x{61}-\x{7A}\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{100}-\x{131}\x{134}-\x{13E}\x{141}-\x{148}\x{14A}-\x{17E}\x{180}-\x{1C3}\x{1CD}-\x{1F0}\x{1F4}-\x{1F5}\x{1FA}-\x{217}\x{250}-\x{2A8}\x{2BB}-\x{2C1}\x{386}\x{388}-\x{38A}\x{38C}\x{38E}-\x{3A1}\x{3A3}-\x{3CE}\x{3D0}-\x{3D6}\x{3DA}\x{3DC}\x{3DE}\x{3E0}\x{3E2}-\x{3F3}\x{401}-\x{40C}\x{40E}-\x{44F}\x{451}-\x{45C}\x{45E}-\x{481}\x{490}-\x{4C4}\x{4C7}-\x{4C8}\x{4CB}-\x{4CC}\x{4D0}-\x{4EB}\x{4EE}-\x{4F5}\x{4F8}-\x{4F9}\x{531}-\x{556}\x{559}\x{561}-\x{586}\x{5D0}-\x{5EA}\x{5F0}-\x{5F2}\x{621}-\x{63A}\x{641}-\x{64A}\x{671}-\x{6B7}\x{6BA}-\x{6BE}\x{6C0}-\x{6CE}\x{6D0}-\x{6D3}\x{6D5}\x{6E5}-\x{6E6}\x{905}-\x{939}\x{93D}\x{958}-\x{961}\x{985}-\x{98C}\x{98F}-\x{990}\x{993}-\x{9A8}\x{9AA}-\x{9B0}\x{9B2}\x{9B6}-\x{9B9}\x{9DC}-\x{9DD}\x{9DF}-\x{9E1}\x{9F0}-\x{9F1}\x{A05}-\x{A0A}\x{A0F}-\x{A10}\x{A13}-\x{A28}\x{A2A}-\x{A30}\x{A32}-\x{A33}\x{A35}-\x{A36}\x{A38}-\x{A39}\x{A59}-\x{A5C}\x{A5E}\x{A72}-\x{A74}\x{A85}-\x{A8B}\x{A8D}\x{A8F}-\x{A91}\x{A93}-\x{AA8}\x{AAA}-\x{AB0}\x{AB2}-\x{AB3}\x{AB5}-\x{AB9}\x{ABD}\x{AE0}\x{B05}-\x{B0C}\x{B0F}-\x{B10}\x{B13}-\x{B28}\x{B2A}-\x{B30}\x{B32}-\x{B33}\x{B36}-\x{B39}\x{B3D}\x{B5C}-\x{B5D}\x{B5F}-\x{B61}\x{B85}-\x{B8A}\x{B8E}-\x{B90}\x{B92}-\x{B95}\x{B99}-\x{B9A}\x{B9C}\x{B9E}-\x{B9F}\x{BA3}-\x{BA4}\x{BA8}-\x{BAA}\x{BAE}-\x{BB5}\x{BB7}-\x{BB9}\x{C05}-\x{C0C}\x{C0E}-\x{C10}\x{C12}-\x{C28}\x{C2A}-\x{C33}\x{C35}-\x{C39}\x{C60}-\x{C61}\x{C85}-\x{C8C}\x{C8E}-\x{C90}\x{C92}-\x{CA8}\x{CAA}-\x{CB3}\x{CB5}-\x{CB9}\x{CDE}\x{CE0}-\x{CE1}\x{D05}-\x{D0C}\x{D0E}-\x{D10}\x{D12}-\x{D28}\x{D2A}-\x{D39}\x{D60}-\x{D61}\x{E01}-\x{E2E}\x{E30}\x{E32}-\x{E33}\x{E40}-\x{E45}\x{E81}-\x{E82}\x{E84}\x{E87}-\x{E88}\x{E8A}\x{E8D}\x{E94}-\x{E97}\x{E99}-\x{E9F}\x{EA1}-\x{EA3}\x{EA5}\x{EA7}\x{EAA}-\x{EAB}\x{EAD}-\x{EAE}\x{EB0}\x{EB2}-\x{EB3}\x{EBD}\x{EC0}-\x{EC4}\x{F40}-\x{F47}\x{F49}-\x{F69}\x{10A0}-\x{10C5}\x{10D0}-\x{10F6}\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}\x{110B}-\x{110C}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}\x{1150}\x{1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}\x{1167}\x{1169}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}\x{11AB}\x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}\x{11EB}\x{11F0}\x{11F9}\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}\x{3041}-\x{3094}\x{30A1}-\x{30FA}\x{3105}-\x{312C}\x{AC00}-\x{D7A3}\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}\x{30}-\x{39}\x{660}-\x{669}\x{6F0}-\x{6F9}\x{966}-\x{96F}\x{9E6}-\x{9EF}\x{A66}-\x{A6F}\x{AE6}-\x{AEF}\x{B66}-\x{B6F}\x{BE7}-\x{BEF}\x{C66}-\x{C6F}\x{CE6}-\x{CEF}\x{D66}-\x{D6F}\x{E50}-\x{E59}\x{ED0}-\x{ED9}\x{F20}-\x{F29}\x{300}-\x{345}\x{360}-\x{361}\x{483}-\x{486}\x{591}-\x{5A1}\x{5A3}-\x{5B9}\x{5BB}-\x{5BD}\x{5BF}\x{5C1}-\x{5C2}\x{5C4}\x{64B}-\x{652}\x{670}\x{6D6}-\x{6DC}\x{6DD}-\x{6DF}\x{6E0}-\x{6E4}\x{6E7}-\x{6E8}\x{6EA}-\x{6ED}\x{901}-\x{903}\x{93C}\x{93E}-\x{94C}\x{94D}\x{951}-\x{954}\x{962}-\x{963}\x{981}-\x{983}\x{9BC}\x{9BE}\x{9BF}\x{9C0}-\x{9C4}\x{9C7}-\x{9C8}\x{9CB}-\x{9CD}\x{9D7}\x{9E2}-\x{9E3}\x{A02}\x{A3C}\x{A3E}\x{A3F}\x{A40}-\x{A42}\x{A47}-\x{A48}\x{A4B}-\x{A4D}\x{A70}-\x{A71}\x{A81}-\x{A83}\x{ABC}\x{ABE}-\x{AC5}\x{AC7}-\x{AC9}\x{ACB}-\x{ACD}\x{B01}-\x{B03}\x{B3C}\x{B3E}-\x{B43}\x{B47}-\x{B48}\x{B4B}-\x{B4D}\x{B56}-\x{B57}\x{B82}-\x{B83}\x{BBE}-\x{BC2}\x{BC6}-\x{BC8}\x{BCA}-\x{BCD}\x{BD7}\x{C01}-\x{C03}\x{C3E}-\x{C44}\x{C46}-\x{C48}\x{C4A}-\x{C4D}\x{C55}-\x{C56}\x{C82}-\x{C83}\x{CBE}-\x{CC4}\x{CC6}-\x{CC8}\x{CCA}-\x{CCD}\x{CD5}-\x{CD6}\x{D02}-\x{D03}\x{D3E}-\x{D43}\x{D46}-\x{D48}\x{D4A}-\x{D4D}\x{D57}\x{E31}\x{E34}-\x{E3A}\x{E47}-\x{E4E}\x{EB1}\x{EB4}-\x{EB9}\x{EBB}-\x{EBC}\x{EC8}-\x{ECD}\x{F18}-\x{F19}\x{F35}\x{F37}\x{F39}\x{F3E}\x{F3F}\x{F71}-\x{F84}\x{F86}-\x{F8B}\x{F90}-\x{F95}\x{F97}\x{F99}-\x{FAD}\x{FB1}-\x{FB7}\x{FB9}\x{20D0}-\x{20DC}\x{20E1}\x{302A}-\x{302F}\x{3099}\x{309A}\x{B7}\x{2D0}\x{2D1}\x{387}\x{640}\x{E46}\x{EC6}\x{3005}\x{3031}-\x{3035}\x{309D}-\x{309E}\x{30FC}-\x{30FE}]/u', $name, $m);
foreach (array_unique($m[0], \SORT_STRING) as $c) {
$o = (new UTF8($c))->nextCode();
$esc = "U".str_pad(strtoupper(dechex($o)), 6, "0", \STR_PAD_LEFT);
$name = str_replace($c, $esc, $name);
}
// Apply stricter rules to the first character
if (preg_match('/^[^_\x{41}-\x{5A}\x{61}-\x{7A}\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{100}-\x{131}\x{134}-\x{13E}\x{141}-\x{148}\x{14A}-\x{17E}\x{180}-\x{1C3}\x{1CD}-\x{1F0}\x{1F4}-\x{1F5}\x{1FA}-\x{217}\x{250}-\x{2A8}\x{2BB}-\x{2C1}\x{386}\x{388}-\x{38A}\x{38C}\x{38E}-\x{3A1}\x{3A3}-\x{3CE}\x{3D0}-\x{3D6}\x{3DA}\x{3DC}\x{3DE}\x{3E0}\x{3E2}-\x{3F3}\x{401}-\x{40C}\x{40E}-\x{44F}\x{451}-\x{45C}\x{45E}-\x{481}\x{490}-\x{4C4}\x{4C7}-\x{4C8}\x{4CB}-\x{4CC}\x{4D0}-\x{4EB}\x{4EE}-\x{4F5}\x{4F8}-\x{4F9}\x{531}-\x{556}\x{559}\x{561}-\x{586}\x{5D0}-\x{5EA}\x{5F0}-\x{5F2}\x{621}-\x{63A}\x{641}-\x{64A}\x{671}-\x{6B7}\x{6BA}-\x{6BE}\x{6C0}-\x{6CE}\x{6D0}-\x{6D3}\x{6D5}\x{6E5}-\x{6E6}\x{905}-\x{939}\x{93D}\x{958}-\x{961}\x{985}-\x{98C}\x{98F}-\x{990}\x{993}-\x{9A8}\x{9AA}-\x{9B0}\x{9B2}\x{9B6}-\x{9B9}\x{9DC}-\x{9DD}\x{9DF}-\x{9E1}\x{9F0}-\x{9F1}\x{A05}-\x{A0A}\x{A0F}-\x{A10}\x{A13}-\x{A28}\x{A2A}-\x{A30}\x{A32}-\x{A33}\x{A35}-\x{A36}\x{A38}-\x{A39}\x{A59}-\x{A5C}\x{A5E}\x{A72}-\x{A74}\x{A85}-\x{A8B}\x{A8D}\x{A8F}-\x{A91}\x{A93}-\x{AA8}\x{AAA}-\x{AB0}\x{AB2}-\x{AB3}\x{AB5}-\x{AB9}\x{ABD}\x{AE0}\x{B05}-\x{B0C}\x{B0F}-\x{B10}\x{B13}-\x{B28}\x{B2A}-\x{B30}\x{B32}-\x{B33}\x{B36}-\x{B39}\x{B3D}\x{B5C}-\x{B5D}\x{B5F}-\x{B61}\x{B85}-\x{B8A}\x{B8E}-\x{B90}\x{B92}-\x{B95}\x{B99}-\x{B9A}\x{B9C}\x{B9E}-\x{B9F}\x{BA3}-\x{BA4}\x{BA8}-\x{BAA}\x{BAE}-\x{BB5}\x{BB7}-\x{BB9}\x{C05}-\x{C0C}\x{C0E}-\x{C10}\x{C12}-\x{C28}\x{C2A}-\x{C33}\x{C35}-\x{C39}\x{C60}-\x{C61}\x{C85}-\x{C8C}\x{C8E}-\x{C90}\x{C92}-\x{CA8}\x{CAA}-\x{CB3}\x{CB5}-\x{CB9}\x{CDE}\x{CE0}-\x{CE1}\x{D05}-\x{D0C}\x{D0E}-\x{D10}\x{D12}-\x{D28}\x{D2A}-\x{D39}\x{D60}-\x{D61}\x{E01}-\x{E2E}\x{E30}\x{E32}-\x{E33}\x{E40}-\x{E45}\x{E81}-\x{E82}\x{E84}\x{E87}-\x{E88}\x{E8A}\x{E8D}\x{E94}-\x{E97}\x{E99}-\x{E9F}\x{EA1}-\x{EA3}\x{EA5}\x{EA7}\x{EAA}-\x{EAB}\x{EAD}-\x{EAE}\x{EB0}\x{EB2}-\x{EB3}\x{EBD}\x{EC0}-\x{EC4}\x{F40}-\x{F47}\x{F49}-\x{F69}\x{10A0}-\x{10C5}\x{10D0}-\x{10F6}\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}\x{110B}-\x{110C}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}\x{1150}\x{1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}\x{1167}\x{1169}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}\x{11AB}\x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}\x{11EB}\x{11F0}\x{11F9}\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}\x{3041}-\x{3094}\x{30A1}-\x{30FA}\x{3105}-\x{312C}\x{AC00}-\x{D7A3}\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}]/u', $name, $m)) {
$c = (string) $m[0];
$o = (new UTF8($c))->nextCode();
$esc = "U".str_pad(strtoupper(dechex($o)), 6, "0", \STR_PAD_LEFT);
$name = $esc.substr($name, strlen($c));
}
return $name;
}, $name));
}
protected function uncoerceName(string $name): string {
protected static function uncoerceName(string $name): string {
preg_match_all('/U[0-9A-F]{6}/', $name, $m);
foreach (array_unique($m[0], \SORT_STRING) as $o) {
$c = UTF8::encode(hexdec(substr($o, 1)));
@ -59,7 +66,7 @@ trait NameCoercion {
return $name;
}
protected function escapeString(string $string, bool $attribute = false): string {
protected static function escapeString(string $string, bool $attribute = false): string {
# Escaping a string (for the purposes of the algorithm above) consists of
# running the following steps:

4
lib/Parser/ParseErrorEmitter.php

@ -78,7 +78,7 @@ trait ParseErrorEmitter {
// Count the number of replacements needed in the message.
$count = substr_count($message, '%s');
// If the number of replacements don't match the arguments then oops.
assert(count($arg) === $count, new \Exception("Parse error message expects $count parameters"));
assert(count($arg) === $count, new \Exception("Message of parse error $code expects $count parameters"));
if ($count > 0) {
// Convert newlines and tabs in the arguments to words to better
@ -89,7 +89,7 @@ trait ParseErrorEmitter {
} elseif ($value === "\t") {
return 'Tab';
} elseif ($value === null) {
return 'nothing';
return 'nothing'; // @codeCoverageIgnore
} else {
return $value;
}

232
lib/Parser/Serializer.php

@ -0,0 +1,232 @@
<?php
/** @license MIT
* Copyright 2017 , Dustin Wilson, J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\HTML\Parser;
use MensBeam\HTML\Parser;
abstract class Serializer {
use NameCoercion;
protected const VOID_ELEMENTS = ["basefont", "bgsound", "frame", "keygen", "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"];
protected const RAWTEXT_ELEMENTS = ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext"];
/** Serializes an HTML DOM node to a string. This is equivalent to the outerHTML getter
*
* @param \DOMDocument|\DOMElement|\DOMText|\DOMComment|\DOMProcessingInstruction|\DOMDocumentFragment|\DOMDocumentType $node The node to serialize
*/
public static function serializeOuter(\DOMNode $node): string {
$s = "";
$stack = [];
$n = $node;
do {
# If current node is an Element
if ($n instanceof \DOMElement) {
# If current node is an element in the HTML namespace,
# the MathML namespace, or the SVG namespace, then let
# tagname be current node's local name. Otherwise, let
# tagname be current node's qualified name.
if (in_array($n->namespaceURI ?? Parser::HTML_NAMESPACE, [Parser::HTML_NAMESPACE, Parser::SVG_NAMESPACE, Parser::MATHML_NAMESPACE])) {
$tagName = self::uncoerceName($n->localName);
} else {
$tagName = self::uncoerceName($n->tagName);
}
# Append a U+003C LESS-THAN SIGN character (<), followed by tagname.
$s .= "<$tagName";
# If current node's is value is not null, and the element does
# not have an is attribute in its attribute list, then
# append the string " is="", followed by current node's is
# value escaped as described below in attribute mode,
# followed by a U+0022 QUOTATION MARK character (").
// DEVIATION: We don't support custom elements
# For each attribute that the element has, append a
# U+0020 SPACE character, the attribute's serialized name as
# described below, a U+003D EQUALS SIGN character (=), a
# U+0022 QUOTATION MARK character ("), the attribute's
# value, escaped as described below in attribute mode, and
# a second U+0022 QUOTATION MARK character (").
foreach ($n->attributes as $a) {
# An attribute's serialized name for the purposes of the previous
# paragraph must be determined as follows:
# If the attribute has no namespace
if ($a->namespaceURI === null) {
# The attribute's serialized name is the attribute's local name.
$name = self::uncoerceName($a->localName);
}
# If the attribute is in the XML namespace
elseif ($a->namespaceURI === Parser::XML_NAMESPACE) {
# The attribute's serialized name is the string "xml:" followed
# by the attribute's local name.
$name = "xml:".self::uncoerceName($a->localName);
}
# If the attribute is in the XMLNS namespace...
elseif ($a->namespaceURI === Parser::XMLNS_NAMESPACE) {
# ... and the attribute's local name is xmlns
if ($a->localName === "xmlns") {
# The attribute's serialized name is the string "xmlns".
$name = "xmlns";
}
# ... and the attribute's local name is not xmlns
else {
# The attribute's serialized name is the string "xmlns:"
# followed by the attribute's local name.
$name = "xmlns:".self::uncoerceName($a->localName);
}
}
# If the attribute is in the XLink namespace
elseif ($a->namespaceURI === Parser::XLINK_NAMESPACE) {
# The attribute's serialized name is the string "xlink:"
# followed by the attribute's local name.
$name = "xlink:".self::uncoerceName($a->localName);
}
# If the attribute is in some other namespace
else {
# The attribute's serialized name is the attribute's qualified name.
$name = ($a->prefix !== "") ? $a->prefix.":".$a->name : $a->name;
}
$value = self::escapeString((string) $a->value, true);
$s .= " $name=\"$value\"";
}
# Append a U+003E GREATER-THAN SIGN character (>).
$s .= ">";
# If current node serializes as void, then continue on to the
# next child node at this point.
# Append the value of running the HTML fragment serialization
# algorithm on the current node element (thus recursing into
# this algorithm for that element), followed by a
# U+003C LESS-THAN SIGN character (<), a U+002F SOLIDUS
# character (/), tagname again, and finally a
# U+003E GREATER-THAN SIGN character (>).
if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE || !in_array($tagName, self::VOID_ELEMENTS)) {
# If the node is a template element, then let the node instead
# be the template element's template contents
# (a DocumentFragment node).
if (
($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE
&& $n->tagName === "template"
&& property_exists($n, "content")
&& $n->content instanceof \DOMDocumentFragment
) {
// NOTE: Treat template content as any other document
// fragment and just invoke the inner serializer
$s .= self::serializeInner($n->content)."</$tagName>";
} elseif ($n->hasChildNodes()) {
// If the element has children, store its tag name and
// continue the loop with its first child; its end
// tag will be written out further down
$stack[] = $tagName;
$n = $n->firstChild;
continue;
} else {
// Otherwise just append the end tag now
$s .= "</$tagName>";
}
}
}
# If current node is a Text node
elseif ($n instanceof \DOMText) {
# If the parent of current node is a style, script, xmp,
# iframe, noembed, noframes, or plaintext element, or
# if the parent of current node is a noscript element
# and scripting is enabled for the node, then append
# the value of current node's data IDL attribute literally.
$p = $n->parentNode;
if ($p instanceof \DOMElement && ($p->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($p->tagName, self::RAWTEXT_ELEMENTS)) {
// NOTE: scripting is assumed not to be enabled
$s .= $n->data;
}
# Otherwise, append the value of current node's data IDL attribute, escaped as described below.
else {
$s .= self::escapeString($n->data);
}
}
# If current node is a Comment
elseif ($n instanceof \DOMComment) {
# Append the literal string "<!--" (U+003C LESS-THAN SIGN,
# U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
# U+002D HYPHEN-MINUS), followed by the value of current
# node's data IDL attribute, followed by the literal
# string "-->" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS,
# U+003E GREATER-THAN SIGN).
$s .= "<!--".$n->data."-->";
}
# If current node is a ProcessingInstruction
elseif ($n instanceof \DOMProcessingInstruction) {
# Append the literal string "<?" (U+003C LESS-THAN SIGN,
# U+003F QUESTION MARK), followed by the value of
# current node's target IDL attribute, followed by a
# single U+0020 SPACE character, followed by the value
# of current node's data IDL attribute, followed by a
# single U+003E GREATER-THAN SIGN character (>).
$s .= "<?".self::uncoerceName($n->target)." ".$n->data.">";
}
# If current node is a DocumentType
elseif ($n instanceof \DOMDocumentType) {
# Append the literal string "<!DOCTYPE" (U+003C LESS-THAN SIGN,
# U+0021 EXCLAMATION MARK, U+0044 LATIN CAPITAL LETTER D,
# U+004F LATIN CAPITAL LETTER O, U+0043 LATIN CAPITAL LETTER C,
# U+0054 LATIN CAPITAL LETTER T, U+0059 LATIN CAPITAL LETTER Y,
# U+0050 LATIN CAPITAL LETTER P, U+0045 LATIN CAPITAL LETTER E),
# followed by a space (U+0020 SPACE), followed by the value
# of current node's name IDL attribute, followed by the
# literal string ">" (U+003E GREATER-THAN SIGN).
$s .= "<!DOCTYPE ".trim($n->name).">";
}
// NOTE: Documents and document fragments have no outer content,
// so we can just serialize the inner content
elseif ($n instanceof \DOMDocument || $n instanceof \DOMDocumentFragment) {
return self::serializeInner($n);
} else {
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($n)]);
}
// If the current node has no more siblings, go up the tree till a
// sibling is found or we've reached the original node
while (!$n->nextSibling && $stack) {
// Write out the stored end tag each time we go up the tree
$tagName = array_pop($stack);
$s .= "</$tagName>";
$n = $n->parentNode;
}
$n = $n->nextSibling;
} while ($stack); // Loop until we have traversed the subtree of the target node in full
return $s;
}
/** Serializes the children of an HTML DOM node to a string. This is equivalent to the innerHTML getter
*
* @param \DOMDocument|\DOMElement|\DOMDocumentFragment $node The node to serialize
*/
public static function serializeInner(\DOMNode $node): string {
# Let s be a string, and initialize it to the empty string.
$s = "";
if ($node instanceof \DOMElement && ($node->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) {
# If the node serializes as void, then return the empty string.
if (in_array($node->tagName, self::VOID_ELEMENTS)) {
return "";
}
# If the node is a template element, then let the node instead
# be the template element's template contents
# (a DocumentFragment node).
elseif ($node->tagName === "template" && property_exists($node, "content") && $node->content instanceof \DOMDocumentFragment) {
// NOTE: template elements won't necessarily have a content
// property because PHP's DOM does not support this natively
$node = $node->content;
}
}
if ($node instanceof \DOMElement || $node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) {
# For each child node of the node, in tree order, run the following steps:
// NOTE: the steps in question are implemented in the "serializeOuter" routine
foreach ($node->childNodes as $n) {
$s .= self::serializeOuter($n);
}
} else {
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($node)]);
}
return $s;
}
}

116
lib/Parser/TreeConstructor.php

@ -9,7 +9,7 @@ namespace MensBeam\HTML\Parser;
use MensBeam\HTML\Parser;
class TreeConstructor {
use ParseErrorEmitter, NameCoercion;
use ParseErrorEmitter, NameCoercion, AttributeSetter;
public $debugLog = "";
@ -335,12 +335,6 @@ class TreeConstructor {
$iterations = 0;
$insertionMode = $this->insertionMode;
// If element name coercison has occurred at some earlier point,
// we must coerce all end tag names to match mangled start tags
if ($token instanceof EndTagToken && $this->mangledElements) {
$token->name = $this->coerceName($token->name);
}
# 13.2.6 Tree construction
#
# As each token is emitted from the tokenizer, the user agent must follow the
@ -379,6 +373,12 @@ class TreeConstructor {
return true;
})());
// If element name coercison has occurred at some earlier point,
// we must coerce all end tag names to match mangled start tags
if ($this->mangledElements && $token instanceof EndTagToken) {
$token->name = self::coerceName($token->name);
}
# 13.2.6.4. The rules for parsing tokens in HTML content
// OPTIMIZATION: Evaluation the "in body" mode first is
// faster for typical documents
@ -401,7 +401,7 @@ class TreeConstructor {
// If attribute name coercison has occurred at some earlier point,
// we must coerce all attributes on html and body start tags in
// case they are relocated to existing elements
$attrName = $this->mangledAttributes ? $this->coerceName($a->name) : $a->name;
$attrName = $this->mangledAttributes ? self::coerceName($a->name) : $a->name;
if (!$top->hasAttributeNS(null, $attrName)) {
$this->elementSetAttribute($top, null, $attrName, $a->value);
}
@ -433,7 +433,7 @@ class TreeConstructor {
// If attribute name coercison has occurred at some earlier point,
// we must coerce all attributes on html and body start tags in
// case they are relocated to existing elements
$attrName = $this->mangledAttributes ? $this->coerceName($a->name) : $a->name;
$attrName = $this->mangledAttributes ? self::coerceName($a->name) : $a->name;
if (!$body->hasAttributeNS(null, $attrName)) {
$this->elementSetAttribute($body, null, $attrName, $a->value);
}
@ -521,7 +521,8 @@ class TreeConstructor {
if (strlen($nextToken->data) === 1 && $nextToken->data === "\n") {
continue;
} elseif (strpos($nextToken->data, "\n") === 0) {
$nextToken->data = substr($nextToken->data, 1);
// NOTE: This case is not currently encountered by the parser due to special handling of newlines
$nextToken->data = substr($nextToken->data, 1); // @codeCoverageIgnore
}
}
// Process the next token
@ -818,7 +819,8 @@ class TreeConstructor {
if (strlen($nextToken->data) === 1 && $nextToken->data === "\n") {
continue;
} elseif (strpos($nextToken->data, "\n") === 0) {
$nextToken->data = substr($nextToken->data, 1);
// NOTE: This case is not currently encountered by the parser due to special handling of newlines
$nextToken->data = substr($nextToken->data, 1); // @codeCoverageIgnore
}
}
# Let the original insertion mode be the current insertion mode.
@ -1065,7 +1067,7 @@ class TreeConstructor {
else {
# 1. If the stack of open elements does not have a form element in scope, then
# this is a parse error; return and ignore the token.
if ($this->stack->hasElementInScope('form')) {
if (!$this->stack->hasElementInScope('form')) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
continue;
}
@ -1918,10 +1920,6 @@ class TreeConstructor {
elseif ($token->name === 'noframes' || $token->name === 'style') {
$this->parseGenericRawText($token);
}
elseif ($token->name === 'noscript') {
$this->insertStartTagToken($token);
$this->insertionMode = self::IN_HEAD_NOSCRIPT_MODE;
}
elseif ($token->name === 'script') {
$this->insertStartTagToken($token);
$this->tokenizer->state = Tokenizer::SCRIPT_DATA_STATE;
@ -2714,7 +2712,9 @@ class TreeConstructor {
# element in table scope, then this is a parse error;
# ignore the token. (fragment case)
if (!$this->stack->hasElementInTableScope("td", "th")) {
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
// NOTE: This case appears to be unreachable
// See https://github.com/whatwg/html/issues/7242
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name); //@codeCoverageIgnore
}
# Otherwise, close the cell (see below) and reprocess the token.
else {
@ -2861,7 +2861,7 @@ class TreeConstructor {
# An end tag...
elseif ($token instanceof EndTagToken) {
# An end tag whose tag name is "template"
if ($token->name === "tenplate") {
if ($token->name === "template") {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
goto ProcessToken;
@ -3262,10 +3262,8 @@ class TreeConstructor {
# Anything else
else {
# Parse error. Ignore the token.
assert($token instanceof CharacterToken || $token instanceof TagToken, new \Exception("Invalid token class: ".get_class($token)));
if ($token instanceof StartTagToken) {
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
} elseif ($token instanceof EndTagToken) {
assert($token instanceof CharacterToken || $token instanceof EndTagToken, new \Exception("Invalid token class: ".get_class($token)));
if ($token instanceof EndTagToken) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
} elseif ($token instanceof CharacterToken) {
$this->error(ParseError::UNEXPECTED_CHAR, $token->data, "exclude whitespace");
@ -3434,6 +3432,12 @@ class TreeConstructor {
return true;
})());
// If element name coercison has occurred at some earlier point,
// we must coerce all end tag names to match mangled start tags
if ($this->mangledElements && $token instanceof EndTagToken) {
$token->name = self::coerceName($token->name, true);
}
# 13.2.6.5 The rules for parsing tokens in foreign content
#
# When the user agent is to apply the rules for parsing tokens in foreign
@ -3587,7 +3591,8 @@ class TreeConstructor {
$node = $this->stack[$pos];
# If node's tag name, converted to ASCII lowercase, is not the
# same as the tag name of the token, then this is a parse error.
if (strtolower($node->nodeName) !== $token->name) {
$nodeName = self::coerceName(strtolower(self::uncoerceName($node->nodeName)), true);
if ($nodeName !== $token->name) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
do {
@ -3598,7 +3603,8 @@ class TreeConstructor {
# If node's tag name, converted to ASCII lowercase, is the same as the
# tag name of the token, pop elements from the stack of open elements until node
# has been popped from the stack, and then abort these steps.
if (strtolower($node->nodeName) === $token->name) {
$nodeName = self::coerceName(strtolower(self::uncoerceName($node->nodeName)), true);
if ($nodeName === $token->name) {
$this->stack->popUntilSame($node);
continue 2;
}
@ -3673,14 +3679,8 @@ class TreeConstructor {
// NOTE: The "entry above" refers to the "in body" insertion mode
// Changes here should be mirrored there
foreach ($this->stack as $node) {
if ($node->nodeName === $token->name && $node->namespaceURI === $this->htmlNamespace) {
$this->stack->generateImpliedEndTags($token->name);
if (!$node->isSameNode($this->stack->currentNode)) {
$this->error($errorCode, $token->name);
}
$this->stack->popUntilSame($node);
return;
} elseif ($this->isElementSpecial($node)) {
// NOTE: Only the "is special" case is possible here
if ($this->isElementSpecial($node)) {
$this->error($errorCode, $token->name);
return;
}
@ -3890,12 +3890,15 @@ class TreeConstructor {
// Abort!
}
else {
// NOTE: This is an edge case only possible via scripting
// @codeCoverageIgnoreStart
# 6. Let previous element be the element immediately above last table in the
# stack of open elements.
$previousElement = $this->stack[$lastTableIndex - 1];
# 7. Let adjusted insertion location be inside previous element, after its last
# child (if any).
$insertionLocation = $previousElement;
// @codeCoverageIgnoreEnd
}
}
# Otherwise let adjusted insertion location be inside target, after its last
@ -4150,7 +4153,7 @@ class TreeConstructor {
# 17. Let node now be the node before node in the stack of open elements.
# 18. Return to the step labeled Loop.
}
}
} // @codeCoverageIgnore
protected function closePElement(TagToken $token) {
# When the steps above say the UA is to close a p element, it means that the UA
@ -4216,11 +4219,7 @@ class TreeConstructor {
// The element name is invalid for XML
// Replace any offending characters with "UHHHHHH" where H are the
// uppercase hexadecimal digits of the character's code point
if ($namespace !== $this->htmlNamespace) {
$qualifiedName = implode(":", array_map([$this, "coerceName"], explode(":", $token->name, 2)));
} else {
$qualifiedName = $this->coerceName($token->name);
}
$qualifiedName = self::coerceName($token->name, ($namespace !== $this->htmlNamespace));
$element = $this->DOM->createElementNS($namespace, $qualifiedName);
$this->mangledElements = true;
}
@ -4246,47 +4245,6 @@ class TreeConstructor {
return $element;
}
public function elementSetAttribute(\DOMElement $element, ?string $namespaceURI, string $qualifiedName, string $value): void {
if ($namespaceURI === Parser::XMLNS_NAMESPACE) {
// NOTE: We create attribute nodes so that xmlns attributes
// don't get lost; otherwise they cannot be serialized
try {
$a = @$element->ownerDocument->createAttributeNS($namespaceURI, $qualifiedName);
} catch (\DOMException $e) {
// FIXME: PHP has a fit here if the document element has a namespace and no prefix
// A workaround does not seem to exist
return;
}
if ($a === false) {
// The document element does not exist yet, so we need
// to insert this element into the document
$element->ownerDocument->appendChild($element);
$a = $element->ownerDocument->createAttributeNS($namespaceURI, $qualifiedName);
$element->ownerDocument->removeChild($element);
}
$a->value = $this->escapeString($value, true);
$element->setAttributeNodeNS($a);
} else {
try {
$element->setAttributeNS($namespaceURI, $qualifiedName, $value);
} catch (\DOMException $e) {
// The attribute name is invalid for XML
// Replace any offending characters with "UHHHHHH" where H are the
// uppercase hexadecimal digits of the character's code point
if ($namespaceURI !== null) {
$qualifiedName = implode(":", array_map([$this, "coerceName"], explode(":", $qualifiedName, 2)));
} else {
$qualifiedName = $this->coerceName($qualifiedName);
}
$element->setAttributeNS($namespaceURI, $qualifiedName, $value);
$this->mangledAttributes = true;
}
if ($qualifiedName === "id" && $namespaceURI === null) {
$element->setIdAttribute($qualifiedName, true);
}
}
}
public function isMathMLTextIntegrationPoint(\DOMElement $e): bool {
return ($e->namespaceURI === Parser::MATHML_NAMESPACE && (in_array($e->nodeName, ['mi', 'mo', 'mn', 'ms', 'mtext'])));
}

2
lib/Parser/ctype.php

@ -11,6 +11,7 @@ namespace MensBeam\HTML\Parser;
// replacement, as they are designed only to evaluate
// single characters
// @codeCoverageIgnoreStart
if (!extension_loaded("ctype")) {
function ctype_alnum(string $str): bool {
return ["a"=>true,"b"=>true,"c"=>true,"d"=>true,"e"=>true,"f"=>true,"g"=>true,"h"=>true,"i"=>true,"j"=>true,"k"=>true,"l"=>true,"m"=>true,"n"=>true,"o"=>true,"p"=>true,"q"=>true,"r"=>true,"s"=>true,"t"=>true,"u"=>true,"v"=>true,"w"=>true,"x"=>true,"y"=>true,"z"=>true,"A"=>true,"B"=>true,"C"=>true,"D"=>true,"E"=>true,"F"=>true,"G"=>true,"H"=>true,"I"=>true,"J"=>true,"K"=>true,"L"=>true,"M"=>true,"N"=>true,"O"=>true,"P"=>true,"Q"=>true,"R"=>true,"S"=>true,"T"=>true,"U"=>true,"V"=>true,"W"=>true,"X"=>true,"Y"=>true,"Z"=>true,"0"=>true,"1"=>true,"2"=>true,"3"=>true,"4"=>true,"5"=>true,"6"=>true,"7"=>true,"8"=>true,"9"=>true][$str] ?? false;
@ -32,3 +33,4 @@ if (!extension_loaded("ctype")) {
return ["a"=>true,"b"=>true,"c"=>true,"d"=>true,"e"=>true,"f"=>true,"A"=>true,"B"=>true,"C"=>true,"D"=>true,"E"=>true,"F"=>true,"0"=>true,"1"=>true,"2"=>true,"3"=>true,"4"=>true,"5"=>true,"6"=>true,"7"=>true,"8"=>true,"9"=>true][$str] ?? false;
}
}
// @codeCoverageIgnoreEnd

22
tests/cases/TestCharset.php

@ -117,6 +117,28 @@ class TestCharset extends \PHPUnit\Framework\TestCase {
$this->assertSame($exp, $act->encoding);
}
/**
* @dataProvider provideNonstandardDeclarationTests
* @covers \MensBeam\HTML\Parser\Data::__construct */
public function testNonstandardDeclarationTests(string $data, ?string $charset, ?string $fallback, int $bytesToScan, string $exp): void {
$config = new Config;
$config->encodingPrescanBytes = $bytesToScan;
$config->encodingFallback = $fallback;
$act = Parser::parse($data, $charset, $config);
$this->assertSame($exp, $act->encoding);
}
public function provideNonstandardDeclarationTests(): iterable {
return [
["<?xml".str_repeat(" ", 1024).">", null, null, 1024, "windows-1252"],
["<?xml ", null, null, 1024, "windows-1252"],
["", "text/html;charset=utf-8", null, 1024, "UTF-8"],
["<meta charset='UTF-8'>", null, null, 1024, "UTF-8"],
["", null, "UTF-8", 1024, "UTF-8"],
["", null, "UTF-7", 1024, "windows-1252"],
];
}
public function provideStandardDeclarationTests() {
$tests = [];
$blacklist = ["xmldecl-3.html"];

10
tests/cases/TestParser.php

@ -13,6 +13,7 @@ use MensBeam\HTML\Parser\Exception;
/**
* @covers \MensBeam\HTML\Parser
* @covers \MensBeam\HTML\Parser\Exception
*/
class TestParser extends \PHPUnit\Framework\TestCase {
public function testParseADocument(): void {
@ -33,6 +34,15 @@ class TestParser extends \PHPUnit\Framework\TestCase {
$this->assertInstanceOf(\DOMDocumentFragment::class, $out);
}
/** @covers \MensBeam\HTML\Parser\TreeConstructor::__construct */
public function testParseAFragmentWithBogusQuirksMode(): void {
$doc = new \DOMDocument();
$context = $doc->createElement("div");
$in = "hello world!";
$this->expectExceptionObject(new Exception(Exception::INVALID_QUIRKS_MODE));
Parser::parseFragment($context, -1, $in, "tex/html; charset=utf8");
}
public function testParseADocumentReportingErrors(): void {
$in = "hello world!";
$conf = new Config;

258
tests/cases/TestSerializer.php

@ -0,0 +1,258 @@
<?php
/**
* @license MIT
* Copyright 2017, Dustin Wilson, J. King et al.
* See LICENSE and AUTHORS files for details
*/
declare(strict_types=1);
namespace MensBeam\HTML\DOM\TestCase;
use MensBeam\HTML\Parser\Exception;
use MensBeam\HTML\Parser;
use MensBeam\HTML\Parser\AttributeSetter;
use MensBeam\HTML\Parser\NameCoercion;
use MensBeam\HTML\Parser\Serializer;
/** @covers \MensBeam\HTML\Parser\Serializer */
class TestSerializer extends \PHPUnit\Framework\TestCase {
use NameCoercion, AttributeSetter;
/** @dataProvider provideStandardTreeTests */
public function testStandardTreeTests(array $data, bool $fragment, string $exp): void {
$node = $this->buildTree($data, $fragment);
$this->assertSame($exp, Serializer::serializeOuter($node));
}
public function provideStandardTreeTests(): iterable {
$blacklist = [];
$files = new \AppendIterator();
$files->append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/cases/serializer/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
foreach ($files as $file) {
if (!in_array(basename($file), $blacklist)) {
yield from $this->parseTreeTestFile($file);
}
}
}
/** @dataProvider provideTemplateTests */
public function testSerializeADecoratedTemplate(?string $ns, bool $content, bool $fragment, bool $text, string $exp): void {
$d = new \DOMDocument;
$t = $d->createElementNS($ns, "template");
$t->appendChild($d->createTextNode("EEK"));
if ($content) {
$t->content = null;
if ($fragment) {
$f = $d->createDocumentFragment();
$t->content = $f;
if ($text) {
$f->appendChild($d->createTextNode("OOK"));
}
}
}
$exp1 = $exp;
$exp2 = "<template>$exp</template>";
$this->assertSame($exp1, Serializer::serializeInner($t));
$this->assertSame($exp2, Serializer::serializeOuter($t));
}
public function provideTemplateTests(): iterable {
return [
[null, false, false, false, "EEK"],
[null, true, false, false, "EEK"],
[null, true, true, false, ""],
[null, true, true, true, "OOK"],
[Parser::HTML_NAMESPACE, false, false, false, "EEK"],
[Parser::HTML_NAMESPACE, true, false, false, "EEK"],
[Parser::HTML_NAMESPACE, true, true, false, ""],
[Parser::HTML_NAMESPACE, true, true, true, "OOK"],
];
}
/** @dataProvider provideEmptyElementTests */
public function testInnerSerializeEmptyElement(string $tagName, ?string $ns, string $exp): void {
$d = new \DOMDocument;
$e = $d->createElementNS($ns, $tagName);
$e->appendChild($d->createTextNode("EEK"));
$this->assertSame($exp, Serializer::serializeInner($e));
}
public function provideEmptyElementTests(): iterable {
return [
["basefont", null, ""],
["bgsound", null, ""],
["frame", null, ""],
["keygen", null, ""],
["area", null, ""],
["base", null, ""],
["br", null, ""],
["col", null, ""],
["embed", null, ""],
["hr", null, ""],
["img", null, ""],
["input", null, ""],
["link", null, ""],
["meta", null, ""],
["param", null, ""],
["source", null, ""],
["track", null, ""],
["wbr", null, ""],
["basefont", Parser::HTML_NAMESPACE, ""],
["bgsound", Parser::HTML_NAMESPACE, ""],
["frame", Parser::HTML_NAMESPACE, ""],
["keygen", Parser::HTML_NAMESPACE, ""],
["area", Parser::HTML_NAMESPACE, ""],
["base", Parser::HTML_NAMESPACE, ""],
["br", Parser::HTML_NAMESPACE, ""],
["col", Parser::HTML_NAMESPACE, ""],
["embed", Parser::HTML_NAMESPACE, ""],
["hr", Parser::HTML_NAMESPACE, ""],
["img", Parser::HTML_NAMESPACE, ""],
["input", Parser::HTML_NAMESPACE, ""],
["link", Parser::HTML_NAMESPACE, ""],
["meta", Parser::HTML_NAMESPACE, ""],
["param", Parser::HTML_NAMESPACE, ""],
["source", Parser::HTML_NAMESPACE, ""],
["track", Parser::HTML_NAMESPACE, ""],
["wbr", Parser::HTML_NAMESPACE, ""],
["basefont", Parser::SVG_NAMESPACE, "EEK"],
["bgsound", Parser::SVG_NAMESPACE, "EEK"],
["frame", Parser::SVG_NAMESPACE, "EEK"],
["keygen", Parser::SVG_NAMESPACE, "EEK"],
["area", Parser::SVG_NAMESPACE, "EEK"],
["base", Parser::SVG_NAMESPACE, "EEK"],
["br", Parser::SVG_NAMESPACE, "EEK"],
["col", Parser::SVG_NAMESPACE, "EEK"],
["embed", Parser::SVG_NAMESPACE, "EEK"],
["hr", Parser::SVG_NAMESPACE, "EEK"],
["img", Parser::SVG_NAMESPACE, "EEK"],
["input", Parser::SVG_NAMESPACE, "EEK"],
["link", Parser::SVG_NAMESPACE, "EEK"],
["meta", Parser::SVG_NAMESPACE, "EEK"],
["param", Parser::SVG_NAMESPACE, "EEK"],
["source", Parser::SVG_NAMESPACE, "EEK"],
["track", Parser::SVG_NAMESPACE, "EEK"],
["wbr", Parser::SVG_NAMESPACE, "EEK"],
];
}
public function testOuterSerializeAnInvalidNode(): void {
$d = new \DOMDocument;
$a = $d->createAttribute("oops");
$this->expectExceptionObject(new Exception(Exception::UNSUPPORTED_NODE_TYPE, [\DOMAttr::class]));
Serializer::serializeOuter($a);
}
public function testInnerSerializeAnInvalidNode(): void {
$d = new \DOMDocument;
$t = $d->createTextNode("OOPS");
$this->expectExceptionObject(new Exception(Exception::UNSUPPORTED_NODE_TYPE, [\DOMText::class]));
Serializer::serializeInner($t);
}
protected function buildTree(array $data, bool $fragment, bool $formatOutput = false): \DOMNode {
$document = new \DOMDocument;
$document->formatOutput = $formatOutput;
if ($fragment) {
$document->appendChild($document->createElement("html"));
$out = $document->createDocumentFragment();
} else {
$out = $document;
}
$cur = $out;
$pad = 2;
// process each line in turn
for ($l = 0; $l < sizeof($data); $l++) {
preg_match('/^(\|\s+)(.+)/', $data[$l], $m);
// pop any parents as long as the padding of the line is less than the expected padding
$p = strlen((string) $m[1]);
assert($p >= 2 && $p <= $pad && !($p % 2), new \Exception("Input data is invalid on line ".($l + 1)));
while ($p < $pad) {
$pad -= 2;
$cur = $cur->parentNode;
}
// act based upon what the rest of the line looks like
$d = $m[2];
if (preg_match('/^<!-- (.*?) -->$/', $d, $m)) {
// comment
$cur->appendChild($document->createComment($m[1]));
} elseif (preg_match('/^<!DOCTYPE(?: ([^ >]*)(?: "([^"]*)" "([^"]*)")?)?>$/', $d, $m)) {
// doctype
$name = strlen((string) ($m[1] ?? "")) ? $m[1] : " ";
$public = strlen((string) ($m[2] ?? "")) ? $m[2] : "";
$system = strlen((string) ($m[3] ?? "")) ? $m[3] : "";
$cur->appendChild($document->implementation->createDocumentType($name, $public, $system));
} elseif (preg_match('/^<\?([^ ]+) ([^>]*)>$/', $d, $m)) {
// processing instruction
$cur->appendChild($document->createProcessingInstruction($m[1], $m[2]));
} elseif (preg_match('/^<(?:([^ ]+) )?([^>]+)>$/', $d, $m)) {
// element
$ns = strlen((string) $m[1]) ? (array_flip(Parser::NAMESPACE_MAP)[$m[1]] ?? $m[1]) : null;
$cur = $cur->appendChild($document->createElementNS($ns, self::coerceName($m[2])));
$pad += 2;
} elseif (preg_match('/^(?:([^" ]+) )?([^"=]+)="((?:[^"]|"(?!$))*)"$/', $d, $m)) {
// attribute
$ns = strlen((string) $m[1]) ? (array_flip(Parser::NAMESPACE_MAP)[$m[1]] ?? $m[1]) : "";
$this->elementSetAttribute($cur, $ns, $m[2], $m[3]);
} elseif (preg_match('/^"((?:[^"]|"(?!$))*)("?)$/', $d, $m)) {
// text
$t = $m[1];
while (!strlen((string) $m[2])) {
preg_match('/^((?:[^"]|"(?!$))*)("?)$/', $data[++$l], $m);
$t .= "\n".$m[1];
}
$cur->appendChild($document->createTextNode($t));
} else {
throw new \Exception("Input data is invalid on line ".($l + 1));
}
}
return $out;
}
protected function parseTreeTestFile(string $file): \Generator {
$index = 0;
$l = 0;
$lines = array_map(function($v) {
return rtrim($v, "\n");
}, file($file));
while ($l < sizeof($lines)) {
$pos = $l + 1;
assert(in_array($lines[$l], ["#document", "#fragment"]), new \Exception("Test $file #$index does not start with #document or #fragment tag at line ".($l + 1)));
$fragment = $lines[$l] === "#fragment";
// collect the test input
$data = [];
for (++$l; $l < sizeof($lines); $l++) {
if (preg_match('/^#(script-(on|off)|output)$/', $lines[$l])) {
break;
}
$data[] = $lines[$l];
}
// set the script mode, if present
assert(preg_match('/^#(script-(on|off)|output)$/', $lines[$l]) === 1, new \Exception("Test $file #$index follows data with something other than script flag or output at line ".($l + 1)));
$script = null;
if ($lines[$l] === "#script-off") {
$script = false;
$l++;
} elseif ($lines[$l] === "#script-on") {
$script = true;
$l++;
}
// collect the output string
$exp = [];
assert($lines[$l] === "#output", new \Exception("Test $file #$index follows input with something other than output at line ".($l + 1)));
for (++$l; $l < sizeof($lines); $l++) {
if ($lines[$l] === "" && in_array(($lines[$l + 1] ?? ""), ["#document", "#fragment"])) {
break;
}
assert(preg_match('/^([^#]|$)/', $lines[$l]) === 1, new \Exception("Test $file #$index contains unrecognized data after output at line ".($l + 1)));
$exp[] = $lines[$l];
}
$exp = implode("\n", $exp);
if (!$script) {
yield basename($file)." #$index (line $pos)" => [$data, $fragment, $exp];
}
$l++;
$index++;
}
}
}

25
tests/cases/TestTokenizer.php

@ -44,10 +44,10 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
];
/** @dataProvider provideStandardTokenizerTests */
public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $expErrors) {
public function testStandardTokenizerTests(string $input, array $expected, int $state, ?string $open, ?array $expErrors) {
$config = new Config;
$config->encodingFallback = "UTF-8";
$errorHandler = new ParseError;
$errorHandler = ($expErrors !== null) ? new ParseError : null;
// initialize a stack of open elements, possibly with an open element
$stack = new OpenElementsStack(null);
if ($open) {
@ -71,12 +71,31 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
}
} finally {
$actual = $this->normalizeTokens($actual);
$errors = $this->formatErrors($errorHandler->errors);
$this->assertEquals($expected, $actual, $tokenizer->debugLog);
$errors = ($expErrors !== null) ? $this->formatErrors($errorHandler->errors) : null;
$this->assertEquals($expErrors, $errors, $tokenizer->debugLog);
}
}
/**
* @dataProvider provideStandardTokenizerTests
* @depends testStandardTokenizerTests
*/
public function testStandardTokenizerTestsWithoutErrorReporting(string $input, array $expected, int $state, ?string $open, array $expErrors) {
$this->testStandardTokenizerTests($input, $expected, $state, $open, null);
}
/** @dataProvider provideNonstandardTokenizerTests */
public function testNonstandardTokenizerTests(string $input, array $expected, int $state, ?string $open, array $expErrors) {
$this->testStandardTokenizerTests($input, $expected, $state, $open, $expErrors);
}
public function provideNonstandardTokenizerTests(): iterable {
return [
["\xFF", [new CharacterToken("\u{FFFD}"), new EOFToken], Tokenizer::DATA_STATE, "", [['code' => "noncharacter-in-input-stream", 'line' => 1, 'col' => 1]]],
];
}
public function provideStandardTokenizerTests() {
$tests = [];
$blacklist = ["xmlViolation.test"];

9
tests/cases/TestTreeConstructor.php

@ -16,6 +16,7 @@ use MensBeam\HTML\Parser\Tokenizer;
use MensBeam\HTML\Parser\TreeConstructor;
/**
* @covers \MensBeam\HTML\Parser\Data
* @covers \MensBeam\HTML\Parser\Tokenizer
* @covers \MensBeam\HTML\Parser\TreeConstructor
* @covers \MensBeam\HTML\Parser\ActiveFormattingElementsList
@ -122,11 +123,11 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
}
protected function patchTest(string $data, $fragment, array $errors, array $exp): array {
// When using the HTML namespace, xmlns attribute cannot be inserted due to a PHP limitation
// When using the HTML namespace, xmlns attributes lose their namespace due to a PHP limitation
if ($this->ns) {
for ($a = 0; $a < sizeof($exp); $a++) {
if (preg_match('/^\|\s+xmlns xmlns=/', $exp[$a])) {
array_splice($exp, $a--, 1);
$exp[$a] = preg_replace('/^\|(\s+)xmlns xmlns=/', "|$1xmlns=", $exp[$a]);
}
}
}
@ -180,7 +181,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
$prefix = "null ";
}
}
$localName = $this->uncoerceName($e->localName);
$localName = self::uncoerceName($e->localName);
$this->push("<".$prefix.$localName.">");
$this->depth++;
$attr = [];
@ -191,7 +192,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
assert((bool) $prefix, new \Exception("Prefix for namespace {$a->namespaceURI} is not defined"));
$prefix .= " ";
}
$attr[$prefix.$this->uncoerceName($a->name)] = $a->value;
$attr[$prefix.self::uncoerceName($a->name)] = $a->value;
}
ksort($attr, \SORT_STRING);
foreach ($attr as $k => $v) {

99
tests/cases/serializer/README.md

@ -0,0 +1,99 @@
HTML DOM serialization tests
============================
The format of these tests is essentially the format of html5lib's tree
construction tests in reverse. There are, however, important differences,
so the format is documented in full here.
Each file containing tree construction tests consists of any number of
tests separated by two newlines (LF) and a single newline before the end
of the file. For instance:
[TEST]LF
LF
[TEST]LF
LF
[TEST]LF
Where [TEST] is the following format:
Each test begins with a line reading `#document` or `#fragment`; subsequent
lines represent the document or document fragment (respectively) used as
input, until a line is encountered which reads `#output`, `#script-on`,
or `#script-off`.
Each DOM node in the input is written on its own line beginning with the
characters "| " (a vertical bar followed by a single space); lines which begin
with other characters are a continuation of the previous line. Attributes
are treated as distinct nodes and have their own entries. There is no escape
mechanism: all input is literal, including newlines and quotation marks. Two
spaces are used to denote each level of nesting. For example:
| node
| child node
continuation of child node
| grandchild node
| child node
| attribute node of child
| grandchild node
The different types of nodes are:
- Element nodes in the form `<body>` for an element in the HTML namespace,
or `<svg svg>` for an element in a foreign namespace. Qualified names are
written as usual e.g. `<math math:math>`, though such elements are not
produced by the parser
- Attribute nodes in the form `id="value"` or e.g. `xml xml:id="value"`, with
a quotation mark immediately followed by a newline marking the end of the
attribute value (in other words, attribute values may contain literal
quotation marks)
- Text nodes in the form `"text data"`; like attributes, only a quotation mark
followed a newline marks the end of text data
- Comment nodes of the form `<!-- comment data -->`; the space characters are
padding and are not part of the comment data
- Document type nodes in the form `<!DOCTYPE html "public" "system">`, or
`<!DOCTYPE html>` or simply `<!DOCTYPE>` depending on its contents
- Processing instructions in the form `<?target PI data>`. Processing
instructions are not generated by the HTML parser, but may appear in
documents by other means
Namespaces are represented by the following short names:
| Name | URL |
|-------|--------------------------------------|
| xml | http://www.w3.org/XML/1998/namespace |
| xmlns | http://www.w3.org/2000/xmlns/ |
| xlink | http://www.w3.org/1999/xlink |
| math | http://www.w3.org/1998/Math/MathML |
| svg | http://www.w3.org/2000/svg |
Other namespaces may also appear; these should be interpreted as literal URLs.
After the input block either `#script-on` or `#script-off` may appear. These
signal that the test should be run with scripting on or off, respectively. If
neither line is present, the test should be run in both modes.
Finally, `#output` marks the beginning of output. All subsequent text is
literal characters until two consecutive newlines following by either
`#document` or `#fragment` are seen.
Below is a complete example:
#document
| <!-- This is longer than most tests -->
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| lang="en"
| <head>
| <body>
| style="font-family: "Times New Roman""
| <svg svg>
| xml xml:id="image"
| <div>
| "This is a text node.
It has an embedded newline. It is in fact pretty "busy" and has
multiple newlines.
And even a blank line."
| <!-- This comment also
has a newline -->

33
tests/cases/serializer/mensbeam01.dat

@ -0,0 +1,33 @@
#fragment
| <fake_ns test:test>
#output
<test:test></test:test>
#fragment
| <span>
| test💩test="test"
#output
<span test💩test="test"></span>
#fragment
| <wbr>
| "You should not see this text."
#output
<wbr>
#fragment
| <wbr>
| class="test"
#output
<wbr class="test">
#fragment
| <poop💩>
#output
<poop💩></poop💩>
#fragment
| <test>
| poop💩="soccer"
#output
<test poop💩="soccer"></test>

34
tests/cases/serializer/mensbeam02.dat

@ -0,0 +1,34 @@
#document
| <html>
#output
<html></html>
#document
| <!DOCTYPE html>
| <html>
#output
<!DOCTYPE html><html></html>
#document
| <!DOCTYPE html "public" "system">
| <html>
#output
<!DOCTYPE html><html></html>
#document
| <!DOCTYPE test>
| <html>
#output
<!DOCTYPE test><html></html>
#document
| <!DOCTYPE>
| <html>
#output
<!DOCTYPE ><html></html>
#document
| <html>
| <?php echo "Hello world!"; ?>
#output
<html><?php echo "Hello world!"; ?></html>

913
tests/cases/serializer/wpt01.dat

@ -0,0 +1,913 @@
#fragment
| <span>
#output
<span></span>
#fragment
| <span>
| <a>
#output
<span><a></a></span>
#fragment
| <span>
| <a>
| b="c"
#output
<span><a b="c"></a></span>
#fragment
| <span>
| <a>
| b="&"
#output
<span><a b="&amp;"></a></span>
#fragment
| <span>
| <a>
| b=" "
#output
<span><a b="&nbsp;"></a></span>
#fragment
| <span>
| <a>
| b="""
#output
<span><a b="&quot;"></a></span>
#fragment
| <span>
| <a>
| b="<"
#output
<span><a b="<"></a></span>
#fragment
| <span>
| <a>
| b=">"
#output
<span><a b=">"></a></span>
#fragment
| <span>
| <a>
| href="javascript:"<>""
#output
<span><a href="javascript:&quot;<>&quot;"></a></span>
#fragment
| <span>
| <svg svg>
| xlink xlink:href="a"
#output
<span><svg xlink:href="a"></svg></span>
#fragment
| <span>
| <svg svg>
| xmlns xmlns:svg="test"
#output
<span><svg xmlns:svg="test"></svg></span>
#fragment
| <span>
| "a"
#output
<span>a</span>
#fragment
| <span>
| "&"
#output
<span>&amp;</span>
#fragment
| <span>
| " "
#output
<span>&nbsp;</span>
#fragment
| <span>
| "<"
#output
<span>&lt;</span>
#fragment
| <span>
| ">"
#output
<span>&gt;</span>
#fragment
| <span>
| """
#output
<span>"</span>
#fragment
| <span>
| <style>
| "<&>"
#output
<span><style><&></style></span>
#fragment
| <span>
| <script>
| type="test"
| "<&>"
#output
<span><script type="test"><&></script></span>
#fragment
| <script>
| type="test"
| "<&>"
#output
<script type="test"><&></script>
#fragment
| <span>
| <xmp>
| "<&>"
#output
<span><xmp><&></xmp></span>
#fragment
| <span>
| <iframe>
| "<&>"
#output
<span><iframe><&></iframe></span>
#fragment
| <span>
| <noembed>
| "<&>"
#output
<span><noembed><&></noembed></span>
#fragment
| <span>
| <noframes>
| "<&>"
#output
<span><noframes><&></noframes></span>
#fragment
| <span>
| <noscript>
| "<&>"
#script-off
#output
<span><noscript>&lt;&amp;&gt;</noscript></span>
#fragment
| <span>
| <noscript>
| "<&>"
#script-on
#output
<span><noscript><&></noscript></span>
#fragment
| <span>
| <!-- data -->
#output
<span><!--data--></span>
#fragment
| <span>
| <a>
| <b>
| <c>
| <d>
| "e"
| <f>
| <g>
| "h"
#output
<span><a><b><c></c></b><d>e</d><f><g>h</g></f></a></span>
#fragment
| <span>
| b="c"
#output
<span b="c"></span>
#fragment
| <span>
| <svg svg>
| xml xml:foo="test"
#output
<span><svg xml:foo="test"></svg></span>
#fragment
| <span>
| <svg svg>
| xml abc:foo="test"
#output
<span><svg xml:foo="test"></svg></span>
#fragment
| <span>
| <svg svg>
| xmlns xmlns:foo="test"
#output
<span><svg xmlns:foo="test"></svg></span>
#fragment
| <span>
| <svg svg>
| xmlns xmlns="test"
#output
<span><svg xmlns="test"></svg></span>
#fragment
| <span>
| <svg svg>
| fake_ns abc:def="test"
#output
<span><svg abc:def="test"></svg></span>
#fragment
| <pre>
| "
"
#output
<pre>
</pre>
#fragment
| <pre>
| "a
"
#output
<pre>a
</pre>
#fragment
| <span>
| <pre>
| "
"
#output
<span><pre>
</pre></span>
#fragment
| <span>
| <pre>
| "a
"
#output
<span><pre>a
</pre></span>
#fragment
| <textarea>
| "
"
#output
<textarea>
</textarea>
#fragment
| <textarea>
| "a
"
#output
<textarea>a
</textarea>
#fragment
| <span>
| <textarea>
| "
"
#output
<span><textarea>
</textarea></span>
#fragment
| <span>
| <textarea>
| "a
"
#output
<span><textarea>a
</textarea></span>
#fragment
| <listing>
| "
"
#output
<listing>
</listing>
#fragment
| <listing>
| "a
"
#output
<listing>a
</listing>
#fragment
| <span>
| <listing>
| "
"
#output
<span><listing>
</listing></span>
#fragment
| <span>
| <listing>
| "a
"
#output
<span><listing>a
</listing></span>
#fragment
| <area>
#output
<area>
#fragment
| <span>
| <area>
| <a>
| "test"
| <b>
#output
<span><area><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <area>
| <b>
#output
<span><a>test</a><area><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <area>
#output
<span><a>test</a><b></b><area></span>
#fragment
| <base>
#output
<base>
#fragment
| <span>
| <base>
| <a>
| "test"
| <b>
#output
<span><base><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <base>
| <b>
#output
<span><a>test</a><base><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <base>
#output
<span><a>test</a><b></b><base></span>
#fragment
| <basefont>
#output
<basefont>
#fragment
| <span>
| <basefont>
| <a>
| "test"
| <b>
#output
<span><basefont><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <basefont>
| <b>
#output
<span><a>test</a><basefont><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <basefont>
#output
<span><a>test</a><b></b><basefont></span>
#fragment
| <bgsound>
#output
<bgsound>
#fragment
| <span>
| <bgsound>
| <a>
| "test"
| <b>
#output
<span><bgsound><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <bgsound>
| <b>
#output
<span><a>test</a><bgsound><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <bgsound>
#output
<span><a>test</a><b></b><bgsound></span>
#fragment
| <br>
#output
<br>
#fragment
| <span>
| <br>
| <a>
| "test"
| <b>
#output
<span><br><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <br>
| <b>
#output
<span><a>test</a><br><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <br>
#output
<span><a>test</a><b></b><br></span>
#fragment
| <col>
#output
<col>
#fragment
| <span>
| <col>
| <a>
| "test"
| <b>
#output
<span><col><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <col>
| <b>
#output
<span><a>test</a><col><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <col>
#output
<span><a>test</a><b></b><col></span>
#fragment
| <embed>
#output
<embed>
#fragment
| <span>
| <embed>
| <a>
| "test"
| <b>
#output
<span><embed><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <embed>
| <b>
#output
<span><a>test</a><embed><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <embed>
#output
<span><a>test</a><b></b><embed></span>
#fragment
| <frame>
#output
<frame>
#fragment
| <span>
| <frame>
| <a>
| "test"
| <b>
#output
<span><frame><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <frame>
| <b>
#output
<span><a>test</a><frame><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <frame>
#output
<span><a>test</a><b></b><frame></span>
#fragment
| <hr>
#output
<hr>
#fragment
| <span>
| <hr>
| <a>
| "test"
| <b>
#output
<span><hr><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <hr>
| <b>
#output
<span><a>test</a><hr><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <hr>
#output
<span><a>test</a><b></b><hr></span>
#fragment
| <img>
#output
<img>
#fragment
| <span>
| <img>
| <a>
| "test"
| <b>
#output
<span><img><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <img>
| <b>
#output
<span><a>test</a><img><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <img>
#output
<span><a>test</a><b></b><img></span>
#fragment
| <input>
#output
<input>
#fragment
| <span>
| <input>
| <a>
| "test"
| <b>
#output
<span><input><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <input>
| <b>
#output
<span><a>test</a><input><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <input>
#output
<span><a>test</a><b></b><input></span>
#fragment
| <keygen>
#output
<keygen>
#fragment
| <span>
| <keygen>
| <a>
| "test"
| <b>
#output
<span><keygen><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <keygen>
| <b>
#output
<span><a>test</a><keygen><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <keygen>
#output
<span><a>test</a><b></b><keygen></span>
#fragment
| <link>
#output
<link>
#fragment
| <span>
| <link>
| <a>
| "test"
| <b>
#output
<span><link><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <link>
| <b>
#output
<span><a>test</a><link><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <link>
#output
<span><a>test</a><b></b><link></span>
#fragment
| <meta>
#output
<meta>
#fragment
| <span>
| <meta>
| <a>
| "test"
| <b>
#output
<span><meta><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <meta>
| <b>
#output
<span><a>test</a><meta><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <meta>
#output
<span><a>test</a><b></b><meta></span>
#fragment
| <param>
#output
<param>
#fragment
| <span>
| <param>
| <a>
| "test"
| <b>
#output
<span><param><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <param>
| <b>
#output
<span><a>test</a><param><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <param>
#output
<span><a>test</a><b></b><param></span>
#fragment
| <source>
#output
<source>
#fragment
| <span>
| <source>
| <a>
| "test"
| <b>
#output
<span><source><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <source>
| <b>
#output
<span><a>test</a><source><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <source>
#output
<span><a>test</a><b></b><source></span>
#fragment
| <track>
#output
<track>
#fragment
| <span>
| <track>
| <a>
| "test"
| <b>
#output
<span><track><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <track>
| <b>
#output
<span><a>test</a><track><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <track>
#output
<span><a>test</a><b></b><track></span>
#fragment
| <wbr>
#output
<wbr>
#fragment
| <span>
| <wbr>
| <a>
| "test"
| <b>
#output
<span><wbr><a>test</a><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <wbr>
| <b>
#output
<span><a>test</a><wbr><b></b></span>
#fragment
| <span>
| <a>
| "test"
| <b>
| <wbr>
#output
<span><a>test</a><b></b><wbr></span>

12
tests/cases/tree-construction/mensbeam02.dat

@ -9,6 +9,18 @@
| <poop💩>
| "PHP does not support Fifth Edition XML, hence name coercion"
#data
<!DOCTYPE html><svg><poop💩>PHP does not support Fifth Edition XML, hence name coercion</poop💩></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg poop💩>
| "PHP does not support Fifth Edition XML, hence name coercion"
#data
<!DOCTYPE html><a poop💩="PHP does not support Fifth Edition XML, hence name coercion"></a>
#errors

129
tests/cases/tree-construction/mensbeam03.dat

@ -15,4 +15,131 @@ X
#document-fragment
template
#document
| "X"
| "X"
#data
<input type="text">
#errors
#document-fragment
form
#document
| <input>
| type="text"
#data
<rb>X
#errors
#document-fragment
ruby
#document
| <rb>
| "X"
#data
<!DOCTYPE html><ruby><div><rb>X</div></ruby>
#errors
(1,30): unexpected-parent-in-ruby
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <ruby>
| <div>
| <rb>
| "X"
#data
<math definitionurl="http://example.com/"/>
#errors
#document-fragment
div
#document
| <math math>
| definitionURL="http://example.com/"
#data
<!DOCTYPE html><body><form><template><div></form></div></template></form>
#errors
(1,49): unexpected-end-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <form>
| <template>
| content
| <div>
#data
<!DOCTYPE html><body><template><form><div></form></template>
#errors
(1,49): unexpected-end-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <template>
| content
| <form>
| <div>
#data
<dd><div></dd>
#errors
(1,14): unexpected-end-tag
#document-fragment
dl
#document
| <dd>
| <div>
#data
<!DOCTYPE html><head></head></template>
#errors
(1,39): unexpected-end-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
#data
<!DOCTYPE html><select></template></select>
#errors
(1,34): unexpected-end-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!DOCTYPE html><body><template><select><template>
#errors
(1,50): unexpected-eof
(1,50): unexpected-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <template>
| content
| <select>
| <template>
| content
#data
<tr>X
#errors
(1,5): fostered-char
#document-fragment
table
#document
| <tbody>
| <tr>
| "X"

3
tests/phpunit.dist.xml

@ -28,6 +28,9 @@
<testsuite name="Parser">
<file>cases/TestParser.php</file>
</testsuite>
<testsuite name="Serializer">
<file>cases/TestSerializer.php</file>
</testsuite>
<testsuite name="Encoding change">
<file>cases/TestEncodingChange.php</file>
</testsuite>

Loading…
Cancel
Save