diff --git a/lib/Parser/AttributeSetter.php b/lib/Parser/AttributeSetter.php new file mode 100644 index 0000000..9ad63f1 --- /dev/null +++ b/lib/Parser/AttributeSetter.php @@ -0,0 +1,50 @@ +appendChild($d->createElement("html")); + try { + $a = $d->createAttributeNS($namespaceURI, $qualifiedName); + // @codeCoverageIgnoreStart + } catch (\DOMException $e) { + // The attribute name is invalid for XML 1.0 Second Edition + // Replace any offending characters with "UHHHHHH" where H are the + // uppercase hexadecimal digits of the character's code point + // NOTE: This case is never encountered by the parser + $qualifiedName = self::coerceName($qualifiedName, true); + $a = $d->createAttributeNS($namespaceURI, $qualifiedName); + } + // @codeCoverageIgnoreEnd + $a->value = self::escapeString($value, true); + $element->setAttributeNodeNS($element->ownerDocument->importNode($a)); + } else { + try { + $element->setAttributeNS($namespaceURI, $qualifiedName, $value); + } catch (\DOMException $e) { + // The attribute name is invalid for XML 1.0 Second Edition + // Replace any offending characters with "UHHHHHH" where H are the + // uppercase hexadecimal digits of the character's code point + $qualifiedName = self::coerceName($qualifiedName, ($namespaceURI !== null)); + $element->setAttributeNS($namespaceURI, $qualifiedName, $value); + $this->mangledAttributes = true; + } + if ($qualifiedName === "id" && $namespaceURI === null) { + $element->setIdAttribute($qualifiedName, true); + } + } + } +} diff --git a/lib/Parser/Data.php b/lib/Parser/Data.php index 181c3e9..d0fa6e7 100644 --- a/lib/Parser/Data.php +++ b/lib/Parser/Data.php @@ -112,15 +112,17 @@ class Data { else { $char = "\n"; } + } elseif ($char === '') { + $this->eof = true; } // unless we're peeking, track line and column position, and whether we've hit EOF if ($this->track) { - if ($char === "\n") { + if ($char === '') { + // do nothing + } elseif ($char === "\n") { $this->newlines[$this->data->posChar()] = $this->_column; $this->_column = 0; $this->_line++; - } elseif ($char === '') { - $this->eof = true; } else { $this->_column++; $len = strlen($char); @@ -155,7 +157,7 @@ class Data { $this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM); $this->lastError = $here; } elseif ($tail === 0xBFBD && $this->data->posErr === $here) { - $this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM, $this->data->posByte); + $this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM); $this->lastError = $here; } } @@ -184,20 +186,21 @@ class Data { $here = $this->data->posChar(); // if the previous character was a normalized CR+LF pair, we need to go back two if (isset($this->normalized[$here])) { - $this->data->seek(-1); + // NOTE: This case is never encountered by the parser + $this->data->seek(-1); // @codeCoverageIgnore } // recalculate line and column positions, if requested if ($retreatPointer && $this->track) { - $col = $this->newlines[$here] ?? 0; - if ($col) { - $this->_column = $col; + // NOTE: These cases are never encountered by the parser + // @codeCoverageIgnoreStart + if ($col = $this->newlines[$here] ?? 0) { + $this->_column = $col + 1; $this->_line--; - } else { + } elseif ($this->astrals[$here] ?? false) { $this->_column--; - if ($this->astrals[$here] ?? false) { - $this->_column--; - } } + // @codeCoverageIgnoreEnd + $this->_column--; } $this->data->seek(-1); } @@ -252,21 +255,22 @@ class Data { do { // If the current position is the start of a line, // get the column position of the end of the previous line + // NOTE: These cases are never encountered by the parser + // @codeCoverageIgnoreStart if (isset($this->newlines[$pos])) { $line--; - $col = $this->newlines[$pos]; + $col = $this->newlines[$pos] + 1; // If the newline was a normalized CR+LF pair, // go back one extra character if (isset($this->normalized[$pos])) { $pos--; } - } else { - $col--; + } elseif ($this->astrals[$pos] ?? false) { // supplementary plane characters count as two - if ($this->astrals[$pos] ?? false) { - $this->_column--; - } + $col--; } + // @codeCoverageIgnoreEnd + $col--; $pos--; } while (++$relativePos < 0); return [$line, $col]; @@ -274,19 +278,23 @@ class Data { return [$this->_line, $this->_column + $relativePos]; } } else { - return [0, 0]; + return [0, 0]; // @codeCoverageIgnore } } public function __get($property) { switch ($property) { - case 'column': return $this->_column; - break; - case 'line': return $this->_line; - break; - case 'pointer': return $this->data->posChar(); - break; - default: return null; + case 'column': + return $this->_column; // @codeCoverageIgnore + break; + case 'line': + return $this->_line; // @codeCoverageIgnore + break; + case 'pointer': + return $this->data->posChar(); + break; + default: + return null; // @codeCoverageIgnore } } diff --git a/lib/Parser/Exception.php b/lib/Parser/Exception.php index b9a1164..10b48ee 100644 --- a/lib/Parser/Exception.php +++ b/lib/Parser/Exception.php @@ -11,10 +11,13 @@ class Exception extends \Exception { public const FAILED_CREATING_DOCUMENT = 102; public const INVALID_DOCUMENT_CLASS = 103; + public const UNSUPPORTED_NODE_TYPE = 201; + protected static $messages = [ 101 => 'Fragment\'s quirks mode must be one of Parser::NO_QUIRKS_MODE, Parser::LIMITED_QUIRKS_MODE, or Parser::QUIRKS_MODE', 102 => 'Unable to create instance of configured document class "%s"', 103 => 'Configured document class "%s" must be a subclass of \DOMDocument', + 201 => 'Unable to serialize unsupported node type %s', ]; public function __construct(int $code, array $args = [], \Throwable $previous = null) { diff --git a/lib/Parser/NameCoercion.php b/lib/Parser/NameCoercion.php index eba16bf..9a0dc7b 100644 --- a/lib/Parser/NameCoercion.php +++ b/lib/Parser/NameCoercion.php @@ -10,7 +10,7 @@ use MensBeam\Intl\Encoding\UTF8; trait NameCoercion { /** @codeCoverageIgnore */ - protected function coerceNameFifthEdition(string $name): string { + protected static function coerceNameFifthEdition(string $name): string { // This matches the inverse of the production of NameChar in XML 1.0 Fifth Edition, // with the added exclusion of ":" from allowed characters // See https://www.w3.org/TR/REC-xml/#NT-NameStartChar @@ -30,27 +30,34 @@ trait NameCoercion { return $name; } - protected function coerceName(string $name): string { - // This matches the inverse of the production of Name in XML 1.0 Fourth Edition, - // with the added exclusion of ":" from allowed characters - // See https://www.w3.org/TR/2006/REC-xml-20060816/#NT-NameChar - preg_match_all('/[^_\.\-\x{41}-\x{5A}\x{61}-\x{7A}\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{100}-\x{131}\x{134}-\x{13E}\x{141}-\x{148}\x{14A}-\x{17E}\x{180}-\x{1C3}\x{1CD}-\x{1F0}\x{1F4}-\x{1F5}\x{1FA}-\x{217}\x{250}-\x{2A8}\x{2BB}-\x{2C1}\x{386}\x{388}-\x{38A}\x{38C}\x{38E}-\x{3A1}\x{3A3}-\x{3CE}\x{3D0}-\x{3D6}\x{3DA}\x{3DC}\x{3DE}\x{3E0}\x{3E2}-\x{3F3}\x{401}-\x{40C}\x{40E}-\x{44F}\x{451}-\x{45C}\x{45E}-\x{481}\x{490}-\x{4C4}\x{4C7}-\x{4C8}\x{4CB}-\x{4CC}\x{4D0}-\x{4EB}\x{4EE}-\x{4F5}\x{4F8}-\x{4F9}\x{531}-\x{556}\x{559}\x{561}-\x{586}\x{5D0}-\x{5EA}\x{5F0}-\x{5F2}\x{621}-\x{63A}\x{641}-\x{64A}\x{671}-\x{6B7}\x{6BA}-\x{6BE}\x{6C0}-\x{6CE}\x{6D0}-\x{6D3}\x{6D5}\x{6E5}-\x{6E6}\x{905}-\x{939}\x{93D}\x{958}-\x{961}\x{985}-\x{98C}\x{98F}-\x{990}\x{993}-\x{9A8}\x{9AA}-\x{9B0}\x{9B2}\x{9B6}-\x{9B9}\x{9DC}-\x{9DD}\x{9DF}-\x{9E1}\x{9F0}-\x{9F1}\x{A05}-\x{A0A}\x{A0F}-\x{A10}\x{A13}-\x{A28}\x{A2A}-\x{A30}\x{A32}-\x{A33}\x{A35}-\x{A36}\x{A38}-\x{A39}\x{A59}-\x{A5C}\x{A5E}\x{A72}-\x{A74}\x{A85}-\x{A8B}\x{A8D}\x{A8F}-\x{A91}\x{A93}-\x{AA8}\x{AAA}-\x{AB0}\x{AB2}-\x{AB3}\x{AB5}-\x{AB9}\x{ABD}\x{AE0}\x{B05}-\x{B0C}\x{B0F}-\x{B10}\x{B13}-\x{B28}\x{B2A}-\x{B30}\x{B32}-\x{B33}\x{B36}-\x{B39}\x{B3D}\x{B5C}-\x{B5D}\x{B5F}-\x{B61}\x{B85}-\x{B8A}\x{B8E}-\x{B90}\x{B92}-\x{B95}\x{B99}-\x{B9A}\x{B9C}\x{B9E}-\x{B9F}\x{BA3}-\x{BA4}\x{BA8}-\x{BAA}\x{BAE}-\x{BB5}\x{BB7}-\x{BB9}\x{C05}-\x{C0C}\x{C0E}-\x{C10}\x{C12}-\x{C28}\x{C2A}-\x{C33}\x{C35}-\x{C39}\x{C60}-\x{C61}\x{C85}-\x{C8C}\x{C8E}-\x{C90}\x{C92}-\x{CA8}\x{CAA}-\x{CB3}\x{CB5}-\x{CB9}\x{CDE}\x{CE0}-\x{CE1}\x{D05}-\x{D0C}\x{D0E}-\x{D10}\x{D12}-\x{D28}\x{D2A}-\x{D39}\x{D60}-\x{D61}\x{E01}-\x{E2E}\x{E30}\x{E32}-\x{E33}\x{E40}-\x{E45}\x{E81}-\x{E82}\x{E84}\x{E87}-\x{E88}\x{E8A}\x{E8D}\x{E94}-\x{E97}\x{E99}-\x{E9F}\x{EA1}-\x{EA3}\x{EA5}\x{EA7}\x{EAA}-\x{EAB}\x{EAD}-\x{EAE}\x{EB0}\x{EB2}-\x{EB3}\x{EBD}\x{EC0}-\x{EC4}\x{F40}-\x{F47}\x{F49}-\x{F69}\x{10A0}-\x{10C5}\x{10D0}-\x{10F6}\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}\x{110B}-\x{110C}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}\x{1150}\x{1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}\x{1167}\x{1169}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}\x{11AB}\x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}\x{11EB}\x{11F0}\x{11F9}\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}\x{3041}-\x{3094}\x{30A1}-\x{30FA}\x{3105}-\x{312C}\x{AC00}-\x{D7A3}\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}\x{30}-\x{39}\x{660}-\x{669}\x{6F0}-\x{6F9}\x{966}-\x{96F}\x{9E6}-\x{9EF}\x{A66}-\x{A6F}\x{AE6}-\x{AEF}\x{B66}-\x{B6F}\x{BE7}-\x{BEF}\x{C66}-\x{C6F}\x{CE6}-\x{CEF}\x{D66}-\x{D6F}\x{E50}-\x{E59}\x{ED0}-\x{ED9}\x{F20}-\x{F29}\x{300}-\x{345}\x{360}-\x{361}\x{483}-\x{486}\x{591}-\x{5A1}\x{5A3}-\x{5B9}\x{5BB}-\x{5BD}\x{5BF}\x{5C1}-\x{5C2}\x{5C4}\x{64B}-\x{652}\x{670}\x{6D6}-\x{6DC}\x{6DD}-\x{6DF}\x{6E0}-\x{6E4}\x{6E7}-\x{6E8}\x{6EA}-\x{6ED}\x{901}-\x{903}\x{93C}\x{93E}-\x{94C}\x{94D}\x{951}-\x{954}\x{962}-\x{963}\x{981}-\x{983}\x{9BC}\x{9BE}\x{9BF}\x{9C0}-\x{9C4}\x{9C7}-\x{9C8}\x{9CB}-\x{9CD}\x{9D7}\x{9E2}-\x{9E3}\x{A02}\x{A3C}\x{A3E}\x{A3F}\x{A40}-\x{A42}\x{A47}-\x{A48}\x{A4B}-\x{A4D}\x{A70}-\x{A71}\x{A81}-\x{A83}\x{ABC}\x{ABE}-\x{AC5}\x{AC7}-\x{AC9}\x{ACB}-\x{ACD}\x{B01}-\x{B03}\x{B3C}\x{B3E}-\x{B43}\x{B47}-\x{B48}\x{B4B}-\x{B4D}\x{B56}-\x{B57}\x{B82}-\x{B83}\x{BBE}-\x{BC2}\x{BC6}-\x{BC8}\x{BCA}-\x{BCD}\x{BD7}\x{C01}-\x{C03}\x{C3E}-\x{C44}\x{C46}-\x{C48}\x{C4A}-\x{C4D}\x{C55}-\x{C56}\x{C82}-\x{C83}\x{CBE}-\x{CC4}\x{CC6}-\x{CC8}\x{CCA}-\x{CCD}\x{CD5}-\x{CD6}\x{D02}-\x{D03}\x{D3E}-\x{D43}\x{D46}-\x{D48}\x{D4A}-\x{D4D}\x{D57}\x{E31}\x{E34}-\x{E3A}\x{E47}-\x{E4E}\x{EB1}\x{EB4}-\x{EB9}\x{EBB}-\x{EBC}\x{EC8}-\x{ECD}\x{F18}-\x{F19}\x{F35}\x{F37}\x{F39}\x{F3E}\x{F3F}\x{F71}-\x{F84}\x{F86}-\x{F8B}\x{F90}-\x{F95}\x{F97}\x{F99}-\x{FAD}\x{FB1}-\x{FB7}\x{FB9}\x{20D0}-\x{20DC}\x{20E1}\x{302A}-\x{302F}\x{3099}\x{309A}\x{B7}\x{2D0}\x{2D1}\x{387}\x{640}\x{E46}\x{EC6}\x{3005}\x{3031}-\x{3035}\x{309D}-\x{309E}\x{30FC}-\x{30FE}]/u', $name, $m); - foreach (array_unique($m[0], \SORT_STRING) as $c) { - $o = (new UTF8($c))->nextCode(); - $esc = "U".str_pad(strtoupper(dechex($o)), 6, "0", \STR_PAD_LEFT); - $name = str_replace($c, $esc, $name); - } - // Apply stricter rules to the first character - if (preg_match('/^[^_\x{41}-\x{5A}\x{61}-\x{7A}\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{100}-\x{131}\x{134}-\x{13E}\x{141}-\x{148}\x{14A}-\x{17E}\x{180}-\x{1C3}\x{1CD}-\x{1F0}\x{1F4}-\x{1F5}\x{1FA}-\x{217}\x{250}-\x{2A8}\x{2BB}-\x{2C1}\x{386}\x{388}-\x{38A}\x{38C}\x{38E}-\x{3A1}\x{3A3}-\x{3CE}\x{3D0}-\x{3D6}\x{3DA}\x{3DC}\x{3DE}\x{3E0}\x{3E2}-\x{3F3}\x{401}-\x{40C}\x{40E}-\x{44F}\x{451}-\x{45C}\x{45E}-\x{481}\x{490}-\x{4C4}\x{4C7}-\x{4C8}\x{4CB}-\x{4CC}\x{4D0}-\x{4EB}\x{4EE}-\x{4F5}\x{4F8}-\x{4F9}\x{531}-\x{556}\x{559}\x{561}-\x{586}\x{5D0}-\x{5EA}\x{5F0}-\x{5F2}\x{621}-\x{63A}\x{641}-\x{64A}\x{671}-\x{6B7}\x{6BA}-\x{6BE}\x{6C0}-\x{6CE}\x{6D0}-\x{6D3}\x{6D5}\x{6E5}-\x{6E6}\x{905}-\x{939}\x{93D}\x{958}-\x{961}\x{985}-\x{98C}\x{98F}-\x{990}\x{993}-\x{9A8}\x{9AA}-\x{9B0}\x{9B2}\x{9B6}-\x{9B9}\x{9DC}-\x{9DD}\x{9DF}-\x{9E1}\x{9F0}-\x{9F1}\x{A05}-\x{A0A}\x{A0F}-\x{A10}\x{A13}-\x{A28}\x{A2A}-\x{A30}\x{A32}-\x{A33}\x{A35}-\x{A36}\x{A38}-\x{A39}\x{A59}-\x{A5C}\x{A5E}\x{A72}-\x{A74}\x{A85}-\x{A8B}\x{A8D}\x{A8F}-\x{A91}\x{A93}-\x{AA8}\x{AAA}-\x{AB0}\x{AB2}-\x{AB3}\x{AB5}-\x{AB9}\x{ABD}\x{AE0}\x{B05}-\x{B0C}\x{B0F}-\x{B10}\x{B13}-\x{B28}\x{B2A}-\x{B30}\x{B32}-\x{B33}\x{B36}-\x{B39}\x{B3D}\x{B5C}-\x{B5D}\x{B5F}-\x{B61}\x{B85}-\x{B8A}\x{B8E}-\x{B90}\x{B92}-\x{B95}\x{B99}-\x{B9A}\x{B9C}\x{B9E}-\x{B9F}\x{BA3}-\x{BA4}\x{BA8}-\x{BAA}\x{BAE}-\x{BB5}\x{BB7}-\x{BB9}\x{C05}-\x{C0C}\x{C0E}-\x{C10}\x{C12}-\x{C28}\x{C2A}-\x{C33}\x{C35}-\x{C39}\x{C60}-\x{C61}\x{C85}-\x{C8C}\x{C8E}-\x{C90}\x{C92}-\x{CA8}\x{CAA}-\x{CB3}\x{CB5}-\x{CB9}\x{CDE}\x{CE0}-\x{CE1}\x{D05}-\x{D0C}\x{D0E}-\x{D10}\x{D12}-\x{D28}\x{D2A}-\x{D39}\x{D60}-\x{D61}\x{E01}-\x{E2E}\x{E30}\x{E32}-\x{E33}\x{E40}-\x{E45}\x{E81}-\x{E82}\x{E84}\x{E87}-\x{E88}\x{E8A}\x{E8D}\x{E94}-\x{E97}\x{E99}-\x{E9F}\x{EA1}-\x{EA3}\x{EA5}\x{EA7}\x{EAA}-\x{EAB}\x{EAD}-\x{EAE}\x{EB0}\x{EB2}-\x{EB3}\x{EBD}\x{EC0}-\x{EC4}\x{F40}-\x{F47}\x{F49}-\x{F69}\x{10A0}-\x{10C5}\x{10D0}-\x{10F6}\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}\x{110B}-\x{110C}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}\x{1150}\x{1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}\x{1167}\x{1169}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}\x{11AB}\x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}\x{11EB}\x{11F0}\x{11F9}\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}\x{3041}-\x{3094}\x{30A1}-\x{30FA}\x{3105}-\x{312C}\x{AC00}-\x{D7A3}\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}]/u', $name, $m)) { - $c = (string) $m[0]; - $o = (new UTF8($c))->nextCode(); - $esc = "U".str_pad(strtoupper(dechex($o)), 6, "0", \STR_PAD_LEFT); - $name = $esc.substr($name, strlen($c)); + protected static function coerceName(string $name, bool $prefixed = false): string { + if ($prefixed) { + $name = explode(":", $name, 2); + } else { + $name = [$name]; } - return $name; + return implode(":", array_map(function($name) { + // This matches the inverse of the production of Name in XML 1.0 Fourth Edition, + // with the added exclusion of ":" from allowed characters + // See https://www.w3.org/TR/2006/REC-xml-20060816/#NT-NameChar + preg_match_all('/[^_\.\-\x{41}-\x{5A}\x{61}-\x{7A}\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{100}-\x{131}\x{134}-\x{13E}\x{141}-\x{148}\x{14A}-\x{17E}\x{180}-\x{1C3}\x{1CD}-\x{1F0}\x{1F4}-\x{1F5}\x{1FA}-\x{217}\x{250}-\x{2A8}\x{2BB}-\x{2C1}\x{386}\x{388}-\x{38A}\x{38C}\x{38E}-\x{3A1}\x{3A3}-\x{3CE}\x{3D0}-\x{3D6}\x{3DA}\x{3DC}\x{3DE}\x{3E0}\x{3E2}-\x{3F3}\x{401}-\x{40C}\x{40E}-\x{44F}\x{451}-\x{45C}\x{45E}-\x{481}\x{490}-\x{4C4}\x{4C7}-\x{4C8}\x{4CB}-\x{4CC}\x{4D0}-\x{4EB}\x{4EE}-\x{4F5}\x{4F8}-\x{4F9}\x{531}-\x{556}\x{559}\x{561}-\x{586}\x{5D0}-\x{5EA}\x{5F0}-\x{5F2}\x{621}-\x{63A}\x{641}-\x{64A}\x{671}-\x{6B7}\x{6BA}-\x{6BE}\x{6C0}-\x{6CE}\x{6D0}-\x{6D3}\x{6D5}\x{6E5}-\x{6E6}\x{905}-\x{939}\x{93D}\x{958}-\x{961}\x{985}-\x{98C}\x{98F}-\x{990}\x{993}-\x{9A8}\x{9AA}-\x{9B0}\x{9B2}\x{9B6}-\x{9B9}\x{9DC}-\x{9DD}\x{9DF}-\x{9E1}\x{9F0}-\x{9F1}\x{A05}-\x{A0A}\x{A0F}-\x{A10}\x{A13}-\x{A28}\x{A2A}-\x{A30}\x{A32}-\x{A33}\x{A35}-\x{A36}\x{A38}-\x{A39}\x{A59}-\x{A5C}\x{A5E}\x{A72}-\x{A74}\x{A85}-\x{A8B}\x{A8D}\x{A8F}-\x{A91}\x{A93}-\x{AA8}\x{AAA}-\x{AB0}\x{AB2}-\x{AB3}\x{AB5}-\x{AB9}\x{ABD}\x{AE0}\x{B05}-\x{B0C}\x{B0F}-\x{B10}\x{B13}-\x{B28}\x{B2A}-\x{B30}\x{B32}-\x{B33}\x{B36}-\x{B39}\x{B3D}\x{B5C}-\x{B5D}\x{B5F}-\x{B61}\x{B85}-\x{B8A}\x{B8E}-\x{B90}\x{B92}-\x{B95}\x{B99}-\x{B9A}\x{B9C}\x{B9E}-\x{B9F}\x{BA3}-\x{BA4}\x{BA8}-\x{BAA}\x{BAE}-\x{BB5}\x{BB7}-\x{BB9}\x{C05}-\x{C0C}\x{C0E}-\x{C10}\x{C12}-\x{C28}\x{C2A}-\x{C33}\x{C35}-\x{C39}\x{C60}-\x{C61}\x{C85}-\x{C8C}\x{C8E}-\x{C90}\x{C92}-\x{CA8}\x{CAA}-\x{CB3}\x{CB5}-\x{CB9}\x{CDE}\x{CE0}-\x{CE1}\x{D05}-\x{D0C}\x{D0E}-\x{D10}\x{D12}-\x{D28}\x{D2A}-\x{D39}\x{D60}-\x{D61}\x{E01}-\x{E2E}\x{E30}\x{E32}-\x{E33}\x{E40}-\x{E45}\x{E81}-\x{E82}\x{E84}\x{E87}-\x{E88}\x{E8A}\x{E8D}\x{E94}-\x{E97}\x{E99}-\x{E9F}\x{EA1}-\x{EA3}\x{EA5}\x{EA7}\x{EAA}-\x{EAB}\x{EAD}-\x{EAE}\x{EB0}\x{EB2}-\x{EB3}\x{EBD}\x{EC0}-\x{EC4}\x{F40}-\x{F47}\x{F49}-\x{F69}\x{10A0}-\x{10C5}\x{10D0}-\x{10F6}\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}\x{110B}-\x{110C}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}\x{1150}\x{1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}\x{1167}\x{1169}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}\x{11AB}\x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}\x{11EB}\x{11F0}\x{11F9}\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}\x{3041}-\x{3094}\x{30A1}-\x{30FA}\x{3105}-\x{312C}\x{AC00}-\x{D7A3}\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}\x{30}-\x{39}\x{660}-\x{669}\x{6F0}-\x{6F9}\x{966}-\x{96F}\x{9E6}-\x{9EF}\x{A66}-\x{A6F}\x{AE6}-\x{AEF}\x{B66}-\x{B6F}\x{BE7}-\x{BEF}\x{C66}-\x{C6F}\x{CE6}-\x{CEF}\x{D66}-\x{D6F}\x{E50}-\x{E59}\x{ED0}-\x{ED9}\x{F20}-\x{F29}\x{300}-\x{345}\x{360}-\x{361}\x{483}-\x{486}\x{591}-\x{5A1}\x{5A3}-\x{5B9}\x{5BB}-\x{5BD}\x{5BF}\x{5C1}-\x{5C2}\x{5C4}\x{64B}-\x{652}\x{670}\x{6D6}-\x{6DC}\x{6DD}-\x{6DF}\x{6E0}-\x{6E4}\x{6E7}-\x{6E8}\x{6EA}-\x{6ED}\x{901}-\x{903}\x{93C}\x{93E}-\x{94C}\x{94D}\x{951}-\x{954}\x{962}-\x{963}\x{981}-\x{983}\x{9BC}\x{9BE}\x{9BF}\x{9C0}-\x{9C4}\x{9C7}-\x{9C8}\x{9CB}-\x{9CD}\x{9D7}\x{9E2}-\x{9E3}\x{A02}\x{A3C}\x{A3E}\x{A3F}\x{A40}-\x{A42}\x{A47}-\x{A48}\x{A4B}-\x{A4D}\x{A70}-\x{A71}\x{A81}-\x{A83}\x{ABC}\x{ABE}-\x{AC5}\x{AC7}-\x{AC9}\x{ACB}-\x{ACD}\x{B01}-\x{B03}\x{B3C}\x{B3E}-\x{B43}\x{B47}-\x{B48}\x{B4B}-\x{B4D}\x{B56}-\x{B57}\x{B82}-\x{B83}\x{BBE}-\x{BC2}\x{BC6}-\x{BC8}\x{BCA}-\x{BCD}\x{BD7}\x{C01}-\x{C03}\x{C3E}-\x{C44}\x{C46}-\x{C48}\x{C4A}-\x{C4D}\x{C55}-\x{C56}\x{C82}-\x{C83}\x{CBE}-\x{CC4}\x{CC6}-\x{CC8}\x{CCA}-\x{CCD}\x{CD5}-\x{CD6}\x{D02}-\x{D03}\x{D3E}-\x{D43}\x{D46}-\x{D48}\x{D4A}-\x{D4D}\x{D57}\x{E31}\x{E34}-\x{E3A}\x{E47}-\x{E4E}\x{EB1}\x{EB4}-\x{EB9}\x{EBB}-\x{EBC}\x{EC8}-\x{ECD}\x{F18}-\x{F19}\x{F35}\x{F37}\x{F39}\x{F3E}\x{F3F}\x{F71}-\x{F84}\x{F86}-\x{F8B}\x{F90}-\x{F95}\x{F97}\x{F99}-\x{FAD}\x{FB1}-\x{FB7}\x{FB9}\x{20D0}-\x{20DC}\x{20E1}\x{302A}-\x{302F}\x{3099}\x{309A}\x{B7}\x{2D0}\x{2D1}\x{387}\x{640}\x{E46}\x{EC6}\x{3005}\x{3031}-\x{3035}\x{309D}-\x{309E}\x{30FC}-\x{30FE}]/u', $name, $m); + foreach (array_unique($m[0], \SORT_STRING) as $c) { + $o = (new UTF8($c))->nextCode(); + $esc = "U".str_pad(strtoupper(dechex($o)), 6, "0", \STR_PAD_LEFT); + $name = str_replace($c, $esc, $name); + } + // Apply stricter rules to the first character + if (preg_match('/^[^_\x{41}-\x{5A}\x{61}-\x{7A}\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{100}-\x{131}\x{134}-\x{13E}\x{141}-\x{148}\x{14A}-\x{17E}\x{180}-\x{1C3}\x{1CD}-\x{1F0}\x{1F4}-\x{1F5}\x{1FA}-\x{217}\x{250}-\x{2A8}\x{2BB}-\x{2C1}\x{386}\x{388}-\x{38A}\x{38C}\x{38E}-\x{3A1}\x{3A3}-\x{3CE}\x{3D0}-\x{3D6}\x{3DA}\x{3DC}\x{3DE}\x{3E0}\x{3E2}-\x{3F3}\x{401}-\x{40C}\x{40E}-\x{44F}\x{451}-\x{45C}\x{45E}-\x{481}\x{490}-\x{4C4}\x{4C7}-\x{4C8}\x{4CB}-\x{4CC}\x{4D0}-\x{4EB}\x{4EE}-\x{4F5}\x{4F8}-\x{4F9}\x{531}-\x{556}\x{559}\x{561}-\x{586}\x{5D0}-\x{5EA}\x{5F0}-\x{5F2}\x{621}-\x{63A}\x{641}-\x{64A}\x{671}-\x{6B7}\x{6BA}-\x{6BE}\x{6C0}-\x{6CE}\x{6D0}-\x{6D3}\x{6D5}\x{6E5}-\x{6E6}\x{905}-\x{939}\x{93D}\x{958}-\x{961}\x{985}-\x{98C}\x{98F}-\x{990}\x{993}-\x{9A8}\x{9AA}-\x{9B0}\x{9B2}\x{9B6}-\x{9B9}\x{9DC}-\x{9DD}\x{9DF}-\x{9E1}\x{9F0}-\x{9F1}\x{A05}-\x{A0A}\x{A0F}-\x{A10}\x{A13}-\x{A28}\x{A2A}-\x{A30}\x{A32}-\x{A33}\x{A35}-\x{A36}\x{A38}-\x{A39}\x{A59}-\x{A5C}\x{A5E}\x{A72}-\x{A74}\x{A85}-\x{A8B}\x{A8D}\x{A8F}-\x{A91}\x{A93}-\x{AA8}\x{AAA}-\x{AB0}\x{AB2}-\x{AB3}\x{AB5}-\x{AB9}\x{ABD}\x{AE0}\x{B05}-\x{B0C}\x{B0F}-\x{B10}\x{B13}-\x{B28}\x{B2A}-\x{B30}\x{B32}-\x{B33}\x{B36}-\x{B39}\x{B3D}\x{B5C}-\x{B5D}\x{B5F}-\x{B61}\x{B85}-\x{B8A}\x{B8E}-\x{B90}\x{B92}-\x{B95}\x{B99}-\x{B9A}\x{B9C}\x{B9E}-\x{B9F}\x{BA3}-\x{BA4}\x{BA8}-\x{BAA}\x{BAE}-\x{BB5}\x{BB7}-\x{BB9}\x{C05}-\x{C0C}\x{C0E}-\x{C10}\x{C12}-\x{C28}\x{C2A}-\x{C33}\x{C35}-\x{C39}\x{C60}-\x{C61}\x{C85}-\x{C8C}\x{C8E}-\x{C90}\x{C92}-\x{CA8}\x{CAA}-\x{CB3}\x{CB5}-\x{CB9}\x{CDE}\x{CE0}-\x{CE1}\x{D05}-\x{D0C}\x{D0E}-\x{D10}\x{D12}-\x{D28}\x{D2A}-\x{D39}\x{D60}-\x{D61}\x{E01}-\x{E2E}\x{E30}\x{E32}-\x{E33}\x{E40}-\x{E45}\x{E81}-\x{E82}\x{E84}\x{E87}-\x{E88}\x{E8A}\x{E8D}\x{E94}-\x{E97}\x{E99}-\x{E9F}\x{EA1}-\x{EA3}\x{EA5}\x{EA7}\x{EAA}-\x{EAB}\x{EAD}-\x{EAE}\x{EB0}\x{EB2}-\x{EB3}\x{EBD}\x{EC0}-\x{EC4}\x{F40}-\x{F47}\x{F49}-\x{F69}\x{10A0}-\x{10C5}\x{10D0}-\x{10F6}\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}\x{110B}-\x{110C}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}\x{1150}\x{1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}\x{1167}\x{1169}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}\x{11AB}\x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}\x{11EB}\x{11F0}\x{11F9}\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}\x{3041}-\x{3094}\x{30A1}-\x{30FA}\x{3105}-\x{312C}\x{AC00}-\x{D7A3}\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}]/u', $name, $m)) { + $c = (string) $m[0]; + $o = (new UTF8($c))->nextCode(); + $esc = "U".str_pad(strtoupper(dechex($o)), 6, "0", \STR_PAD_LEFT); + $name = $esc.substr($name, strlen($c)); + } + return $name; + }, $name)); } - protected function uncoerceName(string $name): string { + protected static function uncoerceName(string $name): string { preg_match_all('/U[0-9A-F]{6}/', $name, $m); foreach (array_unique($m[0], \SORT_STRING) as $o) { $c = UTF8::encode(hexdec(substr($o, 1))); @@ -59,7 +66,7 @@ trait NameCoercion { return $name; } - protected function escapeString(string $string, bool $attribute = false): string { + protected static function escapeString(string $string, bool $attribute = false): string { # Escaping a string (for the purposes of the algorithm above) consists of # running the following steps: diff --git a/lib/Parser/ParseErrorEmitter.php b/lib/Parser/ParseErrorEmitter.php index 68e5beb..316d344 100644 --- a/lib/Parser/ParseErrorEmitter.php +++ b/lib/Parser/ParseErrorEmitter.php @@ -78,7 +78,7 @@ trait ParseErrorEmitter { // Count the number of replacements needed in the message. $count = substr_count($message, '%s'); // If the number of replacements don't match the arguments then oops. - assert(count($arg) === $count, new \Exception("Parse error message expects $count parameters")); + assert(count($arg) === $count, new \Exception("Message of parse error $code expects $count parameters")); if ($count > 0) { // Convert newlines and tabs in the arguments to words to better @@ -89,7 +89,7 @@ trait ParseErrorEmitter { } elseif ($value === "\t") { return 'Tab'; } elseif ($value === null) { - return 'nothing'; + return 'nothing'; // @codeCoverageIgnore } else { return $value; } diff --git a/lib/Parser/Serializer.php b/lib/Parser/Serializer.php new file mode 100644 index 0000000..a3c430e --- /dev/null +++ b/lib/Parser/Serializer.php @@ -0,0 +1,232 @@ +namespaceURI ?? Parser::HTML_NAMESPACE, [Parser::HTML_NAMESPACE, Parser::SVG_NAMESPACE, Parser::MATHML_NAMESPACE])) { + $tagName = self::uncoerceName($n->localName); + } else { + $tagName = self::uncoerceName($n->tagName); + } + # Append a U+003C LESS-THAN SIGN character (<), followed by tagname. + $s .= "<$tagName"; + # If current node's is value is not null, and the element does + # not have an is attribute in its attribute list, then + # append the string " is="", followed by current node's is + # value escaped as described below in attribute mode, + # followed by a U+0022 QUOTATION MARK character ("). + // DEVIATION: We don't support custom elements + # For each attribute that the element has, append a + # U+0020 SPACE character, the attribute's serialized name as + # described below, a U+003D EQUALS SIGN character (=), a + # U+0022 QUOTATION MARK character ("), the attribute's + # value, escaped as described below in attribute mode, and + # a second U+0022 QUOTATION MARK character ("). + foreach ($n->attributes as $a) { + # An attribute's serialized name for the purposes of the previous + # paragraph must be determined as follows: + + # If the attribute has no namespace + if ($a->namespaceURI === null) { + # The attribute's serialized name is the attribute's local name. + $name = self::uncoerceName($a->localName); + } + # If the attribute is in the XML namespace + elseif ($a->namespaceURI === Parser::XML_NAMESPACE) { + # The attribute's serialized name is the string "xml:" followed + # by the attribute's local name. + $name = "xml:".self::uncoerceName($a->localName); + } + # If the attribute is in the XMLNS namespace... + elseif ($a->namespaceURI === Parser::XMLNS_NAMESPACE) { + # ... and the attribute's local name is xmlns + if ($a->localName === "xmlns") { + # The attribute's serialized name is the string "xmlns". + $name = "xmlns"; + } + # ... and the attribute's local name is not xmlns + else { + # The attribute's serialized name is the string "xmlns:" + # followed by the attribute's local name. + $name = "xmlns:".self::uncoerceName($a->localName); + } + } + # If the attribute is in the XLink namespace + elseif ($a->namespaceURI === Parser::XLINK_NAMESPACE) { + # The attribute's serialized name is the string "xlink:" + # followed by the attribute's local name. + $name = "xlink:".self::uncoerceName($a->localName); + } + # If the attribute is in some other namespace + else { + # The attribute's serialized name is the attribute's qualified name. + $name = ($a->prefix !== "") ? $a->prefix.":".$a->name : $a->name; + } + $value = self::escapeString((string) $a->value, true); + $s .= " $name=\"$value\""; + } + # Append a U+003E GREATER-THAN SIGN character (>). + $s .= ">"; + # If current node serializes as void, then continue on to the + # next child node at this point. + # Append the value of running the HTML fragment serialization + # algorithm on the current node element (thus recursing into + # this algorithm for that element), followed by a + # U+003C LESS-THAN SIGN character (<), a U+002F SOLIDUS + # character (/), tagname again, and finally a + # U+003E GREATER-THAN SIGN character (>). + if (($n->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE || !in_array($tagName, self::VOID_ELEMENTS)) { + # If the node is a template element, then let the node instead + # be the template element's template contents + # (a DocumentFragment node). + if ( + ($n->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE + && $n->tagName === "template" + && property_exists($n, "content") + && $n->content instanceof \DOMDocumentFragment + ) { + // NOTE: Treat template content as any other document + // fragment and just invoke the inner serializer + $s .= self::serializeInner($n->content).""; + } elseif ($n->hasChildNodes()) { + // If the element has children, store its tag name and + // continue the loop with its first child; its end + // tag will be written out further down + $stack[] = $tagName; + $n = $n->firstChild; + continue; + } else { + // Otherwise just append the end tag now + $s .= ""; + } + } + } + # If current node is a Text node + elseif ($n instanceof \DOMText) { + # If the parent of current node is a style, script, xmp, + # iframe, noembed, noframes, or plaintext element, or + # if the parent of current node is a noscript element + # and scripting is enabled for the node, then append + # the value of current node's data IDL attribute literally. + $p = $n->parentNode; + if ($p instanceof \DOMElement && ($p->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE && in_array($p->tagName, self::RAWTEXT_ELEMENTS)) { + // NOTE: scripting is assumed not to be enabled + $s .= $n->data; + } + # Otherwise, append the value of current node's data IDL attribute, escaped as described below. + else { + $s .= self::escapeString($n->data); + } + } + # If current node is a Comment + elseif ($n instanceof \DOMComment) { + # Append the literal string "" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS, + # U+003E GREATER-THAN SIGN). + $s .= ""; + } + # If current node is a ProcessingInstruction + elseif ($n instanceof \DOMProcessingInstruction) { + # Append the literal string "). + $s .= "target)." ".$n->data.">"; + } + # If current node is a DocumentType + elseif ($n instanceof \DOMDocumentType) { + # Append the literal string "" (U+003E GREATER-THAN SIGN). + $s .= "name).">"; + } + // NOTE: Documents and document fragments have no outer content, + // so we can just serialize the inner content + elseif ($n instanceof \DOMDocument || $n instanceof \DOMDocumentFragment) { + return self::serializeInner($n); + } else { + throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($n)]); + } + // If the current node has no more siblings, go up the tree till a + // sibling is found or we've reached the original node + while (!$n->nextSibling && $stack) { + // Write out the stored end tag each time we go up the tree + $tagName = array_pop($stack); + $s .= ""; + $n = $n->parentNode; + } + $n = $n->nextSibling; + } while ($stack); // Loop until we have traversed the subtree of the target node in full + return $s; + } + + /** Serializes the children of an HTML DOM node to a string. This is equivalent to the innerHTML getter + * + * @param \DOMDocument|\DOMElement|\DOMDocumentFragment $node The node to serialize + */ + public static function serializeInner(\DOMNode $node): string { + # Let s be a string, and initialize it to the empty string. + $s = ""; + + if ($node instanceof \DOMElement && ($node->namespaceURI ?? Parser::HTML_NAMESPACE) === Parser::HTML_NAMESPACE) { + # If the node serializes as void, then return the empty string. + if (in_array($node->tagName, self::VOID_ELEMENTS)) { + return ""; + } + # If the node is a template element, then let the node instead + # be the template element's template contents + # (a DocumentFragment node). + elseif ($node->tagName === "template" && property_exists($node, "content") && $node->content instanceof \DOMDocumentFragment) { + // NOTE: template elements won't necessarily have a content + // property because PHP's DOM does not support this natively + $node = $node->content; + } + } + if ($node instanceof \DOMElement || $node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) { + # For each child node of the node, in tree order, run the following steps: + // NOTE: the steps in question are implemented in the "serializeOuter" routine + foreach ($node->childNodes as $n) { + $s .= self::serializeOuter($n); + } + } else { + throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($node)]); + } + return $s; + } +} diff --git a/lib/Parser/TreeConstructor.php b/lib/Parser/TreeConstructor.php index f0b2a22..b8fa91f 100644 --- a/lib/Parser/TreeConstructor.php +++ b/lib/Parser/TreeConstructor.php @@ -9,7 +9,7 @@ namespace MensBeam\HTML\Parser; use MensBeam\HTML\Parser; class TreeConstructor { - use ParseErrorEmitter, NameCoercion; + use ParseErrorEmitter, NameCoercion, AttributeSetter; public $debugLog = ""; @@ -335,12 +335,6 @@ class TreeConstructor { $iterations = 0; $insertionMode = $this->insertionMode; - // If element name coercison has occurred at some earlier point, - // we must coerce all end tag names to match mangled start tags - if ($token instanceof EndTagToken && $this->mangledElements) { - $token->name = $this->coerceName($token->name); - } - # 13.2.6 Tree construction # # As each token is emitted from the tokenizer, the user agent must follow the @@ -379,6 +373,12 @@ class TreeConstructor { return true; })()); + // If element name coercison has occurred at some earlier point, + // we must coerce all end tag names to match mangled start tags + if ($this->mangledElements && $token instanceof EndTagToken) { + $token->name = self::coerceName($token->name); + } + # 13.2.6.4. The rules for parsing tokens in HTML content // OPTIMIZATION: Evaluation the "in body" mode first is // faster for typical documents @@ -401,7 +401,7 @@ class TreeConstructor { // If attribute name coercison has occurred at some earlier point, // we must coerce all attributes on html and body start tags in // case they are relocated to existing elements - $attrName = $this->mangledAttributes ? $this->coerceName($a->name) : $a->name; + $attrName = $this->mangledAttributes ? self::coerceName($a->name) : $a->name; if (!$top->hasAttributeNS(null, $attrName)) { $this->elementSetAttribute($top, null, $attrName, $a->value); } @@ -433,7 +433,7 @@ class TreeConstructor { // If attribute name coercison has occurred at some earlier point, // we must coerce all attributes on html and body start tags in // case they are relocated to existing elements - $attrName = $this->mangledAttributes ? $this->coerceName($a->name) : $a->name; + $attrName = $this->mangledAttributes ? self::coerceName($a->name) : $a->name; if (!$body->hasAttributeNS(null, $attrName)) { $this->elementSetAttribute($body, null, $attrName, $a->value); } @@ -521,7 +521,8 @@ class TreeConstructor { if (strlen($nextToken->data) === 1 && $nextToken->data === "\n") { continue; } elseif (strpos($nextToken->data, "\n") === 0) { - $nextToken->data = substr($nextToken->data, 1); + // NOTE: This case is not currently encountered by the parser due to special handling of newlines + $nextToken->data = substr($nextToken->data, 1); // @codeCoverageIgnore } } // Process the next token @@ -818,7 +819,8 @@ class TreeConstructor { if (strlen($nextToken->data) === 1 && $nextToken->data === "\n") { continue; } elseif (strpos($nextToken->data, "\n") === 0) { - $nextToken->data = substr($nextToken->data, 1); + // NOTE: This case is not currently encountered by the parser due to special handling of newlines + $nextToken->data = substr($nextToken->data, 1); // @codeCoverageIgnore } } # Let the original insertion mode be the current insertion mode. @@ -1065,7 +1067,7 @@ class TreeConstructor { else { # 1. If the stack of open elements does not have a form element in scope, then # this is a parse error; return and ignore the token. - if ($this->stack->hasElementInScope('form')) { + if (!$this->stack->hasElementInScope('form')) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); continue; } @@ -1918,10 +1920,6 @@ class TreeConstructor { elseif ($token->name === 'noframes' || $token->name === 'style') { $this->parseGenericRawText($token); } - elseif ($token->name === 'noscript') { - $this->insertStartTagToken($token); - $this->insertionMode = self::IN_HEAD_NOSCRIPT_MODE; - } elseif ($token->name === 'script') { $this->insertStartTagToken($token); $this->tokenizer->state = Tokenizer::SCRIPT_DATA_STATE; @@ -2714,7 +2712,9 @@ class TreeConstructor { # element in table scope, then this is a parse error; # ignore the token. (fragment case) if (!$this->stack->hasElementInTableScope("td", "th")) { - $this->error(ParseError::UNEXPECTED_START_TAG, $token->name); + // NOTE: This case appears to be unreachable + // See https://github.com/whatwg/html/issues/7242 + $this->error(ParseError::UNEXPECTED_START_TAG, $token->name); //@codeCoverageIgnore } # Otherwise, close the cell (see below) and reprocess the token. else { @@ -2861,7 +2861,7 @@ class TreeConstructor { # An end tag... elseif ($token instanceof EndTagToken) { # An end tag whose tag name is "template" - if ($token->name === "tenplate") { + if ($token->name === "template") { # Process the token using the rules for the "in head" insertion mode. $insertionMode = self::IN_HEAD_MODE; goto ProcessToken; @@ -3262,10 +3262,8 @@ class TreeConstructor { # Anything else else { # Parse error. Ignore the token. - assert($token instanceof CharacterToken || $token instanceof TagToken, new \Exception("Invalid token class: ".get_class($token))); - if ($token instanceof StartTagToken) { - $this->error(ParseError::UNEXPECTED_START_TAG, $token->name); - } elseif ($token instanceof EndTagToken) { + assert($token instanceof CharacterToken || $token instanceof EndTagToken, new \Exception("Invalid token class: ".get_class($token))); + if ($token instanceof EndTagToken) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } elseif ($token instanceof CharacterToken) { $this->error(ParseError::UNEXPECTED_CHAR, $token->data, "exclude whitespace"); @@ -3434,6 +3432,12 @@ class TreeConstructor { return true; })()); + // If element name coercison has occurred at some earlier point, + // we must coerce all end tag names to match mangled start tags + if ($this->mangledElements && $token instanceof EndTagToken) { + $token->name = self::coerceName($token->name, true); + } + # 13.2.6.5 The rules for parsing tokens in foreign content # # When the user agent is to apply the rules for parsing tokens in foreign @@ -3587,7 +3591,8 @@ class TreeConstructor { $node = $this->stack[$pos]; # If node's tag name, converted to ASCII lowercase, is not the # same as the tag name of the token, then this is a parse error. - if (strtolower($node->nodeName) !== $token->name) { + $nodeName = self::coerceName(strtolower(self::uncoerceName($node->nodeName)), true); + if ($nodeName !== $token->name) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } do { @@ -3598,7 +3603,8 @@ class TreeConstructor { # If node's tag name, converted to ASCII lowercase, is the same as the # tag name of the token, pop elements from the stack of open elements until node # has been popped from the stack, and then abort these steps. - if (strtolower($node->nodeName) === $token->name) { + $nodeName = self::coerceName(strtolower(self::uncoerceName($node->nodeName)), true); + if ($nodeName === $token->name) { $this->stack->popUntilSame($node); continue 2; } @@ -3673,14 +3679,8 @@ class TreeConstructor { // NOTE: The "entry above" refers to the "in body" insertion mode // Changes here should be mirrored there foreach ($this->stack as $node) { - if ($node->nodeName === $token->name && $node->namespaceURI === $this->htmlNamespace) { - $this->stack->generateImpliedEndTags($token->name); - if (!$node->isSameNode($this->stack->currentNode)) { - $this->error($errorCode, $token->name); - } - $this->stack->popUntilSame($node); - return; - } elseif ($this->isElementSpecial($node)) { + // NOTE: Only the "is special" case is possible here + if ($this->isElementSpecial($node)) { $this->error($errorCode, $token->name); return; } @@ -3890,12 +3890,15 @@ class TreeConstructor { // Abort! } else { + // NOTE: This is an edge case only possible via scripting + // @codeCoverageIgnoreStart # 6. Let previous element be the element immediately above last table in the # stack of open elements. $previousElement = $this->stack[$lastTableIndex - 1]; # 7. Let adjusted insertion location be inside previous element, after its last # child (if any). $insertionLocation = $previousElement; + // @codeCoverageIgnoreEnd } } # Otherwise let adjusted insertion location be inside target, after its last @@ -4150,7 +4153,7 @@ class TreeConstructor { # 17. Let node now be the node before node in the stack of open elements. # 18. Return to the step labeled Loop. } - } + } // @codeCoverageIgnore protected function closePElement(TagToken $token) { # When the steps above say the UA is to close a p element, it means that the UA @@ -4216,11 +4219,7 @@ class TreeConstructor { // The element name is invalid for XML // Replace any offending characters with "UHHHHHH" where H are the // uppercase hexadecimal digits of the character's code point - if ($namespace !== $this->htmlNamespace) { - $qualifiedName = implode(":", array_map([$this, "coerceName"], explode(":", $token->name, 2))); - } else { - $qualifiedName = $this->coerceName($token->name); - } + $qualifiedName = self::coerceName($token->name, ($namespace !== $this->htmlNamespace)); $element = $this->DOM->createElementNS($namespace, $qualifiedName); $this->mangledElements = true; } @@ -4246,47 +4245,6 @@ class TreeConstructor { return $element; } - public function elementSetAttribute(\DOMElement $element, ?string $namespaceURI, string $qualifiedName, string $value): void { - if ($namespaceURI === Parser::XMLNS_NAMESPACE) { - // NOTE: We create attribute nodes so that xmlns attributes - // don't get lost; otherwise they cannot be serialized - try { - $a = @$element->ownerDocument->createAttributeNS($namespaceURI, $qualifiedName); - } catch (\DOMException $e) { - // FIXME: PHP has a fit here if the document element has a namespace and no prefix - // A workaround does not seem to exist - return; - } - if ($a === false) { - // The document element does not exist yet, so we need - // to insert this element into the document - $element->ownerDocument->appendChild($element); - $a = $element->ownerDocument->createAttributeNS($namespaceURI, $qualifiedName); - $element->ownerDocument->removeChild($element); - } - $a->value = $this->escapeString($value, true); - $element->setAttributeNodeNS($a); - } else { - try { - $element->setAttributeNS($namespaceURI, $qualifiedName, $value); - } catch (\DOMException $e) { - // The attribute name is invalid for XML - // Replace any offending characters with "UHHHHHH" where H are the - // uppercase hexadecimal digits of the character's code point - if ($namespaceURI !== null) { - $qualifiedName = implode(":", array_map([$this, "coerceName"], explode(":", $qualifiedName, 2))); - } else { - $qualifiedName = $this->coerceName($qualifiedName); - } - $element->setAttributeNS($namespaceURI, $qualifiedName, $value); - $this->mangledAttributes = true; - } - if ($qualifiedName === "id" && $namespaceURI === null) { - $element->setIdAttribute($qualifiedName, true); - } - } - } - public function isMathMLTextIntegrationPoint(\DOMElement $e): bool { return ($e->namespaceURI === Parser::MATHML_NAMESPACE && (in_array($e->nodeName, ['mi', 'mo', 'mn', 'ms', 'mtext']))); } diff --git a/lib/Parser/ctype.php b/lib/Parser/ctype.php index cd9d177..bae6e7b 100644 --- a/lib/Parser/ctype.php +++ b/lib/Parser/ctype.php @@ -11,6 +11,7 @@ namespace MensBeam\HTML\Parser; // replacement, as they are designed only to evaluate // single characters +// @codeCoverageIgnoreStart if (!extension_loaded("ctype")) { function ctype_alnum(string $str): bool { return ["a"=>true,"b"=>true,"c"=>true,"d"=>true,"e"=>true,"f"=>true,"g"=>true,"h"=>true,"i"=>true,"j"=>true,"k"=>true,"l"=>true,"m"=>true,"n"=>true,"o"=>true,"p"=>true,"q"=>true,"r"=>true,"s"=>true,"t"=>true,"u"=>true,"v"=>true,"w"=>true,"x"=>true,"y"=>true,"z"=>true,"A"=>true,"B"=>true,"C"=>true,"D"=>true,"E"=>true,"F"=>true,"G"=>true,"H"=>true,"I"=>true,"J"=>true,"K"=>true,"L"=>true,"M"=>true,"N"=>true,"O"=>true,"P"=>true,"Q"=>true,"R"=>true,"S"=>true,"T"=>true,"U"=>true,"V"=>true,"W"=>true,"X"=>true,"Y"=>true,"Z"=>true,"0"=>true,"1"=>true,"2"=>true,"3"=>true,"4"=>true,"5"=>true,"6"=>true,"7"=>true,"8"=>true,"9"=>true][$str] ?? false; @@ -32,3 +33,4 @@ if (!extension_loaded("ctype")) { return ["a"=>true,"b"=>true,"c"=>true,"d"=>true,"e"=>true,"f"=>true,"A"=>true,"B"=>true,"C"=>true,"D"=>true,"E"=>true,"F"=>true,"0"=>true,"1"=>true,"2"=>true,"3"=>true,"4"=>true,"5"=>true,"6"=>true,"7"=>true,"8"=>true,"9"=>true][$str] ?? false; } } +// @codeCoverageIgnoreEnd diff --git a/tests/cases/TestCharset.php b/tests/cases/TestCharset.php index 144fcfb..639062e 100644 --- a/tests/cases/TestCharset.php +++ b/tests/cases/TestCharset.php @@ -117,6 +117,28 @@ class TestCharset extends \PHPUnit\Framework\TestCase { $this->assertSame($exp, $act->encoding); } + /** + * @dataProvider provideNonstandardDeclarationTests + * @covers \MensBeam\HTML\Parser\Data::__construct */ + public function testNonstandardDeclarationTests(string $data, ?string $charset, ?string $fallback, int $bytesToScan, string $exp): void { + $config = new Config; + $config->encodingPrescanBytes = $bytesToScan; + $config->encodingFallback = $fallback; + $act = Parser::parse($data, $charset, $config); + $this->assertSame($exp, $act->encoding); + } + + public function provideNonstandardDeclarationTests(): iterable { + return [ + ["", null, null, 1024, "windows-1252"], + ["", null, null, 1024, "UTF-8"], + ["", null, "UTF-8", 1024, "UTF-8"], + ["", null, "UTF-7", 1024, "windows-1252"], + ]; + } + public function provideStandardDeclarationTests() { $tests = []; $blacklist = ["xmldecl-3.html"]; diff --git a/tests/cases/TestParser.php b/tests/cases/TestParser.php index e49d99c..9ccf580 100644 --- a/tests/cases/TestParser.php +++ b/tests/cases/TestParser.php @@ -13,6 +13,7 @@ use MensBeam\HTML\Parser\Exception; /** * @covers \MensBeam\HTML\Parser + * @covers \MensBeam\HTML\Parser\Exception */ class TestParser extends \PHPUnit\Framework\TestCase { public function testParseADocument(): void { @@ -33,6 +34,15 @@ class TestParser extends \PHPUnit\Framework\TestCase { $this->assertInstanceOf(\DOMDocumentFragment::class, $out); } + /** @covers \MensBeam\HTML\Parser\TreeConstructor::__construct */ + public function testParseAFragmentWithBogusQuirksMode(): void { + $doc = new \DOMDocument(); + $context = $doc->createElement("div"); + $in = "hello world!"; + $this->expectExceptionObject(new Exception(Exception::INVALID_QUIRKS_MODE)); + Parser::parseFragment($context, -1, $in, "tex/html; charset=utf8"); + } + public function testParseADocumentReportingErrors(): void { $in = "hello world!"; $conf = new Config; diff --git a/tests/cases/TestSerializer.php b/tests/cases/TestSerializer.php new file mode 100644 index 0000000..522867d --- /dev/null +++ b/tests/cases/TestSerializer.php @@ -0,0 +1,258 @@ +buildTree($data, $fragment); + $this->assertSame($exp, Serializer::serializeOuter($node)); + } + + public function provideStandardTreeTests(): iterable { + $blacklist = []; + $files = new \AppendIterator(); + $files->append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/cases/serializer/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); + foreach ($files as $file) { + if (!in_array(basename($file), $blacklist)) { + yield from $this->parseTreeTestFile($file); + } + } + } + + /** @dataProvider provideTemplateTests */ + public function testSerializeADecoratedTemplate(?string $ns, bool $content, bool $fragment, bool $text, string $exp): void { + $d = new \DOMDocument; + $t = $d->createElementNS($ns, "template"); + $t->appendChild($d->createTextNode("EEK")); + if ($content) { + $t->content = null; + if ($fragment) { + $f = $d->createDocumentFragment(); + $t->content = $f; + if ($text) { + $f->appendChild($d->createTextNode("OOK")); + } + } + } + $exp1 = $exp; + $exp2 = ""; + $this->assertSame($exp1, Serializer::serializeInner($t)); + $this->assertSame($exp2, Serializer::serializeOuter($t)); + } + + public function provideTemplateTests(): iterable { + return [ + [null, false, false, false, "EEK"], + [null, true, false, false, "EEK"], + [null, true, true, false, ""], + [null, true, true, true, "OOK"], + [Parser::HTML_NAMESPACE, false, false, false, "EEK"], + [Parser::HTML_NAMESPACE, true, false, false, "EEK"], + [Parser::HTML_NAMESPACE, true, true, false, ""], + [Parser::HTML_NAMESPACE, true, true, true, "OOK"], + ]; + } + + /** @dataProvider provideEmptyElementTests */ + public function testInnerSerializeEmptyElement(string $tagName, ?string $ns, string $exp): void { + $d = new \DOMDocument; + $e = $d->createElementNS($ns, $tagName); + $e->appendChild($d->createTextNode("EEK")); + $this->assertSame($exp, Serializer::serializeInner($e)); + } + + public function provideEmptyElementTests(): iterable { + return [ + ["basefont", null, ""], + ["bgsound", null, ""], + ["frame", null, ""], + ["keygen", null, ""], + ["area", null, ""], + ["base", null, ""], + ["br", null, ""], + ["col", null, ""], + ["embed", null, ""], + ["hr", null, ""], + ["img", null, ""], + ["input", null, ""], + ["link", null, ""], + ["meta", null, ""], + ["param", null, ""], + ["source", null, ""], + ["track", null, ""], + ["wbr", null, ""], + ["basefont", Parser::HTML_NAMESPACE, ""], + ["bgsound", Parser::HTML_NAMESPACE, ""], + ["frame", Parser::HTML_NAMESPACE, ""], + ["keygen", Parser::HTML_NAMESPACE, ""], + ["area", Parser::HTML_NAMESPACE, ""], + ["base", Parser::HTML_NAMESPACE, ""], + ["br", Parser::HTML_NAMESPACE, ""], + ["col", Parser::HTML_NAMESPACE, ""], + ["embed", Parser::HTML_NAMESPACE, ""], + ["hr", Parser::HTML_NAMESPACE, ""], + ["img", Parser::HTML_NAMESPACE, ""], + ["input", Parser::HTML_NAMESPACE, ""], + ["link", Parser::HTML_NAMESPACE, ""], + ["meta", Parser::HTML_NAMESPACE, ""], + ["param", Parser::HTML_NAMESPACE, ""], + ["source", Parser::HTML_NAMESPACE, ""], + ["track", Parser::HTML_NAMESPACE, ""], + ["wbr", Parser::HTML_NAMESPACE, ""], + ["basefont", Parser::SVG_NAMESPACE, "EEK"], + ["bgsound", Parser::SVG_NAMESPACE, "EEK"], + ["frame", Parser::SVG_NAMESPACE, "EEK"], + ["keygen", Parser::SVG_NAMESPACE, "EEK"], + ["area", Parser::SVG_NAMESPACE, "EEK"], + ["base", Parser::SVG_NAMESPACE, "EEK"], + ["br", Parser::SVG_NAMESPACE, "EEK"], + ["col", Parser::SVG_NAMESPACE, "EEK"], + ["embed", Parser::SVG_NAMESPACE, "EEK"], + ["hr", Parser::SVG_NAMESPACE, "EEK"], + ["img", Parser::SVG_NAMESPACE, "EEK"], + ["input", Parser::SVG_NAMESPACE, "EEK"], + ["link", Parser::SVG_NAMESPACE, "EEK"], + ["meta", Parser::SVG_NAMESPACE, "EEK"], + ["param", Parser::SVG_NAMESPACE, "EEK"], + ["source", Parser::SVG_NAMESPACE, "EEK"], + ["track", Parser::SVG_NAMESPACE, "EEK"], + ["wbr", Parser::SVG_NAMESPACE, "EEK"], + ]; + } + + public function testOuterSerializeAnInvalidNode(): void { + $d = new \DOMDocument; + $a = $d->createAttribute("oops"); + $this->expectExceptionObject(new Exception(Exception::UNSUPPORTED_NODE_TYPE, [\DOMAttr::class])); + Serializer::serializeOuter($a); + } + + public function testInnerSerializeAnInvalidNode(): void { + $d = new \DOMDocument; + $t = $d->createTextNode("OOPS"); + $this->expectExceptionObject(new Exception(Exception::UNSUPPORTED_NODE_TYPE, [\DOMText::class])); + Serializer::serializeInner($t); + } + + protected function buildTree(array $data, bool $fragment, bool $formatOutput = false): \DOMNode { + $document = new \DOMDocument; + $document->formatOutput = $formatOutput; + if ($fragment) { + $document->appendChild($document->createElement("html")); + $out = $document->createDocumentFragment(); + } else { + $out = $document; + } + $cur = $out; + $pad = 2; + // process each line in turn + for ($l = 0; $l < sizeof($data); $l++) { + preg_match('/^(\|\s+)(.+)/', $data[$l], $m); + // pop any parents as long as the padding of the line is less than the expected padding + $p = strlen((string) $m[1]); + assert($p >= 2 && $p <= $pad && !($p % 2), new \Exception("Input data is invalid on line ".($l + 1))); + while ($p < $pad) { + $pad -= 2; + $cur = $cur->parentNode; + } + // act based upon what the rest of the line looks like + $d = $m[2]; + if (preg_match('/^$/', $d, $m)) { + // comment + $cur->appendChild($document->createComment($m[1])); + } elseif (preg_match('/^]*)(?: "([^"]*)" "([^"]*)")?)?>$/', $d, $m)) { + // doctype + $name = strlen((string) ($m[1] ?? "")) ? $m[1] : " "; + $public = strlen((string) ($m[2] ?? "")) ? $m[2] : ""; + $system = strlen((string) ($m[3] ?? "")) ? $m[3] : ""; + $cur->appendChild($document->implementation->createDocumentType($name, $public, $system)); + } elseif (preg_match('/^<\?([^ ]+) ([^>]*)>$/', $d, $m)) { + // processing instruction + $cur->appendChild($document->createProcessingInstruction($m[1], $m[2])); + } elseif (preg_match('/^<(?:([^ ]+) )?([^>]+)>$/', $d, $m)) { + // element + $ns = strlen((string) $m[1]) ? (array_flip(Parser::NAMESPACE_MAP)[$m[1]] ?? $m[1]) : null; + $cur = $cur->appendChild($document->createElementNS($ns, self::coerceName($m[2]))); + $pad += 2; + } elseif (preg_match('/^(?:([^" ]+) )?([^"=]+)="((?:[^"]|"(?!$))*)"$/', $d, $m)) { + // attribute + $ns = strlen((string) $m[1]) ? (array_flip(Parser::NAMESPACE_MAP)[$m[1]] ?? $m[1]) : ""; + $this->elementSetAttribute($cur, $ns, $m[2], $m[3]); + } elseif (preg_match('/^"((?:[^"]|"(?!$))*)("?)$/', $d, $m)) { + // text + $t = $m[1]; + while (!strlen((string) $m[2])) { + preg_match('/^((?:[^"]|"(?!$))*)("?)$/', $data[++$l], $m); + $t .= "\n".$m[1]; + } + $cur->appendChild($document->createTextNode($t)); + } else { + throw new \Exception("Input data is invalid on line ".($l + 1)); + } + } + return $out; + } + + protected function parseTreeTestFile(string $file): \Generator { + $index = 0; + $l = 0; + $lines = array_map(function($v) { + return rtrim($v, "\n"); + }, file($file)); + while ($l < sizeof($lines)) { + $pos = $l + 1; + assert(in_array($lines[$l], ["#document", "#fragment"]), new \Exception("Test $file #$index does not start with #document or #fragment tag at line ".($l + 1))); + $fragment = $lines[$l] === "#fragment"; + // collect the test input + $data = []; + for (++$l; $l < sizeof($lines); $l++) { + if (preg_match('/^#(script-(on|off)|output)$/', $lines[$l])) { + break; + } + $data[] = $lines[$l]; + } + // set the script mode, if present + assert(preg_match('/^#(script-(on|off)|output)$/', $lines[$l]) === 1, new \Exception("Test $file #$index follows data with something other than script flag or output at line ".($l + 1))); + $script = null; + if ($lines[$l] === "#script-off") { + $script = false; + $l++; + } elseif ($lines[$l] === "#script-on") { + $script = true; + $l++; + } + // collect the output string + $exp = []; + assert($lines[$l] === "#output", new \Exception("Test $file #$index follows input with something other than output at line ".($l + 1))); + for (++$l; $l < sizeof($lines); $l++) { + if ($lines[$l] === "" && in_array(($lines[$l + 1] ?? ""), ["#document", "#fragment"])) { + break; + } + assert(preg_match('/^([^#]|$)/', $lines[$l]) === 1, new \Exception("Test $file #$index contains unrecognized data after output at line ".($l + 1))); + $exp[] = $lines[$l]; + } + $exp = implode("\n", $exp); + if (!$script) { + yield basename($file)." #$index (line $pos)" => [$data, $fragment, $exp]; + } + $l++; + $index++; + } + } +} diff --git a/tests/cases/TestTokenizer.php b/tests/cases/TestTokenizer.php index f76f1a6..5771cf6 100644 --- a/tests/cases/TestTokenizer.php +++ b/tests/cases/TestTokenizer.php @@ -44,10 +44,10 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase { ]; /** @dataProvider provideStandardTokenizerTests */ - public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $expErrors) { + public function testStandardTokenizerTests(string $input, array $expected, int $state, ?string $open, ?array $expErrors) { $config = new Config; $config->encodingFallback = "UTF-8"; - $errorHandler = new ParseError; + $errorHandler = ($expErrors !== null) ? new ParseError : null; // initialize a stack of open elements, possibly with an open element $stack = new OpenElementsStack(null); if ($open) { @@ -71,12 +71,31 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase { } } finally { $actual = $this->normalizeTokens($actual); - $errors = $this->formatErrors($errorHandler->errors); $this->assertEquals($expected, $actual, $tokenizer->debugLog); + $errors = ($expErrors !== null) ? $this->formatErrors($errorHandler->errors) : null; $this->assertEquals($expErrors, $errors, $tokenizer->debugLog); } } + /** + * @dataProvider provideStandardTokenizerTests + * @depends testStandardTokenizerTests + */ + public function testStandardTokenizerTestsWithoutErrorReporting(string $input, array $expected, int $state, ?string $open, array $expErrors) { + $this->testStandardTokenizerTests($input, $expected, $state, $open, null); + } + + /** @dataProvider provideNonstandardTokenizerTests */ + public function testNonstandardTokenizerTests(string $input, array $expected, int $state, ?string $open, array $expErrors) { + $this->testStandardTokenizerTests($input, $expected, $state, $open, $expErrors); + } + + public function provideNonstandardTokenizerTests(): iterable { + return [ + ["\xFF", [new CharacterToken("\u{FFFD}"), new EOFToken], Tokenizer::DATA_STATE, "", [['code' => "noncharacter-in-input-stream", 'line' => 1, 'col' => 1]]], + ]; + } + public function provideStandardTokenizerTests() { $tests = []; $blacklist = ["xmlViolation.test"]; diff --git a/tests/cases/TestTreeConstructor.php b/tests/cases/TestTreeConstructor.php index bf377d4..fcd06e6 100644 --- a/tests/cases/TestTreeConstructor.php +++ b/tests/cases/TestTreeConstructor.php @@ -16,6 +16,7 @@ use MensBeam\HTML\Parser\Tokenizer; use MensBeam\HTML\Parser\TreeConstructor; /** + * @covers \MensBeam\HTML\Parser\Data * @covers \MensBeam\HTML\Parser\Tokenizer * @covers \MensBeam\HTML\Parser\TreeConstructor * @covers \MensBeam\HTML\Parser\ActiveFormattingElementsList @@ -122,11 +123,11 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { } protected function patchTest(string $data, $fragment, array $errors, array $exp): array { - // When using the HTML namespace, xmlns attribute cannot be inserted due to a PHP limitation + // When using the HTML namespace, xmlns attributes lose their namespace due to a PHP limitation if ($this->ns) { for ($a = 0; $a < sizeof($exp); $a++) { if (preg_match('/^\|\s+xmlns xmlns=/', $exp[$a])) { - array_splice($exp, $a--, 1); + $exp[$a] = preg_replace('/^\|(\s+)xmlns xmlns=/', "|$1xmlns=", $exp[$a]); } } } @@ -180,7 +181,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { $prefix = "null "; } } - $localName = $this->uncoerceName($e->localName); + $localName = self::uncoerceName($e->localName); $this->push("<".$prefix.$localName.">"); $this->depth++; $attr = []; @@ -191,7 +192,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { assert((bool) $prefix, new \Exception("Prefix for namespace {$a->namespaceURI} is not defined")); $prefix .= " "; } - $attr[$prefix.$this->uncoerceName($a->name)] = $a->value; + $attr[$prefix.self::uncoerceName($a->name)] = $a->value; } ksort($attr, \SORT_STRING); foreach ($attr as $k => $v) { diff --git a/tests/cases/serializer/README.md b/tests/cases/serializer/README.md new file mode 100644 index 0000000..824bf7e --- /dev/null +++ b/tests/cases/serializer/README.md @@ -0,0 +1,99 @@ +HTML DOM serialization tests +============================ + +The format of these tests is essentially the format of html5lib's tree +construction tests in reverse. There are, however, important differences, +so the format is documented in full here. + +Each file containing tree construction tests consists of any number of +tests separated by two newlines (LF) and a single newline before the end +of the file. For instance: + + [TEST]LF + LF + [TEST]LF + LF + [TEST]LF + +Where [TEST] is the following format: + +Each test begins with a line reading `#document` or `#fragment`; subsequent +lines represent the document or document fragment (respectively) used as +input, until a line is encountered which reads `#output`, `#script-on`, +or `#script-off`. + +Each DOM node in the input is written on its own line beginning with the +characters "| " (a vertical bar followed by a single space); lines which begin +with other characters are a continuation of the previous line. Attributes +are treated as distinct nodes and have their own entries. There is no escape +mechanism: all input is literal, including newlines and quotation marks. Two +spaces are used to denote each level of nesting. For example: + + | node + | child node + continuation of child node + | grandchild node + | child node + | attribute node of child + | grandchild node + +The different types of nodes are: + +- Element nodes in the form `` for an element in the HTML namespace, + or `` for an element in a foreign namespace. Qualified names are + written as usual e.g. ``, though such elements are not + produced by the parser +- Attribute nodes in the form `id="value"` or e.g. `xml xml:id="value"`, with + a quotation mark immediately followed by a newline marking the end of the + attribute value (in other words, attribute values may contain literal + quotation marks) +- Text nodes in the form `"text data"`; like attributes, only a quotation mark + followed a newline marks the end of text data +- Comment nodes of the form ``; the space characters are + padding and are not part of the comment data +- Document type nodes in the form ``, or + `` or simply `` depending on its contents +- Processing instructions in the form ``. Processing + instructions are not generated by the HTML parser, but may appear in + documents by other means + +Namespaces are represented by the following short names: + +| Name | URL | +|-------|--------------------------------------| +| xml | http://www.w3.org/XML/1998/namespace | +| xmlns | http://www.w3.org/2000/xmlns/ | +| xlink | http://www.w3.org/1999/xlink | +| math | http://www.w3.org/1998/Math/MathML | +| svg | http://www.w3.org/2000/svg | + +Other namespaces may also appear; these should be interpreted as literal URLs. + +After the input block either `#script-on` or `#script-off` may appear. These +signal that the test should be run with scripting on or off, respectively. If +neither line is present, the test should be run in both modes. + +Finally, `#output` marks the beginning of output. All subsequent text is +literal characters until two consecutive newlines following by either +`#document` or `#fragment` are seen. + +Below is a complete example: + + #document + | + | + | + | lang="en" + | + | + | style="font-family: "Times New Roman"" + | + | xml xml:id="image" + |
+ | "This is a text node. + It has an embedded newline. It is in fact pretty "busy" and has + multiple newlines. + + And even a blank line." + | diff --git a/tests/cases/serializer/mensbeam01.dat b/tests/cases/serializer/mensbeam01.dat new file mode 100644 index 0000000..c317644 --- /dev/null +++ b/tests/cases/serializer/mensbeam01.dat @@ -0,0 +1,33 @@ +#fragment +| +#output + + +#fragment +| +| test💩test="test" +#output + + +#fragment +| +| "You should not see this text." +#output + + +#fragment +| +| class="test" +#output + + +#fragment +| +#output + + +#fragment +| +| poop💩="soccer" +#output + diff --git a/tests/cases/serializer/mensbeam02.dat b/tests/cases/serializer/mensbeam02.dat new file mode 100644 index 0000000..7760020 --- /dev/null +++ b/tests/cases/serializer/mensbeam02.dat @@ -0,0 +1,34 @@ +#document +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + + +#document +| +| +#output + diff --git a/tests/cases/serializer/wpt01.dat b/tests/cases/serializer/wpt01.dat new file mode 100644 index 0000000..0074d36 --- /dev/null +++ b/tests/cases/serializer/wpt01.dat @@ -0,0 +1,913 @@ +#fragment +| +#output + + +#fragment +| +| +#output + + +#fragment +| +| +| b="c" +#output + + +#fragment +| +| +| b="&" +#output + + +#fragment +| +| +| b=" " +#output + + +#fragment +| +| +| b=""" +#output + + +#fragment +| +| +| b="<" +#output + + +#fragment +| +| +| b=">" +#output + + +#fragment +| +| +| href="javascript:"<>"" +#output + + +#fragment +| +| +| xlink xlink:href="a" +#output + + +#fragment +| +| +| xmlns xmlns:svg="test" +#output + + +#fragment +| +| "a" +#output +a + +#fragment +| +| "&" +#output +& + +#fragment +| +| " " +#output +  + +#fragment +| +| "<" +#output +< + +#fragment +| +| ">" +#output +> + +#fragment +| +| """ +#output +" + +#fragment +| +| + +#fragment +| +| + +#fragment +| + +#fragment +| +| +| "<&>" +#output +<span><xmp><&> + +#fragment +| +| + +#fragment +| +| +| "<&>" +#output +<span><noembed><&> + +#fragment +| +| +| "<&>" +#output +<span><noframes><&> + +#fragment +| +|