diff --git a/lib/DOM/Document.php b/lib/DOM/Document.php index 4b3c349..f41a6e9 100644 --- a/lib/DOM/Document.php +++ b/lib/DOM/Document.php @@ -96,12 +96,13 @@ class Document extends \DOMDocument { $e->content = $this->createDocumentFragment(); } return $e; - } catch (\DOMException) { + } catch (\DOMException $e) { // The element name is invalid for XML // Replace any offending characters with "UHHHHHH" where H is the // uppercase hexadecimal digits of the character's code point $this->mangledElements = true; - $name = $this->CoerceName($name); + $name = $this->coerceName($name); + return parent::createElement($name, $value); } } @@ -113,13 +114,39 @@ class Document extends \DOMDocument { $e->content = $this->createDocumentFragment(); } return $e; - } catch (\DOMException) { - throw $e; + } catch (\DOMException $e) { // The element name is invalid for XML - // Replace any offending characters with "UHHHHHH" where H is the + // Replace any offending characters with "UHHHHHH" where H are the // uppercase hexadecimal digits of the character's code point $this->mangledElements = true; - $qualifiedName = $this->CoerceName($qualifiedName); + $qualifiedName = $this->coerceName($qualifiedName); + return parent::createElementNS($namespaceURI, $qualifiedName, $value); + } + } + + public function createAttribute($name) { + try { + return parent::createAttribute($name); + } catch (\DOMException $e) { + // The element name is invalid for XML + // Replace any offending characters with "UHHHHHH" where H are the + // uppercase hexadecimal digits of the character's code point + $this->mangledAttributes = true; + $name = $this->coerceName($name); + return parent::createAttribute($name); + } + } + + public function createAttributeNS($namespaceURI, $qualifiedName) { + try { + return parent::createAttributeNS($namespaceURI, $qualifiedName); + } catch (\DOMException $e) { + // The element name is invalid for XML + // Replace any offending characters with "UHHHHHH" where H are the + // uppercase hexadecimal digits of the character's code point + $this->mangledAttributes = true; + $qualifiedName = $this->coerceName($qualifiedName); + return parent::createAttributeNS($namespaceURI, $qualifiedName); } } diff --git a/lib/DOM/Element.php b/lib/DOM/Element.php index 789c4e4..306d384 100644 --- a/lib/DOM/Element.php +++ b/lib/DOM/Element.php @@ -12,6 +12,32 @@ class Element extends \DOMElement { protected const SELF_CLOSING_ELEMENTS = ['area', 'base', 'basefont', 'bgsound', 'br', 'col', 'embed', 'frame', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']; + public function setAttribute($name, $value) { + try { + parent::setAttribute($name, $value); + } catch (\DOMException $e) { + // The attribute name is invalid for XML + // Replace any offending characters with "UHHHHHH" where H are the + // uppercase hexadecimal digits of the character's code point + $this->ownerDocument->mangledAttributes = true; + $name = $this->coerceName($name); + parent::setAttribute($name, $value); + } + } + + public function setAttributeNS($namespaceURI, $qualifiedName, $value) { + try { + parent::setAttributeNS($namespaceURI, $qualifiedName, $value); + } catch (\DOMException $e) { + // The attribute name is invalid for XML + // Replace any offending characters with "UHHHHHH" where H are the + // uppercase hexadecimal digits of the character's code point + $this->ownerDocument->mangledAttributes = true; + $qualifiedName = $this->coerceName($qualifiedName); + parent::setAttributeNS($namespaceURI, $qualifiedName, $value); + } + } + public function isMathMLTextIntegrationPoint(): bool { return ( $this->namespaceURI === Parser::MATHML_NAMESPACE && ( diff --git a/lib/DOM/traits/EscapeString.php b/lib/DOM/traits/EscapeString.php index 7fc1a27..d504f36 100644 --- a/lib/DOM/traits/EscapeString.php +++ b/lib/DOM/traits/EscapeString.php @@ -23,12 +23,30 @@ trait EscapeString { return $string; } - protected function CoerceName(string $name): string { + protected function coerceName(string $name): string { // This matches the inverse of the production of NameChar in XML 1.0, // with the added exclusion of ":" from allowed characters // See https://www.w3.org/TR/REC-xml/#NT-NameStartChar - preg_match_all('/[^\-\.0-9\x{B7}\x{300}-\x{36F}\x{203F}-\x{2040}A-Za-z_\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/u', $name, $m, \PREG_OFFSET_CAPTURE); - var_export($m); - exit; + preg_match_all('/[^\-\.0-9\x{B7}\x{300}-\x{36F}\x{203F}-\x{2040}A-Za-z_\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/u', $name, $m); + foreach (array_unique($m[0], \SORT_STRING) as $c) { + $esc = "U".str_pad(strtoupper(dechex(\IntlChar::ord($c))), 6, "0", \STR_PAD_LEFT); + $name = str_replace($c, $esc, $name); + } + // Apply stricter rules to the first character + if (preg_match('/^[^A-Za-z_\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/u', $name, $m)) { + $c = $m[0]; + $esc = "U".str_pad(strtoupper(dechex(\IntlChar::ord($c))), 6, "0", \STR_PAD_LEFT); + $name = $esc.substr($name, strlen($c)); + } + return $name; + } + + protected function uncoerceName(string $name): string { + preg_match_all('/U[0-9A-F]{6}/', $name, $m); + foreach (array_unique($m[0], \SORT_STRING) as $o) { + $c = \IntlChar::chr(hexdec(substr($o, 1))); + $name = str_replace($o, $c, $name); + } + return $name; } } diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index 559212a..548bc44 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -3,7 +3,7 @@ declare(strict_types=1); namespace dW\HTML5; class TreeBuilder { - use ParseErrorEmitter; + use ParseErrorEmitter, EscapeString; public $debugLog = ""; @@ -292,6 +292,12 @@ class TreeBuilder { $this->debugLog .= "EMITTED: ".constant(get_class($token)."::NAME")."\n"; return true; })()); + + // If element name coercison has occurred at some earlier point, + // we must coerce all end tag names to match mangled start tags + if ($token instanceof EndTagToken && $this->DOM->mangledElements) { + $token->name = $this->coerceName($token->name); + } // Loop used for reprocessing. $iterations = 0; while (true) { @@ -373,6 +379,14 @@ class TreeBuilder { return true; })()); + // If attribute name coercison has occurred at some earlier point, + // we must coerce all attributes on html and body start tags in + // case they are relocated to existing elements + if ($token instanceof StartTagToken && $this->DOM->mangledAttributes && in_array($token->name, ["html", "body"])) { + foreach ($token->attributes as $attr) { + $attr->name = $this->coerceName($attr->name); + } + } # 13.2.6.4. The rules for parsing tokens in HTML content # 13.2.6.4.1. The "initial" insertion mode if ($insertionMode === self::INITIAL_MODE) { @@ -4198,7 +4212,18 @@ class TreeBuilder { $element = $document->createElementNS($namespace, $localName); # Append each attribute in the given token to element. foreach ($token->attributes as $attr) { - $element->setAttributeNS(null, $attr->name, $attr->value); + $ns = null; + if ($namespace) { + // Determine the namespace URI for the prefix, if any + if (strpos($attr->name, "xml:") === 0) { + $ns = Parser::XML_NAMESPACE; + } elseif (strpos($attr->name, "xmlns:") === 0) { + $ns = Parser::XMLNS_NAMESPACE; + } elseif (strpos($attr->name, "xlink:") === 0) { + $ns = Parser::XLINK_NAMESPACE; + } + } + $element->setAttributeNS($ns, $attr->name, $attr->value); } # If element has an xmlns attribute in the XMLNS namespace whose value # is not exactly the same as the element's namespace, that is a diff --git a/tests/cases/TestTreeConstructor.php b/tests/cases/TestTreeConstructor.php index 51c687c..00550ef 100644 --- a/tests/cases/TestTreeConstructor.php +++ b/tests/cases/TestTreeConstructor.php @@ -22,6 +22,8 @@ use dW\HTML5\TreeBuilder; * @covers \dW\HTML5\Stack */ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { + use \dW\HTML5\EscapeString; + protected $out; protected $depth; @@ -102,19 +104,6 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { } } } - if (in_array($data, [ - //'', - //'', - //'', - //'', - //'bar', - //'', - //'', - //'', - //'bar', - ])) { - $skip = 'Requires implementation of the "Coercing an HTML DOM into an infoset" specification section'; - } return [$exp, $patched, $skip]; } @@ -167,13 +156,20 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { } else { $prefix = ""; } - $this->push("<".$prefix.$e->localName.">"); + $localName = $this->uncoerceName($e->localName); + $this->push("<".$prefix.$localName.">"); $this->depth++; $attr = []; foreach ($e->attributes as $a) { - $attr[$a->name] = $a->value; + $prefix = ""; + if ($a->namespaceURI) { + $prefix = Parser::NAMESPACE_MAP[$a->namespaceURI]; + assert((bool) $prefix, new \Exception("Prefix for namespace {$a->namespaceURI} is not defined")); + $prefix .= " "; + } + $attr[$prefix.$this->uncoerceName($a->name)] = $a->value; } - ksort($attr); + ksort($attr, \SORT_STRING); foreach ($attr as $k => $v) { $this->push($k.'="'.$v.'"'); }