diff --git a/RoboFile.php b/RoboFile.php index c44ad91..341297b 100644 --- a/RoboFile.php +++ b/RoboFile.php @@ -6,6 +6,7 @@ const BASE = __DIR__.\DIRECTORY_SEPARATOR; const BASE_TEST = BASE."tests".\DIRECTORY_SEPARATOR; define("IS_WIN", defined("PHP_WINDOWS_VERSION_MAJOR")); define("IS_MAC", php_uname("s") === "Darwin"); +error_reporting(0); function norm(string $path): string { $out = realpath($path); diff --git a/lib/DOM/Document.php b/lib/DOM/Document.php index c6ebd4b..4b3c349 100644 --- a/lib/DOM/Document.php +++ b/lib/DOM/Document.php @@ -3,7 +3,7 @@ declare(strict_types=1); namespace dW\HTML5; class Document extends \DOMDocument { - use Descendant, Serialize; + use Descendant, Serialize, EscapeString; // Quirks mode constants public const NO_QUIRKS_MODE = 0; @@ -11,6 +11,8 @@ class Document extends \DOMDocument { public const LIMITED_QUIRKS_MODE = 2; public $quirksMode = self::NO_QUIRKS_MODE; + public $mangledElements = false; + public $mangledAttributes = false; // An array of all template elements created in the document // This exists because values of properties on derived DOM classes @@ -87,21 +89,38 @@ class Document extends \DOMDocument { } public function createElement($name, $value = "") { - $e = parent::createElement($name, $value); - if ($name === "template") { - $this->templateElements[] = $e; - $e->content = $this->createDocumentFragment(); + try { + $e = parent::createElement($name, $value); + if ($name === "template") { + $this->templateElements[] = $e; + $e->content = $this->createDocumentFragment(); + } + return $e; + } catch (\DOMException) { + // The element name is invalid for XML + // Replace any offending characters with "UHHHHHH" where H is the + // uppercase hexadecimal digits of the character's code point + $this->mangledElements = true; + $name = $this->CoerceName($name); } - return $e; } public function createElementNS($namespaceURI, $qualifiedName, $value = "") { - $e = parent::createElementNS($namespaceURI, $qualifiedName, $value); - if ($qualifiedName === "template" && $namespaceURI === null) { - $this->templateElements[] = $e; - $e->content = $this->createDocumentFragment(); + try { + $e = parent::createElementNS($namespaceURI, $qualifiedName, $value); + if ($qualifiedName === "template" && $namespaceURI === null) { + $this->templateElements[] = $e; + $e->content = $this->createDocumentFragment(); + } + return $e; + } catch (\DOMException) { + throw $e; + // The element name is invalid for XML + // Replace any offending characters with "UHHHHHH" where H is the + // uppercase hexadecimal digits of the character's code point + $this->mangledElements = true; + $qualifiedName = $this->CoerceName($qualifiedName); } - return $e; } public function __toString() { diff --git a/lib/DOM/traits/EscapeString.php b/lib/DOM/traits/EscapeString.php index 898ee06..7fc1a27 100644 --- a/lib/DOM/traits/EscapeString.php +++ b/lib/DOM/traits/EscapeString.php @@ -22,4 +22,13 @@ trait EscapeString { return $string; } -} \ No newline at end of file + + protected function CoerceName(string $name): string { + // This matches the inverse of the production of NameChar in XML 1.0, + // with the added exclusion of ":" from allowed characters + // See https://www.w3.org/TR/REC-xml/#NT-NameStartChar + preg_match_all('/[^\-\.0-9\x{B7}\x{300}-\x{36F}\x{203F}-\x{2040}A-Za-z_\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/u', $name, $m, \PREG_OFFSET_CAPTURE); + var_export($m); + exit; + } +} diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index cbfbde3..559212a 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -3347,6 +3347,7 @@ class TreeBuilder { $formattingElementIndex = $this->activeFormattingElementsList->findToMarker($subject); if ($formattingElementIndex > -1) { $formattingElement = $this->activeFormattingElementsList[$formattingElementIndex]['element']; + $formattingToken = $this->activeFormattingElementsList[$formattingElementIndex]['token']; } else { $formattingElement = null; } @@ -3465,8 +3466,9 @@ class TreeBuilder { $element = $this->createElementForToken($token, null, $commonAncestor); $this->activeFormattingElementsList[$nodeListPos] = ['token' => $nodeToken, 'element' => $element]; $this->stack[$nodeIndex] = $element; + $node = $element; # If last node is furthest block, then move the aforementioned - # to be immediately after the new node in the list of + # bookmark to be immediately after the new node in the list of # active formatting elements. if ($lastNode->isSameNode($furthestBlock)) { $bookmark = $nodeListPos + 1; @@ -3494,7 +3496,6 @@ class TreeBuilder { # Create an element for the token for which formatting element was # created, in the HTML namespace, with furthest block as the # intended parent. - $formattingToken = $this->activeFormattingElementsList[$formattingElementIndex]['token']; $element = $this->createElementForToken($formattingToken, null, $furthestBlock); # Take all of the child nodes of furthest block and append them to # the element created in the last step. @@ -3511,9 +3512,9 @@ class TreeBuilder { # Remove formatting element from the stack of open elements, and # insert the new element into the stack of open elements # immediately below the position of furthest block in that stack. - $this->stack->insert($element, $this->stack->findSame($furthestBlock)); assert($stackIndex > 0, new \Exception("Attempting to delete root element from stack")); - unset($this->stack[$this->stack->findSame($formattingElement)]); + $this->stack->removeSame($formattingElement); + $this->stack->insert($element, $this->stack->findSame($furthestBlock) + 1); # Jump back to the step labeled outer loop. goto OuterLoop; }