From 607cd69fd2eb2d9074eac494f1a9a4aa8439660b Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 20 Apr 2023 19:22:09 -0400 Subject: [PATCH] Start on actual sanitization --- lib/AbstractSanitizer.php | 132 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/lib/AbstractSanitizer.php b/lib/AbstractSanitizer.php index b8b51f2..6a35142 100644 --- a/lib/AbstractSanitizer.php +++ b/lib/AbstractSanitizer.php @@ -21,6 +21,12 @@ namespace MensBeam\HTML; * @see https://github.com/WICG/sanitizer-api/issues */ abstract class AbstractSanitizer { + /** @var int The sanitizer action to keep a node */ + protected const ACTION_KEEP = 0; + /** @var int The sanitizer action to drop a node, which removes it and its children */ + protected const ACTION_DROP = 1; + /** @var int The sanitizer action to block a node, which removes it while (possibly) keeping its children; this only applies to element nodes */ + protected const ACTION_BLOCK = 2; /** @var string The HTML namespace */ protected const HTML_NAMESPACE = "http://www.w3.org/1999/xhtml"; /** @var string The MathML namespace */ @@ -609,4 +615,130 @@ abstract class AbstractSanitizer { // return the transformed configuration return $out; } + + /** Sanitizes a document or document fragment according to the loaded configuration + * + * The returned document fragment contains clones of all valid nodes, leaving the original input untouched + * + * @param \DOMDocument|\DOMDocumentFragment $input The document or document fragment to sanitize + */ + public function sanitize(\DOMNode $input): \DOMDocumentFragment { + if (!($input instanceof \DOMDocument || $input instanceof \DOMDocumentFragment)) { + throw new \InvalidArgumentException("Argument \$input must be of type \DOMDocument|\DOMDocumentFragment"); + } + # To sanitize a given input of type Document or DocumentFragment + # run these steps: + # Let fragment be the result of running the create a document fragment + # algorithm on input. + $fragment = $this->createDocumentFragment($input); + # Run the sanitize a document fragment algorithm on fragment. + $this->sanitizeDocumentFragment($fragment); + # Return fragment. + return $fragment; + } + + /** Creates a copy of the input as a document fragment, which can then be safely manipulated + * + * @param \DOMDocument|\DOMDocumentFragment $input The input to clone + */ + protected function createDocumentFragment(\DOMNode $input): \DOMDocumentFragment { + assert($input instanceof \DOMDocument || $input instanceof \DOMDocumentFragment, new \Exception("Parameter \$input must be of type \DOMDocument|\DOMDocumentFragment")); + # To create a document fragment named fragment from an input of type + # Document or DocumentFragment, run these steps: + # Let node be null. + $node = null; + # Switch based on input’s type: + # If input is of type DocumentFragment, then: + # Set node to input. + # If input is of type Document, then: + # Set node to input’s documentElement. + if ($input instanceof \DOMDocumentFragment) { + $node = $input; + } elseif ($input instanceof \DOMDocument) { + $node = $input->documentElement; + } + # Let clone be the result of running clone a node on node with the + # clone children flag set. + $clone = $node->cloneNode(true); + # Let fragment be a new DocumentFragment whose node document is node’s + # node document. + $fragment = $node->ownerDocument->createDocumentFragment(); + # Append the node clone to fragment. + $fragment->appendChild($clone); + # Return fragment. + return $fragment; + } + + /** Iterates over a document fragment and sanitizes its nodes + * + * @param \DOMDocumentFragment $fragment The document fragment to sanitize + */ + protected function sanitizeDocumentFragment(\DOMDocumentFragment $fragment): void { + # To sanitize a document fragment named fragment with a Sanitizer + # sanitizer run these steps: + # Let m be a map that maps nodes to a sanitize action. + # Let nodes be a list containing the inclusive descendants of fragment, in tree order. + // Basically we won't be doing things this way. Rather than treating + // all of the fragment's nodes as a flat list, we'll walk the + // fragment and decide as we go, till have visited every node + $node = $fragment->firstChild; + # For each node in nodes: + while ($node) { + # Let action be the result of running the sanitize a node algorithm on node with sanitizer. + $action = $this->sanitizeNode($node); + assert(in_array($action, [self::ACTION_BLOCK, self::ACTION_DROP, self::ACTION_KEEP]), new \Exception("The result of sanitizing a node must be one of the three actions")); + # Set m[node] to action. + # For each node in nodes: + // Again, we're taking action one node at a time; m[node] is simply $action + if ($action === self::ACTION_DROP) { + # If m[node] is drop, remove node. + $victim = $node; + $node = $this->nextNode($node, false); + $victim->parentNode->removeChild($victim); + } else if ($action === self::ACTION_BLOCK) { + # If m[node] is block, create a DocumentFragment fragment, + # append all of node’s children to fragment, and replace + # node within node’s parent with fragment. + // This is a bit confusing because the variable name "fragment" + // is re-used; this is a different fragment from the input + // to this function + $victim = $node; + $node = $this->nextNode($node, true); + $frag = $victim->ownerDocument->createDocumentFragment(); + while ($victim->firstChild) { + $frag->appendChild($victim->firstChild); + } + $victim->parentNode->replaceChild($victim, $frag); + } else { + # If m[node] is keep, do nothing. + $node = $this->nextNode($node, true); + } + } + } + + protected function sanitizeNode(\DOMNode $node): int { + } + + /** Finds the next node in tree order after $node, if any + * + * @param \DOMNode $node The context node + * @param bool $considerChildren Whether or not child nodes are valid next nodes + */ + protected function nextNode(\DOMNode $node, bool $considerChildren): ?\DOMNode { + if ($node->hasChildNodes() && $considerChildren) { + return $node->firstChild; + } + $next = $node->nextSibling; + if ($next) { + return $next; + } + while (!$next) { + $node = $node->parentNode; + if (!$node) { + return null; + } + $next = $node->nextSibling; + } + return $next; + } }