diff --git a/RoboFile.php b/RoboFile.php index 56cfc82..29c6c1f 100644 --- a/RoboFile.php +++ b/RoboFile.php @@ -160,28 +160,54 @@ class RoboFile extends \Robo\Tasks { public function constants(): Result { $c = $this->collectionBuilder()->addCode(function() { $elems = []; + $atrs = []; // retrieve the single-page HTML specification (this is around 15MB in size) $spec = file_get_contents("https://html.spec.whatwg.org/"); // parse it (this may take several seconds if ($spec) { $p = new DOMParser; $document = $p->parseFromString($spec, "text/html;charset=utf-8"); - // pick out element definitions from the specification + // pick out element and attribute definitions from the specification foreach ($document->getElementsByTagName("dfn") as $el) { - if ($el->getAttribute("data-dfn-type") === "element") { + $type = $el->getAttribute("data-dfn-type"); + if ($type === "element") { $elems[] = trim($el->textContent); + } elseif ($type === "element-attr") { + $name = trim($el->textContent); + if (preg_match('/\s/', $name)) { + // skip the definition if the name is not valid + continue; + } + $attrs[$name] = $attrs[$name] ?? []; + $context = $el->getAttribute("data-dfn-for") ?? ""; + if (strlen($context)) { + $context = explode(",", $context); + $attrs[$name] = array_merge($attrs[$name], $context); + } } } } // sort and filter the results for unqiueness sort($elems); + ksort($attrs); $elems = array_unique($elems); - // output the list of elements as a PHP array + $attrs = array_map(function($v) { + sort($v); + return $v; + }, $attrs); + // formt the lists as a PHP arrays $elems = array_map(function($e) { return "'$e' => true"; }, $elems); $elems = implode(", ", $elems); + $attrList = []; + foreach ($attrs as $name => $context) { + $attrList[] = "'$name' => [\"".implode("\", \"", $context)."\"]"; + } + $attrs = implode(", ", $attrList); echo "protected const KNOWN_ELEMENTS_HTML = [".$elems."];\n"; + echo "protected const KNOWN_ATTRIBUTES_HTML = [".$attrs."];\n"; + }); return $c->run(); } diff --git a/lib/AbstractSanitizer.php b/lib/AbstractSanitizer.php index 6a35142..391c8c3 100644 --- a/lib/AbstractSanitizer.php +++ b/lib/AbstractSanitizer.php @@ -43,6 +43,8 @@ abstract class AbstractSanitizer { protected const NULL_NAMESPACE = ""; /** @var array The set of known HTML elements, used to determine what the `allowUnknownMarkup` setting applies to */ protected const KNOWN_ELEMENTS_HTML = ['a' => true, 'abbr' => true, 'acronym' => true, 'address' => true, 'applet' => true, 'area' => true, 'article' => true, 'aside' => true, 'audio' => true, 'b' => true, 'base' => true, 'basefont' => true, 'bdi' => true, 'bdo' => true, 'bgsound' => true, 'big' => true, 'blink' => true, 'blockquote' => true, 'body' => true, 'br' => true, 'button' => true, 'canvas' => true, 'caption' => true, 'center' => true, 'cite' => true, 'code' => true, 'col' => true, 'colgroup' => true, 'data' => true, 'datalist' => true, 'dd' => true, 'del' => true, 'details' => true, 'dfn' => true, 'dialog' => true, 'dir' => true, 'div' => true, 'dl' => true, 'dt' => true, 'em' => true, 'embed' => true, 'fieldset' => true, 'figcaption' => true, 'figure' => true, 'font' => true, 'footer' => true, 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true, 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true, 'hr' => true, 'html' => true, 'i' => true, 'iframe' => true, 'img' => true, 'input' => true, 'ins' => true, 'isindex' => true, 'kbd' => true, 'keygen' => true, 'label' => true, 'legend' => true, 'li' => true, 'link' => true, 'listing' => true, 'main' => true, 'map' => true, 'mark' => true, 'marquee' => true, 'menu' => true, 'menuitem' => true, 'meta' => true, 'meter' => true, 'multicol' => true, 'nav' => true, 'nextid' => true, 'nobr' => true, 'noembed' => true, 'noframes' => true, 'noscript' => true, 'object' => true, 'ol' => true, 'optgroup' => true, 'option' => true, 'output' => true, 'p' => true, 'param' => true, 'picture' => true, 'plaintext' => true, 'pre' => true, 'progress' => true, 'q' => true, 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true, 's' => true, 'samp' => true, 'script' => true, 'search' => true, 'section' => true, 'select' => true, 'slot' => true, 'small' => true, 'source' => true, 'spacer' => true, 'span' => true, 'strike' => true, 'strong' => true, 'style' => true, 'sub' => true, 'summary' => true, 'sup' => true, 'table' => true, 'tbody' => true, 'td' => true, 'template' => true, 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true, 'time' => true, 'title' => true, 'tr' => true, 'track' => true, 'tt' => true, 'u' => true, 'ul' => true, 'var' => true, 'video' => true, 'wbr' => true, 'xmp' => true]; + protected const KNOWN_ATTRIBUTES_HTML = ['abbr' => ["td", "th"], 'accept' => ["form", "input"], 'accept-charset' => ["form"], 'accesskey' => ["*"], 'action' => ["button", "form"], 'align' => ["caption", "col", "div", "embed", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "iframe", "img", "input", "legend", "object", "p", "table", "tbody", "td", "th", "tr"], 'alink' => ["body"], 'allow' => ["iframe"], 'allowfullscreen' => ["iframe"], 'allowtransparency' => ["iframe"], 'alt' => ["area", "img", "input"], 'archive' => ["object"], 'as' => ["link"], 'async' => ["script"], 'autocapitalize' => ["*"], 'autocomplete' => ["button", "fieldset", "form", "input", "object", "output", "select", "textarea"], 'autofocus' => ["*"], 'autoplay' => ["audio", "video"], 'axis' => ["td", "th"], 'background' => [""], 'behavior' => ["marquee"], 'bgcolor' => ["body", "table", "td", "th", "tr"], 'blocking' => ["link", "script", "style"], 'border' => ["img", "input", "object", "table"], 'bordercolor' => ["table"], 'bottommargin' => ["body"], 'cellpadding' => ["table"], 'cellspacing' => ["table"], 'char' => ["col", "tbody", "td", "th", "tr"], 'charoff' => ["col", "tbody", "td", "th", "tr"], 'charset' => ["a", "link", "meta", "script"], 'checked' => ["input"], 'cite' => ["blockquote", "del", "ins", "q"], 'class' => ["*"], 'classid' => ["object"], 'clear' => ["br"], 'code' => ["object"], 'codebase' => ["object"], 'codetype' => ["object"], 'color' => ["hr", "link"], 'cols' => ["textarea"], 'colspan' => ["td", "th"], 'compact' => ["dl", "menu", "ol", "ul"], 'content' => ["meta"], 'contenteditable' => ["*"], 'contextmenu' => [""], 'controls' => ["audio", "video"], 'coords' => ["a", "area"], 'crossorigin' => ["audio", "img", "link", "script", "video"], 'data' => ["object"], 'datafld' => [""], 'dataformatas' => [""], 'datapagesize' => ["table"], 'datasrc' => [""], 'datetime' => ["del", "ins", "time"], 'declare' => ["object"], 'decoding' => ["img"], 'default' => ["track"], 'defer' => ["script"], 'dir' => ["*"], 'direction' => ["marquee"], 'dirname' => ["button", "fieldset", "input", "object", "output", "select", "textarea"], 'disabled' => ["button", "fieldset", "fieldset", "input", "link", "object", "optgroup", "option", "output", "select", "textarea"], 'download' => ["a", "area"], 'draggable' => ["*"], 'dropzone' => [""], 'enctype' => ["button", "form"], 'enterkeyhint' => ["*"], 'event' => ["script"], 'fetchpriority' => ["img", "link", "script"], 'for' => ["label", "output", "script"], 'form' => ["button", "fieldset", "input", "object", "output", "select", "textarea"], 'formaction' => ["button", "form"], 'formenctype' => ["button", "form"], 'formmethod' => ["button", "form"], 'formnovalidate' => ["button", "form"], 'formtarget' => ["button", "form"], 'frame' => ["table"], 'frameborder' => ["iframe"], 'framespacing' => ["iframe"], 'headers' => ["td", "th"], 'height' => ["canvas", "embed", "iframe", "img", "object", "source", "table", "tbody", "td", "th", "tr", "video"], 'hidden' => ["*"], 'high' => ["meter"], 'href' => ["a", "area", "base", "link"], 'hreflang' => ["a", "area", "area", "link"], 'hspace' => ["embed", "iframe", "img", "input", "object"], 'http-equiv' => ["meta"], 'id' => ["*"], 'imagesizes' => ["link"], 'imagesrcset' => ["link"], 'inputmode' => ["*"], 'integrity' => ["link", "script"], 'is' => ["*"], 'ismap' => ["img", "input"], 'itemid' => ["*"], 'itemprop' => ["*"], 'itemref' => ["*"], 'itemscope' => ["*"], 'itemtype' => ["*"], 'kind' => ["track"], 'label' => ["menu", "optgroup", "option", "track"], 'lang' => ["*"], 'language' => ["script"], 'leftmargin' => ["body"], 'link' => ["body"], 'list' => ["input"], 'loading' => ["iframe", "img"], 'longdesc' => ["iframe", "img"], 'loop' => ["audio", "marquee", "video"], 'low' => ["meter"], 'lowsrc' => ["img"], 'manifest' => ["html"], 'marginheight' => ["body", "iframe"], 'marginwidth' => ["body", "iframe"], 'max' => ["input", "meter", "progress"], 'maxlength' => ["input", "textarea"], 'media' => ["link", "meta", "source", "style"], 'method' => ["button", "form"], 'methods' => ["a", "link"], 'min' => ["input", "meter"], 'minlength' => ["input", "textarea"], 'multiple' => ["input", "select"], 'muted' => ["audio", "video"], 'name' => ["a", "button", "embed", "fieldset", "form", "iframe", "img", "input", "map", "meta", "object", "option", "output", "select", "slot", "textarea"], 'nohref' => ["area"], 'nomodule' => ["script"], 'nonce' => ["*"], 'noshade' => ["hr"], 'novalidate' => ["button", "form"], 'nowrap' => ["td", "th"], 'open' => ["details", "dialog"], 'optimum' => ["meter"], 'pattern' => ["input"], 'ping' => ["a", "area"], 'placeholder' => ["input", "textarea"], 'playsinline' => ["video"], 'popover' => ["*"], 'popovertarget' => ["*"], 'popovertargetaction' => ["*"], 'poster' => ["video"], 'preload' => ["audio", "video"], 'profile' => ["head"], 'readonly' => ["form-associated custom elements", "input", "textarea"], 'referrerpolicy' => ["a", "area", "iframe", "img", "link", "script"], 'rel' => ["a", "area", "form", "link"], 'required' => ["input", "select", "textarea"], 'rev' => ["a", "link"], 'reversed' => ["ol"], 'rightmargin' => ["body"], 'role' => [""], 'rows' => ["textarea"], 'rowspan' => ["td", "th"], 'rules' => ["table"], 'sandbox' => ["iframe"], 'scheme' => ["meta"], 'scope' => ["td", "th"], 'scrolling' => ["iframe"], 'selected' => ["option"], 'shape' => ["a", "area"], 'size' => ["hr", "input", "select"], 'sizes' => ["img", "link", "source"], 'slot' => ["*"], 'span' => ["col", "colgroup"], 'spellcheck' => ["*"], 'src' => ["audio", "embed", "iframe", "img", "input", "script", "source", "track", "video"], 'srcdoc' => ["iframe"], 'srclang' => ["track"], 'srcset' => ["img", "source"], 'standby' => ["object"], 'start' => ["ol"], 'step' => ["input"], 'style' => ["*"], 'summary' => ["table"], 'tabindex' => ["*"], 'target' => ["a", "area", "base", "button", "form", "link"], 'text' => ["body"], 'title' => ["abbr", "dfn", "*", "input", "link", "style"], 'topmargin' => ["body"], 'translate' => ["*"], 'truespeed' => ["marquee"], 'type' => ["a", "area", "area", "button", "embed", "input", "li", "link", "menu", "object", "ol", "script", "source", "style", "ul"], 'typemustmatch' => ["object"], 'urn' => ["a", "link"], 'usemap' => ["img", "input", "object"], 'valign' => ["col", "tbody", "td", "th", "tr"], 'value' => ["button", "data", "input", "li", "meter", "option", "progress"], 'version' => ["html"], 'vlink' => ["body"], 'vspace' => ["embed", "iframe", "img", "input", "object"], 'width' => ["canvas", "col", "embed", "hr", "iframe", "img", "object", "pre", "source", "table", "td", "th", "video"], 'wrap' => ["textarea"]]; + /** @var array The default configuration structure */ protected const DEFAULT_CONF = [ 'allowElements' => [ @@ -716,7 +718,73 @@ abstract class AbstractSanitizer { } } + /** Examines a node and returns one of the sanitization actions + * + * Attributes may be removed as part of this function + * + * @param \DOMNode $node The node to examine + */ protected function sanitizeNode(\DOMNode $node): int { + # To sanitize a node named node with sanitizer run these steps: + # Assert: node is not a Document or DocumentFragment or Attr + # or DocumentType node. + assert(!in_array(get_class($node), ["DOMDocument", "DOMDocumentFragment", "DOMAttr", "DOMDocumentType"]), new \Exception("Invalid node Type ".get_class($node))); + if ($node instanceof \DOMElement) { + # If node is an element node: + # Let element be node. + $element = $node; + # For each attr in element’s attribute list: + foreach ($element->attributes as $attr) { + # Let attr action be the result of running the sanitize action + # for an attribute algorithm on attr and element. + $attrAction = $this->sanitizeAttribute($attr); + # If attr action is different from keep, remove an attribute + # supplying attr. + if ($attrAction !== self::ACTION_KEEP) { + $element->removeAttributeNode($attr); + } + } + # Run the steps to handle funky elements on element. + $this->handleFunkyElement($element); + # Let action be the result of running the sanitize action for an + # element on element. + # Return action. + return $this->sanitizeElement($element); + } elseif ($node instanceof \DOMComment) { + # If node is a Comment node: + # Let config be sanitizer’s configuration dictionary, or the default configuration if no configuration dictionary was given. + # If config’s allow comments option exists and |config|[allowComments] is true: Return keep. + if ($this->config['allowComments'] ?? self::DEFAULT_CONF['allowComments']) { + return self::ACTION_KEEP; + } + # Return drop. + return self::ACTION_DROP; + } elseif ($node instanceof \DOMText) { + # If node is a Text node: Return keep. + return self::ACTION_KEEP; + } else { + # Assert: node is a ProcessingInstruction + # Return drop. + //DEVIATION: we allow processing instructions by configuration just like comments + if ($this->config['allowProcessingInstructions'] ?? self::DEFAULT_CONF['allowProcessingInstructions']) { + return self::ACTION_KEEP; + } + return self::ACTION_DROP; + } + } + + protected function sanitizeAttribute(\DOMAttr $attr): int { + # To determine the sanitize action for an attribute given a Sanitizer + # configuration dictionary config, run these steps: + # Let kind be attribute’s attribute kind. + # If kind is unknown and if config["allowUnknownMarkup"] does not exist or it config["allowUnknownMarkup"] is false: Return drop. + # If kind is regular and attribute’s local name does not match any name in the baseline attribute allow list: Return drop. + # If attribute matches any attribute match list in config’s attribute drop list: Return drop. + # If attribute allow list exists in config: + # Then let allow list be |config|["allowAttributes"]. + # Otherwise: Let allow list be the default configuration's attribute allow list. + # If attribute does not match any attribute match list in allow list: Return drop. + # Return keep. } /** Finds the next node in tree order after $node, if any @@ -729,9 +797,6 @@ abstract class AbstractSanitizer { return $node->firstChild; } $next = $node->nextSibling; - if ($next) { - return $next; - } while (!$next) { $node = $node->parentNode; if (!$node) {