You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1048 lines
119 KiB

<?php
/** @license MIT
* Copyright 2023 J. King
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\HTML;
/** An implementation of the W3C HTML Sanitizer API.
*
* The class implements the following methods:
*
* - constructor
* - sanitize
* - sanitizeFor
* - getConfiguration
* - getDefaultConfiguration
1 year ago
*
* @see https://wicg.github.io/sanitizer-api/
* @see https://github.com/WICG/sanitizer-api/issues
*/
abstract class AbstractSanitizer {
/** @var int The sanitizer action to keep a node */
protected const ACTION_KEEP = 0;
/** @var int The sanitizer action to drop a node, which removes it and its children */
protected const ACTION_DROP = 1;
/** @var int The sanitizer action to block a node, which removes it while (possibly) keeping its children; this only applies to element nodes */
protected const ACTION_BLOCK = 2;
/** @var int The "unknown" kind for elements and attributes */
protected const KIND_UNKNOWN = 0;
/** @var int The "regular" kind for elements and attributes */
protected const KIND_REGULAR = 1;
/** @var string The HTML namespace */
protected const HTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
/** @var string The MathML namespace */
protected const MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML";
/** @var string The SVG namespace */
protected const SVG_NAMESPACE = "http://www.w3.org/2000/svg";
/** @var string The XLink namespace */
protected const XLINK_NAMESPACE = "http://www.w3.org/1999/xlink";
/** @var string The XML namespace */
protected const XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace";
/** @var string The XMLNS namespace */
protected const XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/";
/** @var string The null namespace; we use the empty string because it is explicitly not a valid namespace name; see https://www.w3.org/TR/REC-xml-names/#iri-use */
protected const NULL_NAMESPACE = "";
/** @var array The set of known elements, used to determine what the `allowUnknownMarkup` setting applies to */
protected const KNOWN_ELEMENTS = [self::SVG_NAMESPACE => ["a", "animate", "animateMotion", "animateTransform", "audio", "canvas", "circle", "clipPath", "defs", "desc", "discard", "ellipse", "feBlend", "feColorMatrix", "feComponentTransfer", "feComposite", "feConvolveMatrix", "feDiffuseLighting", "feDisplacementMap", "feDistantLight", "feDropShadow", "feFlood", "feFuncA", "feFuncB", "feFuncG", "feFuncR", "feGaussianBlur", "feImage", "feMerge", "feMergeNode", "feMorphology", "feOffset", "fePointLight", "feSpecularLighting", "feSpotLight", "feTile", "feTurbulence", "filter", "foreignObject", "g", "iframe", "image", "line", "linearGradient", "marker", "mask", "metadata", "mpath", "path", "pattern", "polygon", "polyline", "radialGradient", "rect", "script", "set", "stop", "style", "svg", "switch", "symbol", "text", "textPath", "title", "tspan", "unknown", "use", "video", "view"], self::HTML_NAMESPACE => ["a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", "bgsound", "big", "blink", "blockquote", "body", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", "data", "datalist", "dd", "del", "details", "dfn", "dialog", "dir", "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "iframe", "img", "input", "ins", "isindex", "kbd", "keygen", "label", "legend", "li", "link", "listing", "main", "map", "mark", "marquee", "menu", "menuitem", "meta", "meter", "multicol", "nav", "nextid", "nobr", "noembed", "noframes", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "picture", "plaintext", "pre", "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp", "script", "search", "section", "select", "slot", "small", "source", "spacer", "span", "strike", "strong", "style", "sub", "summary", "sup", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr", "xmp"]];
/** @var array The set of known attributes, used to determine what the `allowUnknownMarkup` setting applies to */
protected const KNOWN_ATTRIBUTES = [self::NULL_NAMESPACE => ['abbr' => [self::HTML_NAMESPACE => ["td", "th"]], 'accept' => [self::HTML_NAMESPACE => ["form", "input"]], 'accept-charset' => [self::HTML_NAMESPACE => ["form"]], 'accesskey' => [self::HTML_NAMESPACE => ["*"]], 'accumulate' => [self::SVG_NAMESPACE => ["animate", "animateMotion", "animateTransform"]], 'action' => [self::HTML_NAMESPACE => ["button", "form"]], 'additive' => [self::SVG_NAMESPACE => ["animate", "animateMotion", "animateTransform"]], 'align' => [self::HTML_NAMESPACE => ["caption", "col", "div", "embed", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "iframe", "img", "input", "legend", "object", "p", "table", "tbody", "td", "th", "tr"]], 'alignment-baseline' => [self::SVG_NAMESPACE => ["*"]], 'alink' => [self::HTML_NAMESPACE => ["body"]], 'allow' => [self::HTML_NAMESPACE => ["iframe"]], 'allowfullscreen' => [self::HTML_NAMESPACE => ["iframe"]], 'allowtransparency' => [self::HTML_NAMESPACE => ["iframe"]], 'alt' => [self::HTML_NAMESPACE => ["area", "img", "input"]], 'amplitude' => [self::SVG_NAMESPACE => ["feFuncA", "feFuncB", "feFuncG", "feFuncR"]], 'archive' => [self::HTML_NAMESPACE => ["object"]], 'aria-activedescendant' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas", "circle", "discard", "ellipse", "foreignObject", "g", "iframe", "image", "line", "path", "polygon", "polyline", "rect", "svg", "switch", "symbol", "text", "textPath", "tspan", "unknown", "use", "video", "view"]], 'aria-atomic' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas", "circle", "discard", "ellipse", "foreignObject", "g", "iframe", "image", "line", "path", "polygon", "polyline", "rect", "svg", "switch", "symbol", "text", "textPath", "tspan", "unknown", "use", "video", "view"]], 'aria-autocomplete' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas", "circle", "discard", "ellipse", "foreignObject", "g", "iframe", "image", "line", "path", "polygon", "polyline", "rect", "svg", "switch", "symbol", "text", "textPath", "tspan", "unknown", "use", "video", "view"]], 'aria-busy' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas", "circle", "discard", "ellipse", "foreignObject", "g", "iframe", "image", "line", "path", "polygon", "polyline", "rect", "svg", "switch", "symbol", "text", "textPath", "tspan", "unknown", "use", "video", "view"]], 'aria-checked' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas", "circle", "discard", "ellipse", "foreignObject", "g", "iframe", "image", "line", "path", "polygon", "polyline", "rect", "svg", "switch", "symbol", "text", "textPath", "tspan", "unknown", "use", "video", "view"]], 'aria-colcount' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas", "circle", "discard", "ellipse", "foreignObject", "g", "iframe", "image", "line", "path", "polygon", "polyline", "rect", "svg", "switch", "symbol", "text", "textPath", "tspan", "unknown", "use", "video", "view"]], 'aria-colindex' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas", "circle", "discard", "ellipse", "foreignObject", "g", "iframe", "image", "line", "path", "polygon", "polyline", "rect", "svg", "switch", "symbol", "text", "textPath", "tspan", "unknown", "use", "video", "view"]], 'aria-colspan' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas", "circle", "discard", "ellipse", "foreignObject", "g", "iframe", "image", "line", "path", "polygon", "polyline", "rect", "svg", "switch", "symbol", "text", "textPath", "tspan", "unknown", "use", "video", "view"]], 'aria-controls' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas", "circle", "discard", "ellipse", "foreignObject", "g", "iframe", "image", "line", "path", "polygon", "polyline", "rect", "svg", "switch", "symbol", "text", "textPath", "tspan", "unknown", "use", "video", "view"]], 'aria-current' => [self::HTML_NAMESPACE => ["*"], self::SVG_NAMESPACE => ["a", "audio", "canvas",
/** @var array The baseline list of allowed attributes */
protected const BASELINE_ATTR = [
self::HTML_NAMESPACE => [
'abbr' => true,
'accept' => true,
'accept-charset' => true,
'accesskey' => true,
'action' => true,
'align' => true,
'alink' => true,
'allow' => true,
'allowfullscreen' => true,
'allowpaymentrequest' => true,
'alt' => true,
'anchor' => true,
'archive' => true,
'as' => true,
'async' => true,
'autocapitalize' => true,
'autocomplete' => true,
'autocorrect' => true,
'autofocus' => true,
'autopictureinpicture' => true,
'autoplay' => true,
'axis' => true,
'background' => true,
'behavior' => true,
'bgcolor' => true,
'border' => true,
'bordercolor' => true,
'capture' => true,
'cellpadding' => true,
'cellspacing' => true,
'challenge' => true,
'char' => true,
'charoff' => true,
'charset' => true,
'checked' => true,
'cite' => true,
'class' => true,
'classid' => true,
'clear' => true,
'code' => true,
'codebase' => true,
'codetype' => true,
'color' => true,
'cols' => true,
'colspan' => true,
'compact' => true,
'content' => true,
'contenteditable' => true,
'controls' => true,
'controlslist' => true,
'conversiondestination' => true,
'coords' => true,
'crossorigin' => true,
'csp' => true,
'data' => true,
'datetime' => true,
'declare' => true,
'decoding' => true,
'default' => true,
'defer' => true,
'dir' => true,
'direction' => true,
'dirname' => true,
'disabled' => true,
'disablepictureinpicture' => true,
'disableremoteplayback' => true,
'disallowdocumentaccess' => true,
'download' => true,
'draggable' => true,
'elementtiming' => true,
'enctype' => true,
'end' => true,
'enterkeyhint' => true,
'event' => true,
'exportparts' => true,
'face' => true,
'for' => true,
'form' => true,
'formaction' => true,
'formenctype' => true,
'formmethod' => true,
'formnovalidate' => true,
'formtarget' => true,
'frame' => true,
'frameborder' => true,
'headers' => true,
'height' => true,
'hidden' => true,
'high' => true,
'href' => true,
'hreflang' => true,
'hreftranslate' => true,
'hspace' => true,
'http-equiv' => true,
'id' => true,
'imagesizes' => true,
'imagesrcset' => true,
'importance' => true,
'impressiondata' => true,
'impressionexpiry' => true,
'incremental' => true,
'inert' => true,
'inputmode' => true,
'integrity' => true,
'invisible' => true,
'is' => true,
'ismap' => true,
'keytype' => true,
'kind' => true,
'label' => true,
'lang' => true,
'language' => true,
'latencyhint' => true,
'leftmargin' => true,
'link' => true,
'list' => true,
'loading' => true,
'longdesc' => true,
'loop' => true,
'low' => true,
'lowsrc' => true,
'manifest' => true,
'marginheight' => true,
'marginwidth' => true,
'max' => true,
'maxlength' => true,
'mayscript' => true,
'media' => true,
'method' => true,
'min' => true,
'minlength' => true,
'multiple' => true,
'muted' => true,
'name' => true,
'nohref' => true,
'nomodule' => true,
'nonce' => true,
'noresize' => true,
'noshade' => true,
'novalidate' => true,
'nowrap' => true,
'object' => true,
'open' => true,
'optimum' => true,
'part' => true,
'pattern' => true,
'ping' => true,
'placeholder' => true,
'playsinline' => true,
'policy' => true,
'poster' => true,
'preload' => true,
'pseudo' => true,
'readonly' => true,
'referrerpolicy' => true,
'rel' => true,
'reportingorigin' => true,
'required' => true,
'resources' => true,
'rev' => true,
'reversed' => true,
'role' => true,
'rows' => true,
'rowspan' => true,
'rules' => true,
'sandbox' => true,
'scheme' => true,
'scope' => true,
'scopes' => true,
'scrollamount' => true,
'scrolldelay' => true,
'scrolling' => true,
'select' => true,
'selected' => true,
'shadowroot' => true,
'shadowrootdelegatesfocus' => true,
'shape' => true,
'size' => true,
'sizes' => true,
'slot' => true,
'span' => true,
'spellcheck' => true,
'src' => true,
'srcdoc' => true,
'srclang' => true,
'srcset' => true,
'standby' => true,
'start' => true,
'step' => true,
'style' => true,
'summary' => true,
'tabindex' => true,
'target' => true,
'text' => true,
'title' => true,
'topmargin' => true,
'translate' => true,
'truespeed' => true,
'trusttoken' => true,
'type' => true,
'usemap' => true,
'valign' => true,
'value' => true,
'valuetype' => true,
'version' => true,
'virtualkeyboardpolicy' => true,
'vlink' => true,
'vspace' => true,
'webkitdirectory' => true,
'width' => true,
'wrap' => true,
],
];
/** @var array The default configuration structure */
protected const DEFAULT_CONF = [
'allowElements' => [
self::HTML_NAMESPACE => [
'a' => true,
'abbr' => true,
'acronym' => true,
'address' => true,
'area' => true,
'article' => true,
'aside' => true,
'audio' => true,
'b' => true,
'bdi' => true,
'bdo' => true,
'bgsound' => true,
'big' => true,
'blockquote' => true,
'body' => true,
'br' => true,
'button' => true,
'canvas' => true,
'caption' => true,
'center' => true,
'cite' => true,
'code' => true,
'col' => true,
'colgroup' => true,
'datalist' => true,
'dd' => true,
'del' => true,
'details' => true,
'dfn' => true,
'dialog' => true,
'dir' => true,
'div' => true,
'dl' => true,
'dt' => true,
'em' => true,
'fieldset' => true,
'figcaption' => true,
'figure' => true,
'font' => true,
'footer' => true,
'form' => true,
'h1' => true,
'h2' => true,
'h3' => true,
'h4' => true,
'h5' => true,
'h6' => true,
'head' => true,
'header' => true,
'hgroup' => true,
'hr' => true,
'html' => true,
'i' => true,
'img' => true,
'input' => true,
'ins' => true,
'kbd' => true,
'keygen' => true,
'label' => true,
'layer' => true,
'legend' => true,
'li' => true,
'link' => true,
'listing' => true,
'main' => true,
'map' => true,
'mark' => true,
'marquee' => true,
'menu' => true,
'meta' => true,
'meter' => true,
'nav' => true,
'nobr' => true,
'ol' => true,
'optgroup' => true,
'option' => true,
'output' => true,
'p' => true,
'picture' => true,
'popup' => true,
'pre' => true,
'progress' => true,
'q' => true,
'rb' => true,
'rp' => true,
'rt' => true,
'rtc' => true,
'ruby' => true,
's' => true,
'samp' => true,
'search' => true,
'section' => true,
'select' => true,
'selectmenu' => true,
'small' => true,
'source' => true,
'span' => true,
'strike' => true,
'strong' => true,
'style' => true,
'sub' => true,
'summary' => true,
'sup' => true,
'table' => true,
'tbody' => true,
'td' => true,
'tfoot' => true,
'th' => true,
'thead' => true,
'time' => true,
'tr' => true,
'track' => true,
'tt' => true,
'u' => true,
'ul' => true,
'var' => true,
'video' => true,
'wbr' => true,
]
],
'allowAttributes' => [
self::NULL_NAMESPACE => [
'abbr' => "*",
'accept' => "*",
'accept-charset' => "*",
'accesskey' => "*",
'action' => "*",
'align' => "*",
'alink' => "*",
'allow' => "*",
'allowfullscreen' => "*",
'alt' => "*",
'anchor' => "*",
'archive' => "*",
'as' => "*",
'async' => "*",
'autocapitalize' => "*",
'autocomplete' => "*",
'autocorrect' => "*",
'autofocus' => "*",
'autopictureinpicture' => "*",
'autoplay' => "*",
'axis' => "*",
'background' => "*",
'behavior' => "*",
'bgcolor' => "*",
'border' => "*",
'bordercolor' => "*",
'capture' => "*",
'cellpadding' => "*",
'cellspacing' => "*",
'challenge' => "*",
'char' => "*",
'charoff' => "*",
'charset' => "*",
'checked' => "*",
'cite' => "*",
'class' => "*",
'classid' => "*",
'clear' => "*",
'code' => "*",
'codebase' => "*",
'codetype' => "*",
'color' => "*",
'cols' => "*",
'colspan' => "*",
'compact' => "*",
'content' => "*",
'contenteditable' => "*",
'controls' => "*",
'controlslist' => "*",
'conversiondestination' => "*",
'coords' => "*",
'crossorigin' => "*",
'csp' => "*",
'data' => "*",
'datetime' => "*",
'declare' => "*",
'decoding' => "*",
'default' => "*",
'defer' => "*",
'dir' => "*",
'direction' => "*",
'dirname' => "*",
'disabled' => "*",
'disablepictureinpicture' => "*",
'disableremoteplayback' => "*",
'disallowdocumentaccess' => "*",
'download' => "*",
'draggable' => "*",
'elementtiming' => "*",
'enctype' => "*",
'end' => "*",
'enterkeyhint' => "*",
'event' => "*",
'exportparts' => "*",
'face' => "*",
'for' => "*",
'form' => "*",
'formaction' => "*",
'formenctype' => "*",
'formmethod' => "*",
'formnovalidate' => "*",
'formtarget' => "*",
'frame' => "*",
'frameborder' => "*",
'headers' => "*",
'height' => "*",
'hidden' => "*",
'high' => "*",
'href' => "*",
'hreflang' => "*",
'hreftranslate' => "*",
'hspace' => "*",
'http-equiv' => "*",
'id' => "*",
'imagesizes' => "*",
'imagesrcset' => "*",
'importance' => "*",
'impressiondata' => "*",
'impressionexpiry' => "*",
'incremental' => "*",
'inert' => "*",
'inputmode' => "*",
'integrity' => "*",
'invisible' => "*",
'is' => "*",
'ismap' => "*",
'keytype' => "*",
'kind' => "*",
'label' => "*",
'lang' => "*",
'language' => "*",
'latencyhint' => "*",
'leftmargin' => "*",
'link' => "*",
'list' => "*",
'loading' => "*",
'longdesc' => "*",
'loop' => "*",
'low' => "*",
'lowsrc' => "*",
'manifest' => "*",
'marginheight' => "*",
'marginwidth' => "*",
'max' => "*",
'maxlength' => "*",
'mayscript' => "*",
'media' => "*",
'method' => "*",
'min' => "*",
'minlength' => "*",
'multiple' => "*",
'muted' => "*",
'name' => "*",
'nohref' => "*",
'nomodule' => "*",
'nonce' => "*",
'noresize' => "*",
'noshade' => "*",
'novalidate' => "*",
'nowrap' => "*",
'object' => "*",
'open' => "*",
'optimum' => "*",
'part' => "*",
'pattern' => "*",
'ping' => "*",
'placeholder' => "*",
'playsinline' => "*",
'policy' => "*",
'poster' => "*",
'preload' => "*",
'pseudo' => "*",
'readonly' => "*",
'referrerpolicy' => "*",
'rel' => "*",
'reportingorigin' => "*",
'required' => "*",
'resources' => "*",
'rev' => "*",
'reversed' => "*",
'role' => "*",
'rows' => "*",
'rowspan' => "*",
'rules' => "*",
'sandbox' => "*",
'scheme' => "*",
'scope' => "*",
'scopes' => "*",
'scrollamount' => "*",
'scrolldelay' => "*",
'scrolling' => "*",
'select' => "*",
'selected' => "*",
'shadowroot' => "*",
'shadowrootdelegatesfocus' => "*",
'shape' => "*",
'size' => "*",
'sizes' => "*",
'slot' => "*",
'span' => "*",
'spellcheck' => "*",
'src' => "*",
'srcdoc' => "*",
'srclang' => "*",
'srcset' => "*",
'standby' => "*",
'start' => "*",
'step' => "*",
'style' => "*",
'summary' => "*",
'tabindex' => "*",
'target' => "*",
'text' => "*",
'title' => "*",
'topmargin' => "*",
'translate' => "*",
'truespeed' => "*",
'trusttoken' => "*",
'type' => "*",
'usemap' => "*",
'valign' => "*",
'value' => "*",
'valuetype' => "*",
'version' => "*",
'virtualkeyboardpolicy' => "*",
'vlink' => "*",
'vspace' => "*",
'webkitdirectory' => "*",
'width' => "*",
'wrap' => "*",
],
],
'allowCustomElements' => false,
'allowUnknownMarkup' => false,
'allowComments' => false,
'allowProcessingInstructions' => false,
'nullNamespaceAsHtml' => true,
];
/** @var array The parsed configuration, as used for processing */
protected $config;
/** Initializes a sanitizer with the provided configuration, or the default configuration if no configuration is provided
*
* The configuration array may contain any of the following keys:
*
* - `allowElements`: an indexed array of elements to retain in the tree. Elements not in this list will be treated as if they were included in the `blockElements` list
* - `blockElements`: an indexed array of elements to remove from the tree while retaining their children
* - `dropElements`: an indexed array of elements to remove from the tree along with their children
* - `allowAttributes`: an indexed array of attributes to allow on certain elements. Attributes not in this list will be dropped
* - `dropAttributes`: an indexed array of attributes to remove from certain elements
* - `allowCustomElements`: Whether to allow custom elements, false by default. For the purposes of this implementation these are HTML elements with names containing dashed. If true, elements are still subject to the allow, block, and drop lists
* - `allowUnknownMarkup`: Whether to allow non-standard elements which are not custom elements, false by default. If true, elements are still subject to the allow, block, and drop lists
* - `allowComments`: Whether to retain comments, false by default
* - `allowProcessingInstructions`: Whether to retain processing instructions, false by default. Processing instructions do not normally appear in HTML documents. This option is an extension to the specification
* - `nullNamespaceAsHtml`: Whether to interpret elements from the tree in the null namespace as being in the HTML namespace, true by default. Per standard behaviour HTML elements have a namespace URI, but not all PHP-based parsers do this. This may be set to false when sanitizing XML documents. This option is an extension to the specification
*
* The entries in element lists may be strings, in which case these are interpreted as local names in the HTML namespace, or an array with the following keys:
*
* - `name`: The localName of the element
* - `namespace`: The namespaceURI of the element, a string or null. If omitted the HTML namespace is assumed
*
* The entries in attribute lists are arrays with the following keys
*
* - `name`: The localName of the attribute
* - `namespace`: The namespaceURI of the attribute. If omitted the null namespace is assumed
* - `elements`: An indexed array of elements on which to allow the attribute, in the same format as other element lists. The string `"*"` may be supplied instead of an array to mean all elements
*
* @param array $config A configuration to use instead of the default one
*/
public function __construct(array $config = null) {
if ($config === null) {
// use the default configuration if none is specified
$this->config = self::DEFAULT_CONF;
} else {
// otherwise validate the configuration; the specification provides
// no clue is to what happens when the configuration is invalid,
// so we'll just have to do our best
$out = [];
// start with the element lists
foreach (["allowElements", "blockElements", "dropElements"] as $opt) {
if (isset($config[$opt]) && is_array($config[$opt])) {
foreach ($config[$opt] as $el) {
if (is_string($el) && strlen($el)) {
// strings are assumed to be in the HTML namespace
$ns = self::HTML_NAMESPACE;
$name = $el;
} elseif (is_array($el) && strlen($el['name'] ?? "")) {
$name = $el['name'];
if (!array_key_exists("namespace", $el)) {
// the namespace key being missing means the HTML namespace
$ns = self::HTML_NAMESPACE;
} elseif (!isset($el['namespace'])) {
// the null namespace is also possible (but will never match in HTML documents)
$ns = self::NULL_NAMESPACE;
} elseif (is_string($el['namespace'])) {
// only use the namespace if it's a string
$ns = $el['namespace'];
} else {
// ignore any other value for the namespace (this is invalid)
continue;
}
}
// create any structures which might be missing
if (!isset($out[$opt])) {
$out[$opt] = [];
}
if (!isset($out[$opt][$ns])) {
$out[$opt][$ns] = [];
}
// add the element
$out[$opt][$ns][$name] = true;
}
}
}
// continue with attribute lists
foreach (["allowAttributes", "dropAttributes"] as $opt) {
if (isset($config[$opt]) && is_array($config[$opt])) {
foreach ($config[$opt] as $attr) {
if (is_array($attr) && strlen($attr['name'] ?? "" && isset($attr['elements']))) {
$name = $attr['name'];
if (!isset($attr['namespace'])) {
// the null namespace is assumed
$ns = self::NULL_NAMESPACE;
} elseif (is_string($attr['namespace'])) {
// only use the namespace if it's a string
$ns = $attr['namespace'];
} else {
// ignore any other value for the namespace (this is invalid)
continue;
}
// now check the list of elements
if ($attr['elements'] === "*") {
// the special string "*" means any element
$list = "*";
} elseif (is_array($attr['elements'])) {
// otherwise the element list is like the element lists handled above
$list = [];
foreach ($attr['elements'] as $el) {
if (is_string($el) && strlen($el)) {
// strings are assumed to be in the HTML namespace
$eNs = self::HTML_NAMESPACE;
$eName = $el;
} elseif (is_array($el) && strlen($el['name'] ?? "")) {
$eName = $el['name'];
if (!array_key_exists("namespace", $el)) {
// the namespace key being missing means the HTML namespace
$eNs = self::HTML_NAMESPACE;
} elseif (!isset($el['namespace'])) {
// the null namespace is also possible (but will never match in HTML documents)
$eNs = self::NULL_NAMESPACE;
} elseif (is_string($el['namespace'])) {
// only use the namespace if it's a string
$eNs = $el['namespace'];
} else {
// ignore any other value for the namespace (this is invalid)
continue;
}
}
// create any structures which might be missing
if (!isset($list[$eNs])) {
$list[$eNs] = [];
}
// add the element
$list[$eNs][$eName] = true;
}
} else {
// ignore any other value for the elements list (this is invalid)
continue;
}
if (!$list) {
// an empty element list is non-functional, so we can skip the attribute
continue;
}
// create any structures which might be missing
if (!isset($out[$opt])) {
$out[$opt] = [];
}
if (!isset($out[$opt][$ns])) {
$out[$opt][$ns] = [];
}
// add the attribute
$out[$opt][$ns][$name] = $list;
}
}
}
}
// finally handle the boolean options
foreach (["allowCustomElements", "allowUnknownMarkup", "allowComments", "allowProcessingInstructions", "nullNamespaceAsHtml"] as $opt) {
if (isset($config[$opt])) {
$out[$opt] = (bool) $config[$opt];
}
}
// use the normalized configuration
$this->config = $out;
}
}
/** Returns a normalized representation of the effective configuration */
public function getConfiguration(): array {
return $this->exportConfiguration($this->config);
}
/** Returns the default configuration */
public function getDefaultConfiguration(): array {
return $this->exportConfiguration(self::DEFAULT_CONF);
}
/** Converts a configuration from the internal representation to the external representation */
protected function exportConfiguration(array $config): array {
$out = [];
// start with the element lists
foreach (["allowElements", "blockElements", "dropElements"] as $opt) {
if (isset($config[$opt])) {
$out[$opt] = [];
ksort($config[$opt]);
foreach ($config[$opt] as $ns => $set) {
ksort($set);
foreach ($set as $el => $t) {
if ($ns === self::HTML_NAMESPACE) {
$out[$opt][] = $el;
} else {
$out[$opt][] = ['name' => $el, 'namespace' => $ns];
}
}
}
}
}
// continue with attribute lists
foreach (["allowAttributes", "dropAttributes"] as $opt) {
if (isset($config[$opt])) {
$out[$opt] = [];
ksort($config[$opt]);
foreach ($config[$opt] as $ns => $set) {
ksort($set);
foreach ($set as $attr => $elems) {
$list = [];
ksort($elems);
foreach ($elems as $eNs => $eSet) {
ksort($eSet);
foreach ($eSet as $el => $t) {
if ($eNs === self::HTML_NAMESPACE) {
$list[] = $el;
} else {
$list[] = ['name' => $el, 'namespace' => $eNs];
}
}
}
if ($ns === self::NULL_NAMESPACE) {
$out[$opt][] = ['name' => $attr, 'elements' => $list];
} else {
$out[$opt][] = ['name' => $attr, 'namespace' => $ns, 'elements' => $list];
}
}
}
}
}
// finally handle the boolean options
foreach (["allowCustomElements", "allowUnknownMarkup", "allowComments", "allowProcessingInstructions", "nullNamespaceAsHtml"] as $opt) {
if (isset($config[$opt])) {
$out[$opt] = $config[$opt];
}
}
// return the transformed configuration
return $out;
}
/** Sanitizes a document or document fragment according to the loaded configuration
*
* The returned document fragment contains clones of all valid nodes, leaving the original input untouched
*
* @param \DOMDocument|\DOMDocumentFragment $input The document or document fragment to sanitize
*/
public function sanitize(\DOMNode $input): \DOMDocumentFragment {
if (!($input instanceof \DOMDocument || $input instanceof \DOMDocumentFragment)) {
throw new \InvalidArgumentException("Argument \$input must be of type \DOMDocument|\DOMDocumentFragment");
}
# To sanitize a given input of type Document or DocumentFragment
# run these steps:
# Let fragment be the result of running the create a document fragment
# algorithm on input.
$fragment = $this->createDocumentFragment($input);
# Run the sanitize a document fragment algorithm on fragment.
$this->sanitizeDocumentFragment($fragment);
# Return fragment.
return $fragment;
}
/** Creates a copy of the input as a document fragment, which can then be safely manipulated
*
* @param \DOMDocument|\DOMDocumentFragment $input The input to clone
*/
protected function createDocumentFragment(\DOMNode $input): \DOMDocumentFragment {
assert($input instanceof \DOMDocument || $input instanceof \DOMDocumentFragment, new \Exception("Parameter \$input must be of type \DOMDocument|\DOMDocumentFragment"));
# To create a document fragment named fragment from an input of type
# Document or DocumentFragment, run these steps:
# Let node be null.
$node = null;
# Switch based on input’s type:
# If input is of type DocumentFragment, then:
# Set node to input.
# If input is of type Document, then:
# Set node to input’s documentElement.
if ($input instanceof \DOMDocumentFragment) {
$node = $input;
} elseif ($input instanceof \DOMDocument) {
$node = $input->documentElement;
}
# Let clone be the result of running clone a node on node with the
# clone children flag set.
$clone = $node->cloneNode(true);
# Let fragment be a new DocumentFragment whose node document is node’s
# node document.
$fragment = $node->ownerDocument->createDocumentFragment();
# Append the node clone to fragment.
$fragment->appendChild($clone);
# Return fragment.
return $fragment;
}
/** Iterates over a document fragment and sanitizes its nodes
*
* @param \DOMDocumentFragment $fragment The document fragment to sanitize
*/
protected function sanitizeDocumentFragment(\DOMDocumentFragment $fragment): void {
# To sanitize a document fragment named fragment with a Sanitizer
# sanitizer run these steps:
# Let m be a map that maps nodes to a sanitize action.
# Let nodes be a list containing the inclusive descendants of fragment, in tree order.
// Basically we won't be doing things this way. Rather than treating
// all of the fragment's nodes as a flat list, we'll walk the
// fragment and decide as we go, till have visited every node
$node = $fragment->firstChild;
# For each node in nodes:
while ($node) {
# Let action be the result of running the sanitize a node algorithm on node with sanitizer.
$action = $this->sanitizeNode($node);
assert(in_array($action, [self::ACTION_BLOCK, self::ACTION_DROP, self::ACTION_KEEP]), new \Exception("The result of sanitizing a node must be one of the three actions"));
# Set m[node] to action.
# For each node in nodes:
// Again, we're taking action one node at a time; m[node] is simply $action
if ($action === self::ACTION_DROP) {
# If m[node] is drop, remove node.
$victim = $node;
$node = $this->nextNode($node, false);
$victim->parentNode->removeChild($victim);
} else if ($action === self::ACTION_BLOCK) {
# If m[node] is block, create a DocumentFragment fragment,
# append all of node’s children to fragment, and replace
# node within node’s parent with fragment.
// This is a bit confusing because the variable name "fragment"
// is re-used; this is a different fragment from the input
// to this function
$victim = $node;
$node = $this->nextNode($node, true);
$frag = $victim->ownerDocument->createDocumentFragment();
while ($victim->firstChild) {
$frag->appendChild($victim->firstChild);
}
$victim->parentNode->replaceChild($victim, $frag);
} else {
# If m[node] is keep, do nothing.
$node = $this->nextNode($node, true);
}
}
}
/** Examines a node and returns one of the sanitization actions
*
* Attributes may be removed as part of this function
*
* @param \DOMNode $node The node to examine
*/
protected function sanitizeNode(\DOMNode $node): int {
# To sanitize a node named node with sanitizer run these steps:
# Assert: node is not a Document or DocumentFragment or Attr
# or DocumentType node.
assert(!in_array(get_class($node), ["DOMDocument", "DOMDocumentFragment", "DOMAttr", "DOMDocumentType"]), new \Exception("Invalid node Type ".get_class($node)));
if ($node instanceof \DOMElement) {
# If node is an element node:
# Let element be node.
$element = $node;
# For each attr in element’s attribute list:
foreach ($element->attributes as $attr) {
# Let attr action be the result of running the sanitize action
# for an attribute algorithm on attr and element.
$attrAction = $this->sanitizeAttribute($attr);
# If attr action is different from keep, remove an attribute
# supplying attr.
if ($attrAction !== self::ACTION_KEEP) {
$element->removeAttributeNode($attr);
}
}
# Run the steps to handle funky elements on element.
$this->handleFunkyElement($element);
# Let action be the result of running the sanitize action for an
# element on element.
# Return action.
return $this->sanitizeElement($element);
} elseif ($node instanceof \DOMComment) {
# If node is a Comment node:
# Let config be sanitizer’s configuration dictionary, or the default configuration if no configuration dictionary was given.
# If config’s allow comments option exists and |config|[allowComments] is true: Return keep.
if ($this->config['allowComments'] ?? self::DEFAULT_CONF['allowComments']) {
return self::ACTION_KEEP;
}
# Return drop.
return self::ACTION_DROP;
} elseif ($node instanceof \DOMText) {
# If node is a Text node: Return keep.
return self::ACTION_KEEP;
} else {
# Assert: node is a ProcessingInstruction
# Return drop.
//DEVIATION: we allow processing instructions by configuration just like comments
if ($this->config['allowProcessingInstructions'] ?? self::DEFAULT_CONF['allowProcessingInstructions']) {
return self::ACTION_KEEP;
}
return self::ACTION_DROP;
}
}
protected function sanitizeAttribute(\DOMAttr $attr): int {
# To determine the sanitize action for an attribute given a Sanitizer
# configuration dictionary config, run these steps:
# Let kind be attribute’s attribute kind.
$ns = $attr->namespaceURI;
$name = $attr->localName;
$eNs = $attr->ownerElement->namespaceURI;
$eName = $attr->ownerElement->localName;
$set = self::KNOWN_ATTRIBUTES[$ns][$name][$eNs] ?? [];
if (!array_intersect($set, [$eName, "*"])) {
1 year ago
# The attribute kind of an attribute is one of regular
# or unknown. Let attribute kind be:
# - unknown, if the [HTML] specification does not assign any
# meaning to attribute’s name.
$kind = self::KIND_UNKNOWN;
} else {
1 year ago
# - regular, otherwise.
$kind = self::KIND_REGULAR;
}
if ($kind === self::KIND_UNKNOWN && ($this->config['allowUnknownMarkup'] ?? self::DEFAULT_CONF['allowUnknownMarkup'])) {
# If kind is unknown and if config["allowUnknownMarkup"] does not exist or it config["allowUnknownMarkup"] is false: Return drop.
return self::ACTION_DROP;
} elseif ($kind === self::KIND_REGULAR) {}
# If kind is regular and attribute’s local name does not match any name in the baseline attribute allow list: Return drop.
# If attribute matches any attribute match list in config’s attribute drop list: Return drop.
# If attribute allow list exists in config:
# Then let allow list be |config|["allowAttributes"].
# Otherwise: Let allow list be the default configuration's attribute allow list.
# If attribute does not match any attribute match list in allow list: Return drop.
# Return keep.
}
/** Finds the next node in tree order after $node, if any
*
* @param \DOMNode $node The context node
* @param bool $considerChildren Whether or not child nodes are valid next nodes
*/
protected function nextNode(\DOMNode $node, bool $considerChildren): ?\DOMNode {
if ($node->hasChildNodes() && $considerChildren) {
return $node->firstChild;
}
$next = $node->nextSibling;
while (!$next) {
$node = $node->parentNode;
if (!$node) {
return null;
}
$next = $node->nextSibling;
}
return $next;
}
}