You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

534 lines
22 KiB

<?php
/** @license MIT
* Copyright 2023 J. King
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\HTML;
/** An implementation of the W3C HTML Sanitizer API.
*
* The class implements the following methods:
*
* - constructor
* - sanitize
* - sanitizeFor
* - getConfiguration
* - getDefaultConfiguration
*
* @see https://wicg.github.io/sanitizer-api/
* @see https://github.com/WICG/sanitizer-api/issues
*/
abstract class AbstractSanitizer {
/** @var string The HTML namespace */
protected const HTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
/** @var string The MathML namespace */
protected const MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML";
/** @var string The SVG namespace */
protected const SVG_NAMESPACE = "http://www.w3.org/2000/svg";
/** @var string The XLink namespace */
protected const XLINK_NAMESPACE = "http://www.w3.org/1999/xlink";
/** @var string The XML namespace */
protected const XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace";
/** @var string The XMLNS namespace */
protected const XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/";
/** @var string The null namespace; we use a null character as this is a hashable value in PHP (null is not), while it is an illegal character in XML and thus will not appear in a legitimate namespace URI */
protected const NULL_NAMESPACE = "\x00";
/** @var array The default configuration structure */
protected const DEFAULT_CONF = [
'allowElements' => [
self::HTML_NAMESPACE => [
'a' => true,
'abbr' => true,
'acronym' => true,
'address' => true,
'area' => true,
'article' => true,
'aside' => true,
'audio' => true,
'b' => true,
'bdi' => true,
'bdo' => true,
'bgsound' => true,
'big' => true,
'blockquote' => true,
'body' => true,
'br' => true,
'button' => true,
'canvas' => true,
'caption' => true,
'center' => true,
'cite' => true,
'code' => true,
'col' => true,
'colgroup' => true,
'datalist' => true,
'dd' => true,
'del' => true,
'details' => true,
'dfn' => true,
'dialog' => true,
'dir' => true,
'div' => true,
'dl' => true,
'dt' => true,
'em' => true,
'fieldset' => true,
'figcaption' => true,
'figure' => true,
'font' => true,
'footer' => true,
'form' => true,
'h1' => true,
'h2' => true,
'h3' => true,
'h4' => true,
'h5' => true,
'h6' => true,
'head' => true,
'header' => true,
'hgroup' => true,
'hr' => true,
'html' => true,
'i' => true,
'img' => true,
'input' => true,
'ins' => true,
'kbd' => true,
'keygen' => true,
'label' => true,
'layer' => true,
'legend' => true,
'li' => true,
'link' => true,
'listing' => true,
'main' => true,
'map' => true,
'mark' => true,
'marquee' => true,
'menu' => true,
'meta' => true,
'meter' => true,
'nav' => true,
'nobr' => true,
'ol' => true,
'optgroup' => true,
'option' => true,
'output' => true,
'p' => true,
'picture' => true,
'popup' => true,
'pre' => true,
'progress' => true,
'q' => true,
'rb' => true,
'rp' => true,
'rt' => true,
'rtc' => true,
'ruby' => true,
's' => true,
'samp' => true,
'section' => true,
'select' => true,
'selectmenu' => true,
'small' => true,
'source' => true,
'span' => true,
'strike' => true,
'strong' => true,
'style' => true,
'sub' => true,
'summary' => true,
'sup' => true,
'table' => true,
'tbody' => true,
'td' => true,
'tfoot' => true,
'th' => true,
'thead' => true,
'time' => true,
'tr' => true,
'track' => true,
'tt' => true,
'u' => true,
'ul' => true,
'var' => true,
'video' => true,
'wbr' => true,
]
],
'allowAttributes' => [
self::NULL_NAMESPACE => [
'abbr' => "*",
'accept' => "*",
'accept-charset' => "*",
'accesskey' => "*",
'action' => "*",
'align' => "*",
'alink' => "*",
'allow' => "*",
'allowfullscreen' => "*",
'alt' => "*",
'anchor' => "*",
'archive' => "*",
'as' => "*",
'async' => "*",
'autocapitalize' => "*",
'autocomplete' => "*",
'autocorrect' => "*",
'autofocus' => "*",
'autopictureinpicture' => "*",
'autoplay' => "*",
'axis' => "*",
'background' => "*",
'behavior' => "*",
'bgcolor' => "*",
'border' => "*",
'bordercolor' => "*",
'capture' => "*",
'cellpadding' => "*",
'cellspacing' => "*",
'challenge' => "*",
'char' => "*",
'charoff' => "*",
'charset' => "*",
'checked' => "*",
'cite' => "*",
'class' => "*",
'classid' => "*",
'clear' => "*",
'code' => "*",
'codebase' => "*",
'codetype' => "*",
'color' => "*",
'cols' => "*",
'colspan' => "*",
'compact' => "*",
'content' => "*",
'contenteditable' => "*",
'controls' => "*",
'controlslist' => "*",
'conversiondestination' => "*",
'coords' => "*",
'crossorigin' => "*",
'csp' => "*",
'data' => "*",
'datetime' => "*",
'declare' => "*",
'decoding' => "*",
'default' => "*",
'defer' => "*",
'dir' => "*",
'direction' => "*",
'dirname' => "*",
'disabled' => "*",
'disablepictureinpicture' => "*",
'disableremoteplayback' => "*",
'disallowdocumentaccess' => "*",
'download' => "*",
'draggable' => "*",
'elementtiming' => "*",
'enctype' => "*",
'end' => "*",
'enterkeyhint' => "*",
'event' => "*",
'exportparts' => "*",
'face' => "*",
'for' => "*",
'form' => "*",
'formaction' => "*",
'formenctype' => "*",
'formmethod' => "*",
'formnovalidate' => "*",
'formtarget' => "*",
'frame' => "*",
'frameborder' => "*",
'headers' => "*",
'height' => "*",
'hidden' => "*",
'high' => "*",
'href' => "*",
'hreflang' => "*",
'hreftranslate' => "*",
'hspace' => "*",
'http-equiv' => "*",
'id' => "*",
'imagesizes' => "*",
'imagesrcset' => "*",
'importance' => "*",
'impressiondata' => "*",
'impressionexpiry' => "*",
'incremental' => "*",
'inert' => "*",
'inputmode' => "*",
'integrity' => "*",
'invisible' => "*",
'is' => "*",
'ismap' => "*",
'keytype' => "*",
'kind' => "*",
'label' => "*",
'lang' => "*",
'language' => "*",
'latencyhint' => "*",
'leftmargin' => "*",
'link' => "*",
'list' => "*",
'loading' => "*",
'longdesc' => "*",
'loop' => "*",
'low' => "*",
'lowsrc' => "*",
'manifest' => "*",
'marginheight' => "*",
'marginwidth' => "*",
'max' => "*",
'maxlength' => "*",
'mayscript' => "*",
'media' => "*",
'method' => "*",
'min' => "*",
'minlength' => "*",
'multiple' => "*",
'muted' => "*",
'name' => "*",
'nohref' => "*",
'nomodule' => "*",
'nonce' => "*",
'noresize' => "*",
'noshade' => "*",
'novalidate' => "*",
'nowrap' => "*",
'object' => "*",
'open' => "*",
'optimum' => "*",
'part' => "*",
'pattern' => "*",
'ping' => "*",
'placeholder' => "*",
'playsinline' => "*",
'policy' => "*",
'poster' => "*",
'preload' => "*",
'pseudo' => "*",
'readonly' => "*",
'referrerpolicy' => "*",
'rel' => "*",
'reportingorigin' => "*",
'required' => "*",
'resources' => "*",
'rev' => "*",
'reversed' => "*",
'role' => "*",
'rows' => "*",
'rowspan' => "*",
'rules' => "*",
'sandbox' => "*",
'scheme' => "*",
'scope' => "*",
'scopes' => "*",
'scrollamount' => "*",
'scrolldelay' => "*",
'scrolling' => "*",
'select' => "*",
'selected' => "*",
'shadowroot' => "*",
'shadowrootdelegatesfocus' => "*",
'shape' => "*",
'size' => "*",
'sizes' => "*",
'slot' => "*",
'span' => "*",
'spellcheck' => "*",
'src' => "*",
'srcdoc' => "*",
'srclang' => "*",
'srcset' => "*",
'standby' => "*",
'start' => "*",
'step' => "*",
'style' => "*",
'summary' => "*",
'tabindex' => "*",
'target' => "*",
'text' => "*",
'title' => "*",
'topmargin' => "*",
'translate' => "*",
'truespeed' => "*",
'trusttoken' => "*",
'type' => "*",
'usemap' => "*",
'valign' => "*",
'value' => "*",
'valuetype' => "*",
'version' => "*",
'virtualkeyboardpolicy' => "*",
'vlink' => "*",
'vspace' => "*",
'webkitdirectory' => "*",
'width' => "*",
'wrap' => "*",
],
],
'allowCustomElements' => false,
'allowUnknownMarkup' => false,
'allowComments' => false,
'allowProcessingInstructions' => false,
'nullNamespaceAsHtml' => true,
];
/** @var array The parsed configuration, as used for processing */
protected $config;
/** Initializes a sanitizer with the provided configuration, or the default configuration if no configuration is provided
*
* The configuration array may contain any of the following keys:
*
* - `allowElements`: an indexed array of elements to retain in the tree. Elements not in this list will be treated as if they were included in the `blockElements` list
* - `blockElements`: an indexed array of elements to remove from the tree while retaining their children
* - `dropElements`: an indexed array of elements to remove from the tree along with their children
* - `allowAttributes`: an indexed array of attributes to allow on certain elements. Attributes not in this list will be dropped
* - `dropAttributes`: an indexed array of attributes to remove from certain elements
* - `allowCustomElements`: Whether to allow custom elements, false by default. For the purposes of this implementation these are HTML elements with names containing dashed. If true, elements are still subject to the allow, block, and drop lists
* - `allowUnknownMarkup`: Whether to allow non-standard elements which are not custom elements, false by default. If true, elements are still subject to the allow, block, and drop lists
* - `allowComments`: Whether to retain comments, false by default
* - `allowProcessingInstructions`: Whether to retain processing instructions, false by default. Processing instructions do not normally appear in HTML documents. This option is an extension to the specification
* - `nullNamespaceAsHtml`: Whether to interpret elements from the tree in the null namespace as being in the HTML namespace, true by default. Per standard behaviour HTML elements have a namespace URI, but not all PHP-based parsers do this. This may be set to false when sanitizing XML documents. This option is an extension to the specification
*
* The entries in element lists may be strings, in which case these are interpreted as local names in the HTML namespace, or an array with the following keys:
*
* - `name`: The localName of the element
* - `namespace`: The namespaceURI of the element, a string or null. If omitted the HTML namespace is assumed
*
* The entries in attribute lists are arrays with the following keys
*
* - `name`: The localName of the attribute
* - `namespace`: The namespaceURI of the attribute. If omitted the null namespace is assumed
* - `elements`: An indexed array of elements on which to allow the attribute, in the same format as other element lists. The string `"*"` may be supplied instead of an array to mean all elements
*
* @param array $config A configuration to use instead of the default one
*/
public function __construct(array $config = null) {
if ($config === null) {
// use the default configuration if none is specified
$this->config = self::DEFAULT_CONF;
} else {
// otherwise validate the configuration; the specification provides
// no clue is to what happens when the configuration is invalid,
// so we'll just have to do our best
$out = [];
// start with the element lists
foreach (["allowElements", "blockElements", "dropElements"] as $opt) {
if (isset($config[$opt]) && is_array($config[$opt])) {
foreach ($config[$opt] as $el) {
if (is_string($el) && strlen($el)) {
// strings are assumed to be in the HTML namespace
$ns = self::HTML_NAMESPACE;
$name = $el;
} elseif (is_array($el) && strlen($el['name'] ?? "")) {
$name = $el['name'];
if (!array_key_exists("namespace", $el)) {
// the namespace key being missing means the HTML namespace
$ns = self::HTML_NAMESPACE;
} elseif (!isset($el['namespace'])) {
// the null namespace is also possible (but will never match in HTML documents)
$ns = self::NULL_NAMESPACE;
} elseif (is_string($el['namespace'])) {
// only use the namespace if it's a string
$ns = $el['namespace'];
} else {
// ignore any other value for the namespace (this is invalid)
continue;
}
}
// create any structures which might be missing
if (!isset($out[$opt])) {
$out[$opt] = [];
}
if (!isset($out[$opt][$ns])) {
$out[$opt][$ns] = [];
}
// add the element
$out[$opt][$ns][$name] = true;
}
}
}
// continue with attribute lists
foreach (["allowAttributes", "dropAttributes"] as $opt) {
if (isset($config[$opt]) && is_array($config[$opt])) {
foreach ($config[$opt] as $attr) {
if (is_array($attr) && strlen($attr['name'] ?? "" && isset($attr['elements']))) {
$name = $attr['name'];
if (!isset($attr['namespace'])) {
// the null namespace is assumed
$ns = self::NULL_NAMESPACE;
} elseif (is_string($attr['namespace'])) {
// only use the namespace if it's a string
$ns = $attr['namespace'];
} else {
// ignore any other value for the namespace (this is invalid)
continue;
}
// now check the list of elements
if ($attr['elements'] === "*") {
// the special string "*" means any element
$list = "*";
} elseif (is_array($attr['elements'])) {
// otherwise the element list is like the element lists handled above
$list = [];
foreach ($attr['elements'] as $el) {
if (is_string($el) && strlen($el)) {
// strings are assumed to be in the HTML namespace
$eNs = self::HTML_NAMESPACE;
$eName = $el;
} elseif (is_array($el) && strlen($el['name'] ?? "")) {
$eName = $el['name'];
if (!array_key_exists("namespace", $el)) {
// the namespace key being missing means the HTML namespace
$eNs = self::HTML_NAMESPACE;
} elseif (!isset($el['namespace'])) {
// the null namespace is also possible (but will never match in HTML documents)
$eNs = self::NULL_NAMESPACE;
} elseif (is_string($el['namespace'])) {
// only use the namespace if it's a string
$eNs = $el['namespace'];
} else {
// ignore any other value for the namespace (this is invalid)
continue;
}
}
// create any structures which might be missing
if (!isset($list[$eNs])) {
$list[$eNs] = [];
}
// add the element
$list[$eNs][$eName] = true;
}
} else {
// ignore any other value for the elements list (this is invalid)
continue;
}
// create any structures which might be missing
if (!isset($out[$opt])) {
$out[$opt] = [];
}
if (!isset($out[$opt][$ns])) {
$out[$opt][$ns] = [];
}
// add the attribute
$out[$opt][$ns][$name] = $list;
}
}
}
}
// finally handle the boolean options
foreach (["allowCustomElements", "allowUnknownMarkup", "allowComments", "allowProcessingInstructions", "nullNamespaceAsHtml"] as $opt) {
$out[$opt] = (bool) $config[$opt] ?? self::DEFAULT_CONF[$opt];
}
// use the normalized configuration
$this->config = $out;
}
}
}