A modern, accurate HTML parser and serializer for PHP
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

205 lines
9.2 KiB

<?php
declare(strict_types=1);
namespace dW\HTML5;
class Parser {
/* Non-static properties */
// Input data that's being parsed, uses Data
protected $data;
// The DOMDocument that is assembled by the tree builder
protected $DOM;
// If parsed as a fragment a fragment is assembled instead
protected $DOMFragment;
// The form element pointer points to the last form element that was opened and
// whose end tag has not yet been seen. It is used to make form controls associate
// with forms in the face of dramatically bad markup, for historical reasons. It is
// ignored inside template elements
protected $formElement;
// Flag that shows whether the content that's being parsed is a fragment or not
protected $fragmentCase = false;
// Context element for fragments
protected $fragmentContext;
// Used for the instance of ParseError
protected $parseError;
// The stack of open elements, uses Stack
protected $stack;
// Used to store the template insertion modes
protected $templateInsertionModes;
// Instance of the Tokenizer class used for creating tokens
protected $tokenizer;
// Instance of the TreeBuilder class used for building the document
protected $treeBuilder;
/* Static properties */
public static $fallbackEncoding = "UTF-8";
// Property used as an instance for the non-static properties
protected static $instance;
// Namespace constants
const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
const XLINK_NAMESPACE = 'http://www.w3.org/1999/xlink';
const XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace';
const XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/';
const NAMESPACE_MAP = [
self::HTML_NAMESPACE => "",
self::MATHML_NAMESPACE => "math",
self::SVG_NAMESPACE => "svg",
self::XLINK_NAMESPACE => "xlink",
self::XML_NAMESPACE => "xml",
self::XMLNS_NAMESPACE => "xmlns",
];
// Protected construct used for creating an instance to access properties which must
// be reset on every parse
protected function __construct() {
static::$instance = $this;
}
public function __destruct() {
$this->treeBuilder->__destruct();
static::$instance = null;
}
public static function parse(string $data, Document $document = null, bool $file = false) {
// If parse() is called by parseFragment() then don't create an instance. It has
// already been created.
$c = __CLASS__;
if (!(static::$instance instanceof $c && !static::$instance->fragmentCase)) {
static::$instance = new $c;
}
if (is_null($document)) {
static::$instance->DOM = new Document();
} else {
if ($document->hasChildNodes()) {
throw new Exception(Exception::PARSER_NONEMPTY_DOCUMENT);
}
static::$instance->DOM = $document;
}
// Initialize the parse error handler.
static::$instance->parseError = new ParseError;
static::$instance->parseError->setHandler();
try {
// Process the input stream.
static::$instance->data = new Data(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN', static::$instance->parseError);
// Set the locale for CTYPE to en_US.UTF8 so ctype functions and strtolower only
// work on basic latin characters. Used extensively when tokenizing.
setlocale(LC_CTYPE, 'en_US.UTF8');
// Initialize the stack of open elements.
static::$instance->stack = new OpenElementsStack(static::$instance->fragmentCase, static::$instance->fragmentContext);
// Initialize the template insertion modes stack if necessary.
if (is_null(static::$instance->templateInsertionModes)) {
static::$instance->templateInsertionModes = new TemplateInsertionModesStack();
}
// Initialize the tokenizer.
static::$instance->tokenizer = new Tokenizer(static::$instance->data, static::$instance->stack, static::$instance->parseError);
// Initialize the tree builder.
static::$instance->treeBuilder = new TreeBuilder(static::$instance->DOM, static::$instance->formElement, static::$instance->fragmentCase, static::$instance->fragmentContext, static::$instance->stack, static::$instance->templateInsertionModes, static::$instance->tokenizer, static::$instance->parseError, static::$instance->data);
// Run the tokenizer. Tokenizer runs until after the EOF token is emitted.
do {
$token = static::$instance->tokenizer->createToken();
static::$instance->treeBuilder->emitToken($token);
} while (!$token instanceof EOFToken);
// Fix id attributes before outputting.
static::$instance->DOM->fixIdAttributes();
// The Parser instance has no need to exist when finished.
$dom = static::$instance->DOM;
static::$instance->__destruct();
} finally {
static::$instance->parseError->clearHandler();
}
return $dom;
}
public static function parseFragment(string $data, Element $context = null, bool $file = false): \DOMDocument {
// Create an instance of this class to use the non static properties.
$c = __CLASS__;
static::$instance = new $c;
static::$instance->DOM = (is_null($context)) ? new Document() : $context->ownerDocument;
static::$instance->DOMFragment = static::$instance->DOM->createDocumentFragment();
// DEVIATION: The spec says to let the document be in quirks mode if the
// DOMDocument is in quirks mode. Cannot check whether the context element is in
// quirks mode, so going to assume it isn't.
// DEVIATION: The spec's version of parsing fragments isn't remotely useful in
// the context this library is intended for use in. This implementation uses a
// DOMDocumentFragment for inserting nodes into. There's no need to have a
// different process for when there isn't a context. There will always be one:
// the DOMDocumentFragment.
static::$instance->fragmentContext = (!is_null($context)) ? $context : static::$instance->DOMFragment;
$name = static::$instance->fragmentContext->nodeName;
# Set the state of the HTML parser's tokenization stage as follows:
switch($name) {
case 'title':
case 'textarea': static::$instance->tokenizer->state = Tokenizer::RCDATA_STATE;
break;
case 'style':
case 'xmp':
case 'iframe':
case 'noembed':
case 'noframes': static::$instance->tokenizer->state = Tokenizer::RAWTEXT_STATE;
break;
case 'script': static::$instance->tokenizer->state = Tokenizer::SCRIPT_STATE;
break;
case 'noscript': static::$instance->tokenizer->state = Tokenizer::NOSCRIPT_STATE;
break;
case 'plaintext': static::$instance->tokenizer->state = Tokenizer::PLAINTEXT_STATE;
break;
default: static::$instance->tokenizer->state = Tokenizer::DATA_STATE;
}
// DEVIATION: Since this implementation uses a DOMDocumentFragment for insertion
// there is no need to create an html element for inserting stuff into.
# If the context element is a template element, push "in template" onto the
# stack of template insertion modes so that it is the new current template
# insertion mode.
if ($context instanceof Element && $context->nodeName === 'template') {
static::$templateInsertionModes = new Stack();
static::$templateInsertionModes[] = TreeBuilder::IN_TEMPLATE_MODE;
}
# Reset the parser's insertion mode appropriately.
// DEVIATION: The insertion mode will be always 'in body', not 'before head' if
// there isn't a context. There isn't a need to reconstruct a valid HTML
// document when using a DOMDocumentFragment.
TreeBuilder::resetInsertionMode();
# Set the parser's form element pointer to the nearest node to the context element
# that is a form element (going straight up the ancestor chain, and including the
# element itself, if it is a form element), if any. (If there is no such form
# element, the form element pointer keeps its initial value, null.)
static::$instance->formElement = ($name === 'form') ? $context : DOM::getAncestor('form', $context);
# Start the parser and let it run until it has consumed all the characters just
# inserted into the input stream.
static::$instance->fragmentCase = true;
static::parse($data, $file);
# If there is a context element, return the child nodes of root, in tree order.
# Otherwise, return the children of the Document object, in tree order.
// DEVIATION: This method will always return a DOMDocumentFragment.
return static::$instance->DOMFragment;
}
}