First pass at high-level API

J. King 3 years ago
@ -60,34 +60,23 @@ class Document extends \DOMDocument {
public function load($source, $options = null): bool {
Parser::parse((string)$source, $this, true);
public function load($source, $options = null, ?string $encodingOrContentType = null): bool {
$data = Parser::fetchFile($source, $encodingOrContentType);
if (!$data) {
return false;
[$data, $encodingOrContentType] = $data;
Parser::parse($data, $this, $encodingOrContentType, null, (string) $source);
return true;
public function loadHTML($source, $options = null): bool {
Parser::parse((string)$source, $this);
public function loadHTML($source, $options = null, ?string $encodingOrContentType = null): bool {
Parser::parse((string)$source, $this, $encodingOrContentType);
return true;
public function loadXML($source, $options = null) {
throw new Exception(Exception::DOM_DISABLED_METHOD, __CLASS__, __FUNCTION__);
public function save($filename, $options = null) {
throw new Exception(Exception::DOM_DISABLED_METHOD, __CLASS__, __FUNCTION__);
public function saveHTML(\DOMNode $node = null): string {
return $this->serialize($node);
public function saveHTMLFile($filename) {}
public function saveXML(\DOMNode $node = null, $options = null) {
throw new Exception(Exception::DOM_DISABLED_METHOD, __CLASS__, __FUNCTION__);
public function createElement($name, $value = "") {
try {
$e = parent::createElement($name, $value);


@ -10,6 +10,7 @@ class Data {
// Used to get the file path for error reporting.
public $filePath;
// Whether the encoding is certain or tentative; this is a feature of the specification, but not relevant for this implementation
public $encodingCertain = false;
// Internal storage for the Intl data object.
@ -37,12 +38,7 @@ class Data {
public function __construct(string $data, string $filePath = 'STDIN', ParseError $errorHandler = null, string $encodingOrContentType = '') {
$this->errorHandler = $errorHandler ?? new ParseError;
if ($filePath !== 'STDIN') {
$this->filePath = realpath($filePath);
$data = file_get_contents($this->filePath);
} else {
$this->filePath = $filePath;
$this->filePath = $filePath;
if ($encoding = Charset::fromBOM($data)) {
// encoding determined from Unicode byte order mark
@ -55,7 +51,7 @@ class Data {
// Encoding is tentative
} else {
// Encoding is tentative; fall back to the configured default encoding
$encoding = Parser::$fallbackEncoding;
$encoding = Charset::fromCharset(Parser::$fallbackEncoding) ?? "windows-1252";
$this->data = Encoding::createDecoder($encoding, $data, false, true);


@ -3,41 +3,7 @@ declare(strict_types=1);
namespace dW\HTML5;
class Parser {
/* Non-static properties */
// Input data that's being parsed, uses Data
protected $data;
// The DOMDocument that is assembled by the tree builder
protected $DOM;
// If parsed as a fragment a fragment is assembled instead
protected $DOMFragment;
// The form element pointer points to the last form element that was opened and
// whose end tag has not yet been seen. It is used to make form controls associate
// with forms in the face of dramatically bad markup, for historical reasons. It is
// ignored inside template elements
protected $formElement;
// Flag that shows whether the content that's being parsed is a fragment or not
protected $fragmentCase = false;
// Context element for fragments
protected $fragmentContext;
// Used for the instance of ParseError
protected $parseError;
// The stack of open elements, uses Stack
protected $stack;
// Used to store the template insertion modes
protected $templateInsertionModes;
// Instance of the Tokenizer class used for creating tokens
protected $tokenizer;
// Instance of the TreeBuilder class used for building the document
protected $treeBuilder;
/* Static properties */
public static $fallbackEncoding = "UTF-8";
// Property used as an instance for the non-static properties
protected static $instance;
public static $fallbackEncoding = "windows-1252";
// Namespace constants
const HTML_NAMESPACE = '';
@ -56,149 +22,68 @@ class Parser {
self::XMLNS_NAMESPACE => "xmlns",
// Protected construct used for creating an instance to access properties which must
// be reset on every parse
protected function __construct() {
static::$instance = $this;
public function __destruct() {
static::$instance = null;
public static function parse(string $data, Document $document = null, bool $file = false) {
// If parse() is called by parseFragment() then don't create an instance. It has
// already been created.
$c = __CLASS__;
if (!(static::$instance instanceof $c && !static::$instance->fragmentCase)) {
static::$instance = new $c;
if (is_null($document)) {
static::$instance->DOM = new Document();
} else {
if ($document->hasChildNodes()) {
throw new Exception(Exception::PARSER_NONEMPTY_DOCUMENT);
static::$instance->DOM = $document;
// Initialize the parse error handler.
static::$instance->parseError = new ParseError;
public static function parse(string $data, ?Document $document = null, ?string $encodingOrContentType = null, ?\DOMElement $fragmentContext = null, ?String $file = null): Document {
// Initialize the various classes needed for parsing
$document = $document ?? new Document;
$errorHandler = new ParseError;
$decoder = new Data($data, $file ?? "STDIN", $errorHandler, $encodingOrContentType);
$stack = new OpenElementsStack($fragmentContext);
$tokenizer = new Tokenizer($decoder, $stack, $errorHandler);
$treeBuilder = new TreeBuilder($document, $decoder, $tokenizer, $errorHandler, $stack, new TemplateInsertionModesStack, $fragmentContext);
// Override error handling
try {
// Process the input stream.
static::$instance->data = new Data(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN', static::$instance->parseError);
// Set the locale for CTYPE to en_US.UTF8 so ctype functions and strtolower only
// work on basic latin characters. Used extensively when tokenizing.
setlocale(LC_CTYPE, 'en_US.UTF8');
// Initialize the stack of open elements.
static::$instance->stack = new OpenElementsStack(static::$instance->fragmentCase, static::$instance->fragmentContext);
// Initialize the template insertion modes stack if necessary.
if (is_null(static::$instance->templateInsertionModes)) {
static::$instance->templateInsertionModes = new TemplateInsertionModesStack();
// Initialize the tokenizer.
static::$instance->tokenizer = new Tokenizer(static::$instance->data, static::$instance->stack, static::$instance->parseError);
// Initialize the tree builder.
static::$instance->treeBuilder = new TreeBuilder(static::$instance->DOM, static::$instance->formElement, static::$instance->fragmentCase, static::$instance->fragmentContext, static::$instance->stack, static::$instance->templateInsertionModes, static::$instance->tokenizer, static::$instance->parseError, static::$instance->data);
// Run the tokenizer. Tokenizer runs until after the EOF token is emitted.
// run the parser to completion
do {
$token = static::$instance->tokenizer->createToken();
$token = $tokenizer->createToken();
} while (!$token instanceof EOFToken);
// Fix id attributes before outputting.
// The Parser instance has no need to exist when finished.
$dom = static::$instance->DOM;
} finally {
// Restore error handling
return $dom;
return $document;
public static function parseFragment(string $data, Element $context = null, bool $file = false): \DOMDocument {
// Create an instance of this class to use the non static properties.
$c = __CLASS__;
static::$instance = new $c;
static::$instance->DOM = (is_null($context)) ? new Document() : $context->ownerDocument;
static::$instance->DOMFragment = static::$instance->DOM->createDocumentFragment();
// DEVIATION: The spec says to let the document be in quirks mode if the
// DOMDocument is in quirks mode. Cannot check whether the context element is in
// quirks mode, so going to assume it isn't.
// DEVIATION: The spec's version of parsing fragments isn't remotely useful in
// the context this library is intended for use in. This implementation uses a
// DOMDocumentFragment for inserting nodes into. There's no need to have a
// different process for when there isn't a context. There will always be one:
// the DOMDocumentFragment.
static::$instance->fragmentContext = (!is_null($context)) ? $context : static::$instance->DOMFragment;
$name = static::$instance->fragmentContext->nodeName;
# Set the state of the HTML parser's tokenization stage as follows:
switch($name) {
case 'title':
case 'textarea': static::$instance->tokenizer->state = Tokenizer::RCDATA_STATE;
case 'style':
case 'xmp':
case 'iframe':
case 'noembed':
case 'noframes': static::$instance->tokenizer->state = Tokenizer::RAWTEXT_STATE;
case 'script': static::$instance->tokenizer->state = Tokenizer::SCRIPT_STATE;
case 'noscript': static::$instance->tokenizer->state = Tokenizer::NOSCRIPT_STATE;
case 'plaintext': static::$instance->tokenizer->state = Tokenizer::PLAINTEXT_STATE;
default: static::$instance->tokenizer->state = Tokenizer::DATA_STATE;
public static function parseFragment(string $data, ?Document $document = null, ?string $encodingOrContentType = null, ?\DOMElement $fragmentContext = null, ?String $file = null): DocumentFragment {
// Create the requisite parsing context if none was supplied
$document = $document ?? new Document;
$tempDocument = new Document;
$fragmentContext = $fragmentContext ?? $document->createElement("div");
// parse the fragment into the temporary document
self::parse($data, $tempDocument, $encodingOrContentType, $fragmentContext, $file);
// extract the nodes from the temp document into a fragment
$fragment = $document->createDocumentFragment();
foreach ($tempDocument->documentElement->childNodes as $node) {
$document->importNode($node, true);
return $fragment;
// DEVIATION: Since this implementation uses a DOMDocumentFragment for insertion
// there is no need to create an html element for inserting stuff into.
# If the context element is a template element, push "in template" onto the
# stack of template insertion modes so that it is the new current template
# insertion mode.
if ($context instanceof Element && $context->nodeName === 'template') {
static::$templateInsertionModes = new Stack();
static::$templateInsertionModes[] = TreeBuilder::IN_TEMPLATE_MODE;
public static function fetchFile(string $file, ?string $encodingOrContentType = null): ?array {
$f = fopen($file, "r");
if (!$f) {
return null;
# Reset the parser's insertion mode appropriately.
// DEVIATION: The insertion mode will be always 'in body', not 'before head' if
// there isn't a context. There isn't a need to reconstruct a valid HTML
// document when using a DOMDocumentFragment.
# Set the parser's form element pointer to the nearest node to the context element
# that is a form element (going straight up the ancestor chain, and including the
# element itself, if it is a form element), if any. (If there is no such form
# element, the form element pointer keeps its initial value, null.)
static::$instance->formElement = ($name === 'form') ? $context : DOM::getAncestor('form', $context);
# Start the parser and let it run until it has consumed all the characters just
# inserted into the input stream.
static::$instance->fragmentCase = true;
static::parse($data, $file);
# If there is a context element, return the child nodes of root, in tree order.
# Otherwise, return the children of the Document object, in tree order.
// DEVIATION: This method will always return a DOMDocumentFragment.
return static::$instance->DOMFragment;
$data = stream_get_contents($f);
$encoding = Charset::fromCharset((string) $encodingOrContentType) ?? Charset::fromTransport((string) $encodingOrContentType);
if (!$encoding) {
$meta = stream_get_meta_data($f);
if ($meta['wrapper_type'] === "http") {
// Try to find a Content-Type header-field
foreach ($meta['wrapper_data'] as $h) {
$h = explode(":", $h, 2);
if (sizeof($h) === 2) {
if (preg_match("/^\s*Content-Type\s*$/i", $h[0])) {
// Try to get an encoding from it
$encoding = Charset::fromTransport($h[1]);
return [$data, $encoding];
