Now uses mensbeam\html-parser.

• Document::__construct now accepts \DOMDocument|string|null as its source. • Document::serialize has been removed in favor of Document::saveHTML, making it more in line with PHP's DOM. • Added Document::loadDOM for loading a vanilla PHP \DOMDocument into this library. As mentioned above Document::__constructor also accepts a \DOMDocument. • Document::importNode will correctly convert an imported template element as a HTMLTemplateElement. • TemplateElement is now HTMLTemplateElement. • DOM walker methods now correctly check if the output of the closure is true • Unnecessary test cases removed.
3 years ago · 15aa960174
16 changed files with 273 additions and 992 deletions
--- a/composer.json
+++ b/composer.json
@ -4,7 +4,8 @@
    "type": "library",
    "require": {
        "php": ">=7.1",
-        "ext-dom": "*"
+        "ext-dom": "*",
+        "mensbeam/html-parser": "dev-master"
    },
    "scripts": {
        "post-install-cmd": ["@composer bin all install"],
--- a/composer.lock
+++ b/composer.lock
@ -4,7 +4,7 @@
        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
        "This file is @generated automatically"
    ],
-    "content-hash": "0e733e74b1b163aa4cd80329ff9c71d0",
+    "content-hash": "0301f6145278bf69bbcca49992befa89",
    "packages": [],
    "packages-dev": [
        {
@ -534,6 +534,167 @@
            },
            "time": "2020-12-25T05:00:37+00:00"
        },
+        {
+            "name": "mensbeam/html-parser",
+            "version": "dev-master",
+            "source": {
+                "type": "git",
+                "url": "https://code.mensbeam.com/MensBeam/HTML-Parser",
+                "reference": "8115ac07d55b96d866da996f6329f6287f09bc49"
+            },
+            "require": {
+                "ext-dom": "*",
+                "mensbeam/intl": ">=0.9.0",
+                "mensbeam/mimesniff": "^0.2.0",
+                "php": ">=7.1"
+            },
+            "require-dev": {
+                "bamarni/composer-bin-plugin": "^1.3",
+                "masterminds/html5": "^2.7"
+            },
+            "suggest": {
+                "ext-ctype": "Improved performance"
+            },
+            "default-branch": true,
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "MensBeam\\HTML\\": [
+                        "lib/"
+                    ]
+                },
+                "classmap": [
+                    "lib/Parser/Token.php"
+                ],
+                "files": [
+                    "lib/Parser/ctype.php"
+                ]
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "Dustin Wilson",
+                    "email": "dustin@dustinwilson.com",
+                    "homepage": "https://dustinwilson.com/"
+                },
+                {
+                    "name": "J. King",
+                    "email": "jking@jkingweb.ca",
+                    "homepage": "https://jkingweb.ca/"
+                }
+            ],
+            "description": "Parses modern HTML text into a PHP DOMDocument",
+            "time": "2021-09-26T20:09:27+00:00"
+        },
+        {
+            "name": "mensbeam/intl",
+            "version": "0.9.0",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/mensbeam/intl.git",
+                "reference": "de037b182ce99aaa90ebc09b0ee0457ddf1d07bc"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/mensbeam/intl/zipball/de037b182ce99aaa90ebc09b0ee0457ddf1d07bc",
+                "reference": "de037b182ce99aaa90ebc09b0ee0457ddf1d07bc",
+                "shasum": ""
+            },
+            "require": {
+                "php": ">=7.1"
+            },
+            "require-dev": {
+                "bamarni/composer-bin-plugin": "*",
+                "ext-intl": "*"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "MensBeam\\Intl\\": "lib/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "J. King",
+                    "email": "jking@jkingweb.ca",
+                    "homepage": "https://jkingweb.ca/"
+                }
+            ],
+            "description": "A set of dependency-free basic internationalization tools",
+            "keywords": [
+                "WHATWG",
+                "charset",
+                "encoding",
+                "internationalization",
+                "intl",
+                "unicode",
+                "utf-8",
+                "utf8"
+            ],
+            "support": {
+                "issues": "https://github.com/mensbeam/intl/issues",
+                "source": "https://github.com/mensbeam/intl/tree/0.9.0"
+            },
+            "time": "2021-03-25T19:08:04+00:00"
+        },
+        {
+            "name": "mensbeam/mimesniff",
+            "version": "0.2.1",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/mensbeam/mime.git",
+                "reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/mensbeam/mime/zipball/c19be2496ab1e27fbf9c3483c2a9faa2781796cd",
+                "reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd",
+                "shasum": ""
+            },
+            "require": {
+                "php": ">=7.1",
+                "psr/http-message": "^1.0"
+            },
+            "require-dev": {
+                "bamarni/composer-bin-plugin": "^1.3",
+                "ext-intl": "*"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "MensBeam\\Mime\\": "lib/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "J. King",
+                    "email": "jking@jkingweb.ca",
+                    "homepage": "https://jkingweb.ca/"
+                }
+            ],
+            "description": "An implementation of the WHATWG MIME Sniffing specification",
+            "keywords": [
+                "WHATWG",
+                "mime",
+                "mimesniff"
+            ],
+            "support": {
+                "issues": "https://github.com/mensbeam/mime/issues",
+                "source": "https://github.com/mensbeam/mime/tree/0.2.1"
+            },
+            "time": "2021-03-07T03:58:00+00:00"
+        },
        {
            "name": "psr/container",
            "version": "1.1.1",
@ -2294,7 +2455,9 @@
    ],
    "aliases": [],
    "minimum-stability": "stable",
-    "stability-flags": [],
+    "stability-flags": {
+        "mensbeam/html-parser": 20
+    },
    "prefer-stable": false,
    "prefer-lowest": false,
    "platform": {
--- a/lib/Document.php
+++ b/lib/Document.php
@ -4,11 +4,17 @@
 * See LICENSE and AUTHORS files for details */

 declare(strict_types=1);
-namespace MensBeam\HTML;
+namespace MensBeam\HTML\DOM;
+use MensBeam\HTML\Parser,
+    MensBeam\HTML\Parser\Data;
+

 class Document extends AbstractDocument {
    protected ?Element $_body = null;
    /** Nonstandard */
+    protected ?string $_documentEncoding;
+    protected int $_quirksMode = 0;
+    /** Nonstandard */
    protected ?\DOMXPath $_xpath = null;

    // List of elements that are treated as block elements for the purposes of
@ -94,6 +100,14 @@ class Document extends AbstractDocument {
        $this->_body = $value;
    }

+    public function __get_documentEncoding(): ?string {
+        return $this->_documentEncoding;
+    }
+
+    public function __get_quirksMode(): int {
+        return $this->_quirksMode;
+    }
+
    public function __get_xpath(): \DOMXPath {
        if ($this->_xpath === null) {
            $this->_xpath = new \DOMXPath($this);
@ -102,34 +116,26 @@ class Document extends AbstractDocument {
    }


-    public function __construct($source = null, ?string $encodingOrContentType = null) {
+    public function __construct($source = null, ?string $encoding = null, int $quirksMode = 0) {
        // Because we cannot have union types until php 8... :)
        if ($source !== null && !$source instanceof \DOMDocument && !is_string($source)) {
            throw new DOMException(DOMException::ARGUMENT_TYPE_ERROR, 1, 'source', 'string|\DOMDocument', gettype($source));
-        } elseif ($source instanceof self) {
-            return $source;
        }

        parent::__construct();

-        $this->registerNodeClass('DOMDocument', '\MensBeam\HTML\Document');
-        $this->registerNodeClass('DOMComment', '\MensBeam\HTML\Comment');
-        $this->registerNodeClass('DOMDocumentFragment', '\MensBeam\HTML\DocumentFragment');
-        $this->registerNodeClass('DOMElement', '\MensBeam\HTML\Element');
-        $this->registerNodeClass('DOMProcessingInstruction', '\MensBeam\HTML\ProcessingInstruction');
-        $this->registerNodeClass('DOMText', '\MensBeam\HTML\Text');
+        $this->registerNodeClass('DOMDocument', '\MensBeam\HTML\DOM\Document');
+        $this->registerNodeClass('DOMComment', '\MensBeam\HTML\DOM\Comment');
+        $this->registerNodeClass('DOMDocumentFragment', '\MensBeam\HTML\DOM\DocumentFragment');
+        $this->registerNodeClass('DOMElement', '\MensBeam\HTML\DOM\Element');
+        $this->registerNodeClass('DOMProcessingInstruction', '\MensBeam\HTML\DOM\ProcessingInstruction');
+        $this->registerNodeClass('DOMText', '\MensBeam\HTML\DOM\Text');

        if ($source !== null) {
            if (is_string($source)) {
-                $source = Parser::parse($source, null, $encodingOrContentType);
-            }
-
-            foreach ($source->childNodes as $child) {
-                if (!$child instanceof \DOMDocumentType) {
-                    $this->appendChild($this->importNode($child, true));
-                } else {
-                    $this->appendChild($this->implementation->createDocumentType($child->name ?? ' ', $child->public ?? '', $child->system ?? ''));
-                }
+                $this->loadHTML($source, null, $encoding);
+            } else {
+                $this->loadDOM($source, $encoding, $quirksMode);
            }
        }
    }
@ -178,7 +184,7 @@ class Document extends AbstractDocument {
            if ($qualifiedName !== 'template' || $namespaceURI !== null) {
                $e = parent::createElementNS($namespaceURI, $qualifiedName, $value);
            } else {
-                $e = new TemplateElement($this, $qualifiedName, $value);
+                $e = new HTMLTemplateElement($this, $qualifiedName, $value);
                // Template elements need to have a reference kept in userland
                ElementMap::set($e);
                $e->content = $this->createDocumentFragment();
@ -203,23 +209,38 @@ class Document extends AbstractDocument {
        return false;
    }

-    public function load($filename, $options = null, ?string $encodingOrContentType = null): bool {
-        $data = Parser::fetchFile($filename, $encodingOrContentType);
+    public function importNode(\DOMNode $node, bool $deep = false) {
+        $node = parent::importNode($node, $deep);
+
+        if ($node instanceof \DOMElement) {
+            $node = $this->convertElementToSubClass($node);
+        }
+
+        return $node;
+    }
+
+    public function load($filename, $options = null, ?string $encoding = null): bool {
+        $data = Parser::fetchFile($filename, $encoding);
        if (!$data) {
            return false;
        }
        [$data, $encodingOrContentType] = $data;
-        Parser::parse($data, $this, $encodingOrContentType, null, (string)$filename);
+        $this->loadHTML($data, null, $encoding);
        return true;
    }

-    public function loadHTML($source, $options = null, ?string $encodingOrContentType = null): bool {
-        if (!is_string($source)) {
-            throw new DOMException(DOMException::ARGUMENT_TYPE_ERROR, 1, 'source', 'string', gettype($source));
+    public function loadDOM(\DOMDocument $source, ?string $encoding = null, int $quirksMode = 0) {
+        if (!$source instanceof \DOMDocument) {
+            throw new DOMException(DOMException::ARGUMENT_TYPE_ERROR, 1, 'source', '\DOMDocument', gettype($source));
        }

-        if (is_string($source)) {
-            $source = Parser::parse($source, null, $encodingOrContentType);
+        $this->_documentEncoding = $encoding;
+        $this->_quirksMode = $quirksMode;
+
+        // If there are already existing child nodes then remove them before loading the
+        // DOM.
+        while ($this->hasChildNodes()) {
+            $this->removeChild($this->firstChild);
        }

        foreach ($source->childNodes as $child) {
@ -230,13 +251,32 @@ class Document extends AbstractDocument {
            }
        }

-        assert(is_string($source), new DOMException(DOMException::STRING_EXPECTED, 'source', gettype($source)));
-        Parser::parse($source, $this, $encodingOrContentType);
+        $templates = $this->walk(function($n) {
+            if ($n instanceof Element && $n->namespaceURI === null && $n->nodeName === 'template') {
+                return true;
+            }
+        });
+
+        foreach ($templates as $template) {
+            $template->replaceWith($this->convertElementToSubClass($template));
+        }
+
        return true;
    }

-    public function loadHTMLFile($filename, $options = null, ?string $encodingOrContentType = null): bool {
-        return $this->load($filename, $options, $encodingOrContentType);
+    public function loadHTML($source, $options = null, ?string $encoding = null): bool {
+        if (!is_string($source)) {
+            throw new DOMException(DOMException::ARGUMENT_TYPE_ERROR, 1, 'source', 'string', gettype($source));
+        }
+
+        $source = Parser::parse($source, $encoding, null);
+        $this->loadDOM($source->document, $source->encoding, $source->quirksMode);
+
+        return true;
+    }
+
+    public function loadHTMLFile($filename, $options = null, ?string $encoding = null): bool {
+        return $this->load($filename, $options, $encoding);
    }

    public function loadXML($source, $options = null): bool {
@ -248,18 +288,6 @@ class Document extends AbstractDocument {
    }

    public function saveHTML(\DOMNode $node = null): string {
-        return $node->serialize($node);
-    }
-
-    public function saveHTMLFile($filename): int {
-        return $this->save($filename);
-    }
-
-    public function saveXML(?\DOMNode $node = null, $options = null): bool {
-        return false;
-    }
-
-    public function serialize(\DOMNode $node = null): string {
        $node = $node ?? $this;
        $formatOutput = $this->formatOutput;

@ -297,6 +325,14 @@ class Document extends AbstractDocument {
        return $this->serializeFragment($node, $formatOutput);
    }

+    public function saveHTMLFile($filename): int {
+        return $this->save($filename);
+    }
+
+    public function saveXML(?\DOMNode $node = null, $options = null): bool {
+        return false;
+    }
+
    public function validate(): bool {
        return true;
    }
@ -433,7 +469,7 @@ class Document extends AbstractDocument {

        # 3. If the node is a template element, then let the node instead be the
        # template element’s template contents (a DocumentFragment node).
-        if ($node instanceof TemplateElement) {
+        if ($node instanceof HTMLTemplateElement) {
            $node = $node->content;
        }

@ -749,7 +785,25 @@ class Document extends AbstractDocument {
    }


+    private function convertElementToSubClass(\DOMElement $element): \DOMElement {
+        if ($element->namespaceURI === null && $element->nodeName === 'template') {
+            $template = $this->createElement('template');
+
+            while ($element->attributes->length > 0) {
+                $template->setAttributeNode($element->attributes->item(0));
+            }
+            while ($element->hasChildNodes()) {
+                $template->content->appendChild($element->firstChild);
+            }
+
+            $element = $template;
+        }
+
+        return $element;
+    }
+
+
    public function __toString() {
-        return $this->serialize();
+        return $this->saveHTML();
    }
 }
--- a/lib/HTMLTemplateElement.php
+++ b/lib/HTMLTemplateElement.php
@ -7,7 +7,7 @@ declare(strict_types=1);
 namespace MensBeam\HTML\DOM;

 /** Class specifically for template elements to handle its content property. */
-class TemplateElement extends Element {
+class HTMLTemplateElement extends Element {
    public $content = null;

    public function __construct(Document $ownerDocument, string $qualifiedName, ?string $value = null, string $namespace = '') {
--- a/lib/traits/Moonwalk.php
+++ b/lib/traits/Moonwalk.php
@ -15,7 +15,7 @@ trait Moonwalk {
    private function moonwalkGenerator(\DOMNode $node, ?\Closure $filter = null) {
        do {
            while (true) {
-                if ($filter === null || $filter($node)) {
+                if ($filter === null || $filter($node) === true) {
                    yield $node;
                }

--- a/lib/traits/MoonwalkShallow.php
+++ b/lib/traits/MoonwalkShallow.php
@ -19,7 +19,7 @@ trait MoonwalkShallow {
        $childNodesLength = $node->childNodes->length;
        for ($childNodesLength = $node->childNodes->length, $i = $childNodesLength - 1; $i >= 0; $i--) {
            $child = $node->childNodes[$i];
-            if ($filter === null || $filter($child)) {
+            if ($filter === null || $filter($child) === true) {
                yield $child;
            }
        }
--- a/lib/traits/ToString.php
+++ b/lib/traits/ToString.php
@ -10,6 +10,6 @@ trait ToString {
    public function __toString(): string {
        $frag = $this->ownerDocument->createDocumentFragment();
        $frag->appendChild($this->cloneNode(true));
-        return $this->ownerDocument->serialize($frag);
+        return $this->ownerDocument->saveHTML($frag);
    }
 }
--- a/lib/traits/Walk.php
+++ b/lib/traits/Walk.php
@ -13,7 +13,7 @@ trait Walk {
    }

    private function walkGenerator(\DOMNode $node, ?\Closure $filter = null) {
-        if ($filter === null || $filter($node)) {
+        if ($filter === null || $filter($node) === true) {
            yield $node;
        }

--- a/lib/traits/WalkShallow.php
+++ b/lib/traits/WalkShallow.php
@ -16,7 +16,7 @@ trait WalkShallow {
        $node = (!$this instanceof TemplateElement) ? $this : $this->content;

        foreach ($node->childNodes as $child) {
-            if ($filter === null || $filter($child)) {
+            if ($filter === null || $filter($child) === true) {
                yield $child;
            }
        }
--- a/tests/cases/TestCharset.php
+++ b/tests/cases/TestCharset.php
@ -1,107 +0,0 @@
-<?php
-/** @license MIT
- * Copyright 2017 , Dustin Wilson, J. King et al.
- * See LICENSE and AUTHORS files for details */
-
-declare(strict_types=1);
-namespace MensBeam\HTML\DOM\TestCase;
-
-use MensBeam\HTML\DOM\Charset;
-
-/** 
- * @covers \MensBeam\HTML\DOM\Charset
- */
-class TestCharset extends \PHPUnit\Framework\TestCase {
-    /** @dataProvider provideCharsets */
-    public function testDetermineEncodingFromEncodingLabel(string $in, ?string $exp) {
-        $this->assertSame($exp, Charset::fromCharset($in));
-    }
-
-    public function provideCharsets() {
-        return [
-            ["UTF-8",                   "UTF-8"],
-            ["  utf8  ",                "UTF-8"],
-            ["ISO-8859-1",              "windows-1252"],
-            ["text/html; charset=utf8", null],
-        ];
-    }
-
-    /** @dataProvider provideContentTypes */
-    public function testDetermineEncodingFromContentType(string $input, ?string $exp) {
-        $this->assertSame($exp, Charset::fromTransport($input));
-    }
-
-    public function provideContentTypes() {
-        return [
-            ["UTF-8",                                             null],
-            ["charset=utf8",                                      null],
-            ["text/html",                                         null],
-            ["text/html charset=utf8",                            null],
-            ["text/html; charset=utf8",                           "UTF-8"],
-            ["text/html;charset=utf8",                            "UTF-8"],
-            ["text/html; charset=\"utf8\"",                       "UTF-8"],
-            ["image/svg+xml; param=value; charset=utf8",          "UTF-8"],
-            ["image/svg+xml; charset=utf8; charset=big5",         "UTF-8"],
-            ["image/svg+xml; charset=utf8;charset=big5",          "UTF-8"],
-            ["text/html; charset=not-valid; charset=big5",        null],
-            ["text/html; charset=not-valid",                      null],
-            ["text/html; charsaaet=\"a \\\"fancy\\\" encoding\"", null],
-        ];
-    }
-
-    /** @dataProvider provideBOMs */
-    public function testDetermineEncodingFromByteOrderMark(string $input, ?string $exp) {
-        $this->assertSame($exp, Charset::fromBOM($input));
-    }
-    
-    public function provideBOMs() {
-        return [
-            'UTF-8'                  => ["\u{FEFF}Hello world!", "UTF-8"],
-            'UTF-16 (big-endian)'    => ["\xFE\xFF\0H\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d\0!", "UTF-16BE"],
-            'UTF-16 (little-endian)' => ["\xFF\xFEH\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d\0!\0", "UTF-16LE"],
-            'No byte order mark'     => ["Hello world!", null],
-        ];
-    }
-
-    /** @dataProvider provideStandardEncodingTests */
-    public function testStandardEncoderTests(string $input, string $exp) {
-        $exp = strtolower($exp);
-        $this->assertSame(strtolower($exp), strtolower(Charset::fromBOM($input)?? Charset::fromPrescan($input, \PHP_INT_MAX) ?? "Windows-1252"));
-    }
-
-    public function provideStandardEncodingTests() {
-        $tests = [];
-        $blacklist = [];
-        $files = new \AppendIterator();
-        $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/html5lib-tests/encoding/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
-        $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/cases/encoding/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
-        foreach ($files as $file) {
-            if (!in_array(basename($file), $blacklist)) {
-                $tests[] = $file;
-            }
-        }
-        return $this->makeEncodingTests(...$tests);
-    }
-
-    protected function makeEncodingTests(string ...$file): iterable {
-        foreach ($file as $path) {
-            $f = basename($path);
-            $test = file($path);
-            $end = sizeof($test);
-            $l = 0;
-            $index = 0;
-            while ($l < $end) {
-                $testId = "$f #".$index++;
-                $data = "";
-                while ($l < $end && !preg_match("/^#data\s+$/", @$test[$l++]));
-                while ($l < $end && !preg_match("/^#encoding\s+$/", ($line = @$test[$l++]))) {
-                    $data .= $line;
-                }
-                if ($l >= $end) {
-                    return;
-                }
-                yield $testId => [trim($data, "\r\n"), trim($test[$l++])];
-            }
-        }
-    }
-}
--- a/tests/cases/TestTokenizer.php
+++ b/tests/cases/TestTokenizer.php
@ -1,235 +0,0 @@
-<?php
-/** @license MIT
- * Copyright 2017 , Dustin Wilson, J. King et al.
- * See LICENSE and AUTHORS files for details */
-
-declare(strict_types=1);
-namespace MensBeam\HTML\DOM\TestCase;
-
-use MensBeam\HTML\DOM\Data;
-use MensBeam\HTML\DOM\EOFToken;
-use MensBeam\HTML\DOM\OpenElementsStack;
-use MensBeam\HTML\DOM\ParseError;
-use MensBeam\HTML\DOM\Tokenizer;
-use MensBeam\HTML\DOM\CharacterToken;
-use MensBeam\HTML\DOM\CommentToken;
-use MensBeam\HTML\DOM\DOCTYPEToken;
-use MensBeam\HTML\DOM\EndTagToken;
-use MensBeam\HTML\DOM\NullCharacterToken;
-use MensBeam\HTML\DOM\StartTagToken;
-use MensBeam\HTML\DOM\TokenAttr;
-use MensBeam\HTML\DOM\WhitespaceToken;
-
-/** 
- * @covers \MensBeam\HTML\DOM\Data
- * @covers \MensBeam\HTML\DOM\Tokenizer
- * @covers \MensBeam\HTML\DOM\CharacterToken
- * @covers \MensBeam\HTML\DOM\CommentToken
- * @covers \MensBeam\HTML\DOM\DataToken
- * @covers \MensBeam\HTML\DOM\TagToken
- * @covers \MensBeam\HTML\DOM\DOCTYPEToken
- * @covers \MensBeam\HTML\DOM\TokenAttr
- */
-class TestTokenizer extends \PHPUnit\Framework\TestCase {
-    const STATE_MAP = [
-        'Data state'          => Tokenizer::DATA_STATE,
-        'PLAINTEXT state'     => Tokenizer::PLAINTEXT_STATE,
-        'RCDATA state'        => Tokenizer::RCDATA_STATE,
-        'RAWTEXT state'       => Tokenizer::RAWTEXT_STATE,
-        'Script data state'   => Tokenizer::SCRIPT_DATA_STATE,
-        'CDATA section state' => Tokenizer::CDATA_SECTION_STATE,
-    ];
-
-    /** @dataProvider provideStandardTokenizerTests */
-    public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $expErrors) {
-        // convert parse error constants into standard symbols in specification
-        $errorMap = array_map(function($str) {
-            return strtolower(str_replace("_", "-", $str));
-        }, array_flip(array_filter((new \ReflectionClass(ParseError::class))->getConstants(), function($v) {
-            return is_int($v);
-        })));
-        // create a stub error handler which collects parse errors
-        $errors = [];
-        $errorHandler = $this->createStub(ParseError::class);
-        $errorHandler->method("emit")->willReturnCallback(function($file, $line, $col, $code) use (&$errors, $errorMap) {
-            $errors[] = ['code' => $errorMap[$code], 'line' => $line, 'col' => $col];
-            return true;
-        });
-        // initialize a stack of open elements, possibly with an open element
-        $stack = new OpenElementsStack();
-        if ($open) {
-            $stack[] = (new \DOMDocument)->createElement($open);
-        }
-        // initialize the data stream and tokenizer
-        $data = new Data($input, "STDIN", $errorHandler, "UTF-8");
-        $tokenizer = new Tokenizer($data, $stack, $errorHandler);
-        $tokenizer->state = $state;
-        // perform the test
-        $actual = [];
-        try {
-            foreach ($tokenizer->tokenize() as $t) {
-                assert(
-                    (!$t instanceof CharacterToken)
-                    || ($t instanceof NullCharacterToken && $t->data === "\0")
-                    || ($t instanceof WhitespaceToken && strspn($t->data, Data::WHITESPACE) === strlen($t->data))
-                    || ($t->data !== "\0" && strspn($t->data, Data::WHITESPACE) === 0)
-                , new \Exception("Character token must either consist of a single null character, consist only of whitespace, or start with other than whitespace: ".get_class($t)." ".var_export($t->data ?? "''", true)));
-                $actual[] = $t;
-            }
-        } finally {
-            $actual = $this->combineCharacterTokens($actual);
-            $this->assertEquals($expected, $actual, $tokenizer->debugLog);
-            $this->assertEquals($expErrors, $errors, $tokenizer->debugLog);
-        }
-    }
-
-    public function provideStandardTokenizerTests() {
-        $tests = [];
-        $blacklist = ["xmlViolation.test"];
-        $files = new \AppendIterator();
-        $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/html5lib-tests/tokenizer/*.test", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
-        $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/cases/tokenizer/*.test", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
-        foreach ($files as $file) {
-            if (!in_array(basename($file), $blacklist)) {
-                $tests[] = $file;
-            }
-        }
-        return $this->makeTokenTests(...$tests);
-    }
-
-    protected function reverseDoubleEscape(string $str): string {
-        if (preg_match_all("/\\\\u([0-9a-f]{4})/i", $str, $matches)) {
-            for ($a = 0; $a < sizeof($matches[0]); $a++) {
-                $esc = $matches[0][$a];
-                $chr = \MensBeam\Intl\Encoding\UTF8::encode(hexdec($matches[1][$a]));
-                $str = str_replace($esc, $chr, $str);
-            }
-        }
-        return $str;
-    }
-
-    protected function combineCharacterTokens(array $tokens) : array {
-        $out = [];
-        $pending = null;
-        foreach ($tokens as $t) {
-            if ($t instanceof CharacterToken) {
-                if (!$pending) {
-                    if ($t instanceof WhitespaceToken || $t instanceof NullCharacterToken) {
-                        $t = new CharacterToken($t->data);
-                    }
-                    $pending = $t;
-                } else {
-                    $pending->data .= $t->data;
-                }
-            } else {
-                if ($pending) {
-                    $out[] = $pending;
-                    $pending = null;
-                }
-                $out[] = $t;
-            }
-        }
-        if ($pending) {
-            $out[] = $pending;
-        }
-        return $out;
-    }
-
-    protected function makeTokenTests(string ...$file): iterable {
-        foreach ($file as $path) {
-            $f = basename($path);
-            $testSet = json_decode(file_get_contents($path), true);
-            foreach ($testSet['tests'] ?? $testSet['xmlViolationTests'] as $index => $test) {
-                $testId = "$f #$index";
-                if ($test['doubleEscaped'] ?? false) {
-                    $test['input'] = $this->reverseDoubleEscape($test['input']);
-                    for ($a = 0; $a < sizeof($test['output']); $a++) {
-                        for ($b = 0; $b < sizeof($test['output'][$a]); $b++) {
-                            if (is_string($test['output'][$a][$b])) {
-                                $test['output'][$a][$b] = $this->reverseDoubleEscape($test['output'][$a][$b]);
-                            }
-                        }
-                    }
-                }
-                $test['initialStates'] = $test['initialStates'] ?? ["Data state"];
-                // check if a test needs a patch due to trivial differences in implementation
-                $this->patchTest($test);
-                for ($a = 0; $a < sizeof($test['initialStates']); $a++) {
-                    $tokens = [];
-                    foreach ($test['output'] as $token) {
-                        switch ($token[0]) {
-                            case "DOCTYPE":
-                                $t = new DOCTYPEToken((string) $token[1], (string) $token[2], (string) $token[3]);
-                                $t->forceQuirks = !$token[4];
-                                $tokens[] = $t;
-                                break;
-                            case "StartTag":
-                                $t = new StartTagToken($token[1], $token[3] ?? false);
-                                foreach ($token[2] ?? [] as $name => $value) {
-                                    $t->attributes[] = new TokenAttr((string) $name, $value);
-                                }
-                                $tokens[] = $t;
-                                break;
-                            case "EndTag":
-                                $tokens[] = new EndTagToken($token[1]);
-                                break;
-                            case "Character":
-                                $tokens[] = new CharacterToken($token[1]);
-                                break;
-                            case "Comment":
-                                $tokens[] = new CommentToken($token[1]);
-                                break;
-                            default:
-                                throw new \Exception("Token type '{$token[0]}' not implemented in standard test interpreter");
-                        }
-                        unset($t);
-                    }
-                    $tokens[] = new EOFToken;
-                    yield "$testId: {$test['description']} ({$test['initialStates'][$a]})" => [
-                        $test['input'],                                 // input
-                        $tokens,                                        // output
-                        self::STATE_MAP[$test['initialStates'][$a]],    // initial state
-                        $test['lastStartTag'] ?? null,                  // open element, if any
-                        $test['errors'] ?? [],                          // errors, if any
-                    ];
-                }
-            }
-        }
-    }
-
-    protected function patchTest(&$test): void {
-        $id = [$test['input'], $test['initialStates']];
-        switch ($id) {
-            // test emits input stream error first despite peeking 
-            case ["<!\u{B}", ["Data state"]]:
-                $test['errors'] = array_reverse($test['errors']);
-                break;
-            // eof-in-<whatever> positions in some tests don't make sense
-            // https://github.com/html5lib/html5lib-tests/issues/125
-            case ["", ["CDATA section state"]]:
-                // there is no position 2
-                $test['errors'][0]['col']--;
-                break;
-            case ["\u{A}", ["CDATA section state"]]:
-                // the line break is, for some reason, not counted in the test
-                $test['errors'][0]['line']++;
-                $test['errors'][0]['col'] = 1;
-                break;
-            case ["<!----!\r\n>", ["Data state"]]:
-            case ["<!----!\n>", ["Data state"]]:
-            case ["<!----!\r>", ["Data state"]]:
-                // the line break is, for some reason, not counted in the test
-                $test['errors'][0]['line']++;
-                $test['errors'][0]['col'] = 2;
-                break;
-            case ["<!----! >", ["Data state"]]:
-                $test['errors'][0]['col']++;
-                break;
-            case [hex2bin("f4808080"), ["CDATA section state"]]:
-            case [hex2bin("3bf4808080"), ["CDATA section state"]]:
-                // malpaired surrogates count as two characters
-                $test['errors'][0]['col']++;
-                break;
-        }
-    }
-}
--- a/tests/cases/TestTreeConstructor.php
+++ b/tests/cases/TestTreeConstructor.php
@ -1,455 +0,0 @@
-<?php
-/** @license MIT
- * Copyright 2017 , Dustin Wilson, J. King et al.
- * See LICENSE and AUTHORS files for details */
-
-declare(strict_types=1);
-namespace MensBeam\HTML\DOM\TestCase;
-
-use MensBeam\HTML\DOM\Data;
-use MensBeam\HTML\DOM\LoopException;
-use MensBeam\HTML\DOM\NotImplementedException;
-use MensBeam\HTML\DOM\OpenElementsStack;
-use MensBeam\HTML\DOM\ParseError;
-use MensBeam\HTML\DOM\Parser;
-use MensBeam\HTML\DOM\TemplateInsertionModesStack;
-use MensBeam\HTML\DOM\Tokenizer;
-use MensBeam\HTML\DOM\TreeBuilder;
-
-/** 
- * @covers \MensBeam\HTML\DOM\Document
- * @covers \MensBeam\HTML\DOM\Element
- * @covers \MensBeam\HTML\DOM\Tokenizer
- * @covers \MensBeam\HTML\DOM\TreeBuilder
- * @covers \MensBeam\HTML\DOM\ActiveFormattingElementsList
- * @covers \MensBeam\HTML\DOM\TemplateInsertionModesStack
- * @covers \MensBeam\HTML\DOM\OpenElementsStack
- * @covers \MensBeam\HTML\DOM\Stack
- * @covers \MensBeam\HTML\DOM\TagToken
- */
-class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
-    use \MensBeam\HTML\DOM\EscapeString;
-
-    protected $out;
-    protected $depth;
-
-    /** @dataProvider provideStandardTreeTests */
-    public function testStandardTreeTests(string $data, array $exp, array $errors, $fragment): void {
-        // certain tests need to be patched to ignore unavoidable limitations of PHP's DOM
-        [$exp, $errors, $patched,  $skip] = $this->patchTest($data, $fragment, $errors, $exp);
-        if (strlen($skip)) {
-            $this->markTestSkipped($skip);
-        } elseif ($patched) {
-            $this->markAsRisky();
-        }
-        // convert parse error constants into standard symbols in specification
-        $errorMap = array_map(function($str) {
-            return strtolower(str_replace("_", "-", $str));
-        }, array_flip(array_filter((new \ReflectionClass(ParseError::class))->getConstants(), function($v) {
-            return is_int($v);
-        })));
-        // create a stub error handler which collects parse errors
-        $actualErrors = [];
-        $errorHandler = $this->createStub(ParseError::class);
-        $errorHandler->method("emit")->willReturnCallback(function($file, $line, $col, $code) use (&$actualErrors, $errorMap) {
-            $actualErrors[] = ['code' => $errorMap[$code], 'line' => $line, 'col' => $col];
-            return true;
-        });
-        // initialize the output document
-        $doc = new \DOMDocument;
-        // prepare the fragment context, if any
-        if ($fragment) {
-            $fragment = explode(" ", $fragment);
-            assert(sizeof($fragment) < 3);
-            if (sizeof($fragment) === 1) {
-                $fragmentContext = $doc->createElement($fragment[0]);
-            } else {
-                $ns = array_flip(Parser::NAMESPACE_MAP)[$fragment[0]] ?? null;
-                assert(isset($ns));
-                $fragmentContext = $doc->createElementNS($ns, $fragment[1]);
-            }
-        } else {
-            $fragmentContext = null;
-        }
-        // initialize the other classes we need
-        $decoder = new Data($data, "STDIN", $errorHandler, "UTF-8");
-        $stack = new OpenElementsStack($fragmentContext);
-        $tokenizer = new Tokenizer($decoder, $stack, $errorHandler);
-        $tokenList = $tokenizer->tokenize();
-        $treeBuilder = new TreeBuilder($doc, $decoder, $tokenizer, $tokenList, $errorHandler, $stack, new TemplateInsertionModesStack, $fragmentContext);
-        // run the tree builder
-        try {
-            $treeBuilder->constructTree();
-        } catch (LoopException $e) {
-            $act = $this->balanceTree($this->serializeTree($doc, (bool) $fragmentContext), $exp);
-            $this->assertEquals($exp, $act, $e->getMessage()."\n".$treeBuilder->debugLog);
-            throw $e;
-        } catch (NotImplementedException $e) {
-            $this->markTestSkipped($e->getMessage());
-            return;
-        }
-        $act = $this->balanceTree($this->serializeTree($doc, (bool) $fragmentContext), $exp);
-        $this->assertEquals($exp, $act, $treeBuilder->debugLog);
-        if ($errors !== false) {
-            // If $errors is false, the test does not include errors when there are in fact errors
-            $this->assertCount(sizeof($errors), $actualErrors, var_export($errors, true).var_export($actualErrors, true));
-        }
-    }
-
-    protected function patchTest(string $data, $fragment, array $errors, array $exp): array {
-        $patched = false;
-        $skip = "";
-        // comments outside the root element are silently dropped by the PHP DOM
-        if (!$fragment) {
-            for ($a = 0; $a < sizeof($exp); $a++) {
-                if (strpos($exp[$a], "| <!--") === 0) {
-                    array_splice($exp, $a--, 1);
-                    $patched = true;
-                }
-            }
-        }
-        // some tests don't document errors when they should
-        if (!$errors && in_array($data, [
-            // math.dat
-            '<math><tr><td><mo><tr>',
-            '<math><thead><mo><tbody>',
-            '<math><tfoot><mo><tbody>',
-            '<math><tbody><mo><tfoot>',
-            '<math><tbody><mo></table>',
-            '<math><thead><mo></table>',
-            '<math><tfoot><mo></table>',
-            // namespace-sensitivity.dat
-            '<body><table><tr><td><svg><td><foreignObject><span></td>Foo',
-            // svg.dat
-            '<svg><tr><td><title><tr>',
-            '<svg><thead><title><tbody>',
-            '<svg><tfoot><title><tbody>',
-            '<svg><tbody><title><tfoot>',
-            '<svg><tbody><title></table>',
-            '<svg><thead><title></table>',
-            '<svg><tfoot><title></table>',
-            // template.dat
-            '<template><a><table><a>',
-            // tests6.dat
-            '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html></html>',
-            // tests8.dat
-            '<table><li><li></table>',
-            // webkit01.dat
-            '<table><tr><td><svg><desc><td></desc><circle>',
-            // webkit02.dat
-            '<legend>test</legend>',
-            '<table><input>',
-            '<b><em><foo><foo><aside></b>',
-            '<b><em><foo><foo><aside></b></em>',
-            '<b><em><foo><foo><foo><aside></b>',
-            '<b><em><foo><foo><foo><aside></b></em>',
-            '<b><em><foo><foo><foo><foo><foo><foo><foo><foo><foo><foo><aside></b></em>',
-            '<b><em><foo><foob><foob><foob><foob><fooc><fooc><fooc><fooc><food><aside></b></em>',
-            '<option><XH<optgroup></optgroup>',
-            '<svg><foreignObject><div>foo</div><plaintext></foreignObject></svg><div>bar</div>',
-            '<svg><foreignObject></foreignObject><title></svg>foo',
-            '</foreignObject><plaintext><div>foo</div>',
-        ])) {
-            $errors = false;
-        }
-        // other tests do list errors, but they are plainly incorrect in missing some
-        if (in_array($data, [
-            // doctype01.dat"
-            "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"\n   \"http://www.w3.org/TR/html4/strict.dtd\">Hello",
-            '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">',
-            '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"\'http://www.w3.org/TR/html4/strict.dtd\'>',
-            '<!DOCTYPE HTML PUBLIC"-//W3C//DTD HTML 4.01//EN"\'http://www.w3.org/TR/html4/strict.dtd\'>',
-            "<!DOCTYPE HTML PUBLIC'-//W3C//DTD HTML 4.01//EN''http://www.w3.org/TR/html4/strict.dtd'>",
-            // entities02.dat
-            "<div>ZZ&prod=23</div>",
-            "<div>ZZ&AElig=</div>",
-            // foreign-fragment.dat
-            "<body><foo>",
-            "<p><foo>",
-            "<p></p><foo>",
-            // ruby.dat
-            "<html><ruby>a<rtc>b<span></ruby></html>",
-            // test1.dat
-            "<!-----><font><div>hello<table>excite!<b>me!<th><i>please!</tr><!--X-->", // this one is pretty hairy with buffered characters
-        ])) {
-            $errors = false;
-        }        
-        if ($errors) {
-            // some "old" errors are made redundant by "new" errors
-            $obsoleteSymbolList = implode("|", [
-                "illegal-codepoint-for-numeric-entity",
-                "eof-in-attribute-value-double-quote",
-                "non-void-element-with-trailing-solidus",
-                "invalid-character-in-attribute-name",
-                "attributes-in-end-tag",
-                "expected-tag-name",
-                "unexpected-character-after-solidus-in-tag",
-                "expected-closing-tag-but-got-char",
-                "eof-in-tag-name",
-                "need-space-after-doctype",
-                "expected-doctype-name-but-got-right-bracket",
-                "expected-dashes-or-doctype",
-                "expected-space-or-right-bracket-in-doctype",
-                "unexpected-char-in-comment",
-                "eof-in-comment-double-dash",
-                "expected-named-entity",
-                "named-entity-without-semicolon",
-                "numeric-entity-without-semicolon",
-                "expected-numeric-entity",
-                "eof-in-attribute-name",
-                "unexpected-eof-in-text-mode",
-                "unexpected-EOF-after-solidus-in-tag",
-                "expected-attribute-name-but-got-eof",
-                "eof-in-script-in-script",
-                "expected-script-data-but-got-eof",
-                "unexpected-EOF-in-text-mode",
-                "expected-tag-name-but-got-question-mark",
-                "incorrect-comment",
-                "self-closing-flag-on-end-tag",
-                "invalid-codepoint",
-                "invalid-codepoint-in-body",
-                "invalid-codepoint-in-foreign-content",
-                "end-table-tag-in-caption",
-                "equals-in-unquoted-attribute-value",
-                "eof-in-numeric-entity",
-                "unexpected-char-in-doctype",
-                "unexpected-end-of-doctype",
-                "unexpected-dash-after-double-dash-in-comment",
-                "unexpected-bang-after-double-dash-in-comment",
-            ]);
-            for ($a = 0, $stop = sizeof($errors); $a < $stop; $a++) {
-                if (preg_match("/^\(\d+,\d+\):? ($obsoleteSymbolList)$/", $errors[$a])) {
-                    // these errors are redundant with "new" errors
-                    unset($errors[$a]);
-                }
-            }
-            $errors = array_values($errors);
-            // some other errors appear to document implementation details
-            //   rather than what the specificatioon dictates, or are
-            //   simple duplicates
-            for ($a = 0, $stop = sizeof($errors); $a < $stop; $a++) {
-                if (
-                    preg_match("/^\(\d+,\d+\): unexpected-end-tag-in-special-element$/", $errors[$a])
-                    || preg_match('/^\d+: Unclosed element “[^”]+”\.$/u', $errors[$a])
-                    || ($data === '<!---x' && $errors[$a] === "(1:7) eof-in-comment")
-                    || ($data === "<!DOCTYPE html><body><table><caption><math><mi>foo</mi><mi>bar</mi>baz</table><p>quux" && $errors[$a] === "(1,78) expected-one-end-tag-but-got-another")
-                    || ($data === "<!DOCTYPE html><!-- XXX - XXX" && $errors[$a] === "(1,29): eof-in-comment")
-                    || ($data === "<!DOCTYPE html><!-- X" && $errors[$a] === "(1,21): eof-in-comment")
-                    || ($data === "<!doctype html><math></html>" && $errors[$a] === "(1,28): expected-one-end-tag-but-got-another")
-                    || ($data === "</" && $errors[$a] === "(1,2): expected-closing-tag-but-got-eof")
-                    || ($data === "<div foo=`bar`>" && $errors[$a] === "(1,14): unexpected-character-in-unquoted-attribute-value")
-                    || (
-                        $errors[$a] === "51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag."
-                        && (
-                            $data === "<b></b><mglyph/><i></i><malignmark/><u></u><mtext/>X"
-                            || $data === "<b></b><mglyph/><i></i><malignmark/><u></u><mi/>X"
-                            || $data === "<b></b><mglyph/><i></i><malignmark/><u></u><mo/>X"
-                            || $data === "<b></b><mglyph/><i></i><malignmark/><u></u><mn/>X"
-                            || $data === "<b></b><mglyph/><i></i><malignmark/><u></u><ms/>X"
-                        )
-                    )
-                    || ($data === "&ammmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmp;" && $errors[$a] === "(1,950): unknown-named-character-reference")
-                    || ($data === "&ammmp;" && $errors[$a] === "(1,7): unknown-named-character-reference")
-                    || ($data === "FOO<!-- BAR -- <QUX> -- MUX -- >BAZ" && $errors[$a] === "(1,35): eof-in-comment")
-                    || ($data === "FOO<!-- BAR --   >BAZ" && $errors[$a] === "(1,21): eof-in-comment")
-                ) {
-                    // these errors seems to simply be redundant
-                    unset($errors[$a]);
-                }
-            }
-            $errors = array_values($errors);
-            // other errors are spurious, or are for runs of character tokens
-            for ($a = 0, $stop = sizeof($errors); $a < $stop; $a++) {
-                if (preg_match("/^\((\d+),(\d+)\):? (foster-parenting-character(?:-in-table)?|unexpected-character-in-colgroup|unexpected-char-after-frameset|unexpected-char-in-frameset|expected-eof-but-got-char)$/", $errors[$a], $m1) && preg_match("/^\((\d+),(\d+)\):? $m1[3]$/", $errors[$a + 1] ?? "", $m2)) {
-                    // if the next error is also a character error at the next or same character position, this implies a run of characters where we only have one token
-                    // technically we should be reporting each one, so this is properly a FIXME
-                    if ($m1[1] == $m2[1] && ($m1[2] + 1 == $m2[2] || $m1[2] == $m2[2])) {
-                        unset($errors[$a]);
-                        $patched = true;
-                    }
-                } elseif (preg_match("/^foster-parenting text /", $errors[$a]) && preg_match("/^foster-parenting text /", $errors[$a + 1] ?? "")) {
-                    // template tests have a different format of error message
-                    unset($errors[$a]);
-                    $patched = true;
-                } elseif (preg_match("/^\((\d+,\d+)\):? unexpected-end-tag$/", $errors[$a], $m) && preg_match("/^\($m[1]\):? (unexpected-end-tag|end-tag-too-early|expected-one-end-tag-but-got-another|adoption-agency-1.3)$/", $errors[$a + 1] ?? "")) {
-                    // unexpected-end-tag errors should only be reported once for a given tag
-                    unset($errors[$a]);
-                }
-            }
-            $errors = array_values($errors);
-        }
-        return [$exp, $errors, $patched, $skip];
-    }
-
-    protected function balanceTree(array $act, array $exp): array {
-        // makes sure that the actual tree contain the same number of lines as the expected tree
-        // lines are inserted where the two trees diverge, until the end of the actual tree is reached
-        // this usually results in cleaner PHPUnit comparison failure output
-        for ($a = 0; $a < sizeof($act) && sizeof($act) < sizeof($exp); $a++) {
-            if (!isset($act[$a]) || $exp[$a] !== $act[$a]) {
-                array_splice($act, $a, 0, [""]);
-            }
-        }
-        return $act;
-    }
-
-    protected function push(string $data): void {
-        $this->out[] = "| ".str_repeat("  ", $this->depth).$data;
-    }
-
-    protected function serializeTree(\DOMDocument $d, bool $fragment): array {
-        $this->out = [];
-        $this->depth = 0;
-        if ($fragment){
-            foreach ($d->documentElement->childNodes as $n) {
-                $this->serializeNode($n);
-            }
-        } else {
-            if ($d->doctype) {
-                $dt = "<!DOCTYPE ";
-                $dt .= ($d->doctype->name !== " ") ? $d->doctype->name : "";
-                if (strlen($d->doctype->publicId) || strlen($d->doctype->systemId)) {
-                    $dt .= ' "'.$d->doctype->publicId.'"';
-                    $dt .= ' "'.$d->doctype->systemId.'"';
-                }
-                $dt .= ">";
-                $this->push($dt);
-            }
-            if ($d->documentElement) {
-                $this->serializeElement($d->documentElement);
-            }
-        }
-        return $this->out;
-    }
-
-    protected function serializeElement(\DOMElement $e): void {
-        if ($e->namespaceURI) {
-            $prefix = Parser::NAMESPACE_MAP[$e->namespaceURI];
-            assert((bool) $prefix, new \Exception("Prefix for namespace {$e->namespaceURI} is not defined"));
-            $prefix .= " ";
-        } else {
-            $prefix = "";
-        }
-        $localName = $this->uncoerceName($e->localName);
-        $this->push("<".$prefix.$localName.">");
-        $this->depth++;
-        $attr = [];
-        foreach ($e->attributes as $a) {
-            $prefix = "";
-            if ($a->namespaceURI) {
-                $prefix = Parser::NAMESPACE_MAP[$a->namespaceURI];
-                assert((bool) $prefix, new \Exception("Prefix for namespace {$a->namespaceURI} is not defined"));
-                $prefix .= " ";
-            }
-            $attr[$prefix.$this->uncoerceName($a->name)] = $a->value;
-        }
-        ksort($attr, \SORT_STRING);
-        foreach ($attr as $k => $v) {
-            $this->push($k.'="'.$v.'"');
-        }
-        if ($e->localName === "template" && $e->namespaceURI === null) {
-            $this->push("content");
-            $this->depth++;
-            foreach ($e->childNodes as $n) {
-                $this->serializeNode($n);
-            }
-            $this->depth--;
-        } else {
-            foreach ($e->childNodes as $n) {
-                $this->serializeNode($n);
-            }
-        }
-        $this->depth--;
-    }
-
-    protected function serializeNode(\DOMNode $n): void {
-        if ($n instanceof \DOMElement) {
-            $this->serializeElement($n);
-        } elseif ($n instanceof \DOMProcessingInstruction) {
-            $this->push("<?".$n->target." ".$n->data.">");
-        } elseif ($n instanceof \DOMComment) {
-            $this->push("<!-- ".$n->data." -->");
-        } elseif ($n instanceof \DOMCharacterData) {
-            $this->push('"'.$n->data.'"');
-        } else {
-            throw new \Exception("Node type ".get_class($n)." not handled");
-        }
-    }
-
-    public function provideStandardTreeTests(): iterable {
-        $blacklist = [];
-        $files = new \AppendIterator();
-        $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/html5lib-tests/tree-construction/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
-        $files->append(new \GlobIterator(\MensBeam\HTML\DOM\BASE."tests/cases/tree-construction/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
-        foreach ($files as $file) {
-            $index = 0;
-            $l = 0;
-            if (!in_array(basename($file), $blacklist)) {
-                $lines = array_map(function($v) {
-                    return rtrim($v, "\n");
-                }, file($file));
-                while ($l < sizeof($lines)) {
-                    $pos = $l + 1;
-                    assert($lines[$l] === "#data", new \Exception("Test $file #$index does not start with #data tag at line ".($l + 1)));
-                    // collect the test input
-                    $data = [];
-                    for (++$l; $l < sizeof($lines); $l++) {
-                        if ($lines[$l] === "#errors") {
-                            break;
-                        }
-                        $data[] = $lines[$l];
-                    }
-                    $data = implode("\n", $data);
-                    // collect the test errors
-                    $errors = [];
-                    assert(($lines[$l] ?? "") === "#errors", new \Exception("Test $file #$index does not list errors at line ".($l + 1)));
-                    for (++$l; $l < sizeof($lines); $l++) {
-                        if ($lines[$l] === "#new-errors") {
-                            continue;
-                        } elseif (preg_match('/^#(document(-fragment)?|script-(on|off)|)$/', $lines[$l])) {
-                            break;
-                        }
-                        $errors[] = $lines[$l];
-                    }
-                    // set the script mode, if present
-                    assert(preg_match('/^#(script-(on|off)|document(-fragment)?)$/', $lines[$l]) === 1, new \Exception("Test $file #$index follows errors with something other than script flag, document fragment, or document at line ".($l + 1)));
-                    $script = null;
-                    if ($lines[$l] === "#script-off") {
-                        $script = false;
-                        $l++;
-                    } elseif ($lines[$l] === "#script-on") {
-                        $script = true;
-                        $l++;
-                    }
-                    // collect the document fragment, if present
-                    assert(preg_match('/^#document(-fragment)?$/', $lines[$l]) === 1, new \Exception("Test $file #$index follows script flag with something other than document fragment or document at line ".($l + 1)));
-                    $fragment = null;
-                    if ($lines[$l] === "#document-fragment") {
-                        $fragment = $lines[++$l];
-                        $l++;
-                    }
-                    // collect the output tree
-                    $exp = [];
-                    assert($lines[$l] === "#document", new \Exception("Test $file #$index follows document fragment with something other than document at line ".($l + 1)));
-                    for (++$l; $l < sizeof($lines); $l++) {
-                        if ($lines[$l] === "" && ($lines[$l + 1] ?? "") === "#data") {
-                            break;
-                        } elseif (($lines[$l][0] ?? "") !== "|") {
-                            // apend the data to the previous token
-                            $exp[sizeof($exp) - 1] .= "\n".$lines[$l];
-                            continue;
-                        }
-                        assert(preg_match('/^[^#]/', $lines[$l]) === 1, new \Exception("Test $file #$index contains unrecognized data after document at line ".($l + 1)));
-                        $exp[] = $lines[$l];
-                    }
-                    if (!$script) {
-                        // scripting-dependent tests are skipped entirely since we will not support scripting
-                        yield basename($file)." #$index (line $pos)" => [$data, $exp, $errors, $fragment];
-                    }
-                    $l++;
-                    $index++;
-                }
-            }
-        }
-    }
-}
--- a/tests/cases/encoding/mensbeam01.dat
+++ b/tests/cases/encoding/mensbeam01.dat
@ -1,36 +0,0 @@
-#data
-<!DOCTYPE HTML>
-<meta charset="x-user-defined">
-#encoding
-Windows-1252
-
-#data
-<!DOCTYPE HTML>
-<meta charset="utf-8" charset="windows-1252">
-#encoding
-UTF-8
-
-#data
-<!DOCTYPE HTML>
-<!-- There is a space at the end of the next line. This must not be removed. -->
-<meta charset 
-#encoding
-Windows-1252
-
-#data
-<!DOCTYPE HTML>
-<meta charset=>
-#encoding
-Windows-1252
-
-#data
-<!DOCTYPE HTML>
-<meta http-equiv="Content-Type" content="text/html; charset charset=">
-#encoding
-Windows-1252
-
-#data
-<!DOCTYPE HTML>
-<meta http-equiv="Content-Type" content="text/html; charset charset=utf-8">
-#encoding
-UTF-8
--- a/tests/cases/tokenizer/mensbeam01.test
+++ b/tests/cases/tokenizer/mensbeam01.test
@ -1,27 +0,0 @@
-{
-    "tests": [
-        {
-            "description":"Whitespace character references",
-            "initialStates":["RCDATA state"],
-            "input":"&#x20;",
-            "output":[["Character", " "]]
-        },
-        {
-            "description":"Script end tag with whitespace",
-            "initialStates":["Script data state"],
-            "lastStartTag":"script",
-            "input":"</script >",
-            "output":[["EndTag", "script"]]
-        },
-        {
-            "description":"Self-closing script end tag",
-            "initialStates":["Script data state"],
-            "lastStartTag":"script",
-            "input":"</script/>",
-            "output":[["EndTag", "script"]],
-            "errors":[
-                { "code": "end-tag-with-trailing-solidus", "line": 1, "col": 10 }
-            ]
-        }
-    ]
-}
--- a/tests/cases/tree-construction/mensbeam01.dat
+++ b/tests/cases/tree-construction/mensbeam01.dat
@ -1,68 +0,0 @@
-#data
-<!DOCTYPE html><svg xmlns="http://www.w3.org/2000/svg"/>
-#errors
-#document
-| <!DOCTYPE html>
-| <html>
-|   <head>
-|   <body>
-|     <svg svg>
-|       xmlns xmlns="http://www.w3.org/2000/svg"
-
-#data
-<!DOCTYPE html><svg xmlns="http://www.w3.org/1999/xlink"/>
-#errors
-(1,58): invalid-namespace-attribute-value
-#document
-| <!DOCTYPE html>
-| <html>
-|   <head>
-|   <body>
-|     <svg svg>
-
-#data
-<!DOCTYPE html><svg xmlns:xlink="http://www.w3.org/1999/xlink"/>
-#errors
-#document
-| <!DOCTYPE html>
-| <html>
-|   <head>
-|   <body>
-|     <svg svg>
-|       xmlns xlink="http://www.w3.org/1999/xlink"
-
-#data
-<!DOCTYPE html><svg xlink:href="http://example.com/"/>
-#errors
-#document
-| <!DOCTYPE html>
-| <html>
-|   <head>
-|   <body>
-|     <svg svg>
-|       xlink href="http://example.com/"
-
-#data
-<!DOCTYPE html><svg xmlns:xlink="http://www.w3.org/1999/xhtml" xlink:href="http://example.com/"/>
-#errors
-(1,97): invalid-namespace-attribute-value
-#document
-| <!DOCTYPE html>
-| <html>
-|   <head>
-|   <body>
-|     <svg svg>
-|       xlink href="http://example.com/"
-
-#data
-<!DOCTYPE html><svg xml:id="proper"/><p xml:id="bogus">
-#errors
-#document
-| <!DOCTYPE html>
-| <html>
-|   <head>
-|   <body>
-|     <svg svg>
-|       xml id="proper"
-|     <p>
-|       xml:id="bogus"
--- a/tests/phpunit.dist.xml
+++ b/tests/phpunit.dist.xml
@ -19,15 +19,6 @@
    <testsuite name="DOM">
        <file>cases/TestDOM.php</file>
    </testsuite>
-    <testsuite name="Charset">
-        <file>cases/TestCharset.php</file>
-    </testsuite>
-    <testsuite name="Tokenizer">
-        <file>cases/TestTokenizer.php</file>
-    </testsuite>
-    <testsuite name="Tree">
-        <file>cases/TestTreeConstructor.php</file>
-    </testsuite>
    <testsuite name="Serializer">
        <file>cases/TestSerializer.php</file>
    </testsuite>