Browse Source

Re-integrated serializer, more tests

wrapper-classes
Dustin Wilson 3 years ago
parent
commit
5e187634d2
  1. 1
      README.md
  2. 6
      composer.json
  3. 69
      composer.lock
  4. 66
      lib/Document.php
  5. 17
      lib/InnerNode/Document.php
  6. 88
      lib/Node.php
  7. 26
      tests/cases/TestNode.php
  8. 14
      vendor-bin/phpunit/composer.lock
  9. 12
      vendor-bin/robo/composer.lock

1
README.md

@ -67,4 +67,5 @@ The primary aim of this library is accuracy. However, due either to limitations
5. CDATA section nodes, text nodes, and document fragments per the specification can be instantiated by their constructors independent of the `Document::createCDATASectionNode`, `Document::createTextNode`, and `Document::createDocumentFragment` methods respectively. This is not possible currently with this library and probably never will be due to the difficulty of implementing it and the awkwardness of their being different from every other node type in this respect.
6. This implementation will not implement the `NodeIterator` and `TreeWalker` APIs. They are horribly conceived and impractical APIs that few people actually use because it's literally easier to write recursive loops to walk through the DOM than it is to use those APIs. They have instead been replaced with the `ParentNode::walk` generator.
7. All of the `Range` APIs will also not be implemented due to the sheer complexity of creating them in userland and how it adds undue difficulty to node manipulation in the "core" DOM. Numerous operations reference in excrutiating detail what to do with Ranges when manipulating nodes and would have to be added here to be compliant or mostly so -- slowing everything else down in the process on an already front-heavy library.
8. The `DOMParser` and `XMLSerializer` APIs will not be implemented because they are ridiculous and limited in their scope. For instance, `DOMParser::parseFromString` won't set a document's character set to anything but UTF-8. This library needs to be able to print to other encodings due to the nature of how it is used. `Document::__construct` will accept optional `$source` and `$charset` arguments, and there are both `Document::loadHTML` and `Document::loadFile` methods for loading DOM from a string or a file respectively.
8. Aside from `HTMLElement`, `HTMLTemplateElement`, `MathMLElement`, and `SVGElement` none of the specific derived element classes (such as `HTMLAnchorElement` or `SVGSVGElement`) are implemented. The focus on this library will be on the core DOM before moving onto those. They may or may not be implemented in the future.

6
composer.json

@ -5,7 +5,7 @@
"require": {
"php": ">=8.0",
"ext-dom": "*",
"mensbeam/html-parser": ">=1.0",
"mensbeam/html-parser": "dev-master",
"mensbeam/framework": "dev-main",
"symfony/css-selector": "^5.3"
},
@ -53,6 +53,10 @@
{
"type": "git",
"url": "mensbeam-gitea:MensBeam/Framework.git"
},
{
"type": "git",
"url": "mensbeam-gitea:MensBeam/HTML-Parser.git"
}
]
}

69
composer.lock

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "3897d645c52d2c74808975ebcdddff81",
"content-hash": "64e0c3baf9ecd7960acc83f89ab580bd",
"packages": [
{
"name": "mensbeam/framework",
@ -59,11 +59,11 @@
},
{
"name": "mensbeam/html-parser",
"version": "1.1.1",
"version": "dev-master",
"source": {
"type": "git",
"url": "https://code.mensbeam.com/MensBeam/HTML-Parser",
"reference": "3fc966226b7d45ab51a6c20cb8cafeedcf54ccec"
"url": "mensbeam-gitea:MensBeam/HTML-Parser.git",
"reference": "5698a93c01f38f1236fd83d6913dff41d844ee72"
},
"require": {
"ext-dom": "*",
@ -77,6 +77,7 @@
"suggest": {
"ext-ctype": "Improved performance"
},
"default-branch": true,
"type": "library",
"autoload": {
"psr-4": {
@ -91,7 +92,20 @@
"lib/Parser/ctype.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"autoload-dev": {
"psr-4": {
"MensBeam\\HTML\\Test\\": "tests/lib/",
"MensBeam\\HTML\\TestCase\\": "tests/cases/"
}
},
"scripts": {
"post-install-cmd": [
"@composer bin all install"
],
"post-update-cmd": [
"@composer bin all update"
]
},
"license": [
"MIT"
],
@ -109,14 +123,14 @@
],
"description": "Parser and serializer for modern HTML documents",
"keywords": [
"HTML5",
"WHATWG",
"dom",
"html",
"html5",
"parser",
"parsing"
"parsing",
"whatwg"
],
"time": "2021-10-24T17:24:48+00:00"
"time": "2021-10-31T16:11:06+00:00"
},
{
"name": "mensbeam/intl",
@ -1370,16 +1384,16 @@
},
{
"name": "symfony/console",
"version": "v5.3.7",
"version": "v5.3.10",
"source": {
"type": "git",
"url": "https://github.com/symfony/console.git",
"reference": "8b1008344647462ae6ec57559da166c2bfa5e16a"
"reference": "d4e409d9fbcfbf71af0e5a940abb7b0b4bad0bd3"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/console/zipball/8b1008344647462ae6ec57559da166c2bfa5e16a",
"reference": "8b1008344647462ae6ec57559da166c2bfa5e16a",
"url": "https://api.github.com/repos/symfony/console/zipball/d4e409d9fbcfbf71af0e5a940abb7b0b4bad0bd3",
"reference": "d4e409d9fbcfbf71af0e5a940abb7b0b4bad0bd3",
"shasum": ""
},
"require": {
@ -1449,7 +1463,7 @@
"terminal"
],
"support": {
"source": "https://github.com/symfony/console/tree/v5.3.7"
"source": "https://github.com/symfony/console/tree/v5.3.10"
},
"funding": [
{
@ -1465,7 +1479,7 @@
"type": "tidelift"
}
],
"time": "2021-08-25T20:02:16+00:00"
"time": "2021-10-26T09:30:15+00:00"
},
{
"name": "symfony/deprecation-contracts",
@ -1536,16 +1550,16 @@
},
{
"name": "symfony/http-foundation",
"version": "v5.3.7",
"version": "v5.3.10",
"source": {
"type": "git",
"url": "https://github.com/symfony/http-foundation.git",
"reference": "e36c8e5502b4f3f0190c675f1c1f1248a64f04e5"
"reference": "9f34f02e8a5fdc7a56bafe011cea1ce97300e54c"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/http-foundation/zipball/e36c8e5502b4f3f0190c675f1c1f1248a64f04e5",
"reference": "e36c8e5502b4f3f0190c675f1c1f1248a64f04e5",
"url": "https://api.github.com/repos/symfony/http-foundation/zipball/9f34f02e8a5fdc7a56bafe011cea1ce97300e54c",
"reference": "9f34f02e8a5fdc7a56bafe011cea1ce97300e54c",
"shasum": ""
},
"require": {
@ -1589,7 +1603,7 @@
"description": "Defines an object-oriented layer for the HTTP specification",
"homepage": "https://symfony.com",
"support": {
"source": "https://github.com/symfony/http-foundation/tree/v5.3.7"
"source": "https://github.com/symfony/http-foundation/tree/v5.3.10"
},
"funding": [
{
@ -1605,7 +1619,7 @@
"type": "tidelift"
}
],
"time": "2021-08-27T11:20:35+00:00"
"time": "2021-10-11T15:41:55+00:00"
},
{
"name": "symfony/mime",
@ -2486,16 +2500,16 @@
},
{
"name": "symfony/string",
"version": "v5.3.7",
"version": "v5.3.10",
"source": {
"type": "git",
"url": "https://github.com/symfony/string.git",
"reference": "8d224396e28d30f81969f083a58763b8b9ceb0a5"
"reference": "d70c35bb20bbca71fc4ab7921e3c6bda1a82a60c"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/string/zipball/8d224396e28d30f81969f083a58763b8b9ceb0a5",
"reference": "8d224396e28d30f81969f083a58763b8b9ceb0a5",
"url": "https://api.github.com/repos/symfony/string/zipball/d70c35bb20bbca71fc4ab7921e3c6bda1a82a60c",
"reference": "d70c35bb20bbca71fc4ab7921e3c6bda1a82a60c",
"shasum": ""
},
"require": {
@ -2549,7 +2563,7 @@
"utf8"
],
"support": {
"source": "https://github.com/symfony/string/tree/v5.3.7"
"source": "https://github.com/symfony/string/tree/v5.3.10"
},
"funding": [
{
@ -2565,7 +2579,7 @@
"type": "tidelift"
}
],
"time": "2021-08-26T08:00:08+00:00"
"time": "2021-10-27T18:21:46+00:00"
},
{
"name": "symfony/yaml",
@ -2718,6 +2732,7 @@
"aliases": [],
"minimum-stability": "stable",
"stability-flags": {
"mensbeam/html-parser": 20,
"mensbeam/framework": 20
},
"prefer-stable": false,

66
lib/Document.php

@ -14,7 +14,9 @@ use MensBeam\HTML\DOM\InnerNode\{
use MensBeam\HTML\Parser;
use MensBeam\HTML\Parser\{
Charset,
Data
Data,
Config as ParserConfig,
Serializer
};
@ -88,7 +90,7 @@ class Document extends Node {
$this->_implementation = new DOMImplementation($this);
if ($source !== null) {
$this->loadHTML($source, $charset ?? 'windows-1252');
$this->loadHTML($source, $charset);
} elseif ($charset !== 'UTF-8') {
$this->_characterSet = Charset::fromCharset((string)$charset) ?? 'UTF-8';
}
@ -256,12 +258,50 @@ class Document extends Node {
return $this->cloneWrapperNode($node, $this, $deep);
}
public function loadFile(string $filename, ?string $charset = null): void {
$f = fopen($filename, 'r');
if (!$f) {
return;
}
$data = stream_get_contents($f);
$charset = Charset::fromCharset((string)$charset) ?? Charset::fromTransport((string)$charset);
$meta = stream_get_meta_data($f);
$wrapperType = $meta['wrapper_type'];
if (!$charset && $wrapperType === 'http') {
// Try to find a Content-Type header field
foreach ($meta['wrapper_data'] as $h) {
$h = explode(':', $h, 2);
if (count($h) === 2 && preg_match("/^\s*Content-Type\s*$/i", $h[0])) {
// Try to get an encoding from it
$charset = Charset::fromTransport($h[1]);
break;
}
}
}
if ($wrapperType === 'plainfile') {
$filename = realpath($filename);
$this->_URL = "file://$filename";
} else {
$this->_URL = $filename;
}
$this->loadHTML($data, $charset);
}
public function loadHTML(string $source = null, ?string $charset = null): void {
if ($this->hasChildNodes()) {
throw new DOMException(DOMException::NO_MODIFICATION_ALLOWED);
}
$source = Parser::parse($source, $charset ?? 'windows-1252');
$config = null;
if ($charset !== null) {
$config = new ParserConfig();
$config->fallbackEncoding = Charset::fromCharset($charset);
}
$source = Parser::parse($source, null, $config);
$this->_characterSet = $source->encoding;
$this->_compatMode = ($source->quirksMode === Parser::NO_QUIRKS_MODE || $source->$quirksMode === Parser::LIMITED_QUIRKS_MODE) ? 'CSS1Compat' : 'BackCompat';
@ -272,6 +312,21 @@ class Document extends Node {
}
}
public function saveHTML(Comment|Document|DocumentFragment|DocumentType|Element|ProcessingInstruction|Text|null $node = null): string {
$node = $node ?? $this;
if ($node !== $this) {
if ($node->ownerDocument !== $this) {
throw new DOMException(DOMException::WRONG_DOCUMENT);
}
$node = $this->getInnerNode($node);
} else {
$node = $node->innerNode;
}
return Serializer::serialize($node);
}
protected function __createAttribute(?string $namespace, string $qualifiedName): Attr {
// Before we do the next step we need to work around a PHP DOM bug. PHP DOM
@ -305,4 +360,9 @@ class Document extends Node {
return $this->innerNode->getWrapperNode($attr);
}
public function __toString() {
return $this->saveHTML();
}
}

17
lib/InnerNode/Document.php

@ -10,6 +10,7 @@ namespace MensBeam\HTML\DOM\InnerNode;
use MensBeam\Framework\MagicProperties;
use MensBeam\HTML\DOM\{
Document as WrapperDocument,
DOMException,
Node as WrapperNode,
XMLDocument as WrapperXMLDocument
};
@ -48,6 +49,22 @@ class Document extends \DOMDocument {
}
public function getInnerNode(WrapperNode $node = null): ?\DOMNode {
if ($node === null) {
return null;
}
if ($node === $this) {
return $this;
}
if ($node instanceof \DOMDocument) {
throw new DOMException(DOMException::WRONG_DOCUMENT);
}
return $this->nodeMap->get($node);
}
public function getWrapperNode(?\DOMNode $node = null): ?WrapperNode {
if ($node === null) {
return null;

88
lib/Node.php

@ -8,6 +8,7 @@
declare(strict_types=1);
namespace MensBeam\HTML\DOM;
use MensBeam\Framework\MagicProperties,
MensBeam\HTML\Parser,
MensBeam\HTML\Parser\NameCoercion;
use MensBeam\HTML\DOM\InnerNode\{
Document as InnerDocument,
@ -455,7 +456,7 @@ abstract class Node {
# pre-inserting node into this before child.
// Aside from pre-insertion validity PHP's DOM does this correctly already.
$this->preInsertionValidity($node, $child);
$this->innerNode->insertBefore($this->getInnerNode($node));
$this->innerNode->insertBefore($this->getInnerNode($node), $this->getInnerNode($child));
return $node;
}
@ -472,7 +473,7 @@ abstract class Node {
# 2. Let defaultNamespace be the result of running locate a namespace for this
# using null.
# 3. Return true if defaultNamespace is the same as namespace; otherwise false.
return ($this->locateNamespace($this, null) === $namespace);
return ($this->locateNamespace($this->innerNode, null) === $namespace);
}
public function isEqualNode(?Node $otherNode) {
@ -752,12 +753,28 @@ abstract class Node {
# contents, with document set to copy's template contents's node document, and
# with the clone children flag set.
# 3. Append copied contents to copy's template contents.
// Create a wrapper node for the cloned template regardless and then append a
// clone of its DocumentFragment to its content. Template contents are stored in
// the wrapper nodes.
$copyWrapper = $copy->ownerDocument->getWrapperNode($copy);
$nodeWrapper = $node->ownerDocument->getWrapperNode($node);
$copyWrapper->content->appendChild($this->cloneWrapperNode($nodeWrapper, $document->wrapperNode, true));
// Template contents are stored in the wrapper nodes.
$copyWrapperContent = $copy->ownerDocument->getWrapperNode($copy)->content;
// Need to check to see if what is being cloned is a MensBeam inner node or not.
// Most of the time this will be the case, but if a document is being parsed
// that has template elements it won't be; instead the template element's
// children need to be appended to the inner content DOMDocumentFragment.
if ($node->ownerDocument instanceof InnerDocument) {
$nodeWrapperContent = $node->ownerDocument->getWrapperNode($node)->content;
if ($nodeWrapperContent->hasChildNodes()) {
$copyWrapperContent->appendChild($this->cloneWrapperNode($nodeWrapperContent, $document->wrapperNode, true));
}
} else {
$copyContent = $this->getInnerNode($copyWrapperContent);
$childNodes = $node->childNodes;
foreach ($childNodes as $child) {
$copyContent->appendChild($this->cloneInnerNode($child, $document, true));
}
// Step #6 isn't necessary now; just return the copy.
return $copy;
}
}
# 6. If the clone children flag is set, clone all the children of node and append
@ -917,6 +934,14 @@ abstract class Node {
return $this->innerNode;
}
// If the node isn't a Document node and its document is the same as $this'
// document then get the inner node from the inner document's node map cache.
$doc = ($this instanceof Document) ? $this->innerNode : $this->innerNode->ownerDocument;
if (!$node instanceof Document && $node->ownerDocument === $doc) {
return $doc->getInnerNode($node);
}
// Otherwise, use reflection to get the innerNode protected property.
return Reflection::getProtectedProperty($node, 'innerNode');
}
@ -988,16 +1013,29 @@ abstract class Node {
return true;
}
protected function locateNamespace(Node $node, ?string $prefix = null): ?string {
protected function locateNamespace(\DOMNode $node, ?string $prefix = null): ?string {
# To locate a namespace for a node using prefix, switch on the interface node
# implements:
#
# ↪ Element
if ($node instanceof Element) {
if ($node instanceof \DOMElement) {
// Work around PHP DOM HTML namespace bug
if ($node->namespaceURI === null && !$node->ownerDocument->getWrapperNode($node->ownerDocument) instanceof XMLDocument) {
$namespace = Parser::HTML_NAMESPACE;
} else {
$namespace = $node->namespaceURI;
}
// Work around another PHP DOM bug where \DOMNode::prefix returns an empty string if empty instead of null
$nodePrefix = $node->prefix;
if ($nodePrefix === '') {
$nodePrefix = null;
}
# 1. If its namespace is non-null and its namespace prefix is prefix, then return
# namespace.
if ($node->namespaceURI !== null && $node->prefix === $prefix) {
return $node->namespaceURI;
if ($namespace !== null && $nodePrefix === $prefix) {
return $namespace;
}
# 2. If it has an attribute whose namespace is the XMLNS namespace, namespace prefix
@ -1005,7 +1043,7 @@ abstract class Node {
# attribute whose namespace is the XMLNS namespace, namespace prefix is null, and
# local name is "xmlns", then return its value if it is not the empty string, and
# null otherwise.
$attributes = $this->getInnerNode($node)->attributes;
$attributes = $node->attributes;
// Have to check for null because PHP DOM violates the spec and returns null when empty
if ($attributes !== null) {
foreach ($attributes as $attr) {
@ -1015,10 +1053,9 @@ abstract class Node {
}
}
$parentElement = $node->parentElement;
# 3. If its parent element is null, then return null.
if ($parentElement === null) {
$parentElement = $node->parentNode;
if (!$parentElement instanceof \DOMElement) {
return null;
}
@ -1028,7 +1065,7 @@ abstract class Node {
}
# ↪ Document
elseif ($node instanceof Document) {
elseif ($node instanceof InnerDocument) {
# 1. If its document element is null, then return null.
if ($node->documentElement === null) {
return null;
@ -1041,13 +1078,13 @@ abstract class Node {
# ↪ DocumentType
# ↪ DocumentFragment
elseif ($node instanceof DocumentType || $node instanceof DocumentFragment) {
elseif ($node instanceof \DOMDocumentType || $node instanceof \DOMDocumentFragment) {
# Return null.
return null;
}
# ↪ Attr
elseif ($node instanceof Attr) {
elseif ($node instanceof \DOMAttr) {
# 1. If its element is null, then return null.
if ($node->ownerElement === null) {
return null;
@ -1060,9 +1097,11 @@ abstract class Node {
# ↪ Otherwise
# 1. If its parent element is null, then return null.
$parentElement = $node->parentElement;
if ($parentElement === null) {
return null;
else {
$parentElement = $node->parentNode;
if (!$parentElement instanceof \DOMElement) {
return null;
}
}
# 2. Return the result of running locate a namespace on its parent element using
@ -1252,4 +1291,9 @@ abstract class Node {
}
}
}
public function __toString() {
return $this->ownerDocument->saveHTML($this);
}
}

26
tests/cases/TestNode.php

@ -264,6 +264,32 @@ class TestNode extends \PHPUnit\Framework\TestCase {
}
/** @covers \MensBeam\HTML\DOM\Node::insertBefore */
public function testMethod_insertBefore(): void {
$d = new Document();
$d->appendChild($d->createElement('html'));
$d->documentElement->appendChild($d->createElement('body'));
$div = $d->body->appendChild($d->createElement('div'));
$ook = $d->body->insertBefore($d->createTextNode('ook'), $div);
$this->assertSame('<body>ook<div></div></body>', (string)$d->body);
$t = $d->body->insertBefore($d->createElement('template'), $ook);
$this->assertSame('<body><template></template>ook<div></div></body>', (string)$d->body);
}
/** @covers \MensBeam\HTML\DOM\Node::isDefaultNamespace */
public function testMethod_isDefaultNamespace(): void {
$d = new Document();
$d->appendChild($d->createElement('html'));
$d->documentElement->appendChild($d->createElement('body'));
$this->assertTrue($d->body->isDefaultNamespace(Parser::HTML_NAMESPACE));
$this->assertFalse($d->body->isDefaultNamespace(''));
}
/**
* @covers \MensBeam\HTML\DOM\Node::__get_childNodes
*

14
vendor-bin/phpunit/composer.lock

@ -529,23 +529,23 @@
},
{
"name": "phpunit/php-code-coverage",
"version": "9.2.7",
"version": "9.2.8",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/php-code-coverage.git",
"reference": "d4c798ed8d51506800b441f7a13ecb0f76f12218"
"reference": "cf04e88a2e3c56fc1a65488afd493325b4c1bc3e"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/d4c798ed8d51506800b441f7a13ecb0f76f12218",
"reference": "d4c798ed8d51506800b441f7a13ecb0f76f12218",
"url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/cf04e88a2e3c56fc1a65488afd493325b4c1bc3e",
"reference": "cf04e88a2e3c56fc1a65488afd493325b4c1bc3e",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-libxml": "*",
"ext-xmlwriter": "*",
"nikic/php-parser": "^4.12.0",
"nikic/php-parser": "^4.13.0",
"php": ">=7.3",
"phpunit/php-file-iterator": "^3.0.3",
"phpunit/php-text-template": "^2.0.2",
@ -594,7 +594,7 @@
],
"support": {
"issues": "https://github.com/sebastianbergmann/php-code-coverage/issues",
"source": "https://github.com/sebastianbergmann/php-code-coverage/tree/9.2.7"
"source": "https://github.com/sebastianbergmann/php-code-coverage/tree/9.2.8"
},
"funding": [
{
@ -602,7 +602,7 @@
"type": "github"
}
],
"time": "2021-09-17T05:39:03+00:00"
"time": "2021-10-30T08:01:38+00:00"
},
{
"name": "phpunit/php-file-iterator",

12
vendor-bin/robo/composer.lock

@ -1838,16 +1838,16 @@
},
{
"name": "symfony/string",
"version": "v5.3.7",
"version": "v5.3.10",
"source": {
"type": "git",
"url": "https://github.com/symfony/string.git",
"reference": "8d224396e28d30f81969f083a58763b8b9ceb0a5"
"reference": "d70c35bb20bbca71fc4ab7921e3c6bda1a82a60c"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/string/zipball/8d224396e28d30f81969f083a58763b8b9ceb0a5",
"reference": "8d224396e28d30f81969f083a58763b8b9ceb0a5",
"url": "https://api.github.com/repos/symfony/string/zipball/d70c35bb20bbca71fc4ab7921e3c6bda1a82a60c",
"reference": "d70c35bb20bbca71fc4ab7921e3c6bda1a82a60c",
"shasum": ""
},
"require": {
@ -1901,7 +1901,7 @@
"utf8"
],
"support": {
"source": "https://github.com/symfony/string/tree/v5.3.7"
"source": "https://github.com/symfony/string/tree/v5.3.10"
},
"funding": [
{
@ -1917,7 +1917,7 @@
"type": "tidelift"
}
],
"time": "2021-08-26T08:00:08+00:00"
"time": "2021-10-27T18:21:46+00:00"
},
{
"name": "symfony/yaml",

Loading…
Cancel
Save