From d093e43066b653731afe158a3ed1544c791a0a4b Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Thu, 30 Sep 2021 16:47:00 -0500 Subject: [PATCH] Rewrote Document::createElement and Document::createElementNS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Both now more closely follow the spec --- composer.json | 6 ++ composer.lock | 25 ++++-- lib/DOMException.php | 10 +++ lib/Document.php | 133 ++++++++++++++++++++++++++++--- lib/ElementSet.php | 10 ++- lib/HTMLTemplateElement.php | 4 +- tests/cases/TestDocument.php | 89 +++++++++++++++++++++ vendor-bin/phpunit/composer.lock | 2 +- vendor-bin/robo/composer.lock | 2 +- 9 files changed, 254 insertions(+), 27 deletions(-) diff --git a/composer.json b/composer.json index f7076f4..4c28921 100644 --- a/composer.json +++ b/composer.json @@ -38,6 +38,12 @@ "MensBeam\\HTML\\DOM\\TestCase\\": "tests/cases/" } }, + "repositories": [ + { + "type": "git", + "url": "mensbeam-gitea:MensBeam/HTML-Parser.git" + } + ], "require-dev": { "bamarni/composer-bin-plugin": "^1.3", "daux/daux.io": "^0.16.0" diff --git a/composer.lock b/composer.lock index 2ee37f2..aaa0b41 100644 --- a/composer.lock +++ b/composer.lock @@ -4,15 +4,15 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "1077b4324237a83eab2f418e6eb3f2d4", + "content-hash": "fa97f8577cdf49f5f1dc348f28cf141b", "packages": [ { "name": "mensbeam/html-parser", "version": "dev-master", "source": { "type": "git", - "url": "https://code.mensbeam.com/MensBeam/HTML-Parser", - "reference": "49764a456743ac69074b4f64619cf87af254c071" + "url": "mensbeam-gitea:MensBeam/HTML-Parser.git", + "reference": "c6c3f725d09f4162fb58c6943f980af34c7435b2" }, "require": { "ext-dom": "*", @@ -42,7 +42,20 @@ "lib/Parser/ctype.php" ] }, - "notification-url": "https://packagist.org/downloads/", + "autoload-dev": { + "psr-4": { + "MensBeam\\HTML\\Test\\": "tests/lib/", + "MensBeam\\HTML\\TestCase\\": "tests/cases/" + } + }, + "scripts": { + "post-install-cmd": [ + "@composer bin all install" + ], + "post-update-cmd": [ + "@composer bin all update" + ] + }, "license": [ "MIT" ], @@ -59,7 +72,7 @@ } ], "description": "Parses modern HTML text into a PHP DOMDocument", - "time": "2021-09-28T20:15:07+00:00" + "time": "2021-09-29T22:39:46+00:00" }, { "name": "mensbeam/intl", @@ -2466,5 +2479,5 @@ "ext-dom": "*" }, "platform-dev": [], - "plugin-api-version": "2.1.0" + "plugin-api-version": "2.0.0" } diff --git a/lib/DOMException.php b/lib/DOMException.php index b1fc0d5..effd087 100644 --- a/lib/DOMException.php +++ b/lib/DOMException.php @@ -15,7 +15,12 @@ class DOMException extends \Exception { const INVALID_CHARACTER = 5; const NO_MODIFICATION_ALLOWED = 7; const NOT_FOUND = 8; + const NOT_SUPPORTED = 9; const SYNTAX_ERROR = 12; + const INVALID_MODIFICATION_ERROR = 13; + const NAMESPACE_ERROR = 14; + const INVALID_ACCESS_ERROR = 15; + const VALIDATION_ERROR = 16; const OUTER_HTML_FAILED_NOPARENT = 101; @@ -26,7 +31,12 @@ class DOMException extends \Exception { 5 => 'Invalid character', 7 => 'Modification not allowed here', 8 => 'Not found error', + 9 => 'Feature is not supported because %s', 12 => 'Syntax error', + 13 => 'Invalid modification error', + 14 => 'Namespace error', + 15 => 'Invalid access error', + 16 => 'Validation error', 101 => 'Failed to set the "outerHTML" property; the element does not have a parent node' ]; diff --git a/lib/Document.php b/lib/Document.php index 5cc22e7..a86c3e9 100644 --- a/lib/Document.php +++ b/lib/Document.php @@ -177,37 +177,144 @@ class Document extends AbstractDocument { } } - public function createElement($name, $value = "") { - return $this->createElementNS(null, $name, $value); + public function createElement($name, $value = null): Element { + # The createElement(localName, options) method steps are: + // DEVIATION: We cannot follow the createElement parameters per the DOM spec + // because we cannot change the parameters from \DOMDOcument. This is okay + // because $options is currently just for the is attribute for custom elements. + // Since this implementation does not have support for scripting that argument + // would be useless anyway. Equally, the $value parameter from PHP's DOM is + // useless, so it is disabled in this implementation as it doesn't exist in the + // DOM spec. + if ($value !== null) { + throw new DOMException(DOMException::NOT_SUPPORTED, 'the value parameter is not in the official DOM specification; create a text node and append instead'); + } + + # 1. If localName does not match the Name production, then throw an + # "InvalidCharacterError" DOMException. + if (preg_match('/^[:A-Z_a-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}][:A-Z_a-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}-\.0-9\x{B7}\x{0300}-\x{036F}\x{203F}-\x{2040}]*$/u', $name) !== 1) { + throw new DOMException(DOMException::INVALID_CHARACTER); + } + + # 2. If this is an HTML document, then set localName to localName in ASCII + # lowercase. + // This will always be an HTML document + $name = strtolower($name); + + # 3. Let is be null. + # 4. If options is a dictionary and options["is"] exists, then set is to it. + # 5. Let namespace be the HTML namespace, if this is an HTML document or this’s + # content type is "application/xhtml+xml"; otherwise null. + # 6. Return the result of creating an element given this, localName, namespace, + # null, is, and with the synchronous custom elements flag set. + // DEVIATION: There is no scripting in this implementation. + + try { + if ($name !== 'template') { + $e = parent::createElement($name); + } else { + $e = new HTMLTemplateElement($this, $name); + } + + return $e; + } catch (\DOMException $e) { + // The element name is invalid for XML + // Replace any offending characters with "UHHHHHH" where H are the + // uppercase hexadecimal digits of the character's code point + return parent::createElement($this->coerceName($name)); + } } - public function createElementNS($namespaceURI, $qualifiedName, $value = "") { - // Normalize the element name and namespace URI per modern DOM specifications. - if ($namespaceURI !== null) { - $namespaceURI = trim($namespaceURI); - $namespaceURI = ($namespaceURI === Parser::HTML_NAMESPACE) ? null : $namespaceURI; + public function createElementNS($namespaceURI, $qualifiedName, $value = null): Element { + # The internal createElementNS steps, given document, namespace, qualifiedName, + # and options, are as follows: + // DEVIATION: We cannot follow the createElement parameters per the DOM spec + // because we cannot change the parameters from \DOMDOcument. This is okay + // because $options is currently just for the is attribute for custom elements. + // Since this implementation does not have support for scripting that argument + // would be useless anyway. Equally, the $value parameter from PHP's DOM is + // useless, so it is disabled in this implementation as it doesn't exist in the + // DOM spec. + + if ($value !== null) { + throw new DOMException(DOMException::NOT_SUPPORTED, 'the value parameter is not in the official DOM specification; create a text node and append instead'); } - $qualifiedName = ($namespaceURI === null) ? strtolower(trim($qualifiedName)) : trim($qualifiedName); + + # 1. Let namespace, prefix, and localName be the result of passing namespace and + # qualifiedName to validate and extract. + + ## To validate and extract a namespace and qualifiedName, run these steps: + ## 1. If namespace is the empty string, set it to null. + if ($namespaceURI === '') { + $namespaceURI = null; + } + + ## 2. Validate qualifiedName. + ### To validate a qualifiedName, throw an "InvalidCharacterError" DOMException if + ### qualifiedName does not match the QName production. + if (preg_match('/^([A-Z_a-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}][A-Z_a-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}-\.0-9\x{B7}\x{0300}-\x{036F}\x{203F}-\x{2040}]*:)?[A-Z_a-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}][A-Z_a-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}-\.0-9\x{B7}\x{0300}-\x{036F}\x{203F}-\x{2040}]*$/u', $qualifiedName) !== 1) { + throw new DOMException(DOMException::INVALID_CHARACTER); + } + + ## 3. Let prefix be null. + $prefix = null; + + ## 4. Let localName be qualifiedName. + $localName = $qualifiedName; + + ## 5. If qualifiedName contains a ":" (U+003E), then split the string on it and + ## set prefix to the part before and localName to the part after. + if (strpos($qualifiedName, ':') !== false) { + $temp = explode(':', $qualifiedName, 2); + $prefix = $temp[0]; + $prefix = ($prefix !== '') ? $prefix : null; + $localName = $temp[1]; + } + + ## 6. If prefix is non-null and namespace is null, then throw a "NamespaceError" DOMException. + ## 7. If prefix is "xml" and namespace is not the XML namespace, then throw a "NamespaceError" DOMException. + ## 8. If either qualifiedName or prefix is "xmlns" and namespace is not the XMLNS + ## namespace, then throw a "NamespaceError" DOMException. + ## 9. If namespace is the XMLNS namespace and neither qualifiedName nor prefix is + ## "xmlns", then throw a "NamespaceError" DOMException. + if ( + ($prefix !== null && $namespaceURI === null) || + ($prefix === 'xml' && $namespaceURI !== Parser::XML_NAMESPACE) || + (($qualifiedName === 'xmlns' || $prefix === 'xmlns') && $namespaceURI !== Parser::XMLNS_NAMESPACE) || + ($namespaceURI === Parser::XMLNS_NAMESPACE && $qualifiedName !== 'xmlns' && $prefix !== 'xmlns') + ) { + throw new DOMException(DOMException::NAMESPACE_ERROR); + } + + ## 10. Return namespace, prefix, and localName. + // Right-o. + + # 2. Let is be null. + # 3. If options is a dictionary and options["is"] exists, then set is to it. + # 4. Return the result of creating an element given document, localName, namespace, + # prefix, is, and with the synchronous custom elements flag set. + // DEVIATION: There is no scripting in this implementation. try { if ($qualifiedName !== 'template' || $namespaceURI !== null) { - $e = parent::createElementNS($namespaceURI, $qualifiedName, $value); + $e = parent::createElementNS($namespaceURI, $qualifiedName); } else { - $e = new HTMLTemplateElement($this, $qualifiedName, $value); + $e = new HTMLTemplateElement($this, $qualifiedName); } return $e; } catch (\DOMException $e) { // The element name is invalid for XML // Replace any offending characters with "UHHHHHH" where H are the - // uppercase hexadecimal digits of the character's code point + // uppercase hexadecimal digits of the character's code point $this->mangledElements = true; if ($namespaceURI !== null) { - $qualifiedName = implode(":", array_map([$this, "coerceName"], explode(":", $qualifiedName, 2))); + $qualifiedName = implode(':', array_map([ $this, 'coerceName' ], explode(':', $qualifiedName, 2))); } else { $qualifiedName = $this->coerceName($qualifiedName); } - return parent::createElementNS($namespaceURI, $qualifiedName, $value); + + return parent::createElementNS($namespaceURI, $qualifiedName); } } diff --git a/lib/ElementSet.php b/lib/ElementSet.php index 4b3c926..39d556b 100644 --- a/lib/ElementSet.php +++ b/lib/ElementSet.php @@ -8,10 +8,12 @@ declare(strict_types=1); namespace MensBeam\HTML\DOM; -// This is a write-only set of elements which need to be kept in memory; it -// exists because values of properties on derived DOM classes are lost unless at -// least one PHP reference is kept for the element somewhere in userspace. This -// is that somewhere. It is at present only used for template elements. +// This is a set of elements which need to be kept in memory; it exists because +// of the peculiar way PHP works. Derived DOM classes (such as +// HTMLTemplateElement) won't remain as such in the DOM (meaning they will +// revert to being what is registered for elements in Document) unless at least +// one reference is kept for the element somewhere in userspace. This is that +// somewhere. class ElementSet { protected static $_storage = []; diff --git a/lib/HTMLTemplateElement.php b/lib/HTMLTemplateElement.php index 9309a06..8c222be 100644 --- a/lib/HTMLTemplateElement.php +++ b/lib/HTMLTemplateElement.php @@ -12,8 +12,8 @@ namespace MensBeam\HTML\DOM; class HTMLTemplateElement extends Element { public $content = null; - public function __construct(Document $ownerDocument, string $qualifiedName, ?string $value = null, string $namespace = '') { - parent::__construct($qualifiedName, $value, $namespace); + public function __construct(Document $ownerDocument, string $qualifiedName, ?string $namespace = null) { + parent::__construct($qualifiedName, null, $namespace); // Elements that are created by their constructor in PHP aren't owned by any // document and are readonly until owned by one. Temporarily append to a diff --git a/tests/cases/TestDocument.php b/tests/cases/TestDocument.php index 37e8a0b..912ee84 100644 --- a/tests/cases/TestDocument.php +++ b/tests/cases/TestDocument.php @@ -17,6 +17,39 @@ use MensBeam\HTML\Parser; class TestDocument extends \PHPUnit\Framework\TestCase { + public function provideAttributeNodes(): iterable { + return [ + [null, 'test', 'test', ''], + [null, 'test:test', 'testU00003Atest', ''], + [null, 'test', 'test', ''], + [null, 'TEST:TEST', 'TESTU00003ATEST', ''], + ['fake_ns', 'test', 'test', ''], + ['fake_ns', 'test:test', 'test', 'test'], + ['fake_ns', 'TEST:TEST', 'TEST', 'TEST'], + ['fake_ns', 'test:test:test', 'testU00003Atest', 'test'], + ['fake_ns', 'TEST:TEST:TEST', 'TESTU00003ATEST', 'TEST'], + ]; + } + + /** + * @dataProvider provideAttributeNodes + * @covers \MensBeam\HTML\DOM\Document::createAttribute + * @covers \MensBeam\HTML\DOM\Document::createAttributeNS + */ + public function testAttributeNodeCreation(?string $nsIn, string $nameIn, string $local, string $prefix): void { + $d = new Document(); + $d->appendChild($d->createElement('html')); + $a = $d->createAttributeNS($nsIn, $nameIn); + $this->assertSame($local, $a->localName); + $this->assertSame($nsIn, $a->namespaceURI); + $this->assertSame($prefix, $a->prefix); + + $d = new Document(); + $d->appendChild($d->createElement('html')); + $a = $d->createAttribute($nameIn); + $this->assertSame(($prefix !== '') ? "{$prefix}U00003A{$local}" : $local, $a->nodeName); + } + /** * @covers \MensBeam\HTML\DOM\Document::__construct * @covers \MensBeam\HTML\DOM\Document::loadDOM @@ -39,4 +72,60 @@ class TestDocument extends \PHPUnit\Framework\TestCase { $this->assertSame('MensBeam\HTML\DOM\Element', $d->firstChild::class); $this->assertSame('html', $d->firstChild->nodeName); } + + + public function provideElements(): iterable { + return [ + // HTML element + [ '', null, 'div', 'div', Element::class ], + // HTML element and having to normalize local name + [ '', null, ' DIV', 'div', Element::class ], + // HTML element with a null namespace + [ null, null, 'div', 'div', Element::class ], + // HTML element with a null namespace and having to normalize local name + [ null, null, 'DIV ', 'div', Element::class ], + // HTML element with an HTML namespace + [ Parser::HTML_NAMESPACE, null, 'div', 'div', Element::class ], + // HTML element with an HTML namespace and having to normalize local name + [ Parser::HTML_NAMESPACE, null, ' DIV ', 'div', Element::class ], + // HTML element with an HTML namespace and having to normalize namespace URI & + // local name + [ strtoupper(Parser::HTML_NAMESPACE), null, ' DIV ', 'div', Element::class ], + // Template element + [ '', null, 'template', 'template', HTMLTemplateElement::class ], + // Template element and having to normalize local name + [ '', null, ' TEMPLATE', 'template', HTMLTemplateElement::class ], + // Template element with a null namespace + [ null, null, 'template', 'template', HTMLTemplateElement::class ], + // Template element with a null namespace and having to normalize local name + [ null, null, 'TEMPLATE ', 'template', HTMLTemplateElement::class ], + // Template element with an HTML namespace + [ Parser::HTML_NAMESPACE, null, 'template', 'template', HTMLTemplateElement::class ], + // Template element with an HTML namespace and having to normalize local name + [ Parser::HTML_NAMESPACE, null, ' TEMPLATE ', 'template', HTMLTemplateElement::class ], + // Template element with an HTML namespace and having to normalize namespace URI & + // local name + [ strtoupper(Parser::HTML_NAMESPACE), null, ' TEMPLATE ', 'template', HTMLTemplateElement::class ], + // SVG element with SVG namespace + [ Parser::SVG_NAMESPACE, Parser::SVG_NAMESPACE, 'svg', 'svg', Element::class ], + // SVG element with SVG namespace and having to normalize local name + [ Parser::SVG_NAMESPACE, Parser::SVG_NAMESPACE, ' SVG', 'SVG', Element::class ], + // SVG element with SVG namespace and having to normalize local name + [ strtoupper(Parser::SVG_NAMESPACE), Parser::SVG_NAMESPACE, ' SVG ', 'SVG', Element::class ] + ]; + } + + /** + * @dataProvider provideElements + * @covers \MensBeam\HTML\DOM\Document::createElement + * @covers \MensBeam\HTML\DOM\Document::createElementNS + */ + public function testElementCreation(?string $nsIn, ?string $nsOut, string $localIn, string $localOut, string $class): void { + $d = new Document; + $n = ($nsIn !== '') ? $d->createElementNS($nsIn, $localIn) : $d->createElement($localIn); + $this->assertInstanceOf($class, $n); + $this->assertNotNull($n->ownerDocument); + $this->assertSame($nsOut, $n->namespaceURI); + $this->assertSame($localOut, $n->localName); + } } diff --git a/vendor-bin/phpunit/composer.lock b/vendor-bin/phpunit/composer.lock index d095529..3ba079a 100644 --- a/vendor-bin/phpunit/composer.lock +++ b/vendor-bin/phpunit/composer.lock @@ -2107,5 +2107,5 @@ "prefer-lowest": false, "platform": [], "platform-dev": [], - "plugin-api-version": "2.1.0" + "plugin-api-version": "2.0.0" } diff --git a/vendor-bin/robo/composer.lock b/vendor-bin/robo/composer.lock index d3af439..73e278c 100644 --- a/vendor-bin/robo/composer.lock +++ b/vendor-bin/robo/composer.lock @@ -2003,5 +2003,5 @@ "prefer-lowest": false, "platform": [], "platform-dev": [], - "plugin-api-version": "2.1.0" + "plugin-api-version": "2.0.0" }