From 991b5ae5401deae2233f95ecee24fe50e32e9c85 Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Sat, 6 Nov 2021 15:57:41 -0500 Subject: [PATCH] Added Parser::parseInto --- README.md | 27 +++++++++++++++----- RoboFile.php | 10 ++++---- lib/Parser.php | 50 ++++++++++++++++++++++++++------------ lib/Parser/Exception.php | 2 ++ lib/Parser/Serializer.php | 2 +- tests/cases/TestParser.php | 30 +++++++++++++++++++---- 6 files changed, 88 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 0e3e3ea..3daf8ea 100644 --- a/README.md +++ b/README.md @@ -8,36 +8,51 @@ A modern, accurate HTML parser and serializer for PHP. ```php public static MensBeam\HTML\Parser::parse( - string $data, - ?string $encodingOrContentType = null. + string $data, + ?string $encodingOrContentType = null. ?MensBeam\HTML\Parser\Config $config = null ): MensBeam\HTML\Parser\Output ``` The `MensBeam\HTML\Parser::parse` static method is used to parse documents. An arbitrary string (and optional encoding) are taken as input, and a `MensBeam\HTML\Parser\Output` object is returned as output. The `Output` object has the following properties: -- `document`: A `DOMDocument` object representing the parsed document +- `documentClass`: A string `DOMDocument` object representing the parsed document - `encoding`: The original character encoding of the document, as supplied by the user or otherwise detected during parsing - `quirksMode`: The detected "quirks mode" property of the document. This will be one of `Parser::NO_QURIKS_MODE` (`0`), `Parser::QUIRKS_MODE` (`1`), or `Parser::LIMITED_QUIRKS_MODE` (`2`) - `errors`: An array containing the list of parse errors emitted during processing if parse error reporting was turned on (see **Configuration** below), or `null` otherwise Extra configuration parameters may be given to the parser by passing a `MensBeam\HTML\Parser\Config` object as the final `$config` argument. See the **Configuration** section below for more details. +### Parsing into existing documents + +```php +public static MensBeam\HTML\Parser::parseInto( + string $data, + \DOMDocument $document, + ?string $encodingOrContentType = null. + ?MensBeam\HTML\Parser\Config $config = null +): MensBeam\HTML\Parser\Output +``` + +The `MensBeam\HTML\Parser::parseInto` static method is used to parse into an existing document. The supplied document must be an instance of (or derived from) `\DOMDocument` and also must be empty. All other arguments are identical to those used when parsing documents normally. + +*NOTE:* The `documentClass` configuration option has no effect when using this method. + ### Parsing fragments ```php public static MensBeam\HTML\Parser::parse( DOMElement $contextElement, int $quirksMode, - string $data, - ?string $encodingOrContentType = null. + string $data, + ?string $encodingOrContentType = null. ?MensBeam\HTML\Parser\Config $config = null ): DOMDocumentFragment ``` The `MensBeam\HTML\Parser::parseFragment` static method is used to parse document fragments. The primary use case for this method is in the implementation of the `innerHTML` setter of HTML elements. Consequently a context element is required, as well as the "quirks mode" property of the context element's document (which must be one of `Parser::NO_QURIKS_MODE` (`0`), `Parser::QUIRKS_MODE` (`1`), or `Parser::LIMITED_QUIRKS_MODE` (`2`)). The further arguments are identical to those used when parsing documents. -If the "quirks mode" property of the document is not know, using `Parser::NO_QUIRKS_MODE` (`0`) is usually the best choice. +If the "quirks mode" property of the document is not known, using `Parser::NO_QUIRKS_MODE` (`0`) is usually the best choice. Unlike the `parse()` method, the `parseFragment()` method returns a `DOMDocumentFragment` object belonging to `$contextElement`'s owner document. diff --git a/RoboFile.php b/RoboFile.php index 19fd031..229fe78 100644 --- a/RoboFile.php +++ b/RoboFile.php @@ -99,17 +99,17 @@ class RoboFile extends \Robo\Tasks { protected function findCoverageEngine(): string { $dir = rtrim(ini_get("extension_dir"), "/").\DIRECTORY_SEPARATOR; - $ext = IS_WIN ? "dll" : (IS_MAC ? "dylib" : "so"); + $ext = IS_WIN ? "dll" : "so"; $php = escapeshellarg(\PHP_BINARY); $code = escapeshellarg(BASE."lib"); if (extension_loaded("pcov")) { - return "$php -d pcov.enabled=1 -d pcov.directory=$code"; + return "$php -d opcache.enable_cli=0 -d pcov.enabled=1 -d pcov.directory=$code"; } elseif (extension_loaded("xdebug")) { - return "$php -d xdebug.mode=coverage"; + return "$php -d opcache.enable_cli=0 -d xdebug.mode=coverage"; } elseif (file_exists($dir."pcov.$ext")) { - return "$php -d extension=pcov.$ext -d pcov.enabled=1 -d pcov.directory=$code"; + return "$php -d opcache.enable_cli=0 -d extension=pcov.$ext -d pcov.enabled=1 -d pcov.directory=$code"; } elseif (file_exists($dir."xdebug.$ext")) { - return "$php -d zend_extension=xdebug.$ext -d xdebug.mode=coverage"; + return "$php -d opcache.enable_cli=0 -d zend_extension=xdebug.$ext -d xdebug.mode=coverage"; } else { if (IS_WIN) { $dbg = dirname(\PHP_BINARY)."\\phpdbg.exe"; diff --git a/lib/Parser.php b/lib/Parser.php index 03259c3..e88d34a 100644 --- a/lib/Parser.php +++ b/lib/Parser.php @@ -41,17 +41,17 @@ class Parser extends Serializer { ]; /** Parses a string to produce a document object - * + * * @param string $data The string to parse. This may be in any valid encoding * @param string|null $encodingOrContentType The document encoding, or HTTP Content-Type header value, if known. If no provided encoding detection will be attempted * @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any */ public static function parse(string $data, ?string $encodingOrContentType = null, ?Config $config = null): Output { - return static::parseDocumentOrFragment($data, $encodingOrContentType, null, null, $config ?? new Config); + return static::parseDocumentOrFragment($data, $encodingOrContentType, null, null, null, $config ?? new Config); } /** Parses a string to produce a partial document (a document fragment) - * + * * @param \DOMElement $contextElement The context element. The fragment will be pparsed as if it is a collection of children of this element * @param int|null $quirksMode The "quirks mode" property of the context element's document. Must be one of Parser::NO_QUIRKS_MODE, Parser::LIMITED_QUIRKS_MODE, or Parser::QUIRKS_MODE * @param string $data The string to parse. This may be in any valid encoding @@ -60,7 +60,7 @@ class Parser extends Serializer { */ public static function parseFragment(\DOMElement $contextElement, ?int $quirksMode, string $data, ?string $encodingOrContentType = null, ?Config $config = null): \DOMDocumentFragment { // parse the fragment into a temporary document - $out = self::parseDocumentOrFragment($data, $encodingOrContentType, $contextElement, $quirksMode, $config ?? new Config); + $out = self::parseDocumentOrFragment($data, $encodingOrContentType, null, $contextElement, $quirksMode, $config ?? new Config); $document = $out->document; // extract the nodes from the temporary document into a fragment belonging to the context element's document $fragment = $contextElement->ownerDocument->createDocumentFragment(); @@ -71,20 +71,38 @@ class Parser extends Serializer { return $fragment; } - protected static function parseDocumentOrFragment(string $data, ?string $encodingOrContentType, ?\DOMElement $fragmentContext, ?int $fragmentQuirks, Config $config): Output { - // check the document class - if (isset($config->documentClass)) { - try { - $document = new $config->documentClass; - } catch (\Throwable $e) { - throw new Exception(Exception::FAILED_CREATING_DOCUMENT, [$config->documentClass], $e); - } - if (!$document instanceof \DOMDocument) { - throw new Exception(Exception::INVALID_DOCUMENT_CLASS, [get_class($document)]); + /** Parses a string into an existing document object + * + * @param string $data The string to parse. This may be in any valid encoding + * @param \DOMDocument $document The document to parse into. Must be an instance of or derived from \DOMDocument and must be empty + * @param string|null $encodingOrContentType The document encoding, or HTTP Content-Type header value, if known. If no provided encoding detection will be attempted + * @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any + */ + public static function parseInto(string $data, \DOMDocument $document, ?string $encodingOrContentType = null, ?Config $config = null): Output { + return static::parseDocumentOrFragment($data, $encodingOrContentType, $document, null, null, $config ?? new Config); + } + + protected static function parseDocumentOrFragment(string $data, ?string $encodingOrContentType, ?\DOMDocument $document, ?\DOMElement $fragmentContext, ?int $fragmentQuirks, Config $config): Output { + if ($document === null) { + // check the document class + if (isset($config->documentClass)) { + try { + $document = new $config->documentClass; + } catch (\Throwable $e) { + throw new Exception(Exception::FAILED_CREATING_DOCUMENT, [$config->documentClass], $e); + } + if (!$document instanceof \DOMDocument) { + throw new Exception(Exception::INVALID_DOCUMENT_CLASS, [get_class($document)]); + } + } else { + $document = new \DOMDocument(); } } else { - $document = new \DOMDocument(); + if ($document->hasChildNodes()) { + throw new Exception(Exception::NON_EMPTY_DOCUMENT); + } } + // sort out other needed configuration $htmlNamespace = ($config->htmlNamespace) ? self::HTML_NAMESPACE : null; // Initialize the various classes needed for parsing @@ -110,7 +128,7 @@ class Parser extends Serializer { // Destroy our existing objects unset($errorHandler, $decoder, $stack, $tokenizer, $tokenList, $treeConstructor); // Parse a second time - return static::parseDocumentOrFragment($data, $encoding, $fragmentContext, $fragmentQuirks, $config); + return static::parseDocumentOrFragment($data, $encoding, $document, $fragmentContext, $fragmentQuirks, $config); } // prepare the output $out = new Output; diff --git a/lib/Parser/Exception.php b/lib/Parser/Exception.php index 10b48ee..02ff103 100644 --- a/lib/Parser/Exception.php +++ b/lib/Parser/Exception.php @@ -10,6 +10,7 @@ class Exception extends \Exception { public const INVALID_QUIRKS_MODE = 101; public const FAILED_CREATING_DOCUMENT = 102; public const INVALID_DOCUMENT_CLASS = 103; + public const NON_EMPTY_DOCUMENT = 104; public const UNSUPPORTED_NODE_TYPE = 201; @@ -17,6 +18,7 @@ class Exception extends \Exception { 101 => 'Fragment\'s quirks mode must be one of Parser::NO_QUIRKS_MODE, Parser::LIMITED_QUIRKS_MODE, or Parser::QUIRKS_MODE', 102 => 'Unable to create instance of configured document class "%s"', 103 => 'Configured document class "%s" must be a subclass of \DOMDocument', + 104 => 'Supplied document is not empty', 201 => 'Unable to serialize unsupported node type %s', ]; diff --git a/lib/Parser/Serializer.php b/lib/Parser/Serializer.php index 7f6674a..85f0ae9 100644 --- a/lib/Parser/Serializer.php +++ b/lib/Parser/Serializer.php @@ -273,7 +273,7 @@ abstract class Serializer { // the DOM occurs within its inner DOM, template contents are entirely in the // userland wrapper class, so that must be accounted for. elseif ($node->ownerDocument instanceof \MensBeam\HTML\DOM\InnerNode\Document) { - $node = $node->ownerDocument->getInnerNode($node->ownerDocument->getWrapperNode($node)->content); + $node = $node->ownerDocument->getInnerNode($node->ownerDocument->getWrapperNode($node)->content); // @codeCoverageIgnore } } } diff --git a/tests/cases/TestParser.php b/tests/cases/TestParser.php index 8b428bf..53360ec 100644 --- a/tests/cases/TestParser.php +++ b/tests/cases/TestParser.php @@ -11,14 +11,14 @@ use MensBeam\HTML\Parser\Output; use MensBeam\HTML\Parser\Config; use MensBeam\HTML\Parser\Exception; -/** +/** * @covers \MensBeam\HTML\Parser * @covers \MensBeam\HTML\Parser\Exception */ class TestParser extends \PHPUnit\Framework\TestCase { public function testParseADocument(): void { $in = "hello world!"; - $out = Parser::parse($in, "tex/html; charset=utf8"); + $out = Parser::parse($in, "text/html; charset=utf8"); $this->assertInstanceOf(Output::class, $out); $this->assertInstanceOf(\DOMDocument::class, $out->document); $this->assertSame("UTF-8", $out->encoding); @@ -26,11 +26,23 @@ class TestParser extends \PHPUnit\Framework\TestCase { $this->assertNull($out->errors); } + public function testParseIntoExistingDocument(): void { + $in = "hello world!"; + $d = new \DOMDocument(); + $out = Parser::parseInto($in, $d, "text/html; charset=utf8"); + $this->assertInstanceOf(Output::class, $out); + $this->assertSame($d, $out->document); + $this->assertInstanceOf(\DOMDocument::class, $out->document); + $this->assertSame("UTF-8", $out->encoding); + $this->assertSame(Parser::QUIRKS_MODE, $out->quirksMode); + $this->assertNull($out->errors); + } + public function testParseAFragment(): void { $doc = new \DOMDocument(); $context = $doc->createElement("div"); $in = "hello world!"; - $out = Parser::parseFragment($context, 0, $in, "tex/html; charset=utf8"); + $out = Parser::parseFragment($context, 0, $in, "text/html; charset=utf8"); $this->assertInstanceOf(\DOMDocumentFragment::class, $out); } @@ -40,14 +52,14 @@ class TestParser extends \PHPUnit\Framework\TestCase { $context = $doc->createElement("div"); $in = "hello world!"; $this->expectExceptionObject(new Exception(Exception::INVALID_QUIRKS_MODE)); - Parser::parseFragment($context, -1, $in, "tex/html; charset=utf8"); + Parser::parseFragment($context, -1, $in, "text/html; charset=utf8"); } public function testParseADocumentReportingErrors(): void { $in = "hello world!"; $conf = new Config; $conf->errorCollection = true; - $out = Parser::parse($in, "tex/html; charset=utf8", $conf); + $out = Parser::parse($in, "text/html; charset=utf8", $conf); $this->assertInstanceOf(Output::class, $out); $this->assertInstanceOf(\DOMDocument::class, $out->document); $this->assertSame("UTF-8", $out->encoding); @@ -91,6 +103,14 @@ class TestParser extends \PHPUnit\Framework\TestCase { Parser::parse($in, "utf8", $conf); } + public function testParseIntoExistingDocumentWithANonEmptyDocument(): void { + $in = "hello world!"; + $d = new \DOMDocument(); + $d->appendChild($d->createElement('fail')); + $this->expectExceptionCode(Exception::NON_EMPTY_DOCUMENT); + Parser::parseInto($in, $d, "utf8"); + } + public function testParseCheckingAttributeCoercion(): void { $document = Parser::parse('
No coercion should occur here
', "UTF-8")->document; $act = [];