Browse Source

Added Parser::parseInto

pretty-print
Dustin Wilson 2 years ago
parent
commit
991b5ae540
  1. 27
      README.md
  2. 10
      RoboFile.php
  3. 50
      lib/Parser.php
  4. 2
      lib/Parser/Exception.php
  5. 2
      lib/Parser/Serializer.php
  6. 30
      tests/cases/TestParser.php

27
README.md

@ -8,36 +8,51 @@ A modern, accurate HTML parser and serializer for PHP.
```php
public static MensBeam\HTML\Parser::parse(
string $data,
?string $encodingOrContentType = null.
string $data,
?string $encodingOrContentType = null.
?MensBeam\HTML\Parser\Config $config = null
): MensBeam\HTML\Parser\Output
```
The `MensBeam\HTML\Parser::parse` static method is used to parse documents. An arbitrary string (and optional encoding) are taken as input, and a `MensBeam\HTML\Parser\Output` object is returned as output. The `Output` object has the following properties:
- `document`: A `DOMDocument` object representing the parsed document
- `documentClass`: A string `DOMDocument` object representing the parsed document
- `encoding`: The original character encoding of the document, as supplied by the user or otherwise detected during parsing
- `quirksMode`: The detected "quirks mode" property of the document. This will be one of `Parser::NO_QURIKS_MODE` (`0`), `Parser::QUIRKS_MODE` (`1`), or `Parser::LIMITED_QUIRKS_MODE` (`2`)
- `errors`: An array containing the list of parse errors emitted during processing if parse error reporting was turned on (see **Configuration** below), or `null` otherwise
Extra configuration parameters may be given to the parser by passing a `MensBeam\HTML\Parser\Config` object as the final `$config` argument. See the **Configuration** section below for more details.
### Parsing into existing documents
```php
public static MensBeam\HTML\Parser::parseInto(
string $data,
\DOMDocument $document,
?string $encodingOrContentType = null.
?MensBeam\HTML\Parser\Config $config = null
): MensBeam\HTML\Parser\Output
```
The `MensBeam\HTML\Parser::parseInto` static method is used to parse into an existing document. The supplied document must be an instance of (or derived from) `\DOMDocument` and also must be empty. All other arguments are identical to those used when parsing documents normally.
*NOTE:* The `documentClass` configuration option has no effect when using this method.
### Parsing fragments
```php
public static MensBeam\HTML\Parser::parse(
DOMElement $contextElement,
int $quirksMode,
string $data,
?string $encodingOrContentType = null.
string $data,
?string $encodingOrContentType = null.
?MensBeam\HTML\Parser\Config $config = null
): DOMDocumentFragment
```
The `MensBeam\HTML\Parser::parseFragment` static method is used to parse document fragments. The primary use case for this method is in the implementation of the `innerHTML` setter of HTML elements. Consequently a context element is required, as well as the "quirks mode" property of the context element's document (which must be one of `Parser::NO_QURIKS_MODE` (`0`), `Parser::QUIRKS_MODE` (`1`), or `Parser::LIMITED_QUIRKS_MODE` (`2`)). The further arguments are identical to those used when parsing documents.
If the "quirks mode" property of the document is not know, using `Parser::NO_QUIRKS_MODE` (`0`) is usually the best choice.
If the "quirks mode" property of the document is not known, using `Parser::NO_QUIRKS_MODE` (`0`) is usually the best choice.
Unlike the `parse()` method, the `parseFragment()` method returns a `DOMDocumentFragment` object belonging to `$contextElement`'s owner document.

10
RoboFile.php

@ -99,17 +99,17 @@ class RoboFile extends \Robo\Tasks {
protected function findCoverageEngine(): string {
$dir = rtrim(ini_get("extension_dir"), "/").\DIRECTORY_SEPARATOR;
$ext = IS_WIN ? "dll" : (IS_MAC ? "dylib" : "so");
$ext = IS_WIN ? "dll" : "so";
$php = escapeshellarg(\PHP_BINARY);
$code = escapeshellarg(BASE."lib");
if (extension_loaded("pcov")) {
return "$php -d pcov.enabled=1 -d pcov.directory=$code";
return "$php -d opcache.enable_cli=0 -d pcov.enabled=1 -d pcov.directory=$code";
} elseif (extension_loaded("xdebug")) {
return "$php -d xdebug.mode=coverage";
return "$php -d opcache.enable_cli=0 -d xdebug.mode=coverage";
} elseif (file_exists($dir."pcov.$ext")) {
return "$php -d extension=pcov.$ext -d pcov.enabled=1 -d pcov.directory=$code";
return "$php -d opcache.enable_cli=0 -d extension=pcov.$ext -d pcov.enabled=1 -d pcov.directory=$code";
} elseif (file_exists($dir."xdebug.$ext")) {
return "$php -d zend_extension=xdebug.$ext -d xdebug.mode=coverage";
return "$php -d opcache.enable_cli=0 -d zend_extension=xdebug.$ext -d xdebug.mode=coverage";
} else {
if (IS_WIN) {
$dbg = dirname(\PHP_BINARY)."\\phpdbg.exe";

50
lib/Parser.php

@ -41,17 +41,17 @@ class Parser extends Serializer {
];
/** Parses a string to produce a document object
*
*
* @param string $data The string to parse. This may be in any valid encoding
* @param string|null $encodingOrContentType The document encoding, or HTTP Content-Type header value, if known. If no provided encoding detection will be attempted
* @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any
*/
public static function parse(string $data, ?string $encodingOrContentType = null, ?Config $config = null): Output {
return static::parseDocumentOrFragment($data, $encodingOrContentType, null, null, $config ?? new Config);
return static::parseDocumentOrFragment($data, $encodingOrContentType, null, null, null, $config ?? new Config);
}
/** Parses a string to produce a partial document (a document fragment)
*
*
* @param \DOMElement $contextElement The context element. The fragment will be pparsed as if it is a collection of children of this element
* @param int|null $quirksMode The "quirks mode" property of the context element's document. Must be one of Parser::NO_QUIRKS_MODE, Parser::LIMITED_QUIRKS_MODE, or Parser::QUIRKS_MODE
* @param string $data The string to parse. This may be in any valid encoding
@ -60,7 +60,7 @@ class Parser extends Serializer {
*/
public static function parseFragment(\DOMElement $contextElement, ?int $quirksMode, string $data, ?string $encodingOrContentType = null, ?Config $config = null): \DOMDocumentFragment {
// parse the fragment into a temporary document
$out = self::parseDocumentOrFragment($data, $encodingOrContentType, $contextElement, $quirksMode, $config ?? new Config);
$out = self::parseDocumentOrFragment($data, $encodingOrContentType, null, $contextElement, $quirksMode, $config ?? new Config);
$document = $out->document;
// extract the nodes from the temporary document into a fragment belonging to the context element's document
$fragment = $contextElement->ownerDocument->createDocumentFragment();
@ -71,20 +71,38 @@ class Parser extends Serializer {
return $fragment;
}
protected static function parseDocumentOrFragment(string $data, ?string $encodingOrContentType, ?\DOMElement $fragmentContext, ?int $fragmentQuirks, Config $config): Output {
// check the document class
if (isset($config->documentClass)) {
try {
$document = new $config->documentClass;
} catch (\Throwable $e) {
throw new Exception(Exception::FAILED_CREATING_DOCUMENT, [$config->documentClass], $e);
}
if (!$document instanceof \DOMDocument) {
throw new Exception(Exception::INVALID_DOCUMENT_CLASS, [get_class($document)]);
/** Parses a string into an existing document object
*
* @param string $data The string to parse. This may be in any valid encoding
* @param \DOMDocument $document The document to parse into. Must be an instance of or derived from \DOMDocument and must be empty
* @param string|null $encodingOrContentType The document encoding, or HTTP Content-Type header value, if known. If no provided encoding detection will be attempted
* @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any
*/
public static function parseInto(string $data, \DOMDocument $document, ?string $encodingOrContentType = null, ?Config $config = null): Output {
return static::parseDocumentOrFragment($data, $encodingOrContentType, $document, null, null, $config ?? new Config);
}
protected static function parseDocumentOrFragment(string $data, ?string $encodingOrContentType, ?\DOMDocument $document, ?\DOMElement $fragmentContext, ?int $fragmentQuirks, Config $config): Output {
if ($document === null) {
// check the document class
if (isset($config->documentClass)) {
try {
$document = new $config->documentClass;
} catch (\Throwable $e) {
throw new Exception(Exception::FAILED_CREATING_DOCUMENT, [$config->documentClass], $e);
}
if (!$document instanceof \DOMDocument) {
throw new Exception(Exception::INVALID_DOCUMENT_CLASS, [get_class($document)]);
}
} else {
$document = new \DOMDocument();
}
} else {
$document = new \DOMDocument();
if ($document->hasChildNodes()) {
throw new Exception(Exception::NON_EMPTY_DOCUMENT);
}
}
// sort out other needed configuration
$htmlNamespace = ($config->htmlNamespace) ? self::HTML_NAMESPACE : null;
// Initialize the various classes needed for parsing
@ -110,7 +128,7 @@ class Parser extends Serializer {
// Destroy our existing objects
unset($errorHandler, $decoder, $stack, $tokenizer, $tokenList, $treeConstructor);
// Parse a second time
return static::parseDocumentOrFragment($data, $encoding, $fragmentContext, $fragmentQuirks, $config);
return static::parseDocumentOrFragment($data, $encoding, $document, $fragmentContext, $fragmentQuirks, $config);
}
// prepare the output
$out = new Output;

2
lib/Parser/Exception.php

@ -10,6 +10,7 @@ class Exception extends \Exception {
public const INVALID_QUIRKS_MODE = 101;
public const FAILED_CREATING_DOCUMENT = 102;
public const INVALID_DOCUMENT_CLASS = 103;
public const NON_EMPTY_DOCUMENT = 104;
public const UNSUPPORTED_NODE_TYPE = 201;
@ -17,6 +18,7 @@ class Exception extends \Exception {
101 => 'Fragment\'s quirks mode must be one of Parser::NO_QUIRKS_MODE, Parser::LIMITED_QUIRKS_MODE, or Parser::QUIRKS_MODE',
102 => 'Unable to create instance of configured document class "%s"',
103 => 'Configured document class "%s" must be a subclass of \DOMDocument',
104 => 'Supplied document is not empty',
201 => 'Unable to serialize unsupported node type %s',
];

2
lib/Parser/Serializer.php

@ -273,7 +273,7 @@ abstract class Serializer {
// the DOM occurs within its inner DOM, template contents are entirely in the
// userland wrapper class, so that must be accounted for.
elseif ($node->ownerDocument instanceof \MensBeam\HTML\DOM\InnerNode\Document) {
$node = $node->ownerDocument->getInnerNode($node->ownerDocument->getWrapperNode($node)->content);
$node = $node->ownerDocument->getInnerNode($node->ownerDocument->getWrapperNode($node)->content); // @codeCoverageIgnore
}
}
}

30
tests/cases/TestParser.php

@ -11,14 +11,14 @@ use MensBeam\HTML\Parser\Output;
use MensBeam\HTML\Parser\Config;
use MensBeam\HTML\Parser\Exception;
/**
/**
* @covers \MensBeam\HTML\Parser
* @covers \MensBeam\HTML\Parser\Exception
*/
class TestParser extends \PHPUnit\Framework\TestCase {
public function testParseADocument(): void {
$in = "hello world!";
$out = Parser::parse($in, "tex/html; charset=utf8");
$out = Parser::parse($in, "text/html; charset=utf8");
$this->assertInstanceOf(Output::class, $out);
$this->assertInstanceOf(\DOMDocument::class, $out->document);
$this->assertSame("UTF-8", $out->encoding);
@ -26,11 +26,23 @@ class TestParser extends \PHPUnit\Framework\TestCase {
$this->assertNull($out->errors);
}
public function testParseIntoExistingDocument(): void {
$in = "hello world!";
$d = new \DOMDocument();
$out = Parser::parseInto($in, $d, "text/html; charset=utf8");
$this->assertInstanceOf(Output::class, $out);
$this->assertSame($d, $out->document);
$this->assertInstanceOf(\DOMDocument::class, $out->document);
$this->assertSame("UTF-8", $out->encoding);
$this->assertSame(Parser::QUIRKS_MODE, $out->quirksMode);
$this->assertNull($out->errors);
}
public function testParseAFragment(): void {
$doc = new \DOMDocument();
$context = $doc->createElement("div");
$in = "hello world!";
$out = Parser::parseFragment($context, 0, $in, "tex/html; charset=utf8");
$out = Parser::parseFragment($context, 0, $in, "text/html; charset=utf8");
$this->assertInstanceOf(\DOMDocumentFragment::class, $out);
}
@ -40,14 +52,14 @@ class TestParser extends \PHPUnit\Framework\TestCase {
$context = $doc->createElement("div");
$in = "hello world!";
$this->expectExceptionObject(new Exception(Exception::INVALID_QUIRKS_MODE));
Parser::parseFragment($context, -1, $in, "tex/html; charset=utf8");
Parser::parseFragment($context, -1, $in, "text/html; charset=utf8");
}
public function testParseADocumentReportingErrors(): void {
$in = "hello world!";
$conf = new Config;
$conf->errorCollection = true;
$out = Parser::parse($in, "tex/html; charset=utf8", $conf);
$out = Parser::parse($in, "text/html; charset=utf8", $conf);
$this->assertInstanceOf(Output::class, $out);
$this->assertInstanceOf(\DOMDocument::class, $out->document);
$this->assertSame("UTF-8", $out->encoding);
@ -91,6 +103,14 @@ class TestParser extends \PHPUnit\Framework\TestCase {
Parser::parse($in, "utf8", $conf);
}
public function testParseIntoExistingDocumentWithANonEmptyDocument(): void {
$in = "hello world!";
$d = new \DOMDocument();
$d->appendChild($d->createElement('fail'));
$this->expectExceptionCode(Exception::NON_EMPTY_DOCUMENT);
Parser::parseInto($in, $d, "utf8");
}
public function testParseCheckingAttributeCoercion(): void {
$document = Parser::parse('<!DOCTYPE html><article xmlns:xlink="http://www.w3.org/1999/xlink">No coercion should occur here</article>', "UTF-8")->document;
$act = [];

Loading…
Cancel
Save