Browse Source

Document serializer

serialize
J. King 3 years ago
parent
commit
2e2563cf15
  1. 44
      README.md
  2. 71
      composer.lock
  3. 5
      lib/Parser.php
  4. 6
      lib/Parser/Serializer.php
  5. 6
      tests/cases/TestSerializer.php
  6. 15
      vendor-bin/phpunit/composer.lock

44
README.md

@ -1,13 +1,13 @@
# HTML-Parser # HTML-Parser
A modern, accurate HTML parser for PHP. A modern, accurate HTML parser and serializer for PHP.
## Usage ## Usage
### Parsing documents ### Parsing documents
```php ```php
public MensBeam\HTML\Parser::parse( public static MensBeam\HTML\Parser::parse(
string $data, string $data,
?string $encodingOrContentType = null. ?string $encodingOrContentType = null.
?MensBeam\HTML\Parser\Config $config = null ?MensBeam\HTML\Parser\Config $config = null
@ -26,7 +26,7 @@ Extra configuration parameters may be given to the parser by passing a `MensBeam
### Parsing fragments ### Parsing fragments
```php ```php
public MensBeam\HTML\Parser::parse( public static MensBeam\HTML\Parser::parse(
DOMElement $contextElement, DOMElement $contextElement,
int $quirksMode, int $quirksMode,
string $data, string $data,
@ -41,7 +41,33 @@ If the "quirks mode" property of the document is not know, using `Parser::NO_QUI
Unlike the `parse()` method, the `parseFragment()` method returns a `DOMDocumentFragment` object belonging to `$contextElement`'s owner document. Unlike the `parse()` method, the `parseFragment()` method returns a `DOMDocumentFragment` object belonging to `$contextElement`'s owner document.
### Examples ### Serializing nodes
```php
public static MensBeam\HTML\Parser::serialize(DOMNode $node): string
```
```php
public static MensBeam\HTML\Parser::serializeInner(DOMNode $node): string
```
The `MensBeam\HTML\Parser::serialize` method can be used to convert most `DOMNode` objects into strings, using the basic algorithm defined in the HTML specification. Nodes of the following types can be successfully serialized:
- `DOMDocument`
- `DOMElement`
- `DOMText`
- `DOMComment`
- `DOMDocumentFragment`
- `DOMDocumentType`
- `DOMProcessingInstruction`
Similarly, the `MensBeam\HTML\Parser::serializeInner` method can be used to convert the children of non-leaf `DOMNode` objects into strings, using the basic algorithm defined in the HTML specification. Children of nodes of the following types can be successfully serialized:
- `DOMDocument`
- `DOMElement`
- `DOMDocumentFragment`
## Examples
- Parsing a document with unknown encoding: - Parsing a document with unknown encoding:
@ -110,6 +136,16 @@ Unlike the `parse()` method, the `parseFragment()` method returns a `DOMDocument
echo $mathFragment->firstChild->namespaceURI; // prints "http://www.w3.org/1998/Math/MathML" echo $mathFragment->firstChild->namespaceURI; // prints "http://www.w3.org/1998/Math/MathML"
``` ```
- Serializing documents and elements:
```php
use MensBeam\HTML\Parser;
$document = Parser::parse("<!DOCTYPE html><a>Ook<p>Eek</a>");
echo Parser::serialize($document); // prints "<html><head></head><body><a>Ook</a><p><a>Eek</a></p></body></html>
echo Parser::serializeInner($document->getElementsByTagName("body")[0]); // prints "<a>Ook</a><p><a>Eek</a></p>
```
## Configuration ## Configuration
The `MensBeam\HTML\Parser\Config` class is used as a container for configuration parameters for the parser. We have tried to use rational defaults, but some parameters are nevertheless configurable: The `MensBeam\HTML\Parser\Config` class is used as a container for configuration parameters for the parser. We have tried to use rational defaults, but some parameters are nevertheless configurable:

71
composer.lock

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "10ef46b0f5366da9ef3b006f3bfc2e8e", "content-hash": "3bd2c9527ab034f6d6c69b494c92178a",
"packages": [ "packages": [
{ {
"name": "mensbeam/intl", "name": "mensbeam/intl",
@ -216,75 +216,6 @@
"source": "https://github.com/bamarni/composer-bin-plugin/tree/master" "source": "https://github.com/bamarni/composer-bin-plugin/tree/master"
}, },
"time": "2020-05-03T08:27:20+00:00" "time": "2020-05-03T08:27:20+00:00"
},
{
"name": "masterminds/html5",
"version": "2.7.5",
"source": {
"type": "git",
"url": "https://github.com/Masterminds/html5-php.git",
"reference": "f640ac1bdddff06ea333a920c95bbad8872429ab"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Masterminds/html5-php/zipball/f640ac1bdddff06ea333a920c95bbad8872429ab",
"reference": "f640ac1bdddff06ea333a920c95bbad8872429ab",
"shasum": ""
},
"require": {
"ext-ctype": "*",
"ext-dom": "*",
"ext-libxml": "*",
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.7-dev"
}
},
"autoload": {
"psr-4": {
"Masterminds\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Matt Butcher",
"email": "technosophos@gmail.com"
},
{
"name": "Matt Farina",
"email": "matt@mattfarina.com"
},
{
"name": "Asmir Mustafic",
"email": "goetas@gmail.com"
}
],
"description": "An HTML5 parser and serializer.",
"homepage": "http://masterminds.github.io/html5-php",
"keywords": [
"HTML5",
"dom",
"html",
"parser",
"querypath",
"serializer",
"xml"
],
"support": {
"issues": "https://github.com/Masterminds/html5-php/issues",
"source": "https://github.com/Masterminds/html5-php/tree/2.7.5"
},
"time": "2021-07-01T14:25:37+00:00"
} }
], ],
"aliases": [], "aliases": [],

5
lib/Parser.php

@ -6,7 +6,6 @@
declare(strict_types=1); declare(strict_types=1);
namespace MensBeam\HTML; namespace MensBeam\HTML;
use MensBeam\HTML\Parser\Charset;
use MensBeam\HTML\Parser\Data; use MensBeam\HTML\Parser\Data;
use MensBeam\HTML\Parser\ParseError; use MensBeam\HTML\Parser\ParseError;
use MensBeam\HTML\Parser\Config; use MensBeam\HTML\Parser\Config;
@ -17,8 +16,9 @@ use MensBeam\HTML\Parser\TemplateInsertionModesStack;
use MensBeam\HTML\Parser\Tokenizer; use MensBeam\HTML\Parser\Tokenizer;
use MensBeam\HTML\Parser\TreeConstructor; use MensBeam\HTML\Parser\TreeConstructor;
use MensBeam\HTML\Parser\Output; use MensBeam\HTML\Parser\Output;
use MensBeam\HTML\Parser\Serializer;
class Parser { class Parser extends Serializer {
public const NO_QUIRKS_MODE = 0; public const NO_QUIRKS_MODE = 0;
public const QUIRKS_MODE = 1; public const QUIRKS_MODE = 1;
public const LIMITED_QUIRKS_MODE = 2; public const LIMITED_QUIRKS_MODE = 2;
@ -47,7 +47,6 @@ class Parser {
* @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any * @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any
*/ */
public static function parse(string $data, ?string $encodingOrContentType = null, ?Config $config = null): Output { public static function parse(string $data, ?string $encodingOrContentType = null, ?Config $config = null): Output {
// parse the document
return static::parseDocumentOrFragment($data, $encodingOrContentType, null, null, $config ?? new Config); return static::parseDocumentOrFragment($data, $encodingOrContentType, null, null, $config ?? new Config);
} }

6
lib/Parser/Serializer.php

@ -18,7 +18,7 @@ abstract class Serializer {
* *
* @param \DOMDocument|\DOMElement|\DOMText|\DOMComment|\DOMProcessingInstruction|\DOMDocumentFragment|\DOMDocumentType $node The node to serialize * @param \DOMDocument|\DOMElement|\DOMText|\DOMComment|\DOMProcessingInstruction|\DOMDocumentFragment|\DOMDocumentType $node The node to serialize
*/ */
public static function serializeOuter(\DOMNode $node): string { public static function serialize(\DOMNode $node): string {
$s = ""; $s = "";
$stack = []; $stack = [];
$n = $node; $n = $node;
@ -220,9 +220,9 @@ abstract class Serializer {
} }
if ($node instanceof \DOMElement || $node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) { if ($node instanceof \DOMElement || $node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) {
# For each child node of the node, in tree order, run the following steps: # For each child node of the node, in tree order, run the following steps:
// NOTE: the steps in question are implemented in the "serializeOuter" routine // NOTE: the steps in question are implemented in the "serialize" routine
foreach ($node->childNodes as $n) { foreach ($node->childNodes as $n) {
$s .= self::serializeOuter($n); $s .= self::serialize($n);
} }
} else { } else {
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($node)]); throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($node)]);

6
tests/cases/TestSerializer.php

@ -21,7 +21,7 @@ class TestSerializer extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideStandardTreeTests */ /** @dataProvider provideStandardTreeTests */
public function testStandardTreeTests(array $data, bool $fragment, string $exp): void { public function testStandardTreeTests(array $data, bool $fragment, string $exp): void {
$node = $this->buildTree($data, $fragment); $node = $this->buildTree($data, $fragment);
$this->assertSame($exp, Serializer::serializeOuter($node)); $this->assertSame($exp, Serializer::serialize($node));
} }
public function provideStandardTreeTests(): iterable { public function provideStandardTreeTests(): iterable {
@ -53,7 +53,7 @@ class TestSerializer extends \PHPUnit\Framework\TestCase {
$exp1 = $exp; $exp1 = $exp;
$exp2 = "<template>$exp</template>"; $exp2 = "<template>$exp</template>";
$this->assertSame($exp1, Serializer::serializeInner($t)); $this->assertSame($exp1, Serializer::serializeInner($t));
$this->assertSame($exp2, Serializer::serializeOuter($t)); $this->assertSame($exp2, Serializer::serialize($t));
} }
public function provideTemplateTests(): iterable { public function provideTemplateTests(): iterable {
@ -140,7 +140,7 @@ class TestSerializer extends \PHPUnit\Framework\TestCase {
$d = new \DOMDocument; $d = new \DOMDocument;
$a = $d->createAttribute("oops"); $a = $d->createAttribute("oops");
$this->expectExceptionObject(new Exception(Exception::UNSUPPORTED_NODE_TYPE, [\DOMAttr::class])); $this->expectExceptionObject(new Exception(Exception::UNSUPPORTED_NODE_TYPE, [\DOMAttr::class]));
Serializer::serializeOuter($a); Serializer::serialize($a);
} }
public function testInnerSerializeAnInvalidNode(): void { public function testInnerSerializeAnInvalidNode(): void {

15
vendor-bin/phpunit/composer.lock

@ -355,16 +355,16 @@
}, },
{ {
"name": "phpdocumentor/reflection-docblock", "name": "phpdocumentor/reflection-docblock",
"version": "5.2.2", "version": "5.3.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/phpDocumentor/ReflectionDocBlock.git", "url": "https://github.com/phpDocumentor/ReflectionDocBlock.git",
"reference": "069a785b2141f5bcf49f3e353548dc1cce6df556" "reference": "622548b623e81ca6d78b721c5e029f4ce664f170"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/069a785b2141f5bcf49f3e353548dc1cce6df556", "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/622548b623e81ca6d78b721c5e029f4ce664f170",
"reference": "069a785b2141f5bcf49f3e353548dc1cce6df556", "reference": "622548b623e81ca6d78b721c5e029f4ce664f170",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -375,7 +375,8 @@
"webmozart/assert": "^1.9.1" "webmozart/assert": "^1.9.1"
}, },
"require-dev": { "require-dev": {
"mockery/mockery": "~1.3.2" "mockery/mockery": "~1.3.2",
"psalm/phar": "^4.8"
}, },
"type": "library", "type": "library",
"extra": { "extra": {
@ -405,9 +406,9 @@
"description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.", "description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.",
"support": { "support": {
"issues": "https://github.com/phpDocumentor/ReflectionDocBlock/issues", "issues": "https://github.com/phpDocumentor/ReflectionDocBlock/issues",
"source": "https://github.com/phpDocumentor/ReflectionDocBlock/tree/master" "source": "https://github.com/phpDocumentor/ReflectionDocBlock/tree/5.3.0"
}, },
"time": "2020-09-03T19:13:55+00:00" "time": "2021-10-19T17:43:47+00:00"
}, },
{ {
"name": "phpdocumentor/type-resolver", "name": "phpdocumentor/type-resolver",

Loading…
Cancel
Save