Browse Source

Document serializer

serialize
J. King 2 years ago
parent
commit
2e2563cf15
  1. 44
      README.md
  2. 71
      composer.lock
  3. 5
      lib/Parser.php
  4. 6
      lib/Parser/Serializer.php
  5. 6
      tests/cases/TestSerializer.php
  6. 15
      vendor-bin/phpunit/composer.lock

44
README.md

@ -1,13 +1,13 @@
# HTML-Parser
A modern, accurate HTML parser for PHP.
A modern, accurate HTML parser and serializer for PHP.
## Usage
### Parsing documents
```php
public MensBeam\HTML\Parser::parse(
public static MensBeam\HTML\Parser::parse(
string $data,
?string $encodingOrContentType = null.
?MensBeam\HTML\Parser\Config $config = null
@ -26,7 +26,7 @@ Extra configuration parameters may be given to the parser by passing a `MensBeam
### Parsing fragments
```php
public MensBeam\HTML\Parser::parse(
public static MensBeam\HTML\Parser::parse(
DOMElement $contextElement,
int $quirksMode,
string $data,
@ -41,7 +41,33 @@ If the "quirks mode" property of the document is not know, using `Parser::NO_QUI
Unlike the `parse()` method, the `parseFragment()` method returns a `DOMDocumentFragment` object belonging to `$contextElement`'s owner document.
### Examples
### Serializing nodes
```php
public static MensBeam\HTML\Parser::serialize(DOMNode $node): string
```
```php
public static MensBeam\HTML\Parser::serializeInner(DOMNode $node): string
```
The `MensBeam\HTML\Parser::serialize` method can be used to convert most `DOMNode` objects into strings, using the basic algorithm defined in the HTML specification. Nodes of the following types can be successfully serialized:
- `DOMDocument`
- `DOMElement`
- `DOMText`
- `DOMComment`
- `DOMDocumentFragment`
- `DOMDocumentType`
- `DOMProcessingInstruction`
Similarly, the `MensBeam\HTML\Parser::serializeInner` method can be used to convert the children of non-leaf `DOMNode` objects into strings, using the basic algorithm defined in the HTML specification. Children of nodes of the following types can be successfully serialized:
- `DOMDocument`
- `DOMElement`
- `DOMDocumentFragment`
## Examples
- Parsing a document with unknown encoding:
@ -110,6 +136,16 @@ Unlike the `parse()` method, the `parseFragment()` method returns a `DOMDocument
echo $mathFragment->firstChild->namespaceURI; // prints "http://www.w3.org/1998/Math/MathML"
```
- Serializing documents and elements:
```php
use MensBeam\HTML\Parser;
$document = Parser::parse("<!DOCTYPE html><a>Ook<p>Eek</a>");
echo Parser::serialize($document); // prints "<html><head></head><body><a>Ook</a><p><a>Eek</a></p></body></html>
echo Parser::serializeInner($document->getElementsByTagName("body")[0]); // prints "<a>Ook</a><p><a>Eek</a></p>
```
## Configuration
The `MensBeam\HTML\Parser\Config` class is used as a container for configuration parameters for the parser. We have tried to use rational defaults, but some parameters are nevertheless configurable:

71
composer.lock

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "10ef46b0f5366da9ef3b006f3bfc2e8e",
"content-hash": "3bd2c9527ab034f6d6c69b494c92178a",
"packages": [
{
"name": "mensbeam/intl",
@ -216,75 +216,6 @@
"source": "https://github.com/bamarni/composer-bin-plugin/tree/master"
},
"time": "2020-05-03T08:27:20+00:00"
},
{
"name": "masterminds/html5",
"version": "2.7.5",
"source": {
"type": "git",
"url": "https://github.com/Masterminds/html5-php.git",
"reference": "f640ac1bdddff06ea333a920c95bbad8872429ab"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Masterminds/html5-php/zipball/f640ac1bdddff06ea333a920c95bbad8872429ab",
"reference": "f640ac1bdddff06ea333a920c95bbad8872429ab",
"shasum": ""
},
"require": {
"ext-ctype": "*",
"ext-dom": "*",
"ext-libxml": "*",
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.7-dev"
}
},
"autoload": {
"psr-4": {
"Masterminds\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Matt Butcher",
"email": "technosophos@gmail.com"
},
{
"name": "Matt Farina",
"email": "matt@mattfarina.com"
},
{
"name": "Asmir Mustafic",
"email": "goetas@gmail.com"
}
],
"description": "An HTML5 parser and serializer.",
"homepage": "http://masterminds.github.io/html5-php",
"keywords": [
"HTML5",
"dom",
"html",
"parser",
"querypath",
"serializer",
"xml"
],
"support": {
"issues": "https://github.com/Masterminds/html5-php/issues",
"source": "https://github.com/Masterminds/html5-php/tree/2.7.5"
},
"time": "2021-07-01T14:25:37+00:00"
}
],
"aliases": [],

5
lib/Parser.php

@ -6,7 +6,6 @@
declare(strict_types=1);
namespace MensBeam\HTML;
use MensBeam\HTML\Parser\Charset;
use MensBeam\HTML\Parser\Data;
use MensBeam\HTML\Parser\ParseError;
use MensBeam\HTML\Parser\Config;
@ -17,8 +16,9 @@ use MensBeam\HTML\Parser\TemplateInsertionModesStack;
use MensBeam\HTML\Parser\Tokenizer;
use MensBeam\HTML\Parser\TreeConstructor;
use MensBeam\HTML\Parser\Output;
use MensBeam\HTML\Parser\Serializer;
class Parser {
class Parser extends Serializer {
public const NO_QUIRKS_MODE = 0;
public const QUIRKS_MODE = 1;
public const LIMITED_QUIRKS_MODE = 2;
@ -47,7 +47,6 @@ class Parser {
* @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any
*/
public static function parse(string $data, ?string $encodingOrContentType = null, ?Config $config = null): Output {
// parse the document
return static::parseDocumentOrFragment($data, $encodingOrContentType, null, null, $config ?? new Config);
}

6
lib/Parser/Serializer.php

@ -18,7 +18,7 @@ abstract class Serializer {
*
* @param \DOMDocument|\DOMElement|\DOMText|\DOMComment|\DOMProcessingInstruction|\DOMDocumentFragment|\DOMDocumentType $node The node to serialize
*/
public static function serializeOuter(\DOMNode $node): string {
public static function serialize(\DOMNode $node): string {
$s = "";
$stack = [];
$n = $node;
@ -220,9 +220,9 @@ abstract class Serializer {
}
if ($node instanceof \DOMElement || $node instanceof \DOMDocument || $node instanceof \DOMDocumentFragment) {
# For each child node of the node, in tree order, run the following steps:
// NOTE: the steps in question are implemented in the "serializeOuter" routine
// NOTE: the steps in question are implemented in the "serialize" routine
foreach ($node->childNodes as $n) {
$s .= self::serializeOuter($n);
$s .= self::serialize($n);
}
} else {
throw new Exception(Exception::UNSUPPORTED_NODE_TYPE, [get_class($node)]);

6
tests/cases/TestSerializer.php

@ -21,7 +21,7 @@ class TestSerializer extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideStandardTreeTests */
public function testStandardTreeTests(array $data, bool $fragment, string $exp): void {
$node = $this->buildTree($data, $fragment);
$this->assertSame($exp, Serializer::serializeOuter($node));
$this->assertSame($exp, Serializer::serialize($node));
}
public function provideStandardTreeTests(): iterable {
@ -53,7 +53,7 @@ class TestSerializer extends \PHPUnit\Framework\TestCase {
$exp1 = $exp;
$exp2 = "<template>$exp</template>";
$this->assertSame($exp1, Serializer::serializeInner($t));
$this->assertSame($exp2, Serializer::serializeOuter($t));
$this->assertSame($exp2, Serializer::serialize($t));
}
public function provideTemplateTests(): iterable {
@ -140,7 +140,7 @@ class TestSerializer extends \PHPUnit\Framework\TestCase {
$d = new \DOMDocument;
$a = $d->createAttribute("oops");
$this->expectExceptionObject(new Exception(Exception::UNSUPPORTED_NODE_TYPE, [\DOMAttr::class]));
Serializer::serializeOuter($a);
Serializer::serialize($a);
}
public function testInnerSerializeAnInvalidNode(): void {

15
vendor-bin/phpunit/composer.lock

@ -355,16 +355,16 @@
},
{
"name": "phpdocumentor/reflection-docblock",
"version": "5.2.2",
"version": "5.3.0",
"source": {
"type": "git",
"url": "https://github.com/phpDocumentor/ReflectionDocBlock.git",
"reference": "069a785b2141f5bcf49f3e353548dc1cce6df556"
"reference": "622548b623e81ca6d78b721c5e029f4ce664f170"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/069a785b2141f5bcf49f3e353548dc1cce6df556",
"reference": "069a785b2141f5bcf49f3e353548dc1cce6df556",
"url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/622548b623e81ca6d78b721c5e029f4ce664f170",
"reference": "622548b623e81ca6d78b721c5e029f4ce664f170",
"shasum": ""
},
"require": {
@ -375,7 +375,8 @@
"webmozart/assert": "^1.9.1"
},
"require-dev": {
"mockery/mockery": "~1.3.2"
"mockery/mockery": "~1.3.2",
"psalm/phar": "^4.8"
},
"type": "library",
"extra": {
@ -405,9 +406,9 @@
"description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.",
"support": {
"issues": "https://github.com/phpDocumentor/ReflectionDocBlock/issues",
"source": "https://github.com/phpDocumentor/ReflectionDocBlock/tree/master"
"source": "https://github.com/phpDocumentor/ReflectionDocBlock/tree/5.3.0"
},
"time": "2020-09-03T19:13:55+00:00"
"time": "2021-10-19T17:43:47+00:00"
},
{
"name": "phpdocumentor/type-resolver",

Loading…
Cancel
Save