Browse Source

Finish high-level property parsing

Still to come:
- Implied property parsing
- Value class pattern
- Peculiar backcompat properties
- URL normalization in general
- URL normalization in e- properties
master
J. King 1 year ago
parent
commit
33bd592645
  1. 3
      composer.json
  2. 234
      composer.lock
  3. 79
      lib/Parser.php
  4. 12
      vendor-bin/phpunit/composer.lock
  5. 12
      vendor-bin/robo/composer.lock

3
composer.json

@ -15,7 +15,8 @@
"require": {
"php": ">=7.1",
"ext-json": "*",
"ext-dom": "*"
"ext-dom": "*",
"mensbeam/html-parser": "1.*"
},
"require-dev": {
"bamarni/composer-bin-plugin": "*"

234
composer.lock

@ -4,8 +4,238 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "b0a58ccd5873863e4784641715d18209",
"packages": [],
"content-hash": "aab10199e9e50dbce6d862a256a96dc7",
"packages": [
{
"name": "mensbeam/html-parser",
"version": "1.3.1",
"source": {
"type": "git",
"url": "https://github.com/mensbeam/HTML-Parser.git",
"reference": "2b8a31ce472190013faab710f6f7ad41a486a740"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/mensbeam/HTML-Parser/zipball/2b8a31ce472190013faab710f6f7ad41a486a740",
"reference": "2b8a31ce472190013faab710f6f7ad41a486a740",
"shasum": ""
},
"require": {
"ext-dom": "*",
"mensbeam/intl": ">=0.9.1",
"mensbeam/mimesniff": ">=0.2.0",
"php": ">=7.1"
},
"require-dev": {
"bamarni/composer-bin-plugin": "^1.3"
},
"suggest": {
"ext-ctype": "Improved performance"
},
"type": "library",
"autoload": {
"files": [
"lib/Parser/ctype.php"
],
"psr-4": {
"MensBeam\\HTML\\": [
"lib/"
]
},
"classmap": [
"lib/Parser/Token.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Dustin Wilson",
"email": "dustin@dustinwilson.com",
"homepage": "https://dustinwilson.com/"
},
{
"name": "J. King",
"email": "jking@jkingweb.ca",
"homepage": "https://jkingweb.ca/"
}
],
"description": "Parser and serializer for modern HTML documents",
"keywords": [
"HTML5",
"WHATWG",
"dom",
"html",
"parser",
"parsing"
],
"support": {
"source": "https://github.com/mensbeam/HTML-Parser/tree/1.3.1"
},
"time": "2023-04-20T19:55:47+00:00"
},
{
"name": "mensbeam/intl",
"version": "0.9.2",
"source": {
"type": "git",
"url": "https://github.com/mensbeam/intl.git",
"reference": "88dbf8398ab69e71164ac073f9ec011be2baa4ae"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/mensbeam/intl/zipball/88dbf8398ab69e71164ac073f9ec011be2baa4ae",
"reference": "88dbf8398ab69e71164ac073f9ec011be2baa4ae",
"shasum": ""
},
"require": {
"php": ">=7.1"
},
"require-dev": {
"bamarni/composer-bin-plugin": "*",
"ext-intl": "*"
},
"type": "library",
"autoload": {
"psr-4": {
"MensBeam\\Intl\\": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "J. King",
"email": "jking@jkingweb.ca",
"homepage": "https://jkingweb.ca/"
}
],
"description": "A set of dependency-free basic internationalization tools",
"keywords": [
"WHATWG",
"charset",
"encoding",
"internationalization",
"intl",
"unicode",
"utf-8",
"utf8"
],
"support": {
"issues": "https://github.com/mensbeam/intl/issues",
"source": "https://github.com/mensbeam/intl/tree/0.9.2"
},
"time": "2023-01-25T22:12:58+00:00"
},
{
"name": "mensbeam/mimesniff",
"version": "0.2.1",
"source": {
"type": "git",
"url": "https://github.com/mensbeam/mime.git",
"reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/mensbeam/mime/zipball/c19be2496ab1e27fbf9c3483c2a9faa2781796cd",
"reference": "c19be2496ab1e27fbf9c3483c2a9faa2781796cd",
"shasum": ""
},
"require": {
"php": ">=7.1",
"psr/http-message": "^1.0"
},
"require-dev": {
"bamarni/composer-bin-plugin": "^1.3",
"ext-intl": "*"
},
"type": "library",
"autoload": {
"psr-4": {
"MensBeam\\Mime\\": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "J. King",
"email": "jking@jkingweb.ca",
"homepage": "https://jkingweb.ca/"
}
],
"description": "An implementation of the WHATWG MIME Sniffing specification",
"keywords": [
"WHATWG",
"mime",
"mimesniff"
],
"support": {
"issues": "https://github.com/mensbeam/mime/issues",
"source": "https://github.com/mensbeam/mime/tree/0.2.1"
},
"time": "2021-03-07T03:58:00+00:00"
},
{
"name": "psr/http-message",
"version": "1.1",
"source": {
"type": "git",
"url": "https://github.com/php-fig/http-message.git",
"reference": "cb6ce4845ce34a8ad9e68117c10ee90a29919eba"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/php-fig/http-message/zipball/cb6ce4845ce34a8ad9e68117c10ee90a29919eba",
"reference": "cb6ce4845ce34a8ad9e68117c10ee90a29919eba",
"shasum": ""
},
"require": {
"php": "^7.2 || ^8.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.1.x-dev"
}
},
"autoload": {
"psr-4": {
"Psr\\Http\\Message\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "PHP-FIG",
"homepage": "http://www.php-fig.org/"
}
],
"description": "Common interface for HTTP messages",
"homepage": "https://github.com/php-fig/http-message",
"keywords": [
"http",
"http-message",
"psr",
"psr-7",
"request",
"response"
],
"support": {
"source": "https://github.com/php-fig/http-message/tree/1.1"
},
"time": "2023-04-04T09:50:52+00:00"
}
],
"packages-dev": [
{
"name": "bamarni/composer-bin-plugin",

79
lib/Parser.php

@ -7,6 +7,8 @@ declare(strict_types=1);
namespace MensBeam\Microformats;
use MensBeam\HTML\Parser\Serializer;
class Parser {
protected const BACKCOMPAT_ROOTS = [
'adr' => "h-adr",
@ -275,49 +277,100 @@ class Parser {
# else if img.p-x[alt] or area.p-x[alt], then return the alt attribute
return $node->getAttribute("alt");
} else {
# else return the textContent of the element after
# else return the textContent of the element after [cleaning]
return $this->getCleanText($node, $prefix);
}
case "u":
# To parse an element for a u-x property value (whether explicit u-* or backcompat equivalent):
if (in_array($node->localName, ["a", "area", "link"]) && $node->hasAttribute("href")) {
# if a.u-x[href] or area.u-x[href] or link.u-x[href], then get the href attribute
return $this->normalizeUrl($node->getAttribute("href"));
$url = $node->getAttribute("href");
} elseif ($node->localName === "img" && $node->hasAttribute("src")) {
# else if img.u-x[src] return the result of "parse an img element for src and alt" (see Sec.1.5)
return $this->parseImg($node);
} elseif (in_array($node->localName, ["audio", "video", "source", "iframe"]) && $node->hasAttribute("src")) {
# else if audio.u-x[src] or video.u-x[src] or source.u-x[src] or iframe.u-x[src], then get the src attribute
return $this->normalizeUrl($node->getAttribute("src"));
$url = $node->getAttribute("src");
} elseif ($node->localName === "video" && $node->hasAttribute("poster")) {
# else if video.u-x[poster], then get the poster attribute
return $this->normalizeUrl($node->getAttribute("href"));
$url = $node->getAttribute("href");
} elseif ($node->localName === "object" && $node->hasAttribute("data")) {
# else if object.u-x[data], then get the data attribute
return $this->normalizeUrl($node->getAttribute("data"));
$url = $node->getAttribute("data");
} elseif ($url = $this->getValueClassPattern($node)) {
# else parse the element for the Value Class Pattern. If a value is found, get it
return $this->normalizeUrl($url);
// Nothing to do in this branch
} elseif ($node->localName === "abbr" && $node->hasAttribute("title")) {
# else if abbr.u-x[title], then get the title attribute
return $this->normalizeUrl($node->getAttribute("title"));
$url = $node->getAttribute("title");
} elseif (in_array($node->localName, ["data", "input"]) && $node->hasAttribute("value")) {
# else if data.u-x[value] or input.u-x[value], then get the value attribute
return $this->normalizeUrl($node->getAttribute("value"));
$url = $node->getAttribute("value");
} else {
return $this->getCleanText($node, $prefix);
# else get the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements
$url = $this->getCleanText($node, $prefix);
}
# return the normalized absolute URL of the gotten value,
# following the containing document's language's rules for
# resolving relative URLs (e.g. in HTML, use the current URL
# context as determined by the page, and first <base>
# element, if any).
return $this->normalizeUrl($url);
case "dt":
// TODO
break;
# To parse an element for a dt-x property value (whether explicit dt-* or backcompat equivalent):
if ($date = $this->getValueClassPattern($node)) {
# parse the element for the Value Class Pattern, including the date and time parsing rules. If a value is found, then return it.
return $date;
} elseif (in_array($node->localName, ["time", "ins", "del"]) && $node->hasAttribute("datetime")) {
# if time.dt-x[datetime] or ins.dt-x[datetime] or del.dt-x[datetime], then return the datetime attribute
return $node->getAttribute("datetime");
} elseif ($node->localName === "abbr" && $node->hasAttribute("title")) {
# else if abbr.dt-x[title], then return the title attribute
return $node->getAttribute("title");
} elseif (in_array($node->localName, ["data", "input"]) && $node->hasAttribute("value")) {
# else if data.dt-x[value] or input.dt-x[value], then return the value attribute
return $node->getAttribute("value");
} else {
# else return the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements.
return $this->getCleanText($node, $prefix);
}
case "e":
//TODO
break;
# To parse an element for a e-x property value (whether explicit "e-*" or backcompat equivalent):
# return a dictionary with two keys:
# html: the innerHTML of the element by using the HTML spec:
# Serializing HTML Fragments algorithm, with
# leading/trailing spaces removed. Proposed: and normalized
# absolute URLs in all URL attributes except those that are
# fragment-only, e.g. start with '#'.(issue 38)
# value: the textContent of the element after [cleaning]
$copy = $node->cloneNode(true);
// TODO: normalize URLs
return [
'html' => trim(Serializer::serializeInner($copy)),
'value' => $this->getCleanText($node, $prefix),
];
default:
throw new \Exception("Unimplemented prefix $prefix");
}
}
protected function parseImg(\DOMElement $node) {
# To parse an img element for src and alt attributes:
if ($node->localName === "img" && $node->hasAttribute("alt")) {
# if img[alt]
# return a new {} structure with
return [
# value: the element's src attribute as a normalized absolute URL
'value' => $this->normalizeUrl($node->getAttribute("src")),
# alt: the element's alt attribute
'alt' => trim($node->getAttribute("alt")),
];
} else {
# else return the element's src attribute as a normalized absolute URL
return $this->normalizeUrl($node->getAttribute("src"));
}
}
protected function getCleanText(\DOMElement $node, string $prefix): string {
$copy = $node->cloneNode(true);
foreach ($copy->getElementsByTagName("script") as $e) {

12
vendor-bin/phpunit/composer.lock

@ -623,16 +623,16 @@
},
{
"name": "phpunit/phpunit",
"version": "9.6.8",
"version": "9.6.9",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/phpunit.git",
"reference": "17d621b3aff84d0c8b62539e269e87d8d5baa76e"
"reference": "a9aceaf20a682aeacf28d582654a1670d8826778"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/17d621b3aff84d0c8b62539e269e87d8d5baa76e",
"reference": "17d621b3aff84d0c8b62539e269e87d8d5baa76e",
"url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/a9aceaf20a682aeacf28d582654a1670d8826778",
"reference": "a9aceaf20a682aeacf28d582654a1670d8826778",
"shasum": ""
},
"require": {
@ -706,7 +706,7 @@
"support": {
"issues": "https://github.com/sebastianbergmann/phpunit/issues",
"security": "https://github.com/sebastianbergmann/phpunit/security/policy",
"source": "https://github.com/sebastianbergmann/phpunit/tree/9.6.8"
"source": "https://github.com/sebastianbergmann/phpunit/tree/9.6.9"
},
"funding": [
{
@ -722,7 +722,7 @@
"type": "tidelift"
}
],
"time": "2023-05-11T05:14:45+00:00"
"time": "2023-06-11T06:13:56+00:00"
},
{
"name": "sebastian/cli-parser",

12
vendor-bin/robo/composer.lock

@ -411,16 +411,16 @@
},
{
"name": "consolidation/self-update",
"version": "2.1.0",
"version": "2.2.0",
"source": {
"type": "git",
"url": "https://github.com/consolidation/self-update.git",
"reference": "714b09fdf0513f83292874bb12de0566066040c2"
"reference": "972a1016761c9b63314e040836a12795dff6953a"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/consolidation/self-update/zipball/714b09fdf0513f83292874bb12de0566066040c2",
"reference": "714b09fdf0513f83292874bb12de0566066040c2",
"url": "https://api.github.com/repos/consolidation/self-update/zipball/972a1016761c9b63314e040836a12795dff6953a",
"reference": "972a1016761c9b63314e040836a12795dff6953a",
"shasum": ""
},
"require": {
@ -460,9 +460,9 @@
"description": "Provides a self:update command for Symfony Console applications.",
"support": {
"issues": "https://github.com/consolidation/self-update/issues",
"source": "https://github.com/consolidation/self-update/tree/2.1.0"
"source": "https://github.com/consolidation/self-update/tree/2.2.0"
},
"time": "2023-02-21T19:33:55+00:00"
"time": "2023-03-18T01:37:41+00:00"
},
{
"name": "dflydev/dot-access-data",

Loading…
Cancel
Save