Microformats/lib/Parser.php

463 lines
No EOL
24 KiB
PHP

<?php
/** @license MIT
* Copyright 2023 J. King
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Microformats;
use MensBeam\HTML\Parser\Serializer;
class Parser {
protected const BACKCOMPAT_ROOTS = [
'adr' => "h-adr",
'vcard' => "h-card",
'hfeed' => "h-feed",
'hentry' => "h-entry",
'vevent' => "h-event",
'geo' => "h-geo",
'hproduct' => "h-product",
'hrecipe' => "h-recipe",
'hresume' => "h-resume",
'hreview' => "h-review",
'hreview-aggregate' => "h-review-aggregate",
];
protected const BACKCOMPAT_PROPERTIES = [
'additional-name' => ['h-card' => ["p", "additional-name"]],
'adr' => ['h-card' => ["p", "adr"]],
'affiliation' => ['h-resume' => ["p", "affiliation", ["vcard"]]],
'author' => ['h-entry' => ["p", "author" ["vcard"]], 'h-recipe' => ["p", "author", ["vcard"]]],
'bday' => ['h-card' => ["dt", "bday"]],
'best' => ['h-review' => ["p", "best"], 'h-review-aggregate' => ["p", "best"]],
'brand' => ['h-product' => ["p", "brand"]],
'category' => ['h-card' => ["p", "category"], 'h-entry' => ["p", "category"], 'h-event' => ["p", "category"], 'h-product' => ["p", "category"]],
'contact' => ['h-resume' => ["p", "contact", ["vcard"]]],
'count' => ['h-review-aggregate' => ["p", "count"]],
'country-name' => ['h-adr' => ["p", "country-name"], 'h-card' => ["p", "country-name"]],
'description' => ['h-event' => ["p", "description"], 'h-product' => ["p", "description"], 'h-review' => ["e", "description"]],
'dtend' => ['h-event' => ["dt", "end"]],
'dtreviewed' => ['h-review' => ["dt", "reviewed"]],
'dtstart' => ['h-event' => ["dt", "start"]],
'duration' => ['h-event' => ["dt", "duration"], 'h-recipe' => ["dt", "duration"]],
'education' => ['h-resume' => ["p", "education", ["vevent"]]],
'email' => ['h-card' => ["u", "email"]],
'entry-content' => ['h-entry' => ["e", "content"]],
'entry-summary' => ['h-entry' => ["p", "summary"]],
'entry-title' => ['h-entry' => ["p", "name"]],
'experience' => ['h-resume' => ["p", "experience", ["vevent"]]],
'extended-address' => ['h-adr' => ["p", "extended-address"], 'h-card' => ["p", "extended-address"]],
'family-name' => ['h-card' => ["p", "family-name"]],
'fn' => ['h-card' => ["p", "name"], 'h-product' => ["p", "name"], 'h-recipe' => ["p", "name"]],
'geo' => ['h-card' => ["p", "geo"], 'h-event' => ["p", "geo"]],
'given-name' => ['h-card' => ["p", "given-name"]],
'honorific-prefix' => ['h-card' => ["p", "honorific-prefix"]],
'honorific-suffix' => ['h-card' => ["p", "honorific-suffix"]],
'identifier' => ['h-product' => ["u", "identifier"]],
'ingredient' => ['h-recipe' => ["p", "ingredient"]],
'instructions' => ['h-recipe' => ["e", "instructions"]],
'key' => ['h-card' => ["u", "key"]],
'label' => ['h-card' => ["p", "label"]],
'latitude' => ['h-card' => ["p", "latitude"], 'h-event' => ["p", "latitude"], 'h-geo' => ["p", "latitude"]],
'locality' => ['h-adr' => ["p", "locality"], 'h-card' => ["p", "locality"]],
'location' => ['h-event' => ["p", "location", ["adr", "vcard"]]],
'logo' => ['h-card' => ["u", "logo"]],
'longitude' => ['h-card' => ["p", "longitude"], 'h-event' => ["p", "longitude"], 'h-geo' => ["p", "longitude"]],
'nickname' => ['h-card' => ["p", "nickname"]],
'note' => ['h-card' => ["p", "note"]],
'nutrition' => ['h-recipe' => ["p", "nutrition"]],
'organization-name' => ['h-card' => ["p", "organization-name"]],
'organization-unit' => ['h-card' => ["p", "organization-unit"]],
'org' => ['h-card' => ["p", "org"]],
'photo' => ['h-card' => ["u", "photo"], 'h-product' => ["u", "photo"], 'h-recipe' => ["u", "photo"]],
'postal-code' => ['h-adr' => ["p", "postal-code"], 'h-card' => ["p", "postal-code"]],
'post-office-box' => ['h-adr' => ["p", "post-office-box"], 'h-card' => ["p", "post-office-box"]],
'price' => ['h-product' => ["p", "price"]],
'published' => ['h-entry' => ["dt", "published"], 'h-recipe' => ["dt", "published"]],
'rating' => ['h-review' => ["p", "rating"], 'h-review-aggregate' => ["p", "rating"]],
'region' => ['h-adr' => ["p", "region"], 'h-card' => ["p", "region"]],
'rev' => ['h-card' => ["dt", "rev"]],
'reviewer' => ['h-review' => ["p", "reviewer"]],
'review' => ['h-product' => ["p", "review", ["hreview"]]],
'role' => ['h-card' => ["p", "role"]],
'skill' => ['h-resume' => ["p", "skill"]],
'street-address' => ['h-adr' => ["p", "street-address"], 'h-card' => ["p", "street-address"]],
'summary' => ['h-event' => ["p", "name"], 'h-recipe' => ["p", "summary"], 'h-resume' => ["p", "summary"], 'h-review' => ["p", "name"], 'h-review-aggregate' => ["p", "name"]],
'tel' => ['h-card' => ["p", "tel"]],
'title' => ['h-card' => ["p", "job-title"]],
'tz' => ['h-card' => ["p", "tz"]],
'uid' => ['h-card' => ["u", "uid"]],
'updated' => ['h-entry' => ["dt", "updated"]],
'url' => ['h-card' => ["u", "url"], 'h-product' => ["u", "url"]],
'url' => ['h-event' => ["u", "url"]],
'votes' => ['h-review-aggregate' => ["p", "votes"]],
'worst' => ['h-review' => ["p", "worst"], 'h-review-aggregate' => ["p", "worst"]],
'yield' => ['h-recipe' => ["p", "yield"]],
];
protected const URL_ATTRS = [
'' => ["itemid", "itemprop", "itemtype"],
'a' => ["href", "ping"],
'area' => ["href", "ping"],
'audio' => ["src"],
'base' => ["href"],
'blockquote' => ["cite"],
'button' => ["formaction"],
'del' => ["cite"],
'embed' => ["src"],
'form' => ["action"],
'iframe' => ["src"],
'img' => ["src"],
'input' => ["formaction", "src"],
'ins' => ["cite"],
'link' => ["href"],
'object' => ["data"],
'q' => ["cite"],
'script' => ["src"],
'source' => ["src"],
'track' => ["src"],
'video' => ["poster", "src"],
];
protected $baseUrl;
/** Parses a DOMElement for microformats
*
* @param \DOMElement $node The DOMElement to parse
* @param string $baseURL The base URL against which to resolve relative URLs in the output
*/
public function parseNode(\DOMElement $node, string $baseUrl = ""): array {
$root = $node;
// Perform HTML base-URL resolution
$this->baseUrl = $baseUrl;
$this->baseUrl = $this->getBaseUrl($root, $baseUrl);
# start with an empty JSON "items" array and "rels" & "rel-urls" hashes:
$out = [
'items' => [],
'rels' => [],
'rel-urls' => [],
];
# parse the root element for class microformats, adding to the JSON items array accordingly
while ($node) {
# parse element class for root class name(s) "h-*" and if none, backcompat root classes
# if found, start parsing a new microformat
$classes = $this->parseClasses($node);
if ($types = $this->matchRootsMf2($classes)) {
$out[] = $this->parseMicroformat($node, $types, false);
} elseif ($types = $this->matchRootsBackcompat($classes)) {
$out[] = $this->parseMicroformat($node, $types, true);
} else {
# if none found, parse child elements for microformats (depth first, doc order)
$node = $this->nextElement($node, $root, true);
continue;
}
// continue to the next element, passing over children (they have already been examined)
$node = $this->nextElement($node, $root, false);
}
// TODO: clean up instance properties
return $out;
}
protected function parseClasses(\DOMElement $node): array {
$attr = trim($node->getAttribute("class"), " \r\n\t\f");
if ($attr !== "") {
return array_unique(preg_split("/[ \r\n\t\f]+/sS", $attr));
} else {
return [];
}
}
protected function matchRootsMf2(array $classes): array {
return array_filter($classes, function($c) {
# The "*" for root (and property) class names consists of an
# optional vendor prefix (series of 1+ number or lowercase
# a-z characters i.e. [0-9a-z]+, followed by '-'), then one
# or more '-' separated lowercase a-z words.
// exclude Tailwind classes https://tailwindcss.com/docs/height
return preg_match('/^h(?:-[a-z0-9]+)?(?:-[a-z]+)+$/S', $c) && !preg_match('/^h-(?:px|auto|full|screen|min|max|fit)$/S', $c);
});
}
protected function matchRootsBackcompat(array $classes): array {
$out = [];
foreach ($classes as $c) {
if ($compat = self::BACKCOMPAT_ROOTS[$c] ?? null) {
$out[] = $compat;
}
}
return $out;
}
protected function parseMicroformat(\DOMElement $root, array $types, bool $backcompat): array {
# keep track of whether the root class name(s) was from backcompat
// this is a parameter to this function
# create a new { } structure
$out = [
# type: [array of unique microformat "h-*" type(s) on the element sorted alphabetically]
// NOTE: sorting will be done below; uniqueness was already computed when classes were parsed
'type' => $types,
# properties: { } - to be filled in when that element itself is parsed for microformats properties
'properties' => [],
# if the element has a non-empty id attribute:
# id: string value of element's id attribute
// Added below
];
sort($out['type']);
if (strlen($id = $root->getAttribute("id"))) {
$out['id'] = $id;
}
# parse child elements (document order) by:
while ($node = $this->nextElement($node ?? $root, $root, !($isRoot = $isRoot ?? false))) {
$isRoot = false;
$classes = $this->parseClasses($node);
if ($backcompat) {
# if parsing a backcompat root, parse child element class name(s) for backcompat properties
$properties = $this->parsePropertiesBackcompat($node, $classes, $types);
} else {
# else parse a child element class for property class name(s) "p-*,u-*,dt-*,e-*"
$properties = $this->parsePropertiesMf2($node, $classes);
}
# if such class(es) are found, it is a property element
# add properties found to current microformat's properties: { } structure
foreach ($properties as $k => $v) {
if (!isset($out['properties'][$k])) {
$out['properties'][$k] = [];
}
array_push($out['properties'][$k], ...$v);
}
# parse a child element for microformats (recurse)
$child = null;
if ($types = $this->matchRootsMf2($classes)) {
$child = $this->parseMicroformat($node, $types, false);
} elseif ($types = $this->matchRootsBackcompat($classes)) {
$child = $this->parseMicroformat($node, $types, true);
}
if ($child) {
$isRoot = true;
}
}
return $out;
}
protected function parsePropertiesMf2(\DOMElement $node, array $classes): array {
$out = [];
foreach ($classes as $c) {
# The "*" for root (and property) class names consists of an
# optional vendor prefix (series of 1+ number or lowercase
# a-z characters i.e. [0-9a-z]+, followed by '-'), then one
# or more '-' separated lowercase a-z words.
if (!preg_match('/^(p|u|dt|e)((?:-[a-z0-9]+)?(?:-[a-z]+)+)$/S', $c, $match)) {
continue;
}
$prefix = $match[1];
$key = substr($match[2], 1);
if (!isset($out[$key])) {
$out[$key] = [];
}
$out[$key][] = $this->parseProperty($node, $prefix);
}
return $out;
}
protected function parsePropertiesBackcompat(\DOMElement $node, array &$classes, array $types): array {
$out = [];
foreach ($classes as $c) {
foreach ($types as $t) {
$map = static::BACKCOMPAT_PROPERTIES[$c][$t] ?? null;
if (!$map) {
// TODO : handle special mapped properties
continue;
}
$prefix = $map[0];
$key = $map[1];
$roots = $map[2] ?? [];
if (!isset($out[$key])) {
$out[$key] = [];
}
$out[$key][] = $this->parseProperty($node, $prefix);
foreach ($roots as $r) {
if (!in_array($r, $classes)) {
$classes[] = $r;
}
}
}
}
// TODO: handle link relations
return $out;
}
protected function parseProperty(\DOMElement $node, string $prefix) {
switch ($prefix) {
case "p":
# To parse an element for a p-x property value (whether explicit p-* or backcompat equivalent):
if ($text = $this->getValueClassPattern($node)) {
# Parse the element for the Value Class Pattern. If a value is found, return it.
return $text;
} elseif (in_array($node->localName, ["abbr", "link"]) && $node->hasAttribute("title")) {
# If abbr.p-x[title] or link.p-x[title], return the title attribute.
return $node->getAttribute("href");
} elseif (in_array($node->localName, ["data", "input"]) && $node->hasAttribute("value")) {
# else if data.p-x[value] or input.p-x[value], then return the value attribute
return $node->getAttribute("value");
} elseif (in_array($node->localName, ["img", "area"]) && $node->hasAttribute("alt")) {
# else if img.p-x[alt] or area.p-x[alt], then return the alt attribute
return $node->getAttribute("alt");
} else {
# else return the textContent of the element after [cleaning]
return $this->getCleanText($node, $prefix);
}
case "u":
# To parse an element for a u-x property value (whether explicit u-* or backcompat equivalent):
if (in_array($node->localName, ["a", "area", "link"]) && $node->hasAttribute("href")) {
# if a.u-x[href] or area.u-x[href] or link.u-x[href], then get the href attribute
$url = $node->getAttribute("href");
} elseif ($node->localName === "img" && $node->hasAttribute("src")) {
# else if img.u-x[src] return the result of "parse an img element for src and alt" (see Sec.1.5)
return $this->parseImg($node);
} elseif (in_array($node->localName, ["audio", "video", "source", "iframe"]) && $node->hasAttribute("src")) {
# else if audio.u-x[src] or video.u-x[src] or source.u-x[src] or iframe.u-x[src], then get the src attribute
$url = $node->getAttribute("src");
} elseif ($node->localName === "video" && $node->hasAttribute("poster")) {
# else if video.u-x[poster], then get the poster attribute
$url = $node->getAttribute("href");
} elseif ($node->localName === "object" && $node->hasAttribute("data")) {
# else if object.u-x[data], then get the data attribute
$url = $node->getAttribute("data");
} elseif ($url = $this->getValueClassPattern($node)) {
# else parse the element for the Value Class Pattern. If a value is found, get it
// Nothing to do in this branch
} elseif ($node->localName === "abbr" && $node->hasAttribute("title")) {
# else if abbr.u-x[title], then get the title attribute
$url = $node->getAttribute("title");
} elseif (in_array($node->localName, ["data", "input"]) && $node->hasAttribute("value")) {
# else if data.u-x[value] or input.u-x[value], then get the value attribute
$url = $node->getAttribute("value");
} else {
# else get the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements
$url = $this->getCleanText($node, $prefix);
}
# return the normalized absolute URL of the gotten value,
# following the containing document's language's rules for
# resolving relative URLs (e.g. in HTML, use the current URL
# context as determined by the page, and first <base>
# element, if any).
return $this->normalizeUrl($url);
case "dt":
# To parse an element for a dt-x property value (whether explicit dt-* or backcompat equivalent):
if ($date = $this->getValueClassPattern($node)) {
# parse the element for the Value Class Pattern, including the date and time parsing rules. If a value is found, then return it.
return $date;
} elseif (in_array($node->localName, ["time", "ins", "del"]) && $node->hasAttribute("datetime")) {
# if time.dt-x[datetime] or ins.dt-x[datetime] or del.dt-x[datetime], then return the datetime attribute
return $node->getAttribute("datetime");
} elseif ($node->localName === "abbr" && $node->hasAttribute("title")) {
# else if abbr.dt-x[title], then return the title attribute
return $node->getAttribute("title");
} elseif (in_array($node->localName, ["data", "input"]) && $node->hasAttribute("value")) {
# else if data.dt-x[value] or input.dt-x[value], then return the value attribute
return $node->getAttribute("value");
} else {
# else return the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements.
return $this->getCleanText($node, $prefix);
}
case "e":
# To parse an element for a e-x property value (whether explicit "e-*" or backcompat equivalent):
# return a dictionary with two keys:
# html: the innerHTML of the element by using the HTML spec:
# Serializing HTML Fragments algorithm, with
# leading/trailing spaces removed. Proposed: and normalized
# absolute URLs in all URL attributes except those that are
# fragment-only, e.g. start with '#'.(issue 38)
# value: the textContent of the element after [cleaning]
$copy = $node->cloneNode(true);
// TODO: normalize URLs
return [
'html' => trim(Serializer::serializeInner($copy)),
'value' => $this->getCleanText($node, $prefix),
];
default:
throw new \Exception("Unimplemented prefix $prefix");
}
}
protected function getValueClassPattern(\DOMElement $node) {
}
protected function parseImg(\DOMElement $node) {
# To parse an img element for src and alt attributes:
if ($node->localName === "img" && $node->hasAttribute("alt")) {
# if img[alt]
# return a new {} structure with
return [
# value: the element's src attribute as a normalized absolute URL
'value' => $this->normalizeUrl($node->getAttribute("src")),
# alt: the element's alt attribute
'alt' => trim($node->getAttribute("alt")),
];
} else {
# else return the element's src attribute as a normalized absolute URL
return $this->normalizeUrl($node->getAttribute("src"));
}
}
protected function normalizeUrl(string $url): string {
// Stub
return $url;
}
protected function getCleanText(\DOMElement $node, string $prefix): string {
$copy = $node->cloneNode(true);
foreach ($copy->getElementsByTagName("script") as $e) {
$e->parentNode->removeChild($e);
}
foreach ($copy->getElementsByTagName("style") as $e) {
$e->parentNode->removeChild($e);
}
foreach ($copy->getElementsByTagName("img") as $e) {
$alt = $e->getAttribute("alt");
$src = $e->hasAttribute("src") ? $this->normalizeUrl($e->getAttribute("src")) : "";
if ($prefix === "u") {
$attr = strlen($src) ? $src : "";
} else {
$attr = strlen($alt) ? $alt : $src;
}
$e->parentNode->replaceChild($e->ownerDocument->createTextNode(" ".$attr." "), $e);
}
return trim($copy->textContent);
}
protected function getBaseUrl(\DOMElement $root, string $base): string {
$set = $root->ownerDocument->getElementsByTagName("base");
if ($set->length) {
return $this->normalizeUrl($set[0]->getAttribute("href"));
}
return $base;
}
/** Finds the next node in tree order after $node, if any
*
* @param \DOMNode $node The context node
* @param \DOMElement $root The element to consider the contextual root of the tree
* @param bool $considerChildren Whether or not child nodes are valid next nodes
*/
protected function nextElement(\DOMElement $node, \DOMElement $root, bool $considerChildren): ?\DOMElement {
if ($considerChildren && $node->localName !== "template" && $node->hasChildNodes()) {
$next = $node->firstChild;
if ($next instanceof \DOMElement) {
return $next;
}
}
$next = $node->nextSibling;
while (!$next) {
$node = $node->parentNode;
if ($node->isSameNode($root)) {
return null;
}
$next = $node->nextSibling;
while ($next and !$next instanceof \DOMElement) {
$next = $next->nextSibling;
}
}
return $next;
}
}