329 lines
No EOL
14 KiB
PHP
329 lines
No EOL
14 KiB
PHP
<?php
|
|
/** @license MIT
|
|
* Copyright 2023 J. King
|
|
* See LICENSE and AUTHORS files for details */
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace MensBeam\Microformats;
|
|
|
|
class Parser {
|
|
protected const BACKCOMPAT_ROOTS = [
|
|
'adr' => "h-adr",
|
|
'vcard' => "h-card",
|
|
'hfeed' => "h-feed",
|
|
'hentry' => "h-entry",
|
|
'vevent' => "h-event",
|
|
'geo' => "h-geo",
|
|
'hproduct' => "h-product",
|
|
'hrecipe' => "h-recipe",
|
|
'hresume' => "h-resume",
|
|
'hreview' => "h-review",
|
|
'hreview-aggregate' => "h-review-aggregate",
|
|
];
|
|
protected const BACKCOMPAT_PROPERTIES = [
|
|
'h-adr' => [
|
|
'post-office-box' => ["p-post-office-box"],
|
|
'extended-address' => ["p-extended-address"],
|
|
'street-address' => ["p-street-address"],
|
|
'locality' => ["p-locality"],
|
|
'region' => ["p-region"],
|
|
'postal-code' => ["p-postal-code"],
|
|
'country-name' => ["p-country-name"],
|
|
],
|
|
'h-card' => [
|
|
'fn' => ["p-name"],
|
|
'honorific-prefix' => ["p-honorific-prefix"],
|
|
'given-name' => ["p-given-name"],
|
|
'additional-name' => ["p-additional-name"],
|
|
'family-name' => ["p-family-name"],
|
|
'honorific-suffix' => ["p-honorific-suffix"],
|
|
'nickname' => ["p-nickname"],
|
|
'email' => ["u-email"],
|
|
'logo' => ["u-logo"],
|
|
'photo' => ["u-photo"],
|
|
'url' => ["u-url"],
|
|
'uid' => ["u-uid"],
|
|
'category' => ["p-category"],
|
|
'adr' => ["p-adr", "adr"],
|
|
'extended-address' => ["p-extended-address"],
|
|
'street-address' => ["p-street-address"],
|
|
'locality' => ["p-locality"],
|
|
'region' => ["p-region"],
|
|
'postal-code' => ["p-postal-code"],
|
|
'country-name' => ["p-country-name"],
|
|
'label' => ["p-label"],
|
|
'geo' => ["p-geo", "geo"],
|
|
'latitude' => ["p-latitude"],
|
|
'longitude' => ["p-longitude"],
|
|
'tel' => ["p-tel"],
|
|
'note' => ["p-note"],
|
|
'bday' => ["dt-bday"],
|
|
'key' => ["u-key"],
|
|
'org' => ["p-org"],
|
|
'organization-name' => ["p-organization-name"],
|
|
'organization-unit' => ["p-organization-unit"],
|
|
'title' => ["p-job-title"],
|
|
'role' => ["p-role"],
|
|
'tz' => ["p-tz"],
|
|
'rev' => ["dt-rev"],
|
|
],
|
|
'h-feed' => [
|
|
// NOTE: h-feed mapping requires special handling beyond what is listed here
|
|
],
|
|
'h-entry' => [
|
|
// NOTE: h-entry mapping requires special handling beyond what is listed here
|
|
'entry-title' => ["p-name"],
|
|
'entry-summary' => ["p-summary"],
|
|
'entry-content' => ["e-content"],
|
|
'published' => ["dt-published"],
|
|
'updated' => ["dt-updated"],
|
|
'author' => ["p-author", "vcard"],
|
|
'category' => ["p-category"],
|
|
],
|
|
'h-event' => [
|
|
'summary' => ["p-name"],
|
|
'dtstart' => ["dt-start"],
|
|
'dtend' => ["dt-end"],
|
|
'duration' => ["dt-duration"],
|
|
'description' => ["p-description"],
|
|
'url' => ["u-url"],
|
|
'category' => ["p-category"],
|
|
'location' => ["p-location", "vcard", "adr"],
|
|
'geo' => ["p-geo", "geo"],
|
|
'latitude' => ["p-latitude"],
|
|
'longitude' => ["p-longitude"],
|
|
],
|
|
'h-geo' => [
|
|
'latitude' => ["p-latitude"],
|
|
'longitude' => ["p-longitude"],
|
|
],
|
|
'h-product' => [
|
|
'fn' => ["p-name"],
|
|
'photo' => ["u-photo"],
|
|
'brand' => ["p-brand"],
|
|
'category' => ["p-category"],
|
|
'description' => ["p-description"],
|
|
'identifier' => ["u-identifier"],
|
|
'url' => ["u-url"],
|
|
'review' => ["p-review", "hreview"],
|
|
'price' => ["p-price"],
|
|
],
|
|
'h-recipe' => [
|
|
'fn' => ["p-name"],
|
|
'ingredient' => ["p-ingredient"],
|
|
'yield' => ["p-yield"],
|
|
'instructions' => ["e-instructions"],
|
|
'duration' => ["dt-duration"],
|
|
'photo' => ["u-photo"],
|
|
'summary' => ["p-summary"],
|
|
'author' => ["p-author", "vcard"],
|
|
'nutrition' => ["p-nutrition"],
|
|
'published' => ["dt-published"],
|
|
],
|
|
'h-resume' => [
|
|
'summary' => ["p-summary"],
|
|
'contact' => ["p-contact", "vcard"],
|
|
'education' => ["p-education", "vevent"],
|
|
'experience' => ["p-experience", "vevent"],
|
|
'skill' => ["p-skill"],
|
|
'affiliation' => ["p-affiliation", "vcard"],
|
|
],
|
|
'h-review' => [
|
|
// NOTE: h-review mapping requires special handling beyond what is listed here
|
|
'summary' => ["p-name"],
|
|
//fn - parse as p-name of the item being reviewed (p-item h-item p-name)
|
|
//photo - parse as u-photo of the item being reviewed (p-item h-item u-photo)
|
|
//url - parse as u-url of the item being reviewed (p-item h-item u-url)
|
|
'reviewer' => ["p-reviewer", "vcard"],
|
|
'dtreviewed' => ["dt-reviewed"],
|
|
'rating' => ["p-rating"],
|
|
'best' => ["p-best"],
|
|
'worst' => ["p-worst"],
|
|
'description' => ["e-description"],
|
|
],
|
|
'h-review-aggregate' => [
|
|
// NOTE: h-review-aggregate mapping requires special handling beyond what is listed here
|
|
'summary' => ["p-name"],
|
|
//fn - parse as p-name of the item being reviewed (p-item h-item p-name)
|
|
//photo - parse as u-photo of the item being reviewed (p-item h-item u-photo)
|
|
//url - parse as u-url of the item being reviewed (p-item h-item u-url)
|
|
'rating' => ["p-rating"],
|
|
'best' => ["p-best"],
|
|
'worst' => ["p-worst"],
|
|
'count' => ["p-count"],
|
|
'votes' => ["p-votes"],
|
|
],
|
|
];
|
|
|
|
protected $rootNode;
|
|
protected $baseUrl;
|
|
|
|
/** Parses a DOMElement for microformats
|
|
*
|
|
* @param \DOMElement $node The DOMElement to parse
|
|
* @param string $baseURL The base URL against which to resolve relative URLs in the output
|
|
*/
|
|
public function parseNode(\DOMElement $node, string $baseUrl = ""): array {
|
|
$this->rootNode = $node;
|
|
$this->baseUrl = $baseUrl;
|
|
# start with an empty JSON "items" array and "rels" & "rel-urls" hashes:
|
|
$out = [
|
|
'items' => [],
|
|
'rels' => [],
|
|
'rel-urls' => [],
|
|
];
|
|
# parse the root element for class microformats, adding to the JSON items array accordingly
|
|
while ($node) {
|
|
# parse element class for root class name(s) "h-*" and if none, backcompat root classes
|
|
# if found, start parsing a new microformat
|
|
$classes = $this->parseClasses($node);
|
|
if ($types = $this->matchRootsMf2($classes)) {
|
|
$out[] = $this->parseMicroformat($node, $types, false);
|
|
} elseif ($types = $this->matchRootsBackcompat($classes)) {
|
|
$out[] = $this->parseMicroformat($node, $types, true);
|
|
} else {
|
|
# if none found, parse child elements for microformats (depth first, doc order)
|
|
$node = $this->nextElement($node, $this->rootNode, true);
|
|
continue;
|
|
}
|
|
// continue to the next element, passing over children (they have already been examined)
|
|
$node = $this->nextElement($node, $this->rootNode, false);
|
|
}
|
|
|
|
// TODO: clean up instance properties
|
|
return $out;
|
|
}
|
|
|
|
protected function parseClasses(\DOMElement $node): array {
|
|
$attr = trim($node->getAttribute("class"), " \r\n\t\f");
|
|
if ($attr !== "") {
|
|
return array_unique(preg_split("/[ \r\n\t\f]+/sS", $attr));
|
|
} else {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
protected function matchRootsMf2(array $classes): array {
|
|
return array_filter($classes, function($c) {
|
|
# The "*" for root (and property) class names consists of an
|
|
# optional vendor prefix (series of 1+ number or lowercase
|
|
# a-z characters i.e. [0-9a-z]+, followed by '-'), then one
|
|
# or more '-' separated lowercase a-z words.
|
|
// exclude Tailwind classes https://tailwindcss.com/docs/height
|
|
return preg_match('/^h(?:-[a-z0-9]+)?(?:-[a-z]+)+$/S', $c) && !preg_match('/^h-(?:px|auto|full|screen|min|max|fit)$/S', $c);
|
|
});
|
|
}
|
|
|
|
protected function matchRootsBackcompat(array $classes): array {
|
|
$out = [];
|
|
foreach ($classes as $c) {
|
|
if ($compat = self::BACKCOMPAT_ROOTS[$c] ?? null) {
|
|
$out[] = $compat;
|
|
}
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
protected function parseMicroformat(\DOMElement $root, array $types, bool $backcompat): array {
|
|
# keep track of whether the root class name(s) was from backcompat
|
|
// this is a parameter to this function
|
|
# create a new { } structure
|
|
$out = [
|
|
# type: [array of unique microformat "h-*" type(s) on the element sorted alphabetically]
|
|
// NOTE: sorting will be done below; uniqueness was already computed when classes were parsed
|
|
'type' => $types,
|
|
# properties: { } - to be filled in when that element itself is parsed for microformats properties
|
|
'properties' => [],
|
|
# if the element has a non-empty id attribute:
|
|
# id: string value of element's id attribute
|
|
// Added below
|
|
];
|
|
sort($out['type']);
|
|
if (strlen($id = $root->getAttribute("id"))) {
|
|
$out['id'] = $id;
|
|
}
|
|
# parse child elements (document order) by:
|
|
while ($node = $this->nextElement($node ?? $root, $root, !($isRoot = $isRoot ?? false))) {
|
|
$isRoot = false;
|
|
$classes = $this->parseClasses($node);
|
|
if ($backcompat) {
|
|
# if parsing a backcompat root, parse child element class name(s) for backcompat properties
|
|
// we do this by substituting the real class list for a mapped
|
|
// one and later filling in special properties when needed
|
|
$classes = $this->mapClassesBackcompat($node, $out['type']);
|
|
} else {
|
|
# else parse a child element class for property class name(s) "p-*,u-*,dt-*,e-*"
|
|
$classes = $this->parseClasses($node);
|
|
}
|
|
$properties = $this->matchPropsMf2($classes);
|
|
# if such class(es) are found, it is a property element
|
|
# add properties found to current microformat's properties: { } structure
|
|
foreach ($properties as [$pType, $pName]) {
|
|
if (!isset($out['properties'][$pName])) {
|
|
$out['properties'][$pName] = [];
|
|
}
|
|
$out['properties'][$pName][] = $this->parseProperty($node, $pType, $pName);
|
|
}
|
|
# parse a child element for microformats (recurse)
|
|
$child = null;
|
|
if ($types = $this->matchRootsMf2($classes)) {
|
|
$child = $this->parseMicroformat($node, $types, false);
|
|
} elseif ($types = $this->matchRootsBackcompat($classes)) {
|
|
$child = $this->parseMicroformat($node, $types, true);
|
|
}
|
|
if ($child) {
|
|
$isRoot = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
protected function matchPropsMf2(array $classes): array {
|
|
$out = [];
|
|
foreach ($classes as $c) {
|
|
# The "*" for root (and property) class names consists of an
|
|
# optional vendor prefix (series of 1+ number or lowercase
|
|
# a-z characters i.e. [0-9a-z]+, followed by '-'), then one
|
|
# or more '-' separated lowercase a-z words.
|
|
if (preg_match('/^(p|u|dt|e)((?:-[a-z0-9]+)?(?:-[a-z]+)+)$/S', $c, $match)) {
|
|
$out[] = [$match[1], substr($match[2], 1)];
|
|
}
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
protected function mapClassesBackcompat(\DOMElement $node, array $types): array {
|
|
$out = [];
|
|
$classes = $this->parseClasses($node);
|
|
foreach ($classes as $c) {
|
|
}
|
|
|
|
}
|
|
|
|
/** Finds the next node in tree order after $node, if any
|
|
*
|
|
* @param \DOMNode $node The context node
|
|
* @param \DOMElement $root The element to consider the contextual root of the tree
|
|
* @param bool $considerChildren Whether or not child nodes are valid next nodes
|
|
*/
|
|
protected function nextElement(\DOMElement $node, \DOMElement $root, bool $considerChildren): ?\DOMElement {
|
|
if ($considerChildren && $node->localName !== "template" && $node->hasChildNodes()) {
|
|
$next = $node->firstChild;
|
|
if ($next instanceof \DOMElement) {
|
|
return $next;
|
|
}
|
|
}
|
|
$next = $node->nextSibling;
|
|
while (!$next) {
|
|
$node = $node->parentNode;
|
|
if ($node->isSameNode($root)) {
|
|
return null;
|
|
}
|
|
$next = $node->nextSibling;
|
|
while ($next and !$next instanceof \DOMElement) {
|
|
$next = $next->nextSibling;
|
|
}
|
|
}
|
|
return $next;
|
|
}
|
|
} |