You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
331 lines
13 KiB
331 lines
13 KiB
<?php
|
|
/** @license MIT
|
|
* Copyright 2018 J. King et al.
|
|
* See LICENSE and AUTHORS files for details */
|
|
|
|
declare(strict_types=1);
|
|
namespace MensBeam\Lax\Parser\XML;
|
|
|
|
use MensBeam\Lax\Person\Person;
|
|
use MensBeam\Lax\Person\Collection as PersonCollection;
|
|
use MensBeam\Lax\Text;
|
|
use MensBeam\Lax\Date;
|
|
use MensBeam\Lax\Url;
|
|
|
|
abstract class Construct {
|
|
use \MensBeam\Lax\Parser\Construct;
|
|
|
|
protected const TEXT_LOOSE = "loose";
|
|
protected const TEXT_PLAIN = "plain";
|
|
protected const TEXT_HTML = "html";
|
|
|
|
protected const DATE_ANY = 0;
|
|
protected const DATE_LATEST = 1;
|
|
protected const DATE_EARLIEST = 2;
|
|
protected const DATE_ALL = 3;
|
|
|
|
/** @var \DOMDocument */
|
|
protected $document;
|
|
/** @var \DOMXPath */
|
|
protected $xpath;
|
|
/** @var \DOMElement */
|
|
protected $subject;
|
|
|
|
/** Retrieves an element node based on an XPath query */
|
|
protected function fetchElement(string $query, \DOMNode $context = null): ?\DOMElement {
|
|
$node = @$this->xpath->query("(".$query.")[1]", $context ?? $this->subject);
|
|
if ($node === false) {
|
|
throw new \Exception("Invalid XPath query: $query"); // @codeCoverageIgnore
|
|
}
|
|
return ($node->length) ? $node->item(0) : null;
|
|
}
|
|
|
|
/** Retrieves the trimmed text content of one or more DOM elements based on an XPath query, optionally matching a pattern
|
|
*
|
|
* Returns null if no suitable nodes were found
|
|
*
|
|
* @param string $query The XPath query of the nodes to return
|
|
* @param string|null $pattern The pattern to optionally filter matches with. The pattern should not include delimiters or anchors and is always case-insensitive
|
|
* @param bool|null $multi Whether to return multiple results as an array (true) or one result as a string (false, default)
|
|
* @param \DOMNode $context The context node for the XPath query
|
|
* @return string|array|null
|
|
*/
|
|
protected function fetchString(string $query, ?string $pattern = null, ?bool $multi = null, ?\DOMNode $context = null) {
|
|
$out = [];
|
|
$pattern = strlen($pattern ?? "") ? "/^(?:".str_replace("/", "\\/", $pattern).")$/i" : "";
|
|
$multi = $multi ?? false;
|
|
$nodes = $this->xpath->query($query, $context ?? $this->subject);
|
|
foreach ($nodes as $node) {
|
|
$t = $this->trimText($node->textContent);
|
|
if (!$pattern || preg_match($pattern, $t)) {
|
|
if (!$multi) {
|
|
return $t;
|
|
} else {
|
|
$out[] = $t;
|
|
}
|
|
}
|
|
}
|
|
return ($out) ? $out : null;
|
|
}
|
|
|
|
/** Retrieves and parses a date from one or more DOM elements based on an XPath query
|
|
*
|
|
* Returns null if no suitable nodes were found
|
|
*
|
|
* @param string $query The XPath query of the nodes to return
|
|
* @param bool|null $mode Whether to return the first valid date found (DATE_ANY), the earliest chronologically (DATE_EARLIEST), latest chronologically (DATE_LATEST), or all valid dates (DAATE_ALL) in a sorted array
|
|
* @param \DOMNode $context The context node for the XPath query
|
|
* @return \MensBeam\Lax\Date|array|null
|
|
*/
|
|
protected function fetchDate(string $query, int $mode = self::DATE_ANY, \DOMNode $context = null) {
|
|
$out = [];
|
|
$tz = new \DateTimeZone("UTC");
|
|
assert(in_array($mode, [self::DATE_ANY, self::DATE_ALL, self::DATE_EARLIEST, self::DATE_LATEST]));
|
|
foreach((array) $this->fetchString($query, null, true, $context) as $d) {
|
|
if ($d = $this->parseDate($d ?? "")) {
|
|
if ($mode === self::DATE_ANY) {
|
|
return $d;
|
|
} else {
|
|
// add the date to the output only if it is a unique moment in time so far
|
|
$ts = $d->setTimezone($tz)->format("Y-m-d\TH:i:s.u\Z");
|
|
if (!isset($out[$ts])) {
|
|
$out[$ts] = $d;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// sort the dates earliest to latest and produce an indexed array
|
|
ksort($out);
|
|
$out = array_values($out);
|
|
// return based on requested mode
|
|
switch ($mode) {
|
|
case self::DATE_ALL:
|
|
return $out;
|
|
case self::DATE_EARLIEST:
|
|
return $out ? $out[0] : null;
|
|
case self::DATE_LATEST:
|
|
return $out ? array_pop($out) : null;
|
|
}
|
|
}
|
|
|
|
/** Returns the first valid URL matching an XPath query. Relative URLs are resolved when possible
|
|
*
|
|
* @param string $query The XPath query of the node to return
|
|
* @param \DOMNode $context The context node for the XPath query
|
|
*/
|
|
protected function fetchUrl(string $query, \DOMNode $context = null): ?Url {
|
|
foreach ($this->xpath->query($query, $context ?? $this->subject) as $node) {
|
|
$url = trim($node->textContent);
|
|
if (strlen($url)) {
|
|
try {
|
|
return new Url($url, $node->baseURI);
|
|
} catch (\InvalidArgumentException $e) {
|
|
// don't return a result that doesn't evaluate to a valid URL of some sort
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
protected function fetchText(string $query, string $format, \DOMNode $context = null): ?Text {
|
|
foreach ($this->xpath->query($query, $context ?? $this->subject) as $node) {
|
|
$data = trim($node->textContent);
|
|
if (strlen($data)) {
|
|
$out = new Text;
|
|
if ($format === "plain") {
|
|
$data = $this->trimText($data);
|
|
} elseif ($format === "html" || $format === "loose") {
|
|
$out->htmlBase = strlen($node->baseURI) ? $node->baseURI : null;
|
|
}
|
|
$out->$format = $data;
|
|
return $out;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Returns a node-list of Atom link elements with the desired relation or equivalents.
|
|
*
|
|
* Links without an href attribute are excluded.
|
|
*
|
|
* @see https://tools.ietf.org/html/rfc4287#section-4.2.7.2
|
|
*/
|
|
protected function fetchAtomRelations(string $rel = "", \DOMNode $context = null): array {
|
|
// normalize the relation
|
|
$custom = false;
|
|
$rel = trim($rel);
|
|
if ($rel === "") {
|
|
$rel = "alternate";
|
|
} elseif (strpos(strtolower($rel), "http://www.iana.org/assignments/relation/") === 0) {
|
|
$rel = substr($rel, 41);
|
|
} elseif (preg_match("<^[a-z\.\-]+$>i", $rel)) {
|
|
$rel = strtolower($rel);
|
|
} else {
|
|
$custom = true;
|
|
$url = (string) new Url($rel);
|
|
}
|
|
// look at all the links that have a non-empty href attribute
|
|
$out = [];
|
|
foreach ($this->xpath->query("atom:link[normalize-space(@href)]", $context ?? $this->subject) as $l) {
|
|
try {
|
|
new Url($l->getAttribute("href"));
|
|
} catch (\InvalidArgumentException $e) {
|
|
// reject any links which do not have valid URLs
|
|
continue;
|
|
}
|
|
$r = trim($l->getAttribute("rel"));
|
|
if ($custom) {
|
|
if ($url === (string) new Url($r)) {
|
|
$out[] = $l;
|
|
}
|
|
} else {
|
|
$r = trim(strtolower(rawurldecode($r)));
|
|
$r = (strpos($r, "http://www.iana.org/assignments/relation/") === 0) ? substr($r, 41) : $r;
|
|
$r = !strlen($r) ? "alternate" : $r;
|
|
if ($r === $rel) {
|
|
$out[] = $l;
|
|
}
|
|
}
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
/** Returns the first Atom link URL which matches the desired relation, with nearest desired media type, or no media type if none match */
|
|
protected function fetchAtomRelation(string $rel = "", array $mediaTypes = [], \DOMNode $context = null): ?Url {
|
|
// tidy ther list of media types; this orders them worst (0)to best (highest index) and then creates a hashtable
|
|
$mediaTypes = array_flip(array_reverse(array_values(array_unique(array_map(function(string $t) {
|
|
return strtolower(trim($t));
|
|
}, $mediaTypes)))));
|
|
$rels = $this->fetchAtomRelations($rel, $context);
|
|
if ($rels && !$mediaTypes) {
|
|
return new Url($rels[0]->getAttribute("href"), $rels[0]->baseURI);
|
|
}
|
|
$result = array_reduce($rels, function($best, $cur) use ($mediaTypes) {
|
|
$t = trim($cur->getAttribute("type"));
|
|
// absence of media type is acceptable if no better match yet exists
|
|
if (!strlen($t)) {
|
|
if (!$best) {
|
|
return [$cur, -1]; // any match will rank higher than -1
|
|
}
|
|
}
|
|
$t = $this->parseMediaType($t);
|
|
if ($t) {
|
|
$rank = $mediaTypes[$t] ?? null;
|
|
if (!is_null($rank) && (!$best || $rank > $best[1])) {
|
|
// if the media type is acceptable and there is currently no candidate or the candidate ranks lower, use the current link
|
|
return [$cur, $rank];
|
|
}
|
|
}
|
|
return $best;
|
|
});
|
|
return $result ? new Url($result[0]->getAttribute("href"), $result[0]->baseURI) : null;
|
|
}
|
|
|
|
protected function fetchAtomText(string $query, \DOMNode $context = null): ?Text {
|
|
$out = new Text;
|
|
$populated = false;
|
|
foreach ($this->xpath->query($query, $context ?? $this->subject) as $node) {
|
|
if ($node->hasAttribute("src")) {
|
|
// ignore any external content
|
|
continue;
|
|
}
|
|
// get the content type; assume "text" if not provided
|
|
$type = trim($node->getAttribute("type"));
|
|
$type = $this->parseMediaType((!strlen($type)) ? "text" : $type);
|
|
if ($type === "text" || $type === "text/plain") {
|
|
if (is_null($out->plain)) {
|
|
$plain = $this->trimText($node->textContent);
|
|
if (strlen($plain)) {
|
|
$out->plain = $plain;
|
|
$populated = true;
|
|
}
|
|
}
|
|
} elseif ($type === "html" || $type === "text/html") {
|
|
if (is_null($out->html)) {
|
|
$html = trim($node->textContent);
|
|
if (strlen($html)) {
|
|
$out->html = $html;
|
|
$out->htmlBase = strlen($node->baseURI) ? $node->baseURI : null;
|
|
$populated = true;
|
|
}
|
|
}
|
|
} elseif ($type === "xhtml" || $type === "application/xhtml+xml") {
|
|
if (is_null($out->xhtml) && ($xhtml = $this->fetchElement("xhtml:div", $node))) {
|
|
$out->xhtml = $xhtml->ownerDocument->saveXML($xhtml);
|
|
$out->xhtmlBase = strlen($xhtml->baseURI) ? $xhtml->baseURI : null;
|
|
$populated = true;
|
|
}
|
|
}
|
|
}
|
|
return $populated ? $out : null;
|
|
}
|
|
|
|
/** Primitive to fetch an Atom feed/entry identifier */
|
|
protected function getIdAtom(): ?string {
|
|
return $this->fetchString("atom:id", ".+");
|
|
}
|
|
|
|
/** Primitive to fetch an RSS feed/entry identifier
|
|
*
|
|
* Using RSS' <guid> for feed identifiers is non-standard, but harmless
|
|
*/
|
|
protected function getIdRss2(): ?string {
|
|
return $this->fetchString("guid", ".+");
|
|
}
|
|
|
|
/** Primitive to fetch a Dublin Core feed/entry identifier */
|
|
protected function getIdDC(): ?string {
|
|
return $this->fetchString("dc:identifier", ".+");
|
|
}
|
|
|
|
protected function getLangXML(): ?string {
|
|
// walk up the tree looking for the nearest language tag
|
|
$el = $this->subject;
|
|
do {
|
|
$out = $this->fetchString("@xml:lang", ".+", false, $el);
|
|
$el = $el->parentNode;
|
|
} while (is_null($out) && $el);
|
|
return $out;
|
|
}
|
|
|
|
protected function getLangDC(): ?string {
|
|
return $this->fetchString("dc:language", ".+");
|
|
}
|
|
|
|
protected function getLangRss2(): ?string {
|
|
return $this->fetchString("language", ".+");
|
|
}
|
|
|
|
protected function getLinkAtom(): ?Url {
|
|
return $this->fetchAtomRelation("alternate", ["text/html", "application/xhtml+xml"]);
|
|
}
|
|
|
|
protected function getLinkRss2(): ?Url {
|
|
return $this->fetchUrl("link") ?? $this->fetchUrl("guid[not(@isPermalink) or @isPermalink='true']");
|
|
}
|
|
|
|
protected function getLinkRss1(): ?Url {
|
|
return $this->fetchUrl("rss1:link|rss0:link");
|
|
}
|
|
|
|
protected function getTitleAtom(): ?Text {
|
|
return $this->fetchAtomText("atom:title");
|
|
}
|
|
|
|
protected function getTitleRss1(): ?Text {
|
|
return $this->fetchText("rss1:title|rss0:title", self::TEXT_LOOSE);
|
|
}
|
|
|
|
protected function getTitleRss2(): ?Text {
|
|
return $this->fetchText("title", self::TEXT_LOOSE);
|
|
}
|
|
|
|
protected function getTitleDC(): ?Text {
|
|
return $this->fetchText("dc:title", self::TEXT_PLAIN);
|
|
}
|
|
|
|
protected function getTitlePod(): ?Text {
|
|
return $this->fetchText("apple:title", self::TEXT_PLAIN);
|
|
}
|
|
|
|
}
|
|
|