You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
325 lines
16 KiB
325 lines
16 KiB
<?php
|
|
/** @license MIT
|
|
* Copyright 2018 J. King et al.
|
|
* See LICENSE and AUTHORS files for details */
|
|
|
|
declare(strict_types=1);
|
|
namespace MensBeam\Lax\Parser\XML;
|
|
|
|
use MensBeam\Lax\Parser\XML\Entry as EntryParser;
|
|
use MensBeam\Lax\Parser\Exception;
|
|
use MensBeam\Lax\Person\Person;
|
|
use MensBeam\Lax\Person\Collection as PersonCollection;
|
|
use MensBeam\Lax\Category\Collection as CategoryCollection;
|
|
use MensBeam\Lax\Feed as FeedStruct;
|
|
use MensBeam\Lax\Date;
|
|
use MensBeam\Lax\MimeType;
|
|
use MensBeam\Lax\Schedule;
|
|
use MensBeam\Lax\Text;
|
|
use MensBeam\Lax\Url;
|
|
|
|
class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
|
|
use \MensBeam\Lax\Parser\AbstractFeed;
|
|
|
|
protected const LIBXML_OPTIONS = \LIBXML_BIGLINES | \LIBXML_COMPACT | \LIBXML_HTML_NODEFDTD | \LIBXML_NOCDATA | \LIBXML_NOENT | \LIBXML_NONET | \LIBXML_NOERROR | LIBXML_NOWARNING;
|
|
public const MIME_TYPES = [
|
|
"application/atom+xml", // Atom
|
|
"application/rss+xml", // RSS 2.0
|
|
"application/rdf+xml", // RSS 1.0 (possibly)
|
|
"application/xml", // generic XML
|
|
"text/xml", // generic XML (as text)
|
|
];
|
|
|
|
/** @var string */
|
|
protected $data;
|
|
/** @var string */
|
|
protected $contentType;
|
|
/** @var string */
|
|
protected $url;
|
|
/** @var \DOMElement */
|
|
protected $subject;
|
|
/** @var \DOMXpath */
|
|
protected $xpath;
|
|
|
|
/** Performs initialization of the instance */
|
|
protected function init(FeedStruct $feed): FeedStruct {
|
|
$type = MimeType::parse($this->contentType) ?? "";
|
|
if ($type && !in_array($type->essence, self::MIME_TYPES)) {
|
|
throw new Exception("notXMLType");
|
|
}
|
|
$this->document = new \DOMDocument();
|
|
if (!$this->document->loadXML($this->data, self::LIBXML_OPTIONS)) {
|
|
throw new Exception("notXML");
|
|
}
|
|
$this->document->documentURI = $this->url;
|
|
$this->xpath = new XPath($this->document);
|
|
$this->subject = $this->document->documentElement;
|
|
$ns = $this->subject->namespaceURI;
|
|
$name = $this->subject->localName;
|
|
if (is_null($ns) && $name === "rss") {
|
|
$this->subject = $this->fetchElement("channel") ?? $this->subject;
|
|
$feed->format = "rss";
|
|
$feed->version = $this->document->documentElement->hasAttribute("version") ? $this->document->documentElement->getAttribute("version") : null;
|
|
$this->xpath->rss2 = true;
|
|
} elseif ($ns === XPath::NS['rdf'] && $name === "RDF") {
|
|
$feed->format = "rdf";
|
|
$channel = $this->fetchElement("rss1:channel|rss0:channel");
|
|
if ($channel) {
|
|
$this->subject = $channel;
|
|
$feed->version = ($channel->namespaceURI === XPath::NS['rss1']) ? "1.0" : "0.90";
|
|
} else {
|
|
$element = $this->fetchElement("rss1:item|rss0:item|rss1:image|rss0:image");
|
|
if ($element) {
|
|
$feed->version = ($element->namespaceURI === XPath::NS['rss1']) ? "1.0" : "0.90";
|
|
} else {
|
|
throw new Exception("notXMLFeed");
|
|
}
|
|
}
|
|
} elseif ($ns === XPath::NS['atom'] && $name === "feed") {
|
|
$feed->format = "atom";
|
|
$feed->version = "1.0";
|
|
} else {
|
|
throw new Exception("notXMLFeed");
|
|
}
|
|
$feed->meta->url = $this->url;
|
|
return $feed;
|
|
}
|
|
|
|
public function getId(): ?string {
|
|
return $this->getIdAtom() // Atom ID
|
|
?? $this->getIdDC() // Dublin Core ID
|
|
?? $this->getIdRss2(); // RSS GUID
|
|
}
|
|
|
|
public function getSchedule(): Schedule {
|
|
$sched = new Schedule;
|
|
$sched->interval = $this->getSchedIntervalRss1() ?? $this->getSchedIntervalRss2();
|
|
$sched->skip = $this->getSchedSkipRss2();
|
|
$sched->expired = $this->getExpiredPod();
|
|
if (is_null($sched->expired) && (($sched->skip & Schedule::DAY_ALL) == Schedule::DAY_ALL || ($sched->skip & Schedule::HOUR_ALL) == Schedule::HOUR_ALL)) {
|
|
$sched->expired = true;
|
|
}
|
|
if ($sched->interval) {
|
|
$sched->base = $this->fetchDate("sched:updateBase", self::DATE_ANY);
|
|
}
|
|
return $sched;
|
|
}
|
|
|
|
public function getLang(): ?string {
|
|
return $this->getLangXML() // xml:lang attribute
|
|
?? $this->getLangDC() // Dublin Core language
|
|
?? $this->getLangRss2(); // RSS language
|
|
}
|
|
|
|
public function getUrl(): ?Url {
|
|
return $this->fetchAtomRelation("self", ["application/atom+xml"]) // Atom 'self' relation URL
|
|
?? $this->fetchUrl("self::rss1:channel/@rdf:about") // RDF-about URL from RSS 0.90 or RSS 1.0
|
|
?? $this->fetchUrl("apple:new-feed-url"); // iTunes podcast canonical URL
|
|
}
|
|
|
|
public function getLink(): ?Url {
|
|
return $this->getLinkAtom() // Atom link
|
|
?? $this->getLinkRss1() // RSS 0.90 or RSS 1.0 link
|
|
?? $this->getLinkRss2(); // RSS 2.0 link
|
|
}
|
|
|
|
public function getTitle(): ?Text {
|
|
return $this->getTitleAtom() // Atom title
|
|
?? $this->getTitleRss1() // RSS 0.90 or RSS 1.0 title
|
|
?? $this->getTitleRss2() // RSS 2.0 title
|
|
?? $this->getTitleDC() // Dublin Core title
|
|
?? $this->getTitlePod(); // iTunes podcast title
|
|
}
|
|
|
|
public function getSummary(): ?Text {
|
|
return $this->fetchAtomText("atom:summary") // Atom summary (non-standard)
|
|
?? $this->fetchAtomText("atom:subtitle") // Atom subtitle
|
|
?? $this->fetchText("dc:abstract|dct:abstract", self::TEXT_PLAIN) // Dublin Core abstract
|
|
?? $this->fetchText("dc:description|dct:description", self::TEXT_PLAIN) // Dublin Core description
|
|
?? $this->fetchText("rss1:description", self::TEXT_LOOSE) // RSS 1.0 description
|
|
?? $this->fetchText("rss0:description", self::TEXT_LOOSE) // RSS 0.90 description
|
|
?? $this->fetchText("rss2:description", self::TEXT_LOOSE) // RSS 2.0 description
|
|
?? $this->fetchText("gplay:description", self::TEXT_PLAIN) // Google Play podcast description
|
|
?? $this->fetchText("apple:summary", self::TEXT_PLAIN) // iTunes podcast summary
|
|
?? $this->fetchText("apple:subtitle", self::TEXT_PLAIN); // iTunes podcast subtitle
|
|
}
|
|
|
|
public function getDateModified(): ?Date {
|
|
/* fetching a date works differently from other data as only Atom has
|
|
well-defined semantics here. Thus the semantics of all the other
|
|
formats are equal, and we want the latest date, whatever it is.
|
|
*/
|
|
return $this->fetchDate("atom:updated", self::DATE_LATEST)
|
|
?? $this->fetchDate(self::QUERY_AMBIGUOUS_DATES, self::DATE_LATEST);
|
|
}
|
|
|
|
public function getIcon(): ?Url {
|
|
return $this->fetchUrl("atom:icon") // Atom icon URL
|
|
?? $this->fetchAtomRelation("icon") // Atom icon relation URL
|
|
?? $this->fetchAtomRelation("shortcut icon") // Atom icon relation URL (non-standard Internet Explorewr usage)
|
|
?? $this->fetchAtomRelation("icon shortcut"); // Atom icon relation URL (non-standard Internet Explorewr usage, reversed)
|
|
}
|
|
|
|
public function getImage(): ?Url {
|
|
return $this->fetchUrl("atom:logo") // Atom logo URL
|
|
?? $this->fetchUrl("rss1:image/@rdf:resource") // RSS 1.0 channel image RDF resource
|
|
?? $this->fetchUrl("rss1:image/rss1:url") // RSS 1.0 channel image
|
|
?? $this->fetchUrl("rss1:image/@rdf:about") // RSS 1.0 channel image about-URL
|
|
?? $this->fetchUrl("/rdf:RDF/rss1:image/@rdf:resource") // RSS 1.0 root image RDF resource
|
|
?? $this->fetchUrl("/rdf:RDF/rss1:image/rss1:url") // RSS 1.0 root image
|
|
?? $this->fetchUrl("/rdf:RDF/rss1:image/@rdf:about") // RSS 1.0 root image about-URL
|
|
?? $this->fetchUrl("rss0:image/rss0:url") // RSS 0.90 channel image
|
|
?? $this->fetchUrl("/rdf:RDF/rss0:image/rss0:url") // RSS 0.90 root image
|
|
?? $this->fetchUrl("rss2:image/rss2:url") // RSS 2.0 channel image
|
|
?? $this->fetchUrl("gplay:image/@href") // Google Play podcast image
|
|
?? $this->fetchUrl("apple:image/@href"); // iTunes podcast image
|
|
}
|
|
|
|
public function getCategories(): CategoryCollection {
|
|
return $this->getCategoriesFromNode($this->subject) ?? new CategoryCollection;
|
|
}
|
|
|
|
public function getPeople(): PersonCollection {
|
|
$authors =
|
|
$this->fetchAtomPeople("atom:author", "author") // Atom authors
|
|
?? $this->fetchPeople("dc:creator|dct:creator", "author") // Dublin Core creators
|
|
?? $this->fetchPeople("rss2:author", "author") // RSS 2.0 authors
|
|
?? $this->fetchPeople("gplay:author", "author") // Google Play authors
|
|
?? $this->fetchPeople("apple:author", "author") // iTunes authors
|
|
?? new PersonCollection;
|
|
$contributors =
|
|
$this->fetchAtomPeople("atom:contributor", "contributor") // Atom contributors
|
|
?? $this->fetchPeople("dc:contributor|dct:contributor", "contributor") // Dublin Core contributors
|
|
?? new PersonCollection;
|
|
$editors =
|
|
$this->fetchPeople("rss2:managingEditor", "editor") // RSS 2.0 editors
|
|
?? $this->fetchPeople("dc:publisher|dct:publisher", "editor") // Dublin Core publishers
|
|
?? new PersonCollection;
|
|
$webmasters =
|
|
$this->fetchPeople("rss2:webMaster", "webmaster") // RSS 2.0 authors
|
|
?? $this->getOwnersTunes() // iTunes webmaster
|
|
?? $this->fetchPeople("gplay:email", "webmaster") // Google Play webmaster
|
|
?? new PersonCollection;
|
|
return $authors->merge($contributors, $editors, $webmasters);
|
|
}
|
|
|
|
public function getEntries(FeedStruct $feed): array {
|
|
$out = [];
|
|
foreach ($this->xpath->query("atom:entry|rss2:item|rss0:item|rss1:item|/rdf:RDF/rss0:item|/rdf:RDF/rss1:item", $this->subject) as $node) {
|
|
$entry = (new EntryParser($node, $this->xpath, $feed))->parse();
|
|
if (!$this->empty($entry, ["lang"])) {
|
|
$out[] = $entry;
|
|
}
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
/** Fetches the "complete" flag from an iTunes podcast */
|
|
protected function getExpiredPod(): ?bool {
|
|
return $this->fetchString("apple:complete", "(?-i:Yes)") ? true : null; // case-sensitive pattern
|
|
}
|
|
|
|
/** Fetches the "time-to-live" value (a number of minutes before the feed should be re-fetched) from an RSS 2.0 feed */
|
|
protected function getSchedIntervalRss2(): ?\DateInterval {
|
|
$ttl = (int) $this->fetchString("rss2:ttl", "\d+");
|
|
if ($ttl) {
|
|
return new \DateInterval("PT{$ttl}M");
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Fetches the schedule interval from an RSS feed; this is necessarily approximate:
|
|
*
|
|
* The interval is defined in the syndication RSS extension as fractions of a period, but PHP only supports integer intervals, so we perform integer divison on the nearest subdivision of a period, returning at least one.
|
|
*
|
|
* For example, "four times monthly" first assumes a month is 30 days, and divides this by four to yield seven days.
|
|
*/
|
|
protected function getSchedIntervalRss1(): ?\DateInterval {
|
|
$period = $this->fetchString("sched:updatePeriod", "(?:year|month|week|dai|hour)ly");
|
|
if ($period) {
|
|
[$p, $n] = [
|
|
"hourly" => ["TM", 60], // 60 minutes
|
|
"daily" => ["TH", 24], // 24 hors
|
|
"weekly" => ["D", 7], // 7 days
|
|
"monthly" => ["D", 30], // 30 days
|
|
"yearly" => ["M", 12], // 12 months
|
|
][strtolower($period)];
|
|
$f = max(1, (int) $this->fetchString("sched:updateFrequency", "0*[1-9]\d*")); // a frequency of zero makes no sense
|
|
// divide the period by the frequency
|
|
// FIXME: we must have an integer result because PHP (incorrectly) rejects fractional intervals
|
|
// see https://bugs.php.net/bug.php?id=53831
|
|
$n = max(1, intdiv($n, $f)); // a frequency of zero still makes no sense, so we assume at least one subdivision
|
|
return new \DateInterval("P".(strlen($p) === 1 ? "" : $p[0]).$n.$p[-1]);
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Computes the "skip-schedule" of an RSS feed, the set of days and hours during which a feed should not be fetched */
|
|
protected function getSchedSkipRss2(): ?int {
|
|
$out = 0;
|
|
foreach ($this->fetchString("rss2:skipHours/rss2:hour", "\d+", true) ?? [] as $h) {
|
|
$out |= [
|
|
Schedule::HOUR_0,
|
|
Schedule::HOUR_1,
|
|
Schedule::HOUR_2,
|
|
Schedule::HOUR_3,
|
|
Schedule::HOUR_4,
|
|
Schedule::HOUR_5,
|
|
Schedule::HOUR_6,
|
|
Schedule::HOUR_7,
|
|
Schedule::HOUR_8,
|
|
Schedule::HOUR_9,
|
|
Schedule::HOUR_10,
|
|
Schedule::HOUR_11,
|
|
Schedule::HOUR_12,
|
|
Schedule::HOUR_13,
|
|
Schedule::HOUR_14,
|
|
Schedule::HOUR_15,
|
|
Schedule::HOUR_16,
|
|
Schedule::HOUR_17,
|
|
Schedule::HOUR_18,
|
|
Schedule::HOUR_19,
|
|
Schedule::HOUR_20,
|
|
Schedule::HOUR_21,
|
|
Schedule::HOUR_22,
|
|
Schedule::HOUR_23,
|
|
Schedule::HOUR_0,
|
|
][(int) $h] ?? 0;
|
|
}
|
|
foreach ($this->fetchString("rss2:skipDays/rss2:day", null, true) ?? [] as $d) {
|
|
$out |= [
|
|
"monday" => Schedule::DAY_MON,
|
|
"tuesday" => Schedule::DAY_TUE,
|
|
"wednesday" => Schedule::DAY_WED,
|
|
"thursday" => Schedule::DAY_THU,
|
|
"friday" => Schedule::DAY_FRI,
|
|
"saturday" => Schedule::DAY_SAT,
|
|
"sunday" => Schedule::DAY_SUN,
|
|
"mon" => Schedule::DAY_MON,
|
|
"tue" => Schedule::DAY_TUE,
|
|
"wed" => Schedule::DAY_WED,
|
|
"thu" => Schedule::DAY_THU,
|
|
"fri" => Schedule::DAY_FRI,
|
|
"sat" => Schedule::DAY_SAT,
|
|
"sun" => Schedule::DAY_SUN,
|
|
][strtolower($d)] ?? 0;
|
|
}
|
|
return $out ?: null;
|
|
}
|
|
|
|
/** Returns at most a single person: podcasts implicitly have only one author or webmaster */
|
|
protected function getOwnersTunes(): ?PersonCollection {
|
|
$out = new PersonCollection;
|
|
foreach ($this->xpath->query("apple:owner", $this->subject) as $node) {
|
|
$p = new Person;
|
|
$mail = $this->fetchString("apple:email", null, null, $node) ?? "";
|
|
$p->mail = $this->validateMail($mail) ? $mail : null;
|
|
$p->name = $this->fetchString("apple:name", ".+", null, $node) ?? $mail;
|
|
$p->role = "webmaster";
|
|
if (strlen($p->name)) {
|
|
$out[] = $p;
|
|
}
|
|
}
|
|
return count($out) ? $out : null;
|
|
}
|
|
}
|
|
|