A lax Web news feed parser
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

299 lines
12 KiB

<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Lax\Parser\XML;
use MensBeam\Lax\Parser\Exception;
use MensBeam\Lax\Person\Collection as PersonCollection;
use MensBeam\Lax\Category\Collection as CategoryCollection;
use MensBeam\Lax\Feed as FeedStruct;
use MensBeam\Lax\Date;
use MensBeam\Lax\Schedule;
use MensBeam\Lax\Text;
use MensBeam\Lax\Url;
class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
protected const LIBXML_OPTIONS = \LIBXML_BIGLINES | \LIBXML_COMPACT | \LIBXML_HTML_NODEFDTD | \LIBXML_NOCDATA | \LIBXML_NOENT | \LIBXML_NONET | \LIBXML_NOERROR | LIBXML_NOWARNING;
/** @var string */
protected $data;
/** @var string */
protected $contentType;
/** @var string */
protected $url;
/** @var \DOMElement */
protected $subject;
/** @var \DOMXpath */
protected $xpath;
/** Constructs a parsed feed */
public function __construct(string $data, string $contentType = null, string $url = null) {
$this->data = $data;
$this->contentType = $contentType;
if (strlen($url ?? "")) {
$this->url = $url;
}
}
/** Performs initialization of the instance */
protected function init(FeedStruct $feed): FeedStruct {
$this->document = new \DOMDocument();
if (!$this->document->loadXML($this->data, self::LIBXML_OPTIONS)) {
throw new Exception("notXML");
}
$this->document->documentURI = $this->url;
$this->xpath = new XPath($this->document);
$this->subject = $this->document->documentElement;
$ns = $this->subject->namespaceURI;
$name = $this->subject->localName;
if (is_null($ns) && $name === "rss") {
$this->subject = $this->fetchElement("channel") ?? $this->subject;
$feed->format = "rss";
$feed->version = $this->document->documentElement->hasAttribute("version") ? $this->document->documentElement->getAttribute("version") : null;
} elseif ($ns === XPath::NS['rdf'] && $name === "RDF") {
$feed->format = "rdf";
$channel = $this->fetchElement("rss1:channel|rss0:channel");
if ($channel) {
$this->subject = $channel;
$feed->version = ($channel->namespaceURI === XPath::NS['rss1']) ? "1.0" : "0.90";
} else {
$element = $this->fetchElement("rss1:item|rss0:item|rss1:image|rss0:image");
if ($element) {
$feed->version = ($element->namespaceURI === XPath::NS['rss1']) ? "1.0" : "0.90";
} else {
throw new Exception("notXMLFeed");
}
}
} elseif ($ns === XPath::NS['atom'] && $name === "feed") {
$feed->format = "atom";
$feed->version = "1.0";
} else {
throw new Exception("notXMLFeed");
}
$feed->meta->url = $this->url;
return $feed;
}
/** Parses the feed to extract data */
public function parse(FeedStruct $feed = null): FeedStruct {
$feed = $this->init($feed ?? new FeedStruct);
$feed->meta->url = strlen($this->url ?? "") ? new Url($this->url) : null;
$feed->sched = $this->getSchedule();
$feed->id = $this->getId();
$feed->lang = $this->getLang();
$feed->url = $this->getUrl();
$feed->link = $this->getLink();
$feed->title = $this->getTitle();
$feed->summary = $this->getSummary();
$feed->dateModified = $this->getDateModified();
//$feed->icon = $this->getIcon();
//$feed->image = $this->getImage();
//$feed->people = $this->getPeople();
//$feed->categories = $this->getCategories();
//$feed->entries = $this->getEntries($feed);
return $feed;
}
public function getId(): ?string {
return $this->getIdAtom() ?? $this->getIdDC() ?? $this->getIdRss2();
}
public function getSchedule(): Schedule {
$sched = new Schedule;
$sched->interval = $this->getSchedIntervalRss1() ?? $this->getSchedIntervalRss2();
$sched->skip = $this->getSchedSkipRss2();
$sched->expired = $this->getExpiredPod();
if (is_null($sched->expired) && (($sched->skip & Schedule::DAY_ALL) == Schedule::DAY_ALL || ($sched->skip & Schedule::HOUR_ALL) == Schedule::HOUR_ALL)) {
$sched->expired = true;
}
if ($sched->interval) {
$sched->base = $this->getSchedBaseRss1();
}
return $sched;
}
public function getLang(): ?string {
return $this->getLangXML() ?? $this->getLangDC() ?? $this->getLangRss2();
}
public function getUrl(): ?Url {
return $this->getUrlAtom() ?? $this->getUrlRss1() ?? $this->getUrlPod();
}
public function getLink(): ?Url {
return $this->getLinkAtom() ?? $this->getLinkRss1() ?? $this->getLinkRss2();
}
public function getTitle(): ?Text {
return $this->getTitleAtom() ?? $this->getTitleRss1() ?? $this->getTitleRss2() ?? $this->getTitleDC() ?? $this->getTitlePod();
}
public function getSummary(): ?Text {
return $this->getSummaryAtom() ?? $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryPod();
}
public function getDateModified(): ?Date {
/* fetching a date works differently from other data as only Atom has
well-defined semantics here. Thus the semantics of all the other
formats are equal, and we want the latest date, whatever it is.
*/
return $this->fetchDate("atom:updated", self::DATE_LATEST) ?? $this->fetchDate("dc:date|pubDate|lastBuildDate", self::DATE_LATEST);
}
public function getCategories(): CategoryCollection {
return $this->getCategoriesAtom() ?? $this->getCategoriesRss2() ?? $this->getCategoriesDC() ?? $this->getCategoriesPod() ?? new CategoryCollection;
}
public function getPeople(): PersonCollection {
$authors = $this->getAuthorsAtom() ?? $this->getAuthorsDC() ?? $this->getAuthorsPod() ?? $this->getAuthorsRss2() ?? new PersonCollection;
$contributors = $this->getContributorsAtom() ?? $this->getContributorsDC() ?? new PersonCollection;
$editors = $this->getEditorsRss2() ?? new PersonCollection;
$webmasters = $this->getWebmastersPod() ?? $this->getWebmastersRss2() ?? new PersonCollection;
return $authors->merge($contributors, $editors, $webmasters);
}
public function getEntries(FeedStruct $feed = null): array {
return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? [];
}
public function getIcon(): ?Url {
return null;
}
public function getImage(): ?Url {
return null;
}
/** Fetches the "complete" flag from an iTunes podcast */
protected function getExpiredPod(): ?bool {
return $this->fetchString("apple:complete", "(?-i:Yes)") ? true : null; // case-sensitive pattern
}
/** Fetches the "time-to-live" value (a number of minutes before the feed should be re-fetched) from an RSS 2.0 feed */
protected function getSchedIntervalRss2(): ?\DateInterval {
$ttl = (int) $this->fetchString("ttl", "\d+");
if ($ttl) {
return new \DateInterval("PT{$ttl}M");
}
return null;
}
/** Fetches the schedule interval from an RSS feed; this is necessarily approximate:
*
* The interval is defined in the syndication RSS extension as fractions of a period, but PHP only supports integer intervals, so we perform integer divison on the nearest subdivision of a period, returning at least one.
*
* For example, "four times monthly" first assumes a month is 30 days, and divides this by four to yield seven days.
*/
protected function getSchedIntervalRss1(): ?\DateInterval {
$period = $this->fetchString("sched:updatePeriod", "(?:year|month|week|dai|hour)ly");
if ($period) {
[$p, $n] = [
"hourly" => ["TM", 60], // 60 minutes
"daily" => ["TH", 24], // 24 hors
"weekly" => ["D", 7], // 7 days
"monthly" => ["D", 30], // 30 days
"yearly" => ["M", 12], // 12 months
][strtolower($period)];
$f = max(1, (int) $this->fetchString("sched:updateFrequency", "0*[1-9]\d*")); // a frequency of zero makes no sense
// divide the period by the frequency
// FIXME: we must have an integer result because PHP (incorrectly) rejects fractional intervals
// see https://bugs.php.net/bug.php?id=53831
$n = max(1, intdiv($n, $f)); // a frequency of zero still makes no sense, so we assume at least one subdivision
return new \DateInterval("P".(strlen($p) === 1 ? "" : $p[0]).$n.$p[-1]);
}
return null;
}
protected function getSchedBaseRss1(): ?Date {
return $this->fetchDate("sched:updateBase");
}
/** Computes the "skip-schedule" of an RSS feed, the set of days and hours during which a feed should not be fetched */
protected function getSchedSkipRss2(): ?int {
$out = 0;
foreach($this->fetchString("skipHours/hour", "\d+", true) ?? [] as $h) {
$out |= [
Schedule::HOUR_0,
Schedule::HOUR_1,
Schedule::HOUR_2,
Schedule::HOUR_3,
Schedule::HOUR_4,
Schedule::HOUR_5,
Schedule::HOUR_6,
Schedule::HOUR_7,
Schedule::HOUR_8,
Schedule::HOUR_9,
Schedule::HOUR_10,
Schedule::HOUR_11,
Schedule::HOUR_12,
Schedule::HOUR_13,
Schedule::HOUR_14,
Schedule::HOUR_15,
Schedule::HOUR_16,
Schedule::HOUR_17,
Schedule::HOUR_18,
Schedule::HOUR_19,
Schedule::HOUR_20,
Schedule::HOUR_21,
Schedule::HOUR_22,
Schedule::HOUR_23,
Schedule::HOUR_0,
][(int) $h] ?? 0;
}
foreach($this->fetchString("skipDays/day", null, true) ?? [] as $d) {
$out |= [
"monday" => Schedule::DAY_MON,
"tuesday" => Schedule::DAY_TUE,
"wednesday" => Schedule::DAY_WED,
"thursday" => Schedule::DAY_THU,
"friday" => Schedule::DAY_FRI,
"saturday" => Schedule::DAY_SAT,
"sunday" => Schedule::DAY_SUN,
"mon" => Schedule::DAY_MON,
"tue" => Schedule::DAY_TUE,
"wed" => Schedule::DAY_WED,
"thu" => Schedule::DAY_THU,
"fri" => Schedule::DAY_FRI,
"sat" => Schedule::DAY_SAT,
"sun" => Schedule::DAY_SUN,
][strtolower($d)] ?? 0;
}
return $out ?: null;
}
protected function getUrlAtom(): ?Url {
return $this->fetchAtomRelation("self");
}
protected function getUrlRss1(): ?Url {
return $this->fetchUrl("(self::rss1:channel|self::rss0:channel)/@rdf:about");
}
protected function getUrlPod(): ?Url {
return $this->fetchUrl("apple:new-feed-url");
}
protected function getSummaryAtom(): ?Text {
return $this->fetchAtomText("atom:summary") ?? $this->fetchAtomText("atom:subtitle");
}
protected function getSummaryRss2(): ?Text {
return $this->fetchText("description", self::TEXT_LOOSE);
}
protected function getSummaryRss1(): ?Text {
return $this->fetchText("rss1:description|rss0:description", self::TEXT_LOOSE);
}
protected function getSummaryDC(): ?Text {
return $this->fetchText("dc:description", self::TEXT_PLAIN);
}
protected function getSummaryPod(): ?Text {
return $this->fetchText("apple:summary|gplay:description", self::TEXT_PLAIN) ?? $this->fetchText("apple:subtitle", self::TEXT_PLAIN);
}
}