Browse Source

Feed modification dates

master
J. King 4 years ago
parent
commit
1e46b5ac88
  1. 1
      lib/Parser/JSON/Feed.php
  2. 32
      lib/Parser/XML/Construct.php
  3. 21
      lib/Parser/XML/Feed.php
  4. 2
      tests/cases/AbstractParserTestCase.php
  5. 42
      tests/cases/XML/feed-atom.yaml
  6. 42
      tests/cases/XML/feed-rss1.yaml
  7. 40
      tests/cases/XML/feed-rss2.yaml

1
lib/Parser/JSON/Feed.php

@ -122,6 +122,7 @@ class Feed implements \MensBeam\Lax\Parser\Feed {
* JSON Feed does not have categories at the feed level, so this always returns and empty collection
*/
public function getCategories(): CategoryCollection {
// TODO: the cast extension does add (iTunes) categories to feeds
return new CategoryCollection;
}

32
lib/Parser/XML/Construct.php

@ -19,6 +19,11 @@ abstract class Construct {
protected const TEXT_PLAIN = "plain";
protected const TEXT_HTML = "html";
protected const DATE_ANY = 0;
protected const DATE_LATEST = 1;
protected const DATE_EARLIEST = 2;
protected const DATE_ALL = 3;
/** @var \DOMDocument */
protected $document;
/** @var \DOMXPath */
@ -68,22 +73,39 @@ abstract class Construct {
* Returns null if no suitable nodes were found
*
* @param string $query The XPath query of the nodes to return
* @param bool|null $multi Whether to return multiple results as an array (true) or one result as a date object (false, default)
* @param bool|null $mode Whether to return the first valid date found (DATE_ANY), the earliest chronologically (DATE_EARLIEST), latest chronologically (DATE_LATEST), or all valid dates (DAATE_ALL) in a sorted array
* @param \DOMNode $context The context node for the XPath query
* @return \MensBeam\Lax\Date|array|null
*/
protected function fetchDate(string $query, ?bool $multi = null, \DOMNode $context = null) {
protected function fetchDate(string $query, int $mode = self::DATE_ANY, \DOMNode $context = null) {
$out = [];
$tz = new \DateTimeZone("UTC");
assert(in_array($mode, [self::DATE_ANY, self::DATE_ALL, self::DATE_EARLIEST, self::DATE_LATEST]));
foreach((array) $this->fetchString($query, null, true, $context) as $d) {
if ($d = $this->parseDate($d ?? "")) {
if (!$multi) {
if ($mode === self::DATE_ANY) {
return $d;
} else {
$out[] = $d;
// add the date to the output only if it is a unique moment in time so far
$ts = $d->setTimezone($tz)->format("Y-m-d\TH:i:s.u\Z");
if (!isset($out[$ts])) {
$out[$ts] = $d;
}
}
}
}
return $out ?: null;
// sort the dates earliest to latest and produce an indexed array
ksort($out);
$out = array_values($out);
// return based on requested mode
switch ($mode) {
case self::DATE_ALL:
return $out;
case self::DATE_EARLIEST:
return $out ? $out[0] : null;
case self::DATE_LATEST:
return $out ? array_pop($out) : null;
}
}
/** Returns the first valid URL matching an XPath query. Relative URLs are resolved when possible

21
lib/Parser/XML/Feed.php

@ -88,7 +88,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
$feed->link = $this->getLink();
$feed->title = $this->getTitle();
$feed->summary = $this->getSummary();
//$feed->dateModified = $this->getDateModified();
$feed->dateModified = $this->getDateModified();
//$feed->icon = $this->getIcon();
//$feed->image = $this->getImage();
//$feed->people = $this->getPeople();
@ -135,6 +135,14 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return $this->getSummaryAtom() ?? $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryPod();
}
public function getDateModified(): ?Date {
/* fetching a date works differently from other data as only Atom has
well-defined semantics here. Thus the semantics of all the other
formats are equal, and we want the latest date, whatever it is.
*/
return $this->fetchDate("atom:updated", self::DATE_LATEST) ?? $this->fetchDate("dc:date|pubDate|lastBuildDate", self::DATE_LATEST);
}
public function getCategories(): CategoryCollection {
return $this->getCategoriesAtom() ?? $this->getCategoriesRss2() ?? $this->getCategoriesDC() ?? $this->getCategoriesPod() ?? new CategoryCollection;
}
@ -147,10 +155,6 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return $authors->merge($contributors, $editors, $webmasters);
}
public function getDateModified(): ?Date {
return $this->getDateModifiedAtom() ?? $this->getDateModifiedDC() ?? $this->getDateModifiedRss2();
}
public function getEntries(FeedStruct $feed = null): array {
return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? [];
}
@ -168,6 +172,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return $this->fetchString("apple:complete", "(?-i:Yes)") ? true : null; // case-sensitive pattern
}
/** Fetches the "time-to-live" value (a number of minutes before the feed should be re-fetched) from an RSS 2.0 feed */
protected function getSchedIntervalRss2(): ?\DateInterval {
$ttl = (int) $this->fetchString("ttl", "\d+");
if ($ttl) {
@ -176,6 +181,12 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return null;
}
/** Fetches the schedule interval from an RSS feed; this is necessarily approximate:
*
* The interval is defined in the syndication RSS extension as fractions of a period, but PHP only supports integer intervals, so we perform integer divison on the nearest subdivision of a period, returning at least one.
*
* For example, "four times monthly" first assumes a month is 30 days, and divides this by four to yield seven days.
*/
protected function getSchedIntervalRss1(): ?\DateInterval {
$period = $this->fetchString("sched:updatePeriod", "(?:year|month|week|dai|hour)ly");
if ($period) {

2
tests/cases/AbstractParserTestCase.php

@ -76,6 +76,8 @@ class AbstractParserTestCase extends \PHPUnit\Framework\TestCase {
foreach ($output as $k => $v) {
if (in_array($k, ["title", "summary"])) {
$f->$k = $this->makeText($v);
} elseif ($k === "dateModified") {
$f->$k = new Date($v, new \DateTimeZone("UTC"));
} elseif ($k === "people") {
$c = new PersonCollection;
foreach ($v as $m) {

42
tests/cases/XML/feed-atom.yaml

@ -353,3 +353,45 @@ Ignored text constructs:
output:
format: atom
version: '1.0'
Feed date:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<updated>2020-03-03T00:00:00Z</updated>
</feed>
output:
format: atom
version: '1.0'
dateModified: '2020-03-03T00:00:00Z'
Multiple feed dates:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<updated>2020-03-03T00:00:00Z</updated>
<updated>2020-03-04T00:00:00Z</updated>
</feed>
output:
format: atom
version: '1.0'
dateModified: '2020-03-04T00:00:00Z'
Multiple feed date timezones:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<updated>2020-03-03T00:00:00Z</updated>
<updated>2020-03-03T00:00:00-04:00</updated>
<updated>2020-03-03T01:00:00Z</updated>
</feed>
output:
format: atom
version: '1.0'
dateModified: '2020-03-03T00:00:00-04:00'
Bogus feed date:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<created>2020-03-03T00:00:00Z</created>
</feed>
output:
format: atom
version: '1.0'

42
tests/cases/XML/feed-rss1.yaml

@ -194,9 +194,9 @@ Feed link:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel>
<link/>
<link>http://[example.net]/</link>
<link>http://example.com/</link>
<link/>
<link>http://[example.net]/</link>
<link>http://example.com/</link>
</channel>
</rdf:RDF>
output:
@ -208,7 +208,7 @@ Feed title 1:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel>
<title> Loose text</title>
<title> Loose text</title>
</channel>
</rdf:RDF>
output:
@ -221,7 +221,7 @@ Feed title 2:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel>
<title xml:base="https://example.com/"> Loose text</title>
<title xml:base="https://example.com/"> Loose text</title>
</channel>
</rdf:RDF>
output:
@ -235,7 +235,7 @@ DC title:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<dc:title xml:base="https://example.com/"> Plain text</dc:title>
<dc:title xml:base="https://example.com/"> Plain text</dc:title>
</channel>
</rdf:RDF>
output:
@ -248,7 +248,7 @@ Feed summary:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel>
<description>Loose text</description>
<description>Loose text</description>
</channel>
</rdf:RDF>
output:
@ -261,7 +261,7 @@ DC summary:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<dc:description>Plain text</dc:description>
<dc:description>Plain text</dc:description>
</channel>
</rdf:RDF>
output:
@ -269,3 +269,29 @@ DC summary:
version: '1.0'
summary:
plain: 'Plain text'
DC date:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<dc:date>2020-03-03T00:00:00Z</dc:date>
</channel>
</rdf:RDF>
output:
format: rdf
version: '1.0'
dateModified: '2020-03-03T00:00:00Z'
Multiple DC dates:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<dc:date>2020-03-03T00:00:00Z</dc:date>
<dc:date>2020-03-03T00:00:00-04:00</dc:date>
<dc:date>2020-03-03T01:00:00Z</dc:date>
</channel>
</rdf:RDF>
output:
format: rdf
version: '1.0'
dateModified: '2020-03-03T00:00:00-04:00'

40
tests/cases/XML/feed-rss2.yaml

@ -330,3 +330,43 @@ Feed summary:
format: rss
summary:
loose: 'Loose text'
Feed publication date:
input: >
<rss><channel>
<pubDate>2020-03-03T00:00:00Z</pubDate>
</channel></rss>
output:
format: rss
dateModified: '2020-03-03T00:00:00Z'
Feed build date:
input: >
<rss><channel>
<lastBuildDate>2020-03-03T00:00:00Z</lastBuildDate>
</channel></rss>
output:
format: rss
dateModified: '2020-03-03T00:00:00Z'
Multiple dates 1:
input: >
<rss><channel>
<pubDate>2020-03-03T00:00:00Z</pubDate>
<pubDate>2020-03-03T00:00:00-04:00</pubDate>
<lastBuildDate>2020-03-03T00:00:00Z</lastBuildDate>
</channel></rss>
output:
format: rss
dateModified: '2020-03-03T00:00:00-04:00'
Multiple dates 2:
input: >
<rss><channel>
<pubDate>2020-03-03T00:00:00Z</pubDate>
<lastBuildDate>2020-03-03T00:00:00-04:00</lastBuildDate>
<pubDate>2020-03-03T00:00:00Z</pubDate>
</channel></rss>
output:
format: rss
dateModified: '2020-03-03T00:00:00-04:00'

Loading…
Cancel
Save