Browse Source

Feed modification dates

master
J. King 4 years ago
parent
commit
1e46b5ac88
  1. 1
      lib/Parser/JSON/Feed.php
  2. 32
      lib/Parser/XML/Construct.php
  3. 21
      lib/Parser/XML/Feed.php
  4. 2
      tests/cases/AbstractParserTestCase.php
  5. 42
      tests/cases/XML/feed-atom.yaml
  6. 42
      tests/cases/XML/feed-rss1.yaml
  7. 40
      tests/cases/XML/feed-rss2.yaml

1
lib/Parser/JSON/Feed.php

@ -122,6 +122,7 @@ class Feed implements \MensBeam\Lax\Parser\Feed {
* JSON Feed does not have categories at the feed level, so this always returns and empty collection * JSON Feed does not have categories at the feed level, so this always returns and empty collection
*/ */
public function getCategories(): CategoryCollection { public function getCategories(): CategoryCollection {
// TODO: the cast extension does add (iTunes) categories to feeds
return new CategoryCollection; return new CategoryCollection;
} }

32
lib/Parser/XML/Construct.php

@ -19,6 +19,11 @@ abstract class Construct {
protected const TEXT_PLAIN = "plain"; protected const TEXT_PLAIN = "plain";
protected const TEXT_HTML = "html"; protected const TEXT_HTML = "html";
protected const DATE_ANY = 0;
protected const DATE_LATEST = 1;
protected const DATE_EARLIEST = 2;
protected const DATE_ALL = 3;
/** @var \DOMDocument */ /** @var \DOMDocument */
protected $document; protected $document;
/** @var \DOMXPath */ /** @var \DOMXPath */
@ -68,22 +73,39 @@ abstract class Construct {
* Returns null if no suitable nodes were found * Returns null if no suitable nodes were found
* *
* @param string $query The XPath query of the nodes to return * @param string $query The XPath query of the nodes to return
* @param bool|null $multi Whether to return multiple results as an array (true) or one result as a date object (false, default) * @param bool|null $mode Whether to return the first valid date found (DATE_ANY), the earliest chronologically (DATE_EARLIEST), latest chronologically (DATE_LATEST), or all valid dates (DAATE_ALL) in a sorted array
* @param \DOMNode $context The context node for the XPath query * @param \DOMNode $context The context node for the XPath query
* @return \MensBeam\Lax\Date|array|null * @return \MensBeam\Lax\Date|array|null
*/ */
protected function fetchDate(string $query, ?bool $multi = null, \DOMNode $context = null) { protected function fetchDate(string $query, int $mode = self::DATE_ANY, \DOMNode $context = null) {
$out = []; $out = [];
$tz = new \DateTimeZone("UTC");
assert(in_array($mode, [self::DATE_ANY, self::DATE_ALL, self::DATE_EARLIEST, self::DATE_LATEST]));
foreach((array) $this->fetchString($query, null, true, $context) as $d) { foreach((array) $this->fetchString($query, null, true, $context) as $d) {
if ($d = $this->parseDate($d ?? "")) { if ($d = $this->parseDate($d ?? "")) {
if (!$multi) { if ($mode === self::DATE_ANY) {
return $d; return $d;
} else { } else {
$out[] = $d; // add the date to the output only if it is a unique moment in time so far
$ts = $d->setTimezone($tz)->format("Y-m-d\TH:i:s.u\Z");
if (!isset($out[$ts])) {
$out[$ts] = $d;
}
} }
} }
} }
return $out ?: null; // sort the dates earliest to latest and produce an indexed array
ksort($out);
$out = array_values($out);
// return based on requested mode
switch ($mode) {
case self::DATE_ALL:
return $out;
case self::DATE_EARLIEST:
return $out ? $out[0] : null;
case self::DATE_LATEST:
return $out ? array_pop($out) : null;
}
} }
/** Returns the first valid URL matching an XPath query. Relative URLs are resolved when possible /** Returns the first valid URL matching an XPath query. Relative URLs are resolved when possible

21
lib/Parser/XML/Feed.php

@ -88,7 +88,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
$feed->link = $this->getLink(); $feed->link = $this->getLink();
$feed->title = $this->getTitle(); $feed->title = $this->getTitle();
$feed->summary = $this->getSummary(); $feed->summary = $this->getSummary();
//$feed->dateModified = $this->getDateModified(); $feed->dateModified = $this->getDateModified();
//$feed->icon = $this->getIcon(); //$feed->icon = $this->getIcon();
//$feed->image = $this->getImage(); //$feed->image = $this->getImage();
//$feed->people = $this->getPeople(); //$feed->people = $this->getPeople();
@ -135,6 +135,14 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return $this->getSummaryAtom() ?? $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryPod(); return $this->getSummaryAtom() ?? $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryPod();
} }
public function getDateModified(): ?Date {
/* fetching a date works differently from other data as only Atom has
well-defined semantics here. Thus the semantics of all the other
formats are equal, and we want the latest date, whatever it is.
*/
return $this->fetchDate("atom:updated", self::DATE_LATEST) ?? $this->fetchDate("dc:date|pubDate|lastBuildDate", self::DATE_LATEST);
}
public function getCategories(): CategoryCollection { public function getCategories(): CategoryCollection {
return $this->getCategoriesAtom() ?? $this->getCategoriesRss2() ?? $this->getCategoriesDC() ?? $this->getCategoriesPod() ?? new CategoryCollection; return $this->getCategoriesAtom() ?? $this->getCategoriesRss2() ?? $this->getCategoriesDC() ?? $this->getCategoriesPod() ?? new CategoryCollection;
} }
@ -147,10 +155,6 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return $authors->merge($contributors, $editors, $webmasters); return $authors->merge($contributors, $editors, $webmasters);
} }
public function getDateModified(): ?Date {
return $this->getDateModifiedAtom() ?? $this->getDateModifiedDC() ?? $this->getDateModifiedRss2();
}
public function getEntries(FeedStruct $feed = null): array { public function getEntries(FeedStruct $feed = null): array {
return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? []; return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? [];
} }
@ -168,6 +172,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return $this->fetchString("apple:complete", "(?-i:Yes)") ? true : null; // case-sensitive pattern return $this->fetchString("apple:complete", "(?-i:Yes)") ? true : null; // case-sensitive pattern
} }
/** Fetches the "time-to-live" value (a number of minutes before the feed should be re-fetched) from an RSS 2.0 feed */
protected function getSchedIntervalRss2(): ?\DateInterval { protected function getSchedIntervalRss2(): ?\DateInterval {
$ttl = (int) $this->fetchString("ttl", "\d+"); $ttl = (int) $this->fetchString("ttl", "\d+");
if ($ttl) { if ($ttl) {
@ -176,6 +181,12 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return null; return null;
} }
/** Fetches the schedule interval from an RSS feed; this is necessarily approximate:
*
* The interval is defined in the syndication RSS extension as fractions of a period, but PHP only supports integer intervals, so we perform integer divison on the nearest subdivision of a period, returning at least one.
*
* For example, "four times monthly" first assumes a month is 30 days, and divides this by four to yield seven days.
*/
protected function getSchedIntervalRss1(): ?\DateInterval { protected function getSchedIntervalRss1(): ?\DateInterval {
$period = $this->fetchString("sched:updatePeriod", "(?:year|month|week|dai|hour)ly"); $period = $this->fetchString("sched:updatePeriod", "(?:year|month|week|dai|hour)ly");
if ($period) { if ($period) {

2
tests/cases/AbstractParserTestCase.php

@ -76,6 +76,8 @@ class AbstractParserTestCase extends \PHPUnit\Framework\TestCase {
foreach ($output as $k => $v) { foreach ($output as $k => $v) {
if (in_array($k, ["title", "summary"])) { if (in_array($k, ["title", "summary"])) {
$f->$k = $this->makeText($v); $f->$k = $this->makeText($v);
} elseif ($k === "dateModified") {
$f->$k = new Date($v, new \DateTimeZone("UTC"));
} elseif ($k === "people") { } elseif ($k === "people") {
$c = new PersonCollection; $c = new PersonCollection;
foreach ($v as $m) { foreach ($v as $m) {

42
tests/cases/XML/feed-atom.yaml

@ -353,3 +353,45 @@ Ignored text constructs:
output: output:
format: atom format: atom
version: '1.0' version: '1.0'
Feed date:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<updated>2020-03-03T00:00:00Z</updated>
</feed>
output:
format: atom
version: '1.0'
dateModified: '2020-03-03T00:00:00Z'
Multiple feed dates:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<updated>2020-03-03T00:00:00Z</updated>
<updated>2020-03-04T00:00:00Z</updated>
</feed>
output:
format: atom
version: '1.0'
dateModified: '2020-03-04T00:00:00Z'
Multiple feed date timezones:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<updated>2020-03-03T00:00:00Z</updated>
<updated>2020-03-03T00:00:00-04:00</updated>
<updated>2020-03-03T01:00:00Z</updated>
</feed>
output:
format: atom
version: '1.0'
dateModified: '2020-03-03T00:00:00-04:00'
Bogus feed date:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<created>2020-03-03T00:00:00Z</created>
</feed>
output:
format: atom
version: '1.0'

42
tests/cases/XML/feed-rss1.yaml

@ -194,9 +194,9 @@ Feed link:
input: > input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel> <channel>
<link/> <link/>
<link>http://[example.net]/</link> <link>http://[example.net]/</link>
<link>http://example.com/</link> <link>http://example.com/</link>
</channel> </channel>
</rdf:RDF> </rdf:RDF>
output: output:
@ -208,7 +208,7 @@ Feed title 1:
input: > input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel> <channel>
<title> Loose text</title> <title> Loose text</title>
</channel> </channel>
</rdf:RDF> </rdf:RDF>
output: output:
@ -221,7 +221,7 @@ Feed title 2:
input: > input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel> <channel>
<title xml:base="https://example.com/"> Loose text</title> <title xml:base="https://example.com/"> Loose text</title>
</channel> </channel>
</rdf:RDF> </rdf:RDF>
output: output:
@ -235,7 +235,7 @@ DC title:
input: > input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/"> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel> <channel>
<dc:title xml:base="https://example.com/"> Plain text</dc:title> <dc:title xml:base="https://example.com/"> Plain text</dc:title>
</channel> </channel>
</rdf:RDF> </rdf:RDF>
output: output:
@ -248,7 +248,7 @@ Feed summary:
input: > input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel> <channel>
<description>Loose text</description> <description>Loose text</description>
</channel> </channel>
</rdf:RDF> </rdf:RDF>
output: output:
@ -261,7 +261,7 @@ DC summary:
input: > input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/"> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel> <channel>
<dc:description>Plain text</dc:description> <dc:description>Plain text</dc:description>
</channel> </channel>
</rdf:RDF> </rdf:RDF>
output: output:
@ -269,3 +269,29 @@ DC summary:
version: '1.0' version: '1.0'
summary: summary:
plain: 'Plain text' plain: 'Plain text'
DC date:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<dc:date>2020-03-03T00:00:00Z</dc:date>
</channel>
</rdf:RDF>
output:
format: rdf
version: '1.0'
dateModified: '2020-03-03T00:00:00Z'
Multiple DC dates:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<dc:date>2020-03-03T00:00:00Z</dc:date>
<dc:date>2020-03-03T00:00:00-04:00</dc:date>
<dc:date>2020-03-03T01:00:00Z</dc:date>
</channel>
</rdf:RDF>
output:
format: rdf
version: '1.0'
dateModified: '2020-03-03T00:00:00-04:00'

40
tests/cases/XML/feed-rss2.yaml

@ -330,3 +330,43 @@ Feed summary:
format: rss format: rss
summary: summary:
loose: 'Loose text' loose: 'Loose text'
Feed publication date:
input: >
<rss><channel>
<pubDate>2020-03-03T00:00:00Z</pubDate>
</channel></rss>
output:
format: rss
dateModified: '2020-03-03T00:00:00Z'
Feed build date:
input: >
<rss><channel>
<lastBuildDate>2020-03-03T00:00:00Z</lastBuildDate>
</channel></rss>
output:
format: rss
dateModified: '2020-03-03T00:00:00Z'
Multiple dates 1:
input: >
<rss><channel>
<pubDate>2020-03-03T00:00:00Z</pubDate>
<pubDate>2020-03-03T00:00:00-04:00</pubDate>
<lastBuildDate>2020-03-03T00:00:00Z</lastBuildDate>
</channel></rss>
output:
format: rss
dateModified: '2020-03-03T00:00:00-04:00'
Multiple dates 2:
input: >
<rss><channel>
<pubDate>2020-03-03T00:00:00Z</pubDate>
<lastBuildDate>2020-03-03T00:00:00-04:00</lastBuildDate>
<pubDate>2020-03-03T00:00:00Z</pubDate>
</channel></rss>
output:
format: rss
dateModified: '2020-03-03T00:00:00-04:00'

Loading…
Cancel
Save