diff --git a/lib/Parser/JSON/Feed.php b/lib/Parser/JSON/Feed.php
index b638321..b36f289 100644
--- a/lib/Parser/JSON/Feed.php
+++ b/lib/Parser/JSON/Feed.php
@@ -122,6 +122,7 @@ class Feed implements \MensBeam\Lax\Parser\Feed {
* JSON Feed does not have categories at the feed level, so this always returns and empty collection
*/
public function getCategories(): CategoryCollection {
+ // TODO: the cast extension does add (iTunes) categories to feeds
return new CategoryCollection;
}
diff --git a/lib/Parser/XML/Construct.php b/lib/Parser/XML/Construct.php
index 733ba3f..25cde4a 100644
--- a/lib/Parser/XML/Construct.php
+++ b/lib/Parser/XML/Construct.php
@@ -19,6 +19,11 @@ abstract class Construct {
protected const TEXT_PLAIN = "plain";
protected const TEXT_HTML = "html";
+ protected const DATE_ANY = 0;
+ protected const DATE_LATEST = 1;
+ protected const DATE_EARLIEST = 2;
+ protected const DATE_ALL = 3;
+
/** @var \DOMDocument */
protected $document;
/** @var \DOMXPath */
@@ -68,22 +73,39 @@ abstract class Construct {
* Returns null if no suitable nodes were found
*
* @param string $query The XPath query of the nodes to return
- * @param bool|null $multi Whether to return multiple results as an array (true) or one result as a date object (false, default)
+ * @param bool|null $mode Whether to return the first valid date found (DATE_ANY), the earliest chronologically (DATE_EARLIEST), latest chronologically (DATE_LATEST), or all valid dates (DAATE_ALL) in a sorted array
* @param \DOMNode $context The context node for the XPath query
* @return \MensBeam\Lax\Date|array|null
*/
- protected function fetchDate(string $query, ?bool $multi = null, \DOMNode $context = null) {
+ protected function fetchDate(string $query, int $mode = self::DATE_ANY, \DOMNode $context = null) {
$out = [];
+ $tz = new \DateTimeZone("UTC");
+ assert(in_array($mode, [self::DATE_ANY, self::DATE_ALL, self::DATE_EARLIEST, self::DATE_LATEST]));
foreach((array) $this->fetchString($query, null, true, $context) as $d) {
if ($d = $this->parseDate($d ?? "")) {
- if (!$multi) {
+ if ($mode === self::DATE_ANY) {
return $d;
} else {
- $out[] = $d;
+ // add the date to the output only if it is a unique moment in time so far
+ $ts = $d->setTimezone($tz)->format("Y-m-d\TH:i:s.u\Z");
+ if (!isset($out[$ts])) {
+ $out[$ts] = $d;
+ }
}
}
}
- return $out ?: null;
+ // sort the dates earliest to latest and produce an indexed array
+ ksort($out);
+ $out = array_values($out);
+ // return based on requested mode
+ switch ($mode) {
+ case self::DATE_ALL:
+ return $out;
+ case self::DATE_EARLIEST:
+ return $out ? $out[0] : null;
+ case self::DATE_LATEST:
+ return $out ? array_pop($out) : null;
+ }
}
/** Returns the first valid URL matching an XPath query. Relative URLs are resolved when possible
diff --git a/lib/Parser/XML/Feed.php b/lib/Parser/XML/Feed.php
index 1c64594..0c283fc 100644
--- a/lib/Parser/XML/Feed.php
+++ b/lib/Parser/XML/Feed.php
@@ -88,7 +88,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
$feed->link = $this->getLink();
$feed->title = $this->getTitle();
$feed->summary = $this->getSummary();
- //$feed->dateModified = $this->getDateModified();
+ $feed->dateModified = $this->getDateModified();
//$feed->icon = $this->getIcon();
//$feed->image = $this->getImage();
//$feed->people = $this->getPeople();
@@ -135,6 +135,14 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return $this->getSummaryAtom() ?? $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryPod();
}
+ public function getDateModified(): ?Date {
+ /* fetching a date works differently from other data as only Atom has
+ well-defined semantics here. Thus the semantics of all the other
+ formats are equal, and we want the latest date, whatever it is.
+ */
+ return $this->fetchDate("atom:updated", self::DATE_LATEST) ?? $this->fetchDate("dc:date|pubDate|lastBuildDate", self::DATE_LATEST);
+ }
+
public function getCategories(): CategoryCollection {
return $this->getCategoriesAtom() ?? $this->getCategoriesRss2() ?? $this->getCategoriesDC() ?? $this->getCategoriesPod() ?? new CategoryCollection;
}
@@ -147,10 +155,6 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return $authors->merge($contributors, $editors, $webmasters);
}
- public function getDateModified(): ?Date {
- return $this->getDateModifiedAtom() ?? $this->getDateModifiedDC() ?? $this->getDateModifiedRss2();
- }
-
public function getEntries(FeedStruct $feed = null): array {
return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? [];
}
@@ -168,6 +172,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return $this->fetchString("apple:complete", "(?-i:Yes)") ? true : null; // case-sensitive pattern
}
+ /** Fetches the "time-to-live" value (a number of minutes before the feed should be re-fetched) from an RSS 2.0 feed */
protected function getSchedIntervalRss2(): ?\DateInterval {
$ttl = (int) $this->fetchString("ttl", "\d+");
if ($ttl) {
@@ -176,6 +181,12 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
return null;
}
+ /** Fetches the schedule interval from an RSS feed; this is necessarily approximate:
+ *
+ * The interval is defined in the syndication RSS extension as fractions of a period, but PHP only supports integer intervals, so we perform integer divison on the nearest subdivision of a period, returning at least one.
+ *
+ * For example, "four times monthly" first assumes a month is 30 days, and divides this by four to yield seven days.
+ */
protected function getSchedIntervalRss1(): ?\DateInterval {
$period = $this->fetchString("sched:updatePeriod", "(?:year|month|week|dai|hour)ly");
if ($period) {
diff --git a/tests/cases/AbstractParserTestCase.php b/tests/cases/AbstractParserTestCase.php
index bb705bd..264182d 100644
--- a/tests/cases/AbstractParserTestCase.php
+++ b/tests/cases/AbstractParserTestCase.php
@@ -76,6 +76,8 @@ class AbstractParserTestCase extends \PHPUnit\Framework\TestCase {
foreach ($output as $k => $v) {
if (in_array($k, ["title", "summary"])) {
$f->$k = $this->makeText($v);
+ } elseif ($k === "dateModified") {
+ $f->$k = new Date($v, new \DateTimeZone("UTC"));
} elseif ($k === "people") {
$c = new PersonCollection;
foreach ($v as $m) {
diff --git a/tests/cases/XML/feed-atom.yaml b/tests/cases/XML/feed-atom.yaml
index cccc4db..0c375f4 100644
--- a/tests/cases/XML/feed-atom.yaml
+++ b/tests/cases/XML/feed-atom.yaml
@@ -353,3 +353,45 @@ Ignored text constructs:
output:
format: atom
version: '1.0'
+
+Feed date:
+ input: >
+
+ 2020-03-03T00:00:00Z
+
+ output:
+ format: atom
+ version: '1.0'
+ dateModified: '2020-03-03T00:00:00Z'
+
+Multiple feed dates:
+ input: >
+
+ 2020-03-03T00:00:00Z
+ 2020-03-04T00:00:00Z
+
+ output:
+ format: atom
+ version: '1.0'
+ dateModified: '2020-03-04T00:00:00Z'
+
+Multiple feed date timezones:
+ input: >
+
+ 2020-03-03T00:00:00Z
+ 2020-03-03T00:00:00-04:00
+ 2020-03-03T01:00:00Z
+
+ output:
+ format: atom
+ version: '1.0'
+ dateModified: '2020-03-03T00:00:00-04:00'
+
+Bogus feed date:
+ input: >
+
+ 2020-03-03T00:00:00Z
+
+ output:
+ format: atom
+ version: '1.0'
diff --git a/tests/cases/XML/feed-rss1.yaml b/tests/cases/XML/feed-rss1.yaml
index dafce94..c2c48fb 100644
--- a/tests/cases/XML/feed-rss1.yaml
+++ b/tests/cases/XML/feed-rss1.yaml
@@ -194,9 +194,9 @@ Feed link:
input: >
-
- http://[example.net]/
- http://example.com/
+
+ http://[example.net]/
+ http://example.com/
output:
@@ -208,7 +208,7 @@ Feed title 1:
input: >
- Loose text
+ Loose text
output:
@@ -221,7 +221,7 @@ Feed title 2:
input: >
- Loose text
+ Loose text
output:
@@ -235,7 +235,7 @@ DC title:
input: >
- Plain text
+ Plain text
output:
@@ -248,7 +248,7 @@ Feed summary:
input: >
- Loose text
+ Loose text
output:
@@ -261,7 +261,7 @@ DC summary:
input: >
- Plain text
+ Plain text
output:
@@ -269,3 +269,29 @@ DC summary:
version: '1.0'
summary:
plain: 'Plain text'
+
+DC date:
+ input: >
+
+
+ 2020-03-03T00:00:00Z
+
+
+ output:
+ format: rdf
+ version: '1.0'
+ dateModified: '2020-03-03T00:00:00Z'
+
+Multiple DC dates:
+ input: >
+
+
+ 2020-03-03T00:00:00Z
+ 2020-03-03T00:00:00-04:00
+ 2020-03-03T01:00:00Z
+
+
+ output:
+ format: rdf
+ version: '1.0'
+ dateModified: '2020-03-03T00:00:00-04:00'
diff --git a/tests/cases/XML/feed-rss2.yaml b/tests/cases/XML/feed-rss2.yaml
index 6939a3b..a676416 100644
--- a/tests/cases/XML/feed-rss2.yaml
+++ b/tests/cases/XML/feed-rss2.yaml
@@ -330,3 +330,43 @@ Feed summary:
format: rss
summary:
loose: 'Loose text'
+
+Feed publication date:
+ input: >
+
+ 2020-03-03T00:00:00Z
+
+ output:
+ format: rss
+ dateModified: '2020-03-03T00:00:00Z'
+
+Feed build date:
+ input: >
+
+ 2020-03-03T00:00:00Z
+
+ output:
+ format: rss
+ dateModified: '2020-03-03T00:00:00Z'
+
+Multiple dates 1:
+ input: >
+
+ 2020-03-03T00:00:00Z
+ 2020-03-03T00:00:00-04:00
+ 2020-03-03T00:00:00Z
+
+ output:
+ format: rss
+ dateModified: '2020-03-03T00:00:00-04:00'
+
+Multiple dates 2:
+ input: >
+
+ 2020-03-03T00:00:00Z
+ 2020-03-03T00:00:00-04:00
+ 2020-03-03T00:00:00Z
+
+ output:
+ format: rss
+ dateModified: '2020-03-03T00:00:00-04:00'