diff --git a/lib/Parser/JSON/Feed.php b/lib/Parser/JSON/Feed.php index b638321..b36f289 100644 --- a/lib/Parser/JSON/Feed.php +++ b/lib/Parser/JSON/Feed.php @@ -122,6 +122,7 @@ class Feed implements \MensBeam\Lax\Parser\Feed { * JSON Feed does not have categories at the feed level, so this always returns and empty collection */ public function getCategories(): CategoryCollection { + // TODO: the cast extension does add (iTunes) categories to feeds return new CategoryCollection; } diff --git a/lib/Parser/XML/Construct.php b/lib/Parser/XML/Construct.php index 733ba3f..25cde4a 100644 --- a/lib/Parser/XML/Construct.php +++ b/lib/Parser/XML/Construct.php @@ -19,6 +19,11 @@ abstract class Construct { protected const TEXT_PLAIN = "plain"; protected const TEXT_HTML = "html"; + protected const DATE_ANY = 0; + protected const DATE_LATEST = 1; + protected const DATE_EARLIEST = 2; + protected const DATE_ALL = 3; + /** @var \DOMDocument */ protected $document; /** @var \DOMXPath */ @@ -68,22 +73,39 @@ abstract class Construct { * Returns null if no suitable nodes were found * * @param string $query The XPath query of the nodes to return - * @param bool|null $multi Whether to return multiple results as an array (true) or one result as a date object (false, default) + * @param bool|null $mode Whether to return the first valid date found (DATE_ANY), the earliest chronologically (DATE_EARLIEST), latest chronologically (DATE_LATEST), or all valid dates (DAATE_ALL) in a sorted array * @param \DOMNode $context The context node for the XPath query * @return \MensBeam\Lax\Date|array|null */ - protected function fetchDate(string $query, ?bool $multi = null, \DOMNode $context = null) { + protected function fetchDate(string $query, int $mode = self::DATE_ANY, \DOMNode $context = null) { $out = []; + $tz = new \DateTimeZone("UTC"); + assert(in_array($mode, [self::DATE_ANY, self::DATE_ALL, self::DATE_EARLIEST, self::DATE_LATEST])); foreach((array) $this->fetchString($query, null, true, $context) as $d) { if ($d = $this->parseDate($d ?? "")) { - if (!$multi) { + if ($mode === self::DATE_ANY) { return $d; } else { - $out[] = $d; + // add the date to the output only if it is a unique moment in time so far + $ts = $d->setTimezone($tz)->format("Y-m-d\TH:i:s.u\Z"); + if (!isset($out[$ts])) { + $out[$ts] = $d; + } } } } - return $out ?: null; + // sort the dates earliest to latest and produce an indexed array + ksort($out); + $out = array_values($out); + // return based on requested mode + switch ($mode) { + case self::DATE_ALL: + return $out; + case self::DATE_EARLIEST: + return $out ? $out[0] : null; + case self::DATE_LATEST: + return $out ? array_pop($out) : null; + } } /** Returns the first valid URL matching an XPath query. Relative URLs are resolved when possible diff --git a/lib/Parser/XML/Feed.php b/lib/Parser/XML/Feed.php index 1c64594..0c283fc 100644 --- a/lib/Parser/XML/Feed.php +++ b/lib/Parser/XML/Feed.php @@ -88,7 +88,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { $feed->link = $this->getLink(); $feed->title = $this->getTitle(); $feed->summary = $this->getSummary(); - //$feed->dateModified = $this->getDateModified(); + $feed->dateModified = $this->getDateModified(); //$feed->icon = $this->getIcon(); //$feed->image = $this->getImage(); //$feed->people = $this->getPeople(); @@ -135,6 +135,14 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { return $this->getSummaryAtom() ?? $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryPod(); } + public function getDateModified(): ?Date { + /* fetching a date works differently from other data as only Atom has + well-defined semantics here. Thus the semantics of all the other + formats are equal, and we want the latest date, whatever it is. + */ + return $this->fetchDate("atom:updated", self::DATE_LATEST) ?? $this->fetchDate("dc:date|pubDate|lastBuildDate", self::DATE_LATEST); + } + public function getCategories(): CategoryCollection { return $this->getCategoriesAtom() ?? $this->getCategoriesRss2() ?? $this->getCategoriesDC() ?? $this->getCategoriesPod() ?? new CategoryCollection; } @@ -147,10 +155,6 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { return $authors->merge($contributors, $editors, $webmasters); } - public function getDateModified(): ?Date { - return $this->getDateModifiedAtom() ?? $this->getDateModifiedDC() ?? $this->getDateModifiedRss2(); - } - public function getEntries(FeedStruct $feed = null): array { return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? []; } @@ -168,6 +172,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { return $this->fetchString("apple:complete", "(?-i:Yes)") ? true : null; // case-sensitive pattern } + /** Fetches the "time-to-live" value (a number of minutes before the feed should be re-fetched) from an RSS 2.0 feed */ protected function getSchedIntervalRss2(): ?\DateInterval { $ttl = (int) $this->fetchString("ttl", "\d+"); if ($ttl) { @@ -176,6 +181,12 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { return null; } + /** Fetches the schedule interval from an RSS feed; this is necessarily approximate: + * + * The interval is defined in the syndication RSS extension as fractions of a period, but PHP only supports integer intervals, so we perform integer divison on the nearest subdivision of a period, returning at least one. + * + * For example, "four times monthly" first assumes a month is 30 days, and divides this by four to yield seven days. + */ protected function getSchedIntervalRss1(): ?\DateInterval { $period = $this->fetchString("sched:updatePeriod", "(?:year|month|week|dai|hour)ly"); if ($period) { diff --git a/tests/cases/AbstractParserTestCase.php b/tests/cases/AbstractParserTestCase.php index bb705bd..264182d 100644 --- a/tests/cases/AbstractParserTestCase.php +++ b/tests/cases/AbstractParserTestCase.php @@ -76,6 +76,8 @@ class AbstractParserTestCase extends \PHPUnit\Framework\TestCase { foreach ($output as $k => $v) { if (in_array($k, ["title", "summary"])) { $f->$k = $this->makeText($v); + } elseif ($k === "dateModified") { + $f->$k = new Date($v, new \DateTimeZone("UTC")); } elseif ($k === "people") { $c = new PersonCollection; foreach ($v as $m) { diff --git a/tests/cases/XML/feed-atom.yaml b/tests/cases/XML/feed-atom.yaml index cccc4db..0c375f4 100644 --- a/tests/cases/XML/feed-atom.yaml +++ b/tests/cases/XML/feed-atom.yaml @@ -353,3 +353,45 @@ Ignored text constructs: output: format: atom version: '1.0' + +Feed date: + input: > + + 2020-03-03T00:00:00Z + + output: + format: atom + version: '1.0' + dateModified: '2020-03-03T00:00:00Z' + +Multiple feed dates: + input: > + + 2020-03-03T00:00:00Z + 2020-03-04T00:00:00Z + + output: + format: atom + version: '1.0' + dateModified: '2020-03-04T00:00:00Z' + +Multiple feed date timezones: + input: > + + 2020-03-03T00:00:00Z + 2020-03-03T00:00:00-04:00 + 2020-03-03T01:00:00Z + + output: + format: atom + version: '1.0' + dateModified: '2020-03-03T00:00:00-04:00' + +Bogus feed date: + input: > + + 2020-03-03T00:00:00Z + + output: + format: atom + version: '1.0' diff --git a/tests/cases/XML/feed-rss1.yaml b/tests/cases/XML/feed-rss1.yaml index dafce94..c2c48fb 100644 --- a/tests/cases/XML/feed-rss1.yaml +++ b/tests/cases/XML/feed-rss1.yaml @@ -194,9 +194,9 @@ Feed link: input: > - - http://[example.net]/ - http://example.com/ + + http://[example.net]/ + http://example.com/ output: @@ -208,7 +208,7 @@ Feed title 1: input: > - Loose text + Loose text output: @@ -221,7 +221,7 @@ Feed title 2: input: > - Loose text + Loose text output: @@ -235,7 +235,7 @@ DC title: input: > - Plain text + Plain text output: @@ -248,7 +248,7 @@ Feed summary: input: > - Loose text + Loose text output: @@ -261,7 +261,7 @@ DC summary: input: > - Plain text + Plain text output: @@ -269,3 +269,29 @@ DC summary: version: '1.0' summary: plain: 'Plain text' + +DC date: + input: > + + + 2020-03-03T00:00:00Z + + + output: + format: rdf + version: '1.0' + dateModified: '2020-03-03T00:00:00Z' + +Multiple DC dates: + input: > + + + 2020-03-03T00:00:00Z + 2020-03-03T00:00:00-04:00 + 2020-03-03T01:00:00Z + + + output: + format: rdf + version: '1.0' + dateModified: '2020-03-03T00:00:00-04:00' diff --git a/tests/cases/XML/feed-rss2.yaml b/tests/cases/XML/feed-rss2.yaml index 6939a3b..a676416 100644 --- a/tests/cases/XML/feed-rss2.yaml +++ b/tests/cases/XML/feed-rss2.yaml @@ -330,3 +330,43 @@ Feed summary: format: rss summary: loose: 'Loose text' + +Feed publication date: + input: > + + 2020-03-03T00:00:00Z + + output: + format: rss + dateModified: '2020-03-03T00:00:00Z' + +Feed build date: + input: > + + 2020-03-03T00:00:00Z + + output: + format: rss + dateModified: '2020-03-03T00:00:00Z' + +Multiple dates 1: + input: > + + 2020-03-03T00:00:00Z + 2020-03-03T00:00:00-04:00 + 2020-03-03T00:00:00Z + + output: + format: rss + dateModified: '2020-03-03T00:00:00-04:00' + +Multiple dates 2: + input: > + + 2020-03-03T00:00:00Z + 2020-03-03T00:00:00-04:00 + 2020-03-03T00:00:00Z + + output: + format: rss + dateModified: '2020-03-03T00:00:00-04:00'