diff --git a/lib/Parser/XML/Construct.php b/lib/Parser/XML/Construct.php index fe609fc..5c3a41d 100644 --- a/lib/Parser/XML/Construct.php +++ b/lib/Parser/XML/Construct.php @@ -26,6 +26,9 @@ abstract class Construct { protected const DATE_EARLIEST = 2; protected const DATE_ALL = 3; + protected const QUERY_AMBIGUOUS_DATES = "rss2:pubDate|rss2:lastBuildDate|dc:date|dc:available|dc:dateAccepted|dc:dateCopyrighted|dc:dateSubmitted|dc:issued|dc:modified|dc:valid|dct:date|dct:available|dct:dateAccepted|dct:dateCopyrighted|dct:dateSubmitted|dct:issued|dct:modified|dct:valid"; + protected const QUERY_RSS_PERMALINK = "rss2:guid[not(@isPermaLink) or @isPermaLink='true']"; + /** @var \DOMDocument */ protected $document; /** @var \DOMXPath */ @@ -359,7 +362,7 @@ abstract class Construct { } protected function getLinkRss2(): ?Url { - return $this->fetchUrl("rss2:link") ?? $this->fetchUrl("rss2:guid[not(@isPermalink) or @isPermalink='true']"); + return $this->fetchUrl("rss2:link") ?? $this->fetchUrl(self::QUERY_RSS_PERMALINK); } protected function getLinkRss1(): ?Url { diff --git a/lib/Parser/XML/Entry.php b/lib/Parser/XML/Entry.php index 93867f5..13ce060 100644 --- a/lib/Parser/XML/Entry.php +++ b/lib/Parser/XML/Entry.php @@ -88,8 +88,8 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry { well-defined semantics here. Thus the semantics of all the other formats are equal, and we want the latest date, whatever it is. */ - return $this->fetchDate("atom:updated", self::DATE_LATEST) // Atom update date - ?? $this->fetchDate("dc:date|rss2:pubDate|rss2:lastBuildDate", self::DATE_LATEST); // Latest other datee + return $this->fetchDate("atom:updated", self::DATE_LATEST) // Atom update date + ?? $this->fetchDate(self::QUERY_AMBIGUOUS_DATES, self::DATE_LATEST); // Latest other datee } public function getDateCreated(): ?Date { @@ -98,8 +98,9 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry { formats are equal, and we want the earliest date, but only if there are at least two */ - return $this->fetchDate("atom:created", self::DATE_EARLIEST) // Atom creation date - ?? $this->getAssumedDateCreated(); // Earliest other date + return $this->fetchDate("atom:created", self::DATE_EARLIEST) // Atom creation date + ?? $this->fetchDate("dct:created|dc:created", self::DATE_LATEST) // Dublin Core creation date + ?? $this->getAssumedDateCreated(); // Earliest other date } public function getContent(): ?Text { @@ -146,7 +147,7 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry { */ protected function getLinkAndRelatedRss2(): array { $link = $this->fetchUrl("rss2:link"); - $guid = $this->fetchUrl("rss2:guid[not(@isPermalink) or @isPermalink='true']"); + $guid = $this->fetchUrl(self::QUERY_RSS_PERMALINK); if ($link && $guid) { if ($link->getScheme() !== $guid->getScheme() || $link->getAuthority() !== $guid->getAuthority()) { return [$guid, $link]; @@ -156,7 +157,7 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry { } protected function getAssumedDateCreated(): ?Date { - $dates = $this->fetchDate("dc:date|rss2:pubDate|rss2:lastBuildDate", self::DATE_ALL); + $dates = $this->fetchDate(self::QUERY_AMBIGUOUS_DATES, self::DATE_ALL); if (sizeof($dates) > 1) { return $dates[0]; } diff --git a/lib/Parser/XML/Feed.php b/lib/Parser/XML/Feed.php index a0f460c..5f2cfeb 100644 --- a/lib/Parser/XML/Feed.php +++ b/lib/Parser/XML/Feed.php @@ -164,7 +164,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { formats are equal, and we want the latest date, whatever it is. */ return $this->fetchDate("atom:updated", self::DATE_LATEST) - ?? $this->fetchDate("dc:date|rss2:pubDate|rss2:lastBuildDate", self::DATE_LATEST); + ?? $this->fetchDate(self::QUERY_AMBIGUOUS_DATES, self::DATE_LATEST); } public function getIcon(): ?Url { diff --git a/lib/Parser/XML/XPath.php b/lib/Parser/XML/XPath.php index 365c9c7..b096440 100644 --- a/lib/Parser/XML/XPath.php +++ b/lib/Parser/XML/XPath.php @@ -13,6 +13,7 @@ class XPath extends \DOMXpath { 'rss1' => "http://purl.org/rss/1.0/", // RDF site summary 1.0 http://purl.org/rss/1.0/spec 'rss0' => "http://channel.netscape.com/rdf/simple/0.9/", // RDF Site Summary 0.90 http://www.rssboard.org/rss-0-9-0 'dc' => "http://purl.org/dc/elements/1.1/", // Dublin Core metadata http://purl.org/rss/1.0/modules/dc/ + 'dct' => "http://purl.org/dc/terms/", // Dublin Core terms https://web.archive.org/web/20071222055924/http://web.resource.org/rss/1.0/modules/dcterms/ 'sched' => "http://purl.org/rss/1.0/modules/syndication/", // Syndication schedule extension http://purl.org/rss/1.0/modules/syndication/ 'enc' => "http://purl.org/rss/1.0/modules/content/", // Explicitly encoded content extension http://purl.org/rss/1.0/modules/content/ 'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss diff --git a/tests/cases/XML/entry-rss2.yaml b/tests/cases/XML/entry-rss2.yaml index 90faa24..bf9bc8b 100644 --- a/tests/cases/XML/entry-rss2.yaml +++ b/tests/cases/XML/entry-rss2.yaml @@ -10,7 +10,7 @@ GUID: input: > - blah + blah output: @@ -22,7 +22,7 @@ Language: input: > - blah + blah fr @@ -36,7 +36,7 @@ Entry link: input: > - http://example.com/ + http://example.com/ http://example.com/ @@ -58,7 +58,7 @@ Related link: input: > - http://example.com/ + http://example.com/ http://example.net/ @@ -66,7 +66,7 @@ Related link: http://example.net/ - http://example.com/ + http://example.com/ http://example.net/ diff --git a/tests/cases/XML/feed-rss2.yaml b/tests/cases/XML/feed-rss2.yaml index 67bfa71..a60365d 100644 --- a/tests/cases/XML/feed-rss2.yaml +++ b/tests/cases/XML/feed-rss2.yaml @@ -9,7 +9,7 @@ Channel GUID: input: > - http://example.com/ + http://example.com/ output: @@ -20,7 +20,7 @@ Channel GUID with whitespace: input: > - + http://example.com/ @@ -33,7 +33,7 @@ Root GUID: # Any elements on the RSS2 root element should be ignored input: > - http://example.com/ + http://example.com/ output: format: rss @@ -43,7 +43,7 @@ Bogus GUID before good: - http://example.com/ + http://example.com/ output: @@ -255,7 +255,7 @@ Feed link via GUID 1: Feed link via GUID 2: input: > - http://example.com/ + http://example.com/ output: format: rss @@ -265,7 +265,7 @@ Feed link via GUID 2: GUID not a link: input: > - http://example.com/ + http://example.com/ output: format: rss