Browse Source

Fix capitalization of isPermaLink

Also add support for DC Terms dates; tests to come
master
J. King 4 years ago
parent
commit
915852bb2b
  1. 5
      lib/Parser/XML/Construct.php
  2. 13
      lib/Parser/XML/Entry.php
  3. 2
      lib/Parser/XML/Feed.php
  4. 1
      lib/Parser/XML/XPath.php
  5. 10
      tests/cases/XML/entry-rss2.yaml
  6. 12
      tests/cases/XML/feed-rss2.yaml

5
lib/Parser/XML/Construct.php

@ -26,6 +26,9 @@ abstract class Construct {
protected const DATE_EARLIEST = 2;
protected const DATE_ALL = 3;
protected const QUERY_AMBIGUOUS_DATES = "rss2:pubDate|rss2:lastBuildDate|dc:date|dc:available|dc:dateAccepted|dc:dateCopyrighted|dc:dateSubmitted|dc:issued|dc:modified|dc:valid|dct:date|dct:available|dct:dateAccepted|dct:dateCopyrighted|dct:dateSubmitted|dct:issued|dct:modified|dct:valid";
protected const QUERY_RSS_PERMALINK = "rss2:guid[not(@isPermaLink) or @isPermaLink='true']";
/** @var \DOMDocument */
protected $document;
/** @var \DOMXPath */
@ -359,7 +362,7 @@ abstract class Construct {
}
protected function getLinkRss2(): ?Url {
return $this->fetchUrl("rss2:link") ?? $this->fetchUrl("rss2:guid[not(@isPermalink) or @isPermalink='true']");
return $this->fetchUrl("rss2:link") ?? $this->fetchUrl(self::QUERY_RSS_PERMALINK);
}
protected function getLinkRss1(): ?Url {

13
lib/Parser/XML/Entry.php

@ -88,8 +88,8 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry {
well-defined semantics here. Thus the semantics of all the other
formats are equal, and we want the latest date, whatever it is.
*/
return $this->fetchDate("atom:updated", self::DATE_LATEST) // Atom update date
?? $this->fetchDate("dc:date|rss2:pubDate|rss2:lastBuildDate", self::DATE_LATEST); // Latest other datee
return $this->fetchDate("atom:updated", self::DATE_LATEST) // Atom update date
?? $this->fetchDate(self::QUERY_AMBIGUOUS_DATES, self::DATE_LATEST); // Latest other datee
}
public function getDateCreated(): ?Date {
@ -98,8 +98,9 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry {
formats are equal, and we want the earliest date, but only if
there are at least two
*/
return $this->fetchDate("atom:created", self::DATE_EARLIEST) // Atom creation date
?? $this->getAssumedDateCreated(); // Earliest other date
return $this->fetchDate("atom:created", self::DATE_EARLIEST) // Atom creation date
?? $this->fetchDate("dct:created|dc:created", self::DATE_LATEST) // Dublin Core creation date
?? $this->getAssumedDateCreated(); // Earliest other date
}
public function getContent(): ?Text {
@ -146,7 +147,7 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry {
*/
protected function getLinkAndRelatedRss2(): array {
$link = $this->fetchUrl("rss2:link");
$guid = $this->fetchUrl("rss2:guid[not(@isPermalink) or @isPermalink='true']");
$guid = $this->fetchUrl(self::QUERY_RSS_PERMALINK);
if ($link && $guid) {
if ($link->getScheme() !== $guid->getScheme() || $link->getAuthority() !== $guid->getAuthority()) {
return [$guid, $link];
@ -156,7 +157,7 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry {
}
protected function getAssumedDateCreated(): ?Date {
$dates = $this->fetchDate("dc:date|rss2:pubDate|rss2:lastBuildDate", self::DATE_ALL);
$dates = $this->fetchDate(self::QUERY_AMBIGUOUS_DATES, self::DATE_ALL);
if (sizeof($dates) > 1) {
return $dates[0];
}

2
lib/Parser/XML/Feed.php

@ -164,7 +164,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
formats are equal, and we want the latest date, whatever it is.
*/
return $this->fetchDate("atom:updated", self::DATE_LATEST)
?? $this->fetchDate("dc:date|rss2:pubDate|rss2:lastBuildDate", self::DATE_LATEST);
?? $this->fetchDate(self::QUERY_AMBIGUOUS_DATES, self::DATE_LATEST);
}
public function getIcon(): ?Url {

1
lib/Parser/XML/XPath.php

@ -13,6 +13,7 @@ class XPath extends \DOMXpath {
'rss1' => "http://purl.org/rss/1.0/", // RDF site summary 1.0 http://purl.org/rss/1.0/spec
'rss0' => "http://channel.netscape.com/rdf/simple/0.9/", // RDF Site Summary 0.90 http://www.rssboard.org/rss-0-9-0
'dc' => "http://purl.org/dc/elements/1.1/", // Dublin Core metadata http://purl.org/rss/1.0/modules/dc/
'dct' => "http://purl.org/dc/terms/", // Dublin Core terms https://web.archive.org/web/20071222055924/http://web.resource.org/rss/1.0/modules/dcterms/
'sched' => "http://purl.org/rss/1.0/modules/syndication/", // Syndication schedule extension http://purl.org/rss/1.0/modules/syndication/
'enc' => "http://purl.org/rss/1.0/modules/content/", // Explicitly encoded content extension http://purl.org/rss/1.0/modules/content/
'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss

10
tests/cases/XML/entry-rss2.yaml

@ -10,7 +10,7 @@ GUID:
input: >
<rss><channel>
<item>
<guid isPermalink="false">blah</guid>
<guid isPermaLink="false">blah</guid>
</item>
</channel></rss>
output:
@ -22,7 +22,7 @@ Language:
input: >
<rss><channel>
<item>
<guid isPermalink="false">blah</guid>
<guid isPermaLink="false">blah</guid>
<language>fr</language>
</item>
</channel></rss>
@ -36,7 +36,7 @@ Entry link:
input: >
<rss><channel>
<item>
<guid isPermalink="true">http://example.com/</guid>
<guid isPermaLink="true">http://example.com/</guid>
</item>
<item>
<guid>http://example.com/</guid>
@ -58,7 +58,7 @@ Related link:
input: >
<rss><channel>
<item>
<guid isPermalink="true">http://example.com/</guid>
<guid isPermaLink="true">http://example.com/</guid>
<link>http://example.net/</link>
</item>
<item>
@ -66,7 +66,7 @@ Related link:
<link>http://example.net/</link>
</item>
<item>
<guid isPermalink="false">http://example.com/</guid>
<guid isPermaLink="false">http://example.com/</guid>
<link>http://example.net/</link>
</item>
<item>

12
tests/cases/XML/feed-rss2.yaml

@ -9,7 +9,7 @@ Channel GUID:
input: >
<rss>
<channel>
<guid isPermalink="false">http://example.com/</guid>
<guid isPermaLink="false">http://example.com/</guid>
</channel>
</rss>
output:
@ -20,7 +20,7 @@ Channel GUID with whitespace:
input: >
<rss>
<channel>
<guid isPermalink="false">
<guid isPermaLink="false">
http://example.com/
</guid>
</channel>
@ -33,7 +33,7 @@ Root GUID: # Any elements on the RSS2 root element should be ignored
input: >
<rss>
<channel/>
<guid isPermalink="false">http://example.com/</guid>
<guid isPermaLink="false">http://example.com/</guid>
</rss>
output:
format: rss
@ -43,7 +43,7 @@ Bogus GUID before good:
<rss>
<channel>
<guid/>
<guid isPermalink="false">http://example.com/</guid>
<guid isPermaLink="false">http://example.com/</guid>
</channel>
</rss>
output:
@ -255,7 +255,7 @@ Feed link via GUID 1:
Feed link via GUID 2:
input: >
<rss><channel>
<guid isPermalink='true'>http://example.com/</guid>
<guid isPermaLink='true'>http://example.com/</guid>
</channel></rss>
output:
format: rss
@ -265,7 +265,7 @@ Feed link via GUID 2:
GUID not a link:
input: >
<rss><channel>
<guid isPermalink='maybe not'>http://example.com/</guid>
<guid isPermaLink='maybe not'>http://example.com/</guid>
</channel></rss>
output:
format: rss

Loading…
Cancel
Save