Browse Source

Fill out entries so more; test entry IDs, lang

master
J. King 4 years ago
parent
commit
d90af87f96
  1. 58
      lib/Parser/XML/Entry.php
  2. 6
      lib/Parser/XML/Feed.php
  3. 4
      lib/Parser/XML/XPath.php
  4. 10
      tests/cases/XML/entry-rss0.yaml
  5. 54
      tests/cases/XML/entry-rss1.yaml
  6. 33
      tests/cases/XML/entry-rss2.yaml

58
lib/Parser/XML/Entry.php

@ -47,19 +47,21 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry {
}
public function getId(): ?string {
return $this->fetchString("atom:id", ".+") // Item identifier
?? $this->fetchString("dc:identifier", ".+") // Dublin Core identifier
?? $this->fetchString("rss2:guid", ".+"); // RSS 2.0 GUID
return $this->fetchString("atom:id", ".+") // Atom identifier
?? $this->fetchString("dc:identifier", ".+") // Dublin Core identifier
?? $this->fetchString("self::rss1:item/@rdf:about") // RSS 1.0 RDF identifier
?? $this->fetchString("rss2:guid", ".+"); // RSS 2.0 GUID, as string
}
public function getLink(): ?Url {
return $this->getLinkAtom() // Atom link
?? $this->getLinkRss1() // RSS 0.90 or RSS 1.0 link
?? $this->getLinkRss2(); // RSS 2.0 link
return $this->getLinkAtom() // Atom link
?? $this->getLinkRss1() // RSS 0.90 or RSS 1.0 link
?? $this->getLinkAndRelatedRss2()[0]; // RSS 2.0 GUID or link, as URL
}
public function getRelatedLink(): ?Url {
return $this->fetchAtomRelation("related", ["text/html", "application/xhtml+xml"]);
return $this->fetchAtomRelation("related", ["text/html", "application/xhtml+xml"]) // Atom related relation
?? $this->getLinkAndRelatedRss2()[1]; // RSS 2.0 link if different from GUID;
}
public function getTitle(): ?Text {
@ -71,11 +73,22 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry {
}
public function getDateModified(): ?Date {
return null;
/* fetching a date works differently from other data as only Atom has
well-defined semantics here. Thus the semantics of all the other
formats are equal, and we want the latest date, whatever it is.
*/
return $this->fetchDate("atom:updated", self::DATE_LATEST) // Atom update date
?? $this->fetchDate("dc:date|rss2:pubDate|rss2:lastBuildDate", self::DATE_LATEST); // Latest other datee
}
public function getDateCreated(): ?Date {
return null;
/* fetching a date works differently from other data as only Atom has
well-defined semantics here. Thus the semantics of all the other
formats are equal, and we want the earliest date, but only if
there are at least two
*/
return $this->fetchDate("atom:created", self::DATE_EARLIEST) // Atom creation date
?? $this->getAssumedDateCreated(); // Earliest other date
}
public function getContent(): ?Text {
@ -101,4 +114,31 @@ class Entry extends Construct implements \MensBeam\Lax\Parser\Entry {
public function getEnclosures(): EnclosureCollection {
return new EnclosureCollection;
}
/** Returns an indexed array containing the entry link (or null)
* and the entry related link (or null)
*
* This follows the suggestion in RSS 2.0 that if the permalink-GUID
* and link differ, then the latter is a related link. For our purposes
* they are considered to differ if they point to different hosts or
* have different schemes
*/
protected function getLinkAndRelatedRss2(): array {
$link = $this->fetchUrl("rss2:link");
$guid = $this->fetchUrl("rss2:guid[not(@isPermalink) or @isPermalink='true']");
if ($link && $guid) {
if ($link->getScheme() !== $guid->getScheme() || $link->getAuthority() !== $guid->getAuthority()) {
return [$guid, $link];
}
}
return [$link ?? $guid, null];
}
protected function getAssumedDateCreated(): ?Date {
$dates = $this->fetchDate("dc:date|rss2:pubDate|rss2:lastBuildDate", self::DATE_ALL);
if (sizeof($dates) > 1) {
return $dates[0];
}
return null;
}
}

6
lib/Parser/XML/Feed.php

@ -127,9 +127,9 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
}
public function getUrl(): ?Url {
return $this->fetchAtomRelation("self") // Atom 'self' relation URL
?? $this->fetchUrl("self::rss1:channel/@rdf:about") // RDF-about URL from RSS 0.90 or RSS 1.0
?? $this->fetchUrl("apple:new-feed-url"); // iTunes podcast canonical URL
return $this->fetchAtomRelation("self", ["application/atom+xml"]) // Atom 'self' relation URL
?? $this->fetchUrl("self::rss1:channel/@rdf:about") // RDF-about URL from RSS 0.90 or RSS 1.0
?? $this->fetchUrl("apple:new-feed-url"); // iTunes podcast canonical URL
}
public function getLink(): ?Url {

4
lib/Parser/XML/XPath.php

@ -16,8 +16,8 @@ class XPath extends \DOMXpath {
'sched' => "http://purl.org/rss/1.0/modules/syndication/", // Syndication schedule extension http://purl.org/rss/1.0/modules/syndication/
'enc' => "http://purl.org/rss/1.0/modules/content/", // Explicitly encoded content extension http://purl.org/rss/1.0/modules/content/
'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss
'rdf' => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", // Resource Description Framework
'xhtml' => "http://www.w3.org/1999/xhtml", // XHTML
'rdf' => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", // Resource Description Framework https://www.w3.org/TR/2014/REC-rdf11-concepts-20140225/
'xhtml' => "http://www.w3.org/1999/xhtml", // XHTML https://html.spec.whatwg.org/
'apple' => "http://www.itunes.com/dtds/podcast-1.0.dtd", // iTunes podcasts https://help.apple.com/itc/podcasts_connect/#/itcb54353390
'gplay' => "http://www.google.com/schemas/play-podcasts/1.0", // Google Play podcasts https://support.google.com/googleplay/podcasts/answer/6260341
];

10
tests/cases/XML/entry-rss0.yaml

@ -0,0 +1,10 @@
Empty entry:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://channel.netscape.com/rdf/simple/0.9/">
<channel>
<item/>
</channel>
</rdf:RDF>
output:
format: rdf
version: '0.90'

54
tests/cases/XML/entry-rss1.yaml

@ -0,0 +1,54 @@
# For the purposes of testing Dublin Core metadata is considered a part of RSS 1.0
Empty entry:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel>
<item/>
</channel>
</rdf:RDF>
output:
format: rdf
version: '1.0'
RDF identifier:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<item rdf:about="blah"/>
</channel>
</rdf:RDF>
output:
format: rdf
version: '1.0'
entries:
- id: blah
Dublin Core identifier:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<item rdf:about="bloo">
<dc:identifier>blah</dc:identifier>
</item>
</channel>
</rdf:RDF>
output:
format: rdf
version: '1.0'
entries:
- id: blah
Dublin Core identifier and language:
input: >
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<item rdf:about="blah">
<dc:language>fr</dc:language>
</item>
</rdf:RDF>
output:
format: rdf
version: '1.0'
entries:
- id: blah
lang: fr

33
tests/cases/XML/entry-rss2.yaml

@ -0,0 +1,33 @@
Empty entry:
input: >
<rss><channel>
<item/>
</channel></rss>
output:
format: rss
GUID:
input: >
<rss><channel>
<item>
<guid isPermalink="false">blah</guid>
</item>
</channel></rss>
output:
format: rss
entries:
- id: blah
Language:
input: >
<rss><channel>
<item>
<guid isPermalink="false">blah</guid>
<language>fr</language>
</item>
</channel></rss>
output:
format: rss
entries:
- id: blah
lang: fr
Loading…
Cancel
Save