Browse Source

Only match RSS2 elements in RSS2 documents

master
J. King 4 years ago
parent
commit
e93234c3df
  1. 10
      lib/Parser/XML/Construct.php
  2. 13
      lib/Parser/XML/Feed.php
  3. 12
      lib/Parser/XML/XPath.php

10
lib/Parser/XML/Construct.php

@ -272,7 +272,7 @@ abstract class Construct {
* Using RSS' <guid> for feed identifiers is non-standard, but harmless
*/
protected function getIdRss2(): ?string {
return $this->fetchString("guid", ".+");
return $this->fetchString("rss2:guid", ".+");
}
/** Primitive to fetch a Dublin Core feed/entry identifier */
@ -295,7 +295,7 @@ abstract class Construct {
}
protected function getLangRss2(): ?string {
return $this->fetchString("language", ".+");
return $this->fetchString("rss2:language", ".+");
}
protected function getLinkAtom(): ?Url {
@ -303,7 +303,7 @@ abstract class Construct {
}
protected function getLinkRss2(): ?Url {
return $this->fetchUrl("link") ?? $this->fetchUrl("guid[not(@isPermalink) or @isPermalink='true']");
return $this->fetchUrl("rss2:link") ?? $this->fetchUrl("rss2:guid[not(@isPermalink) or @isPermalink='true']");
}
protected function getLinkRss1(): ?Url {
@ -319,7 +319,7 @@ abstract class Construct {
}
protected function getTitleRss2(): ?Text {
return $this->fetchText("title", self::TEXT_LOOSE);
return $this->fetchText("rss2:title", self::TEXT_LOOSE);
}
protected function getTitleDC(): ?Text {
@ -346,7 +346,7 @@ abstract class Construct {
protected function getCategoriesRss2(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->xpath->query("category") as $node) {
foreach ($this->xpath->query("rss2:category") as $node) {
$c = new Category;
$c->domain = $this->trimText($node->getAttribute("domain"));
$c->name = $this->trimText($node->textContent);

13
lib/Parser/XML/Feed.php

@ -53,6 +53,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
$this->subject = $this->fetchElement("channel") ?? $this->subject;
$feed->format = "rss";
$feed->version = $this->document->documentElement->hasAttribute("version") ? $this->document->documentElement->getAttribute("version") : null;
$this->xpath->rss2 = true;
} elseif ($ns === XPath::NS['rdf'] && $name === "RDF") {
$feed->format = "rdf";
$channel = $this->fetchElement("rss1:channel|rss0:channel");
@ -150,7 +151,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
?? $this->fetchText("dc:description", self::TEXT_PLAIN) // Dublin Core description
?? $this->fetchText("rss1:description", self::TEXT_LOOSE) // RSS 1.0 description
?? $this->fetchText("rss0:description", self::TEXT_LOOSE) // RSS 0.90 description
?? $this->fetchText("description", self::TEXT_LOOSE) // RSS 2.0 description
?? $this->fetchText("rss2:description", self::TEXT_LOOSE) // RSS 2.0 description
?? $this->fetchText("gplay:description", self::TEXT_PLAIN) // Google Play podcast description
?? $this->fetchText("apple:summary", self::TEXT_PLAIN) // iTunes podcast summary
?? $this->fetchText("apple:subtitle", self::TEXT_PLAIN); // iTunes podcast subtitle
@ -162,7 +163,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
formats are equal, and we want the latest date, whatever it is.
*/
return $this->fetchDate("atom:updated", self::DATE_LATEST)
?? $this->fetchDate("dc:date|pubDate|lastBuildDate", self::DATE_LATEST);
?? $this->fetchDate("dc:date|rss2:pubDate|rss2:lastBuildDate", self::DATE_LATEST);
}
public function getIcon(): ?Url {
@ -178,7 +179,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
?? $this->fetchUrl("/rdf:RDF/rss1:image/rss1:url") // RSS 1.0 root image
?? $this->fetchUrl("rss0:image/rss0:url") // RSS 0.90 channel image
?? $this->fetchUrl("/rdf:RDF/rss0:image/rss0:url") // RSS 0.90 root image
?? $this->fetchUrl("image/url") // RSS 2.0 channel image
?? $this->fetchUrl("rss2:image/rss2:url") // RSS 2.0 channel image
?? $this->fetchUrl("gplay:image/@href") // Google Play podcast image
?? $this->fetchUrl("apple:image/@href"); // iTunes podcast image
}
@ -211,7 +212,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
/** Fetches the "time-to-live" value (a number of minutes before the feed should be re-fetched) from an RSS 2.0 feed */
protected function getSchedIntervalRss2(): ?\DateInterval {
$ttl = (int) $this->fetchString("ttl", "\d+");
$ttl = (int) $this->fetchString("rss2:ttl", "\d+");
if ($ttl) {
return new \DateInterval("PT{$ttl}M");
}
@ -247,7 +248,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
/** Computes the "skip-schedule" of an RSS feed, the set of days and hours during which a feed should not be fetched */
protected function getSchedSkipRss2(): ?int {
$out = 0;
foreach($this->fetchString("skipHours/hour", "\d+", true) ?? [] as $h) {
foreach($this->fetchString("rss2:skipHours/rss2:hour", "\d+", true) ?? [] as $h) {
$out |= [
Schedule::HOUR_0,
Schedule::HOUR_1,
@ -276,7 +277,7 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
Schedule::HOUR_0,
][(int) $h] ?? 0;
}
foreach($this->fetchString("skipDays/day", null, true) ?? [] as $d) {
foreach($this->fetchString("rss2:skipDays/rss2:day", null, true) ?? [] as $d) {
$out |= [
"monday" => Schedule::DAY_MON,
"tuesday" => Schedule::DAY_TUE,

12
lib/Parser/XML/XPath.php

@ -9,24 +9,32 @@ namespace MensBeam\Lax\Parser\XML;
class XPath extends \DOMXpath {
public const NS = [
'atom' => "http://www.w3.org/2005/Atom", // Atom syndication format https://tools.ietf.org/html/rfc4287
'rss2' => "", // RSS 2.0 does not have a namespace // Really Simple Syndication 2.0.11 http://www.rssboard.org/rss-specification
'rss1' => "http://purl.org/rss/1.0/", // RDF site summary 1.0 http://purl.org/rss/1.0/spec
'rss0' => "http://channel.netscape.com/rdf/simple/0.9/", // RDF Site Summary 0.90 http://www.rssboard.org/rss-0-9-0
'dc' => "http://purl.org/dc/elements/1.1/", // Dublin Core metadata http://purl.org/rss/1.0/modules/dc/
'sched' => "http://purl.org/rss/1.0/modules/syndication/", // Syndication schedule extension http://purl.org/rss/1.0/modules/syndication/
'enc' => "http://purl.org/rss/1.0/modules/content/", // Explicitly encoded content extension http://purl.org/rss/1.0/modules/content/
'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss
// RSS 2.0 does not have a namespace // Really Simple Syndication 2.0.11 http://www.rssboard.org/rss-specification
'rdf' => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", // Resource Description Framework
'xhtml' => "http://www.w3.org/1999/xhtml", // XHTML
'apple' => "http://www.itunes.com/dtds/podcast-1.0.dtd", // iTunes podcasts https://help.apple.com/itc/podcasts_connect/#/itcb54353390
'gplay' => "http://www.google.com/schemas/play-podcasts/1.0", // Google Play podcasts https://support.google.com/googleplay/podcasts/answer/6260341
];
public $rss2 = false;
/** Returns an XPath processor with various necessary namespace prefixes defined */
public function __construct(\DOMDocument $doc) {
parent::__construct($doc);
foreach (XPath::NS as $prefix => $url) {
foreach (self::NS as $prefix => $url) {
$this->registerNamespace($prefix, $url);
}
}
/** {@inheritDoc} */
public function query($expression, $contextnode = null, $registerNS = true) {
$expression = $this->rss2 ? str_replace("rss2:", "", $expression) : $expression;
return parent::query($expression, $contextnode, $registerNS);
}
}

Loading…
Cancel
Save