Browse Source
Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implementedmaster
J. King
6 years ago
commit
92711159d0
8 changed files with 480 additions and 0 deletions
@ -0,0 +1,7 @@ |
|||
* text=auto encoding=utf-8 |
|||
|
|||
*.html diff=html |
|||
*.php diff=php |
|||
*.bat eol=crlf |
|||
*.cmd eol=crlf |
|||
.gitignore -eol |
@ -0,0 +1,2 @@ |
|||
vendor |
|||
samples |
@ -0,0 +1,26 @@ |
|||
{ |
|||
"name": "jkingweb/lax", |
|||
"type": "library", |
|||
"description": "A lax newsfeed parser", |
|||
"keywords": ["rss","atom","jsonfeed"], |
|||
"license": "MIT", |
|||
"authors": [ |
|||
{ |
|||
"name": "J. King", |
|||
"email": "jking@jkingweb.ca", |
|||
"homepage": "https://jkingweb.ca/" |
|||
} |
|||
|
|||
], |
|||
"require": { |
|||
"php": "^7.0", |
|||
"ext-json": "*", |
|||
"ext-dom": "*", |
|||
"sabre/uri": "^2.0" |
|||
}, |
|||
"autoload": { |
|||
"psr-4": { |
|||
"JKingWeb\\Lax\\": "lib/" |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,73 @@ |
|||
{ |
|||
"_readme": [ |
|||
"This file locks the dependencies of your project to a known state", |
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", |
|||
"This file is @generated automatically" |
|||
], |
|||
"content-hash": "ddf62aa3f11d886da2b7ba796090469f", |
|||
"packages": [ |
|||
{ |
|||
"name": "sabre/uri", |
|||
"version": "2.1.1", |
|||
"source": { |
|||
"type": "git", |
|||
"url": "https://github.com/sabre-io/uri.git", |
|||
"reference": "a42126042c7dcb53e2978dadb6d22574d1359b4c" |
|||
}, |
|||
"dist": { |
|||
"type": "zip", |
|||
"url": "https://api.github.com/repos/sabre-io/uri/zipball/a42126042c7dcb53e2978dadb6d22574d1359b4c", |
|||
"reference": "a42126042c7dcb53e2978dadb6d22574d1359b4c", |
|||
"shasum": "" |
|||
}, |
|||
"require": { |
|||
"php": ">=7" |
|||
}, |
|||
"require-dev": { |
|||
"phpunit/phpunit": "^6.0", |
|||
"sabre/cs": "~1.0.0" |
|||
}, |
|||
"type": "library", |
|||
"autoload": { |
|||
"files": [ |
|||
"lib/functions.php" |
|||
], |
|||
"psr-4": { |
|||
"Sabre\\Uri\\": "lib/" |
|||
} |
|||
}, |
|||
"notification-url": "https://packagist.org/downloads/", |
|||
"license": [ |
|||
"BSD-3-Clause" |
|||
], |
|||
"authors": [ |
|||
{ |
|||
"name": "Evert Pot", |
|||
"email": "me@evertpot.com", |
|||
"homepage": "http://evertpot.com/", |
|||
"role": "Developer" |
|||
} |
|||
], |
|||
"description": "Functions for making sense out of URIs.", |
|||
"homepage": "http://sabre.io/uri/", |
|||
"keywords": [ |
|||
"rfc3986", |
|||
"uri", |
|||
"url" |
|||
], |
|||
"time": "2017-02-20T20:02:35+00:00" |
|||
} |
|||
], |
|||
"packages-dev": [], |
|||
"aliases": [], |
|||
"minimum-stability": "stable", |
|||
"stability-flags": [], |
|||
"prefer-stable": false, |
|||
"prefer-lowest": false, |
|||
"platform": { |
|||
"php": "^7.0", |
|||
"ext-json": "*", |
|||
"ext-dom": "*" |
|||
}, |
|||
"platform-dev": [] |
|||
} |
@ -0,0 +1,132 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace JKingWeb\Lax; |
|||
|
|||
abstract class XMLCommon { |
|||
/** @var \DOMDocument */ |
|||
public $document; |
|||
/** @var \DOMXPath */ |
|||
protected $xpath; |
|||
/** @var \DOMElement */ |
|||
protected $subject; |
|||
protected $base = ""; |
|||
|
|||
const NS = [ |
|||
'atom' => "http://www.w3.org/2005/Atom", // Atom syndication format https://tools.ietf.org/html/rfc4287 |
|||
'rss1' => "http://purl.org/rss/1.0/", // RDF site summary 1.0 http://purl.org/rss/1.0/spec |
|||
'rss0' => "http://channel.netscape.com/rdf/simple/0.9/", // RDF Site Summary 0.90 http://www.rssboard.org/rss-0-9-0 |
|||
'dc' => "http://purl.org/dc/elements/1.1/", // Dublin Core metadata http://purl.org/rss/1.0/modules/dc/ |
|||
'sched' => "http://purl.org/rss/1.0/modules/syndication/", // Syndication schedule extension http://purl.org/rss/1.0/modules/syndication/ |
|||
'enc' => "http://purl.org/rss/1.0/modules/content/", // Explicitly encoded content extension http://purl.org/rss/1.0/modules/content/ |
|||
'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss |
|||
// RSS 2.0 does not have a namespace // Really Simple Syndication 2.0.11 http://www.rssboard.org/rss-specification |
|||
'rdf' => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", // Resource Description Framework |
|||
'xhtml' => "http://www.w3.org/1999/xhtml", // XHTML |
|||
'apple' => "http://www.itunes.com/DTDs/Podcast-1.0.dtd" // iTunes podcasts https://help.apple.com/itc/podcasts_connect/#/itcb54353390 |
|||
]; |
|||
|
|||
/** Returns an XPath processor with various necessary namespace prefixes defined */ |
|||
public static function getXPathProcessor(\DOMDocument $doc): \DOMXPath { |
|||
$proc = new \DOMXPath($doc); |
|||
foreach (self::NS as $prefix => $url) { |
|||
$proc->registerNamespace($prefix, $url); |
|||
} |
|||
return $proc; |
|||
} |
|||
|
|||
/** Trims plain text and collapses whitespace */ |
|||
protected function trimText(string $text): string { |
|||
return trim(preg_replace("<\s{2,}>s", " ", $text)); |
|||
} |
|||
|
|||
/** Takes an HTML string as input and returns a sanitized version of that string |
|||
* |
|||
* The $outputHtml parameter, when false, outputs only the plain-text content of the sanitized HTML |
|||
*/ |
|||
protected function sanitizeString(string $markup, bool $outputHtml = true): string { |
|||
if (!preg_match("/<\S/", $markup)) { |
|||
// if the string does not appear to actually contain markup besides entities, we can skip most of the sanitization |
|||
return $outputHtml ? $markup : $this->trimText(html_entity_decode($markup, \ENT_QUOTES | \ENT_HTML5, "UTF-8")); |
|||
} else { |
|||
return "OOK!"; |
|||
} |
|||
} |
|||
|
|||
/** Retrieves an element node based on an XPath query */ |
|||
protected function fetchElement(string $query) { |
|||
$node = $this->xpath->query("(".$query.")[1]", $this->subject); |
|||
return ($node->length) ? $node->item(0) : null; |
|||
} |
|||
|
|||
/** Retrieves multiple element node based on an XPath query */ |
|||
protected function fetchElements(string $query) { |
|||
return $this->xpath->query($query, $this->subject); |
|||
} |
|||
|
|||
/** Retrieves the trimmed text content of a DOM element based on an XPath query */ |
|||
protected function fetchText(string $query) { |
|||
$node = $this->fetchElement($query); |
|||
return ($node) ? $this->trimText($node->textContent) : null; |
|||
} |
|||
|
|||
/** Retrieves the trimmed text content of multiple DOM elements based on an XPath query */ |
|||
protected function fetchTextMulti(string $query) { |
|||
$out = []; |
|||
$nodes = $this->xpath->query($query, $this->subject); |
|||
foreach ($nodes as $node) { |
|||
$out[] = $this->trimText($node->item(0)->textContent); |
|||
} |
|||
return ($out) ? $out : null; |
|||
} |
|||
|
|||
/** Retrieves the trimmed plain-text or HTML content of an Atom text construct based on an XPath query */ |
|||
protected function fetchTextAtom(string $query, bool $html = false) { |
|||
$node = $this->fetchElement($query); |
|||
if ($node) { |
|||
if (!$node->hasAttribute("type") || $node->getAttribute("type")=="text") { |
|||
return $html ? htmlspecialchars($this->trimText($node->textContent), \ENT_QUOTES | \ENT_HTML5) : $this->trimText($node->textContent); |
|||
} elseif ($node->getAttribute("type")=="xhtml") { |
|||
$node = $node->getElementsByTagNameNS(self::NS['xhtml'], "div")->item(0); |
|||
return $node ? $this->sanitizeElement($node, $html) : null; |
|||
} elseif ($node->getAttribute("type")=="html") { |
|||
return $this->sanitizeString($node->textContent, $html); |
|||
} else { |
|||
return null; |
|||
} |
|||
} else { |
|||
return null; |
|||
} |
|||
} |
|||
|
|||
/** Returns a node-list of Atom link elements with the desired relation or equivalents. |
|||
* |
|||
* Links without an href attribute are excluded. |
|||
* |
|||
* @see https://tools.ietf.org/html/rfc4287#section-4.2.7.2 |
|||
*/ |
|||
protected function fetchAtomRelations(string $rel = ""): \DOMNodeList { |
|||
// FIXME: The XPath evaluation will fail if the relation contains an apostrophe. This is a known and difficult-to-overcome limitation of XPath 1.0 which I consider not worth the effort to address at this time |
|||
if ($rel=="" || $rel=="alternate" || $rel=="http://www.iana.org/assignments/relation/alternate") { |
|||
$cond = "not(@rel) or @rel='' or @rel='alternate' or @rel='http://www.iana.org/assignments/relation/alternate'"; |
|||
} elseif (strpos($rel, ":")===false) { |
|||
// FIXME: Checking only for a colon in a link relation is a hack that does not strictly follow IRI rules, but it's adequate for our needs |
|||
$cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'"; |
|||
} elseif (strlen($rel) > 41 && strpos($rel, "http://www.iana.org/assignments/relation/")===0) { |
|||
$rel = substr($rel, 41); |
|||
$cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'"; |
|||
} else { |
|||
$cond = "@rel='$rel'"; |
|||
} |
|||
return $this->xpath->query("./atom:link[@href][$cond]", $this->subject); |
|||
} |
|||
|
|||
/** Resolves a relative URL against a base URL */ |
|||
protected function resolveUrl(string $url, string $base = null): string { |
|||
$base = $base ?? ""; |
|||
return \Sabre\Uri\resolve($base, $url); |
|||
} |
|||
} |
@ -0,0 +1,121 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace JKingWeb\Lax; |
|||
|
|||
trait XMLCommonPrimitives { |
|||
|
|||
/** Primitive to fetch an Atom feed/entry title |
|||
* |
|||
* This fetches the title in plain text rather than HTML, even if HTML is provided in the feed/entry |
|||
*/ |
|||
protected function getTitleAtom() { |
|||
return $this->fetchTextAtom("./atom:title"); |
|||
} |
|||
|
|||
/** Primitive to fetch an RSS feed/entry title */ |
|||
protected function getTitleRss2() { |
|||
return $this->fetchText("./title"); |
|||
} |
|||
|
|||
/** Primitive to fetch an RDF feed/entry title */ |
|||
protected function getTitleRss1() { |
|||
return $this->fetchText("./rss1:title|./rss0:title"); |
|||
} |
|||
|
|||
/** Primitive to fetch a Dublin Core feed/entry title */ |
|||
protected function getTitleDC() { |
|||
return $this->fetchText("./dc:title"); |
|||
} |
|||
|
|||
/** Primitive to fetch an Apple podcast/episdoe title */ |
|||
protected function getTitleApple() { |
|||
return $this->fetchText("./apple:title"); |
|||
} |
|||
|
|||
/** Primitive to fetch an Atom feed/entry Web-representation URL */ |
|||
protected function getLinkAtom() { |
|||
$node = $this->fetchAtomRelations(); |
|||
return $node->length ? $this->resolveURL($node->item(0)->getAttribute("href"), $node->item(0)->baseURI) : null; |
|||
} |
|||
|
|||
/** Primitive to fetch an RSS feed/entry Web-representation URL */ |
|||
protected function getLinkRss2() { |
|||
return $this->fetchText("./link"); |
|||
} |
|||
|
|||
/** Primitive to fetch an RDF feed/entry Web-representation URL */ |
|||
protected function getLinkRss1() { |
|||
return $this->fetchText("./rss1:link|./rss0:link"); |
|||
} |
|||
|
|||
/** Primitive to fetch Atom feed/entry categories */ |
|||
protected function getCategoriesAtom(bool $grouped = false, bool $humanFriendly = true) { |
|||
$nodes = $this->fetchElements("./atom:category[@term]"); |
|||
$out = []; |
|||
foreach ($nodes as $node) { |
|||
$scheme = $node->getAttribute("scheme"); |
|||
$cat = ($humanFriendly && $node->hasAttribute("label")) ? $node->getAttribute("label") : $node->getAttribute("term"); |
|||
if (!$out[$scheme]) { |
|||
$out[$scheme] = []; |
|||
} |
|||
if (!in_array($cat, $out[$scheme])) { |
|||
$out[$scheme][] = $cat; |
|||
} |
|||
} |
|||
return $out ? $out : null; |
|||
} |
|||
|
|||
/** Primitive to fetch RSS feed/entry categories */ |
|||
protected function getCategoriesRss2(bool $grouped = false, bool $humanFriendly = true) { |
|||
if ($grouped) { |
|||
$nodes = $this->fetchElements("./category"); |
|||
$out = []; |
|||
foreach ($nodes as $node) { |
|||
$domain = $node->getAttribute("domain"); |
|||
$cat = $this->trimText($node->textContent); |
|||
if (!$out[$domain]) { |
|||
$out[$domain] = []; |
|||
} |
|||
if (!in_array($cat, $out[$domain])) { |
|||
$out[$domain][] = $cat; |
|||
} |
|||
} |
|||
return $out ? $out : null; |
|||
} else { |
|||
$out = $this->fetchTextMulti("./category"); |
|||
return $out ? array_keys(array_flip($out)) : null; |
|||
} |
|||
} |
|||
|
|||
/** Primitive to fetch Dublin Core feed/entry categories |
|||
* |
|||
* Dublin Core doesn't have an obvious category type, so we use 'subject' as a nearest approximation |
|||
*/ |
|||
protected function getCategoriesDC(bool $grouped = false, bool $humanFriendly = true) { |
|||
$out = $this->fetchTextMulti("./dc:subject"); |
|||
if ($out) { |
|||
$out = array_keys(array_flip($out)); |
|||
return $grouped ? ['' => $out] : $out; |
|||
} |
|||
return null; |
|||
} |
|||
|
|||
/** Primitive to fetch RSS feed/entry categories */ |
|||
protected function getCategoriesApple(bool $grouped = false, bool $humanFriendly = true) { |
|||
$nodes = $this->fetchElements("./apple:category"); |
|||
$out = []; |
|||
foreach ($nodes as $node) { |
|||
$cat = $this->trimText($node->getAttribute("text")); |
|||
if (strlen($cat)) { |
|||
$out[] = $cat; |
|||
} |
|||
} |
|||
$out = array_keys(array_flip($out)); |
|||
return $grouped ? ['' => $out] : $out; |
|||
|
|||
} |
|||
} |
@ -0,0 +1,79 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace JKingWeb\Lax; |
|||
|
|||
class XMLFeed extends XMLCommon { |
|||
use XMLCommonPrimitives; |
|||
use XMLFeedPrimitives; |
|||
|
|||
public $url; |
|||
public $link; |
|||
public $title; |
|||
public $summary; |
|||
public $categories; |
|||
|
|||
/** Returns a parsed feed */ |
|||
public function __construct(string $data, string $contentType = null, string $url = null) { |
|||
$this->init($data, $contentType, $url); |
|||
$this->parse(); |
|||
} |
|||
|
|||
/** Performs initialization of the instance */ |
|||
protected function init(string $data, string $contentType = null, string $url = null) { |
|||
$this->document = new \DOMDocument(); |
|||
$this->document->loadXML($data, \LIBXML_BIGLINES | \LIBXML_COMPACT); |
|||
$this->document->documentURI = $url; |
|||
$this->xpath = self::getXPathProcessor($this->document); |
|||
$this->subject = $this->document->documentElement; |
|||
$ns = $this->subject->namespaceURI; |
|||
$name = $this->subject->localName; |
|||
if (is_null($ns) && $name=="rss") { |
|||
$this->subject = $this->fetchElement("./channel[1]") ?? $this->subject; |
|||
} elseif ($ns==self::NS['rdf'] && $name=="RDF") { |
|||
$this->subject = $this->fetchElement("./rss1:channel|./rss0:channel") ?? $this->subject; |
|||
} elseif ($ns==self::NS['atom'] && $name=="feed") { |
|||
// nothing required for Atom |
|||
} else { |
|||
throw new \Exception; |
|||
} |
|||
$this->url = $url; |
|||
|
|||
} |
|||
|
|||
/** Parses the feed to extract sundry metadata */ |
|||
protected function parse() { |
|||
$this->link = $this->getLink(); |
|||
$this->title = $this->getTitle() ?? $this->link; |
|||
$this->summary = $this->getSummary(); |
|||
} |
|||
|
|||
/** General function to fetch the feed title */ |
|||
public function getTitle() { |
|||
return $this->getTitleAtom() ?? $this->getTitleRss1() ?? $this->getTitleRss2() ?? $this->getTitleDC() ?? $this->getTitleApple(); |
|||
} |
|||
|
|||
/** General function to fetch the feed's Web-representation URL */ |
|||
public function getLink() { |
|||
return $this->getLinkAtom() ?? $this->getLinkRss1() ?? $this->getLinkRss2(); |
|||
} |
|||
|
|||
/** General function to fetch the description of a feed */ |
|||
public function getSummary() { |
|||
// unlike most other data, Atom is not preferred, because Atom doesn't really have feed summaries |
|||
return $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryAtom(); |
|||
} |
|||
|
|||
/** General function to fetch the categories of a feed |
|||
* |
|||
* If the $grouped parameter is true, and array of arrays will be returned, keyed by taxonomy/scheme |
|||
* |
|||
* The $humanFriendly parameter only affects Atom categories |
|||
*/ |
|||
public function getCategories(bool $grouped = false, bool $humanFriendly = true) { |
|||
return $this->getCategoriesAtom($grouped, $humanFriendly) ?? $this->getCategoriesRss2($grouped, $humanFriendly) ?? $this->getCategoriesDC($grouped, $humanFriendly) ?? $this->getCategoriesApple($grouped, $humanFriendly); |
|||
} |
|||
} |
@ -0,0 +1,40 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace JKingWeb\Lax; |
|||
|
|||
trait XMLFeedPrimitives { |
|||
|
|||
/** Primitive to fetch an Atom feed summary |
|||
* |
|||
* Atom does not have a 'description' element like the RSSes, but it does have 'subtitle', which fills roughly the same function |
|||
*/ |
|||
protected function getSummaryAtom() { |
|||
return $this->fetchTextAtom("./atom:subtitle"); |
|||
} |
|||
|
|||
/** Primitive to fetch an RSS feed summary */ |
|||
protected function getSummaryRss2() { |
|||
return $this->fetchText("./description"); |
|||
} |
|||
|
|||
/** Primitive to fetch an RDF feed summary */ |
|||
protected function getSummaryRss1() { |
|||
return $this->fetchText("./rss1:description|./rss0:description"); |
|||
} |
|||
|
|||
/** Primitive to fetch a Dublin Core feed summary */ |
|||
protected function getSummaryDC() { |
|||
return $this->fetchText("./dc:description"); |
|||
} |
|||
|
|||
/** Primitive to fetch an Apple podcast summary */ |
|||
protected function getSummaryApple() { |
|||
return $this->fetchText("./apple:summary") ?? $this->fetchText("./apple:subtitle"); |
|||
} |
|||
|
|||
|
|||
} |
Loading…
Reference in new issue