Browse Source

Initial prototype of feed parser

Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming

Currently feed-level titles, links, summaries, and categories are implemented
master
J. King 6 years ago
commit
92711159d0
  1. 7
      .gitattributes
  2. 2
      .gitignore
  3. 26
      composer.json
  4. 73
      composer.lock
  5. 132
      lib/XMLCommon.php
  6. 121
      lib/XMLCommonPrimitives.php
  7. 79
      lib/XMLFeed.php
  8. 40
      lib/XMLFeedPrimitives.php

7
.gitattributes

@ -0,0 +1,7 @@
* text=auto encoding=utf-8
*.html diff=html
*.php diff=php
*.bat eol=crlf
*.cmd eol=crlf
.gitignore -eol

2
.gitignore

@ -0,0 +1,2 @@
vendor
samples

26
composer.json

@ -0,0 +1,26 @@
{
"name": "jkingweb/lax",
"type": "library",
"description": "A lax newsfeed parser",
"keywords": ["rss","atom","jsonfeed"],
"license": "MIT",
"authors": [
{
"name": "J. King",
"email": "jking@jkingweb.ca",
"homepage": "https://jkingweb.ca/"
}
],
"require": {
"php": "^7.0",
"ext-json": "*",
"ext-dom": "*",
"sabre/uri": "^2.0"
},
"autoload": {
"psr-4": {
"JKingWeb\\Lax\\": "lib/"
}
}
}

73
composer.lock

@ -0,0 +1,73 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"content-hash": "ddf62aa3f11d886da2b7ba796090469f",
"packages": [
{
"name": "sabre/uri",
"version": "2.1.1",
"source": {
"type": "git",
"url": "https://github.com/sabre-io/uri.git",
"reference": "a42126042c7dcb53e2978dadb6d22574d1359b4c"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sabre-io/uri/zipball/a42126042c7dcb53e2978dadb6d22574d1359b4c",
"reference": "a42126042c7dcb53e2978dadb6d22574d1359b4c",
"shasum": ""
},
"require": {
"php": ">=7"
},
"require-dev": {
"phpunit/phpunit": "^6.0",
"sabre/cs": "~1.0.0"
},
"type": "library",
"autoload": {
"files": [
"lib/functions.php"
],
"psr-4": {
"Sabre\\Uri\\": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Evert Pot",
"email": "me@evertpot.com",
"homepage": "http://evertpot.com/",
"role": "Developer"
}
],
"description": "Functions for making sense out of URIs.",
"homepage": "http://sabre.io/uri/",
"keywords": [
"rfc3986",
"uri",
"url"
],
"time": "2017-02-20T20:02:35+00:00"
}
],
"packages-dev": [],
"aliases": [],
"minimum-stability": "stable",
"stability-flags": [],
"prefer-stable": false,
"prefer-lowest": false,
"platform": {
"php": "^7.0",
"ext-json": "*",
"ext-dom": "*"
},
"platform-dev": []
}

132
lib/XMLCommon.php

@ -0,0 +1,132 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\Lax;
abstract class XMLCommon {
/** @var \DOMDocument */
public $document;
/** @var \DOMXPath */
protected $xpath;
/** @var \DOMElement */
protected $subject;
protected $base = "";
const NS = [
'atom' => "http://www.w3.org/2005/Atom", // Atom syndication format https://tools.ietf.org/html/rfc4287
'rss1' => "http://purl.org/rss/1.0/", // RDF site summary 1.0 http://purl.org/rss/1.0/spec
'rss0' => "http://channel.netscape.com/rdf/simple/0.9/", // RDF Site Summary 0.90 http://www.rssboard.org/rss-0-9-0
'dc' => "http://purl.org/dc/elements/1.1/", // Dublin Core metadata http://purl.org/rss/1.0/modules/dc/
'sched' => "http://purl.org/rss/1.0/modules/syndication/", // Syndication schedule extension http://purl.org/rss/1.0/modules/syndication/
'enc' => "http://purl.org/rss/1.0/modules/content/", // Explicitly encoded content extension http://purl.org/rss/1.0/modules/content/
'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss
// RSS 2.0 does not have a namespace // Really Simple Syndication 2.0.11 http://www.rssboard.org/rss-specification
'rdf' => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", // Resource Description Framework
'xhtml' => "http://www.w3.org/1999/xhtml", // XHTML
'apple' => "http://www.itunes.com/DTDs/Podcast-1.0.dtd" // iTunes podcasts https://help.apple.com/itc/podcasts_connect/#/itcb54353390
];
/** Returns an XPath processor with various necessary namespace prefixes defined */
public static function getXPathProcessor(\DOMDocument $doc): \DOMXPath {
$proc = new \DOMXPath($doc);
foreach (self::NS as $prefix => $url) {
$proc->registerNamespace($prefix, $url);
}
return $proc;
}
/** Trims plain text and collapses whitespace */
protected function trimText(string $text): string {
return trim(preg_replace("<\s{2,}>s", " ", $text));
}
/** Takes an HTML string as input and returns a sanitized version of that string
*
* The $outputHtml parameter, when false, outputs only the plain-text content of the sanitized HTML
*/
protected function sanitizeString(string $markup, bool $outputHtml = true): string {
if (!preg_match("/<\S/", $markup)) {
// if the string does not appear to actually contain markup besides entities, we can skip most of the sanitization
return $outputHtml ? $markup : $this->trimText(html_entity_decode($markup, \ENT_QUOTES | \ENT_HTML5, "UTF-8"));
} else {
return "OOK!";
}
}
/** Retrieves an element node based on an XPath query */
protected function fetchElement(string $query) {
$node = $this->xpath->query("(".$query.")[1]", $this->subject);
return ($node->length) ? $node->item(0) : null;
}
/** Retrieves multiple element node based on an XPath query */
protected function fetchElements(string $query) {
return $this->xpath->query($query, $this->subject);
}
/** Retrieves the trimmed text content of a DOM element based on an XPath query */
protected function fetchText(string $query) {
$node = $this->fetchElement($query);
return ($node) ? $this->trimText($node->textContent) : null;
}
/** Retrieves the trimmed text content of multiple DOM elements based on an XPath query */
protected function fetchTextMulti(string $query) {
$out = [];
$nodes = $this->xpath->query($query, $this->subject);
foreach ($nodes as $node) {
$out[] = $this->trimText($node->item(0)->textContent);
}
return ($out) ? $out : null;
}
/** Retrieves the trimmed plain-text or HTML content of an Atom text construct based on an XPath query */
protected function fetchTextAtom(string $query, bool $html = false) {
$node = $this->fetchElement($query);
if ($node) {
if (!$node->hasAttribute("type") || $node->getAttribute("type")=="text") {
return $html ? htmlspecialchars($this->trimText($node->textContent), \ENT_QUOTES | \ENT_HTML5) : $this->trimText($node->textContent);
} elseif ($node->getAttribute("type")=="xhtml") {
$node = $node->getElementsByTagNameNS(self::NS['xhtml'], "div")->item(0);
return $node ? $this->sanitizeElement($node, $html) : null;
} elseif ($node->getAttribute("type")=="html") {
return $this->sanitizeString($node->textContent, $html);
} else {
return null;
}
} else {
return null;
}
}
/** Returns a node-list of Atom link elements with the desired relation or equivalents.
*
* Links without an href attribute are excluded.
*
* @see https://tools.ietf.org/html/rfc4287#section-4.2.7.2
*/
protected function fetchAtomRelations(string $rel = ""): \DOMNodeList {
// FIXME: The XPath evaluation will fail if the relation contains an apostrophe. This is a known and difficult-to-overcome limitation of XPath 1.0 which I consider not worth the effort to address at this time
if ($rel=="" || $rel=="alternate" || $rel=="http://www.iana.org/assignments/relation/alternate") {
$cond = "not(@rel) or @rel='' or @rel='alternate' or @rel='http://www.iana.org/assignments/relation/alternate'";
} elseif (strpos($rel, ":")===false) {
// FIXME: Checking only for a colon in a link relation is a hack that does not strictly follow IRI rules, but it's adequate for our needs
$cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'";
} elseif (strlen($rel) > 41 && strpos($rel, "http://www.iana.org/assignments/relation/")===0) {
$rel = substr($rel, 41);
$cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'";
} else {
$cond = "@rel='$rel'";
}
return $this->xpath->query("./atom:link[@href][$cond]", $this->subject);
}
/** Resolves a relative URL against a base URL */
protected function resolveUrl(string $url, string $base = null): string {
$base = $base ?? "";
return \Sabre\Uri\resolve($base, $url);
}
}

121
lib/XMLCommonPrimitives.php

@ -0,0 +1,121 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\Lax;
trait XMLCommonPrimitives {
/** Primitive to fetch an Atom feed/entry title
*
* This fetches the title in plain text rather than HTML, even if HTML is provided in the feed/entry
*/
protected function getTitleAtom() {
return $this->fetchTextAtom("./atom:title");
}
/** Primitive to fetch an RSS feed/entry title */
protected function getTitleRss2() {
return $this->fetchText("./title");
}
/** Primitive to fetch an RDF feed/entry title */
protected function getTitleRss1() {
return $this->fetchText("./rss1:title|./rss0:title");
}
/** Primitive to fetch a Dublin Core feed/entry title */
protected function getTitleDC() {
return $this->fetchText("./dc:title");
}
/** Primitive to fetch an Apple podcast/episdoe title */
protected function getTitleApple() {
return $this->fetchText("./apple:title");
}
/** Primitive to fetch an Atom feed/entry Web-representation URL */
protected function getLinkAtom() {
$node = $this->fetchAtomRelations();
return $node->length ? $this->resolveURL($node->item(0)->getAttribute("href"), $node->item(0)->baseURI) : null;
}
/** Primitive to fetch an RSS feed/entry Web-representation URL */
protected function getLinkRss2() {
return $this->fetchText("./link");
}
/** Primitive to fetch an RDF feed/entry Web-representation URL */
protected function getLinkRss1() {
return $this->fetchText("./rss1:link|./rss0:link");
}
/** Primitive to fetch Atom feed/entry categories */
protected function getCategoriesAtom(bool $grouped = false, bool $humanFriendly = true) {
$nodes = $this->fetchElements("./atom:category[@term]");
$out = [];
foreach ($nodes as $node) {
$scheme = $node->getAttribute("scheme");
$cat = ($humanFriendly && $node->hasAttribute("label")) ? $node->getAttribute("label") : $node->getAttribute("term");
if (!$out[$scheme]) {
$out[$scheme] = [];
}
if (!in_array($cat, $out[$scheme])) {
$out[$scheme][] = $cat;
}
}
return $out ? $out : null;
}
/** Primitive to fetch RSS feed/entry categories */
protected function getCategoriesRss2(bool $grouped = false, bool $humanFriendly = true) {
if ($grouped) {
$nodes = $this->fetchElements("./category");
$out = [];
foreach ($nodes as $node) {
$domain = $node->getAttribute("domain");
$cat = $this->trimText($node->textContent);
if (!$out[$domain]) {
$out[$domain] = [];
}
if (!in_array($cat, $out[$domain])) {
$out[$domain][] = $cat;
}
}
return $out ? $out : null;
} else {
$out = $this->fetchTextMulti("./category");
return $out ? array_keys(array_flip($out)) : null;
}
}
/** Primitive to fetch Dublin Core feed/entry categories
*
* Dublin Core doesn't have an obvious category type, so we use 'subject' as a nearest approximation
*/
protected function getCategoriesDC(bool $grouped = false, bool $humanFriendly = true) {
$out = $this->fetchTextMulti("./dc:subject");
if ($out) {
$out = array_keys(array_flip($out));
return $grouped ? ['' => $out] : $out;
}
return null;
}
/** Primitive to fetch RSS feed/entry categories */
protected function getCategoriesApple(bool $grouped = false, bool $humanFriendly = true) {
$nodes = $this->fetchElements("./apple:category");
$out = [];
foreach ($nodes as $node) {
$cat = $this->trimText($node->getAttribute("text"));
if (strlen($cat)) {
$out[] = $cat;
}
}
$out = array_keys(array_flip($out));
return $grouped ? ['' => $out] : $out;
}
}

79
lib/XMLFeed.php

@ -0,0 +1,79 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\Lax;
class XMLFeed extends XMLCommon {
use XMLCommonPrimitives;
use XMLFeedPrimitives;
public $url;
public $link;
public $title;
public $summary;
public $categories;
/** Returns a parsed feed */
public function __construct(string $data, string $contentType = null, string $url = null) {
$this->init($data, $contentType, $url);
$this->parse();
}
/** Performs initialization of the instance */
protected function init(string $data, string $contentType = null, string $url = null) {
$this->document = new \DOMDocument();
$this->document->loadXML($data, \LIBXML_BIGLINES | \LIBXML_COMPACT);
$this->document->documentURI = $url;
$this->xpath = self::getXPathProcessor($this->document);
$this->subject = $this->document->documentElement;
$ns = $this->subject->namespaceURI;
$name = $this->subject->localName;
if (is_null($ns) && $name=="rss") {
$this->subject = $this->fetchElement("./channel[1]") ?? $this->subject;
} elseif ($ns==self::NS['rdf'] && $name=="RDF") {
$this->subject = $this->fetchElement("./rss1:channel|./rss0:channel") ?? $this->subject;
} elseif ($ns==self::NS['atom'] && $name=="feed") {
// nothing required for Atom
} else {
throw new \Exception;
}
$this->url = $url;
}
/** Parses the feed to extract sundry metadata */
protected function parse() {
$this->link = $this->getLink();
$this->title = $this->getTitle() ?? $this->link;
$this->summary = $this->getSummary();
}
/** General function to fetch the feed title */
public function getTitle() {
return $this->getTitleAtom() ?? $this->getTitleRss1() ?? $this->getTitleRss2() ?? $this->getTitleDC() ?? $this->getTitleApple();
}
/** General function to fetch the feed's Web-representation URL */
public function getLink() {
return $this->getLinkAtom() ?? $this->getLinkRss1() ?? $this->getLinkRss2();
}
/** General function to fetch the description of a feed */
public function getSummary() {
// unlike most other data, Atom is not preferred, because Atom doesn't really have feed summaries
return $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryAtom();
}
/** General function to fetch the categories of a feed
*
* If the $grouped parameter is true, and array of arrays will be returned, keyed by taxonomy/scheme
*
* The $humanFriendly parameter only affects Atom categories
*/
public function getCategories(bool $grouped = false, bool $humanFriendly = true) {
return $this->getCategoriesAtom($grouped, $humanFriendly) ?? $this->getCategoriesRss2($grouped, $humanFriendly) ?? $this->getCategoriesDC($grouped, $humanFriendly) ?? $this->getCategoriesApple($grouped, $humanFriendly);
}
}

40
lib/XMLFeedPrimitives.php

@ -0,0 +1,40 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\Lax;
trait XMLFeedPrimitives {
/** Primitive to fetch an Atom feed summary
*
* Atom does not have a 'description' element like the RSSes, but it does have 'subtitle', which fills roughly the same function
*/
protected function getSummaryAtom() {
return $this->fetchTextAtom("./atom:subtitle");
}
/** Primitive to fetch an RSS feed summary */
protected function getSummaryRss2() {
return $this->fetchText("./description");
}
/** Primitive to fetch an RDF feed summary */
protected function getSummaryRss1() {
return $this->fetchText("./rss1:description|./rss0:description");
}
/** Primitive to fetch a Dublin Core feed summary */
protected function getSummaryDC() {
return $this->fetchText("./dc:description");
}
/** Primitive to fetch an Apple podcast summary */
protected function getSummaryApple() {
return $this->fetchText("./apple:summary") ?? $this->fetchText("./apple:subtitle");
}
}
Loading…
Cancel
Save