From 92711159d0fa2a4fef35b741e36c42ed90366f62 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 21 Feb 2018 23:51:50 -0500 Subject: [PATCH] Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented --- .gitattributes | 7 ++ .gitignore | 2 + composer.json | 26 +++++++ composer.lock | 73 ++++++++++++++++++++ lib/XMLCommon.php | 132 ++++++++++++++++++++++++++++++++++++ lib/XMLCommonPrimitives.php | 121 +++++++++++++++++++++++++++++++++ lib/XMLFeed.php | 79 +++++++++++++++++++++ lib/XMLFeedPrimitives.php | 40 +++++++++++ 8 files changed, 480 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 composer.json create mode 100644 composer.lock create mode 100644 lib/XMLCommon.php create mode 100644 lib/XMLCommonPrimitives.php create mode 100644 lib/XMLFeed.php create mode 100644 lib/XMLFeedPrimitives.php diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..aaa63b7 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,7 @@ +* text=auto encoding=utf-8 + +*.html diff=html +*.php diff=php +*.bat eol=crlf +*.cmd eol=crlf +.gitignore -eol diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ccd60ab --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +vendor +samples diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..c2d1c23 --- /dev/null +++ b/composer.json @@ -0,0 +1,26 @@ +{ + "name": "jkingweb/lax", + "type": "library", + "description": "A lax newsfeed parser", + "keywords": ["rss","atom","jsonfeed"], + "license": "MIT", + "authors": [ + { + "name": "J. King", + "email": "jking@jkingweb.ca", + "homepage": "https://jkingweb.ca/" + } + + ], + "require": { + "php": "^7.0", + "ext-json": "*", + "ext-dom": "*", + "sabre/uri": "^2.0" + }, + "autoload": { + "psr-4": { + "JKingWeb\\Lax\\": "lib/" + } + } +} diff --git a/composer.lock b/composer.lock new file mode 100644 index 0000000..09194c8 --- /dev/null +++ b/composer.lock @@ -0,0 +1,73 @@ +{ + "_readme": [ + "This file locks the dependencies of your project to a known state", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", + "This file is @generated automatically" + ], + "content-hash": "ddf62aa3f11d886da2b7ba796090469f", + "packages": [ + { + "name": "sabre/uri", + "version": "2.1.1", + "source": { + "type": "git", + "url": "https://github.com/sabre-io/uri.git", + "reference": "a42126042c7dcb53e2978dadb6d22574d1359b4c" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sabre-io/uri/zipball/a42126042c7dcb53e2978dadb6d22574d1359b4c", + "reference": "a42126042c7dcb53e2978dadb6d22574d1359b4c", + "shasum": "" + }, + "require": { + "php": ">=7" + }, + "require-dev": { + "phpunit/phpunit": "^6.0", + "sabre/cs": "~1.0.0" + }, + "type": "library", + "autoload": { + "files": [ + "lib/functions.php" + ], + "psr-4": { + "Sabre\\Uri\\": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Evert Pot", + "email": "me@evertpot.com", + "homepage": "http://evertpot.com/", + "role": "Developer" + } + ], + "description": "Functions for making sense out of URIs.", + "homepage": "http://sabre.io/uri/", + "keywords": [ + "rfc3986", + "uri", + "url" + ], + "time": "2017-02-20T20:02:35+00:00" + } + ], + "packages-dev": [], + "aliases": [], + "minimum-stability": "stable", + "stability-flags": [], + "prefer-stable": false, + "prefer-lowest": false, + "platform": { + "php": "^7.0", + "ext-json": "*", + "ext-dom": "*" + }, + "platform-dev": [] +} diff --git a/lib/XMLCommon.php b/lib/XMLCommon.php new file mode 100644 index 0000000..6add95d --- /dev/null +++ b/lib/XMLCommon.php @@ -0,0 +1,132 @@ + "http://www.w3.org/2005/Atom", // Atom syndication format https://tools.ietf.org/html/rfc4287 + 'rss1' => "http://purl.org/rss/1.0/", // RDF site summary 1.0 http://purl.org/rss/1.0/spec + 'rss0' => "http://channel.netscape.com/rdf/simple/0.9/", // RDF Site Summary 0.90 http://www.rssboard.org/rss-0-9-0 + 'dc' => "http://purl.org/dc/elements/1.1/", // Dublin Core metadata http://purl.org/rss/1.0/modules/dc/ + 'sched' => "http://purl.org/rss/1.0/modules/syndication/", // Syndication schedule extension http://purl.org/rss/1.0/modules/syndication/ + 'enc' => "http://purl.org/rss/1.0/modules/content/", // Explicitly encoded content extension http://purl.org/rss/1.0/modules/content/ + 'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss + // RSS 2.0 does not have a namespace // Really Simple Syndication 2.0.11 http://www.rssboard.org/rss-specification + 'rdf' => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", // Resource Description Framework + 'xhtml' => "http://www.w3.org/1999/xhtml", // XHTML + 'apple' => "http://www.itunes.com/DTDs/Podcast-1.0.dtd" // iTunes podcasts https://help.apple.com/itc/podcasts_connect/#/itcb54353390 + ]; + + /** Returns an XPath processor with various necessary namespace prefixes defined */ + public static function getXPathProcessor(\DOMDocument $doc): \DOMXPath { + $proc = new \DOMXPath($doc); + foreach (self::NS as $prefix => $url) { + $proc->registerNamespace($prefix, $url); + } + return $proc; + } + + /** Trims plain text and collapses whitespace */ + protected function trimText(string $text): string { + return trim(preg_replace("<\s{2,}>s", " ", $text)); + } + + /** Takes an HTML string as input and returns a sanitized version of that string + * + * The $outputHtml parameter, when false, outputs only the plain-text content of the sanitized HTML + */ + protected function sanitizeString(string $markup, bool $outputHtml = true): string { + if (!preg_match("/<\S/", $markup)) { + // if the string does not appear to actually contain markup besides entities, we can skip most of the sanitization + return $outputHtml ? $markup : $this->trimText(html_entity_decode($markup, \ENT_QUOTES | \ENT_HTML5, "UTF-8")); + } else { + return "OOK!"; + } + } + + /** Retrieves an element node based on an XPath query */ + protected function fetchElement(string $query) { + $node = $this->xpath->query("(".$query.")[1]", $this->subject); + return ($node->length) ? $node->item(0) : null; + } + + /** Retrieves multiple element node based on an XPath query */ + protected function fetchElements(string $query) { + return $this->xpath->query($query, $this->subject); + } + + /** Retrieves the trimmed text content of a DOM element based on an XPath query */ + protected function fetchText(string $query) { + $node = $this->fetchElement($query); + return ($node) ? $this->trimText($node->textContent) : null; + } + + /** Retrieves the trimmed text content of multiple DOM elements based on an XPath query */ + protected function fetchTextMulti(string $query) { + $out = []; + $nodes = $this->xpath->query($query, $this->subject); + foreach ($nodes as $node) { + $out[] = $this->trimText($node->item(0)->textContent); + } + return ($out) ? $out : null; + } + + /** Retrieves the trimmed plain-text or HTML content of an Atom text construct based on an XPath query */ + protected function fetchTextAtom(string $query, bool $html = false) { + $node = $this->fetchElement($query); + if ($node) { + if (!$node->hasAttribute("type") || $node->getAttribute("type")=="text") { + return $html ? htmlspecialchars($this->trimText($node->textContent), \ENT_QUOTES | \ENT_HTML5) : $this->trimText($node->textContent); + } elseif ($node->getAttribute("type")=="xhtml") { + $node = $node->getElementsByTagNameNS(self::NS['xhtml'], "div")->item(0); + return $node ? $this->sanitizeElement($node, $html) : null; + } elseif ($node->getAttribute("type")=="html") { + return $this->sanitizeString($node->textContent, $html); + } else { + return null; + } + } else { + return null; + } + } + + /** Returns a node-list of Atom link elements with the desired relation or equivalents. + * + * Links without an href attribute are excluded. + * + * @see https://tools.ietf.org/html/rfc4287#section-4.2.7.2 + */ + protected function fetchAtomRelations(string $rel = ""): \DOMNodeList { + // FIXME: The XPath evaluation will fail if the relation contains an apostrophe. This is a known and difficult-to-overcome limitation of XPath 1.0 which I consider not worth the effort to address at this time + if ($rel=="" || $rel=="alternate" || $rel=="http://www.iana.org/assignments/relation/alternate") { + $cond = "not(@rel) or @rel='' or @rel='alternate' or @rel='http://www.iana.org/assignments/relation/alternate'"; + } elseif (strpos($rel, ":")===false) { + // FIXME: Checking only for a colon in a link relation is a hack that does not strictly follow IRI rules, but it's adequate for our needs + $cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'"; + } elseif (strlen($rel) > 41 && strpos($rel, "http://www.iana.org/assignments/relation/")===0) { + $rel = substr($rel, 41); + $cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'"; + } else { + $cond = "@rel='$rel'"; + } + return $this->xpath->query("./atom:link[@href][$cond]", $this->subject); + } + + /** Resolves a relative URL against a base URL */ + protected function resolveUrl(string $url, string $base = null): string { + $base = $base ?? ""; + return \Sabre\Uri\resolve($base, $url); + } +} diff --git a/lib/XMLCommonPrimitives.php b/lib/XMLCommonPrimitives.php new file mode 100644 index 0000000..76ae1d1 --- /dev/null +++ b/lib/XMLCommonPrimitives.php @@ -0,0 +1,121 @@ +fetchTextAtom("./atom:title"); + } + + /** Primitive to fetch an RSS feed/entry title */ + protected function getTitleRss2() { + return $this->fetchText("./title"); + } + + /** Primitive to fetch an RDF feed/entry title */ + protected function getTitleRss1() { + return $this->fetchText("./rss1:title|./rss0:title"); + } + + /** Primitive to fetch a Dublin Core feed/entry title */ + protected function getTitleDC() { + return $this->fetchText("./dc:title"); + } + + /** Primitive to fetch an Apple podcast/episdoe title */ + protected function getTitleApple() { + return $this->fetchText("./apple:title"); + } + + /** Primitive to fetch an Atom feed/entry Web-representation URL */ + protected function getLinkAtom() { + $node = $this->fetchAtomRelations(); + return $node->length ? $this->resolveURL($node->item(0)->getAttribute("href"), $node->item(0)->baseURI) : null; + } + + /** Primitive to fetch an RSS feed/entry Web-representation URL */ + protected function getLinkRss2() { + return $this->fetchText("./link"); + } + + /** Primitive to fetch an RDF feed/entry Web-representation URL */ + protected function getLinkRss1() { + return $this->fetchText("./rss1:link|./rss0:link"); + } + + /** Primitive to fetch Atom feed/entry categories */ + protected function getCategoriesAtom(bool $grouped = false, bool $humanFriendly = true) { + $nodes = $this->fetchElements("./atom:category[@term]"); + $out = []; + foreach ($nodes as $node) { + $scheme = $node->getAttribute("scheme"); + $cat = ($humanFriendly && $node->hasAttribute("label")) ? $node->getAttribute("label") : $node->getAttribute("term"); + if (!$out[$scheme]) { + $out[$scheme] = []; + } + if (!in_array($cat, $out[$scheme])) { + $out[$scheme][] = $cat; + } + } + return $out ? $out : null; + } + + /** Primitive to fetch RSS feed/entry categories */ + protected function getCategoriesRss2(bool $grouped = false, bool $humanFriendly = true) { + if ($grouped) { + $nodes = $this->fetchElements("./category"); + $out = []; + foreach ($nodes as $node) { + $domain = $node->getAttribute("domain"); + $cat = $this->trimText($node->textContent); + if (!$out[$domain]) { + $out[$domain] = []; + } + if (!in_array($cat, $out[$domain])) { + $out[$domain][] = $cat; + } + } + return $out ? $out : null; + } else { + $out = $this->fetchTextMulti("./category"); + return $out ? array_keys(array_flip($out)) : null; + } + } + + /** Primitive to fetch Dublin Core feed/entry categories + * + * Dublin Core doesn't have an obvious category type, so we use 'subject' as a nearest approximation + */ + protected function getCategoriesDC(bool $grouped = false, bool $humanFriendly = true) { + $out = $this->fetchTextMulti("./dc:subject"); + if ($out) { + $out = array_keys(array_flip($out)); + return $grouped ? ['' => $out] : $out; + } + return null; + } + + /** Primitive to fetch RSS feed/entry categories */ + protected function getCategoriesApple(bool $grouped = false, bool $humanFriendly = true) { + $nodes = $this->fetchElements("./apple:category"); + $out = []; + foreach ($nodes as $node) { + $cat = $this->trimText($node->getAttribute("text")); + if (strlen($cat)) { + $out[] = $cat; + } + } + $out = array_keys(array_flip($out)); + return $grouped ? ['' => $out] : $out; + + } +} diff --git a/lib/XMLFeed.php b/lib/XMLFeed.php new file mode 100644 index 0000000..eea5b5d --- /dev/null +++ b/lib/XMLFeed.php @@ -0,0 +1,79 @@ +init($data, $contentType, $url); + $this->parse(); + } + + /** Performs initialization of the instance */ + protected function init(string $data, string $contentType = null, string $url = null) { + $this->document = new \DOMDocument(); + $this->document->loadXML($data, \LIBXML_BIGLINES | \LIBXML_COMPACT); + $this->document->documentURI = $url; + $this->xpath = self::getXPathProcessor($this->document); + $this->subject = $this->document->documentElement; + $ns = $this->subject->namespaceURI; + $name = $this->subject->localName; + if (is_null($ns) && $name=="rss") { + $this->subject = $this->fetchElement("./channel[1]") ?? $this->subject; + } elseif ($ns==self::NS['rdf'] && $name=="RDF") { + $this->subject = $this->fetchElement("./rss1:channel|./rss0:channel") ?? $this->subject; + } elseif ($ns==self::NS['atom'] && $name=="feed") { + // nothing required for Atom + } else { + throw new \Exception; + } + $this->url = $url; + + } + + /** Parses the feed to extract sundry metadata */ + protected function parse() { + $this->link = $this->getLink(); + $this->title = $this->getTitle() ?? $this->link; + $this->summary = $this->getSummary(); + } + + /** General function to fetch the feed title */ + public function getTitle() { + return $this->getTitleAtom() ?? $this->getTitleRss1() ?? $this->getTitleRss2() ?? $this->getTitleDC() ?? $this->getTitleApple(); + } + + /** General function to fetch the feed's Web-representation URL */ + public function getLink() { + return $this->getLinkAtom() ?? $this->getLinkRss1() ?? $this->getLinkRss2(); + } + + /** General function to fetch the description of a feed */ + public function getSummary() { + // unlike most other data, Atom is not preferred, because Atom doesn't really have feed summaries + return $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryAtom(); + } + + /** General function to fetch the categories of a feed + * + * If the $grouped parameter is true, and array of arrays will be returned, keyed by taxonomy/scheme + * + * The $humanFriendly parameter only affects Atom categories + */ + public function getCategories(bool $grouped = false, bool $humanFriendly = true) { + return $this->getCategoriesAtom($grouped, $humanFriendly) ?? $this->getCategoriesRss2($grouped, $humanFriendly) ?? $this->getCategoriesDC($grouped, $humanFriendly) ?? $this->getCategoriesApple($grouped, $humanFriendly); + } +} diff --git a/lib/XMLFeedPrimitives.php b/lib/XMLFeedPrimitives.php new file mode 100644 index 0000000..bd93fe0 --- /dev/null +++ b/lib/XMLFeedPrimitives.php @@ -0,0 +1,40 @@ +fetchTextAtom("./atom:subtitle"); + } + + /** Primitive to fetch an RSS feed summary */ + protected function getSummaryRss2() { + return $this->fetchText("./description"); + } + + /** Primitive to fetch an RDF feed summary */ + protected function getSummaryRss1() { + return $this->fetchText("./rss1:description|./rss0:description"); + } + + /** Primitive to fetch a Dublin Core feed summary */ + protected function getSummaryDC() { + return $this->fetchText("./dc:description"); + } + + /** Primitive to fetch an Apple podcast summary */ + protected function getSummaryApple() { + return $this->fetchText("./apple:summary") ?? $this->fetchText("./apple:subtitle"); + } + + +}