Initial prototype of feed parser

Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented
6 years ago · 92711159d0
8 changed files with 480 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,7 @@
+* text=auto encoding=utf-8
+
+*.html     diff=html
+*.php      diff=php
+*.bat      eol=crlf
+*.cmd      eol=crlf
+.gitignore -eol
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+vendor
+samples
--- a/composer.json
+++ b/composer.json
@ -0,0 +1,26 @@
+{
+    "name": "jkingweb/lax",
+    "type": "library",
+    "description": "A lax newsfeed parser",
+    "keywords": ["rss","atom","jsonfeed"],
+    "license": "MIT",
+    "authors": [
+        {
+            "name": "J. King",
+            "email": "jking@jkingweb.ca",
+            "homepage": "https://jkingweb.ca/"
+        }
+
+    ],
+    "require": {
+        "php": "^7.0",
+        "ext-json": "*",
+        "ext-dom": "*",
+        "sabre/uri": "^2.0"
+    },
+    "autoload": {
+        "psr-4": {
+            "JKingWeb\\Lax\\": "lib/"
+        }
+    }
+}
--- a/composer.lock
+++ b/composer.lock
@ -0,0 +1,73 @@
+{
+    "_readme": [
+        "This file locks the dependencies of your project to a known state",
+        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
+        "This file is @generated automatically"
+    ],
+    "content-hash": "ddf62aa3f11d886da2b7ba796090469f",
+    "packages": [
+        {
+            "name": "sabre/uri",
+            "version": "2.1.1",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/sabre-io/uri.git",
+                "reference": "a42126042c7dcb53e2978dadb6d22574d1359b4c"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/sabre-io/uri/zipball/a42126042c7dcb53e2978dadb6d22574d1359b4c",
+                "reference": "a42126042c7dcb53e2978dadb6d22574d1359b4c",
+                "shasum": ""
+            },
+            "require": {
+                "php": ">=7"
+            },
+            "require-dev": {
+                "phpunit/phpunit": "^6.0",
+                "sabre/cs": "~1.0.0"
+            },
+            "type": "library",
+            "autoload": {
+                "files": [
+                    "lib/functions.php"
+                ],
+                "psr-4": {
+                    "Sabre\\Uri\\": "lib/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "BSD-3-Clause"
+            ],
+            "authors": [
+                {
+                    "name": "Evert Pot",
+                    "email": "me@evertpot.com",
+                    "homepage": "http://evertpot.com/",
+                    "role": "Developer"
+                }
+            ],
+            "description": "Functions for making sense out of URIs.",
+            "homepage": "http://sabre.io/uri/",
+            "keywords": [
+                "rfc3986",
+                "uri",
+                "url"
+            ],
+            "time": "2017-02-20T20:02:35+00:00"
+        }
+    ],
+    "packages-dev": [],
+    "aliases": [],
+    "minimum-stability": "stable",
+    "stability-flags": [],
+    "prefer-stable": false,
+    "prefer-lowest": false,
+    "platform": {
+        "php": "^7.0",
+        "ext-json": "*",
+        "ext-dom": "*"
+    },
+    "platform-dev": []
+}
--- a/lib/XMLCommon.php
+++ b/lib/XMLCommon.php
@ -0,0 +1,132 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace JKingWeb\Lax;
+
+abstract class XMLCommon {
+    /** @var \DOMDocument */
+    public    $document;
+    /** @var \DOMXPath */
+    protected $xpath;
+    /** @var \DOMElement */
+    protected $subject;
+    protected $base = "";
+    
+    const NS = [
+        'atom'  => "http://www.w3.org/2005/Atom",                   // Atom syndication format                  https://tools.ietf.org/html/rfc4287
+        'rss1'  => "http://purl.org/rss/1.0/",                      // RDF site summary 1.0                     http://purl.org/rss/1.0/spec
+        'rss0'  => "http://channel.netscape.com/rdf/simple/0.9/",   // RDF Site Summary 0.90                    http://www.rssboard.org/rss-0-9-0
+        'dc'    => "http://purl.org/dc/elements/1.1/",              // Dublin Core metadata                     http://purl.org/rss/1.0/modules/dc/
+        'sched' => "http://purl.org/rss/1.0/modules/syndication/",  // Syndication schedule extension           http://purl.org/rss/1.0/modules/syndication/
+        'enc'   => "http://purl.org/rss/1.0/modules/content/",      // Explicitly encoded content extension     http://purl.org/rss/1.0/modules/content/
+        'media' => "http://search.yahoo.com/mrss/",                 // Embedded media extension                 http://www.rssboard.org/media-rss
+        // RSS 2.0 does not have a namespace                        // Really Simple Syndication 2.0.11         http://www.rssboard.org/rss-specification
+        'rdf'   => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",   // Resource Description Framework
+        'xhtml' => "http://www.w3.org/1999/xhtml",                  // XHTML
+        'apple' => "http://www.itunes.com/DTDs/Podcast-1.0.dtd"     // iTunes podcasts                          https://help.apple.com/itc/podcasts_connect/#/itcb54353390
+    ];
+    
+    /** Returns an XPath processor with various necessary namespace prefixes defined */
+    public static function getXPathProcessor(\DOMDocument $doc): \DOMXPath {
+        $proc = new \DOMXPath($doc);
+        foreach (self::NS as $prefix => $url) {
+            $proc->registerNamespace($prefix, $url);
+        }
+        return $proc;
+    }
+
+    /** Trims plain text and collapses whitespace */
+    protected function trimText(string $text): string {
+        return trim(preg_replace("<\s{2,}>s", " ", $text));
+    }
+
+    /** Takes an HTML string as input and returns a sanitized version of that string
+     * 
+     * The $outputHtml parameter, when false, outputs only the plain-text content of the sanitized HTML
+     */
+    protected function sanitizeString(string $markup, bool $outputHtml = true): string {
+        if (!preg_match("/<\S/", $markup)) {
+            // if the string does not appear to actually contain markup besides entities, we can skip most of the sanitization
+            return $outputHtml ? $markup : $this->trimText(html_entity_decode($markup, \ENT_QUOTES | \ENT_HTML5, "UTF-8"));
+        } else {
+            return "OOK!";
+        }
+    }
+
+    /** Retrieves an element node based on an XPath query */
+    protected function fetchElement(string $query) {
+        $node = $this->xpath->query("(".$query.")[1]", $this->subject);
+        return ($node->length) ? $node->item(0) : null;
+    }
+
+    /** Retrieves multiple element node based on an XPath query */
+    protected function fetchElements(string $query) {
+        return $this->xpath->query($query, $this->subject);
+    }
+
+    /** Retrieves the trimmed text content of a DOM element based on an XPath query  */
+    protected function fetchText(string $query) {
+        $node = $this->fetchElement($query);
+        return ($node) ? $this->trimText($node->textContent) : null;
+    }
+
+    /** Retrieves the trimmed text content of multiple DOM elements based on an XPath query  */
+    protected function fetchTextMulti(string $query) {
+        $out = [];
+        $nodes = $this->xpath->query($query, $this->subject);
+        foreach ($nodes as $node) {
+            $out[] = $this->trimText($node->item(0)->textContent);
+        }
+        return ($out) ? $out : null;
+    }
+
+    /** Retrieves the trimmed plain-text or HTML content of an Atom text construct based on an XPath query */
+    protected function fetchTextAtom(string $query, bool $html = false) {
+        $node = $this->fetchElement($query);
+        if ($node) {
+            if (!$node->hasAttribute("type") || $node->getAttribute("type")=="text") {
+                return $html ? htmlspecialchars($this->trimText($node->textContent), \ENT_QUOTES | \ENT_HTML5) : $this->trimText($node->textContent);
+            } elseif ($node->getAttribute("type")=="xhtml") {
+                $node = $node->getElementsByTagNameNS(self::NS['xhtml'], "div")->item(0);
+                return $node ? $this->sanitizeElement($node, $html) : null;
+            } elseif ($node->getAttribute("type")=="html") {
+                return $this->sanitizeString($node->textContent, $html);
+            } else {
+                return null;
+            }
+        } else {
+            return null;
+        }
+    }
+
+    /** Returns a node-list of Atom link elements with the desired relation or equivalents.
+     * 
+     * Links without an href attribute are excluded.
+     * 
+     * @see https://tools.ietf.org/html/rfc4287#section-4.2.7.2
+     */
+    protected function fetchAtomRelations(string $rel = ""): \DOMNodeList {
+        // FIXME: The XPath evaluation will fail if the relation contains an apostrophe. This is a known and difficult-to-overcome limitation of XPath 1.0 which I consider not worth the effort to address at this time
+        if ($rel=="" || $rel=="alternate" || $rel=="http://www.iana.org/assignments/relation/alternate") {
+            $cond = "not(@rel) or @rel='' or @rel='alternate' or @rel='http://www.iana.org/assignments/relation/alternate'";
+        } elseif (strpos($rel, ":")===false) {
+            // FIXME: Checking only for a colon in a link relation is a hack that does not strictly follow IRI rules, but it's adequate for our needs    
+            $cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'";
+        } elseif (strlen($rel) > 41 && strpos($rel, "http://www.iana.org/assignments/relation/")===0) {
+            $rel = substr($rel, 41);
+            $cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'";
+        } else {
+            $cond = "@rel='$rel'";
+        }
+        return $this->xpath->query("./atom:link[@href][$cond]", $this->subject);
+    }
+
+    /** Resolves a relative URL against a base URL */
+    protected function resolveUrl(string $url, string $base = null): string {
+        $base = $base ?? "";
+        return \Sabre\Uri\resolve($base, $url);
+    }
+}
--- a/lib/XMLCommonPrimitives.php
+++ b/lib/XMLCommonPrimitives.php
@ -0,0 +1,121 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace JKingWeb\Lax;
+
+trait XMLCommonPrimitives {
+
+    /** Primitive to fetch an Atom feed/entry title
+     * 
+     * This fetches the title in plain text rather than HTML, even if HTML is provided in the feed/entry
+     */
+    protected function getTitleAtom() {
+        return $this->fetchTextAtom("./atom:title");
+    }
+
+    /** Primitive to fetch an RSS feed/entry title */
+    protected function getTitleRss2() {
+        return $this->fetchText("./title");
+    }
+
+    /** Primitive to fetch an RDF feed/entry title */
+    protected function getTitleRss1() {
+        return $this->fetchText("./rss1:title|./rss0:title");
+    }
+
+    /** Primitive to fetch a Dublin Core feed/entry title */
+    protected function getTitleDC() {
+        return $this->fetchText("./dc:title");
+    }
+
+    /** Primitive to fetch an Apple podcast/episdoe title */
+    protected function getTitleApple() {
+        return $this->fetchText("./apple:title");
+    }
+
+    /** Primitive to fetch an Atom feed/entry Web-representation URL */
+    protected function getLinkAtom() {
+        $node = $this->fetchAtomRelations();
+        return $node->length ? $this->resolveURL($node->item(0)->getAttribute("href"), $node->item(0)->baseURI) : null;
+    }
+
+    /** Primitive to fetch an RSS feed/entry Web-representation URL */
+    protected function getLinkRss2() {
+        return $this->fetchText("./link");
+    }
+
+    /** Primitive to fetch an RDF feed/entry Web-representation URL */
+    protected function getLinkRss1() {
+        return $this->fetchText("./rss1:link|./rss0:link");
+    }
+
+    /** Primitive to fetch Atom feed/entry categories */
+    protected function getCategoriesAtom(bool $grouped = false, bool $humanFriendly = true) {
+        $nodes = $this->fetchElements("./atom:category[@term]");
+        $out = [];
+        foreach ($nodes as $node) {
+            $scheme = $node->getAttribute("scheme");
+            $cat = ($humanFriendly && $node->hasAttribute("label")) ? $node->getAttribute("label") : $node->getAttribute("term");
+            if (!$out[$scheme]) {
+                $out[$scheme] = [];
+            }
+            if (!in_array($cat, $out[$scheme])) {
+                $out[$scheme][] = $cat;
+            }
+        }
+        return $out ? $out : null;
+    }
+
+    /** Primitive to fetch RSS feed/entry categories */
+    protected function getCategoriesRss2(bool $grouped = false, bool $humanFriendly = true) {
+        if ($grouped) {
+            $nodes = $this->fetchElements("./category");
+            $out = [];
+            foreach ($nodes as $node) {
+                $domain = $node->getAttribute("domain");
+                $cat = $this->trimText($node->textContent);
+                if (!$out[$domain]) {
+                    $out[$domain] = [];
+                }
+                if (!in_array($cat, $out[$domain])) {
+                    $out[$domain][] = $cat;
+                }
+            }
+            return $out ? $out : null;
+        } else {
+            $out = $this->fetchTextMulti("./category");
+            return $out ? array_keys(array_flip($out)) : null;
+        }
+    }
+
+    /** Primitive to fetch Dublin Core feed/entry categories
+     * 
+     * Dublin Core doesn't have an obvious category type, so we use 'subject' as a nearest approximation
+    */
+    protected function getCategoriesDC(bool $grouped = false, bool $humanFriendly = true) {
+        $out = $this->fetchTextMulti("./dc:subject");
+        if ($out) {
+            $out = array_keys(array_flip($out));
+            return $grouped ? ['' => $out] : $out;
+        }
+        return null;
+    }
+
+    /** Primitive to fetch RSS feed/entry categories */
+    protected function getCategoriesApple(bool $grouped = false, bool $humanFriendly = true) {
+        $nodes = $this->fetchElements("./apple:category");
+        $out = [];
+        foreach ($nodes as $node) {
+            $cat = $this->trimText($node->getAttribute("text"));
+            if (strlen($cat)) {
+                $out[] = $cat;
+            }
+        }
+        $out = array_keys(array_flip($out));
+        return $grouped ? ['' => $out] : $out;
+
+    }
+}
--- a/lib/XMLFeed.php
+++ b/lib/XMLFeed.php
@ -0,0 +1,79 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace JKingWeb\Lax;
+
+class XMLFeed extends XMLCommon {
+    use XMLCommonPrimitives;
+    use XMLFeedPrimitives;
+    
+    public $url;
+    public $link;
+    public $title;
+    public $summary;
+    public $categories;
+
+    /** Returns a parsed feed */
+    public function __construct(string $data, string $contentType = null, string $url = null) {
+        $this->init($data, $contentType, $url);
+        $this->parse();
+    }
+
+    /** Performs initialization of the instance */
+    protected function init(string $data, string $contentType = null, string $url = null) {
+        $this->document = new \DOMDocument();
+        $this->document->loadXML($data, \LIBXML_BIGLINES | \LIBXML_COMPACT);
+        $this->document->documentURI = $url;
+        $this->xpath = self::getXPathProcessor($this->document);
+        $this->subject = $this->document->documentElement;
+        $ns = $this->subject->namespaceURI;
+        $name = $this->subject->localName;
+        if (is_null($ns) && $name=="rss") {
+            $this->subject = $this->fetchElement("./channel[1]") ?? $this->subject;
+        } elseif ($ns==self::NS['rdf'] && $name=="RDF") {
+            $this->subject = $this->fetchElement("./rss1:channel|./rss0:channel") ?? $this->subject;
+        } elseif ($ns==self::NS['atom'] && $name=="feed") {
+            // nothing required for Atom
+        } else {
+            throw new \Exception;
+        }
+        $this->url = $url;
+        
+    }
+
+    /** Parses the feed to extract sundry metadata */
+    protected function parse() {
+        $this->link = $this->getLink();
+        $this->title = $this->getTitle() ?? $this->link;
+        $this->summary = $this->getSummary();
+    }
+    
+    /** General function to fetch the feed title */
+    public function getTitle() {
+        return $this->getTitleAtom() ?? $this->getTitleRss1() ?? $this->getTitleRss2() ?? $this->getTitleDC() ?? $this->getTitleApple();
+    }
+
+    /** General function to fetch the feed's Web-representation URL */
+    public function getLink() {
+        return $this->getLinkAtom() ?? $this->getLinkRss1() ?? $this->getLinkRss2();
+    }
+
+    /** General function to fetch the description of a feed */
+    public function getSummary() {
+        // unlike most other data, Atom is not preferred, because Atom doesn't really have feed summaries
+        return $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryAtom();
+    }
+
+    /** General function to fetch the categories of a feed 
+     * 
+     * If the $grouped parameter is true, and array of arrays will be returned, keyed by taxonomy/scheme
+     * 
+     * The $humanFriendly parameter only affects Atom categories
+    */
+    public function getCategories(bool $grouped = false, bool $humanFriendly = true) {
+        return $this->getCategoriesAtom($grouped, $humanFriendly) ?? $this->getCategoriesRss2($grouped, $humanFriendly) ?? $this->getCategoriesDC($grouped, $humanFriendly) ?? $this->getCategoriesApple($grouped, $humanFriendly);
+    }
+}
--- a/lib/XMLFeedPrimitives.php
+++ b/lib/XMLFeedPrimitives.php
@ -0,0 +1,40 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace JKingWeb\Lax;
+
+trait XMLFeedPrimitives {
+
+    /** Primitive to fetch an Atom feed summary
+     * 
+     * Atom does not have a 'description' element like the RSSes, but it does have 'subtitle', which fills roughly the same function
+     */
+    protected function getSummaryAtom() {
+        return $this->fetchTextAtom("./atom:subtitle");
+    }
+
+    /** Primitive to fetch an RSS feed summary */
+    protected function getSummaryRss2() {
+        return $this->fetchText("./description");
+    }
+
+    /** Primitive to fetch an RDF feed summary */
+    protected function getSummaryRss1() {
+        return $this->fetchText("./rss1:description|./rss0:description");
+    }
+
+    /** Primitive to fetch a Dublin Core feed summary */
+    protected function getSummaryDC() {
+        return $this->fetchText("./dc:description");
+    }
+
+    /** Primitive to fetch an Apple podcast summary */
+    protected function getSummaryApple() {
+        return $this->fetchText("./apple:summary") ?? $this->fetchText("./apple:subtitle");
+    }
+
+
+}