Browse Source

Initial cleanup of XML parser

master
J. King 4 years ago
parent
commit
6e780ec0a8
  1. 24
      lib/Parser/XML/Construct.php
  2. 93
      lib/Parser/XML/Feed.php
  3. 94
      lib/Parser/XML/Primitives/Construct.php
  4. 10
      lib/Parser/XML/Primitives/Feed.php
  5. 1
      tests/cases/JSON/JSONTest.php

24
lib/Parser/XML/Construct.php

@ -6,8 +6,10 @@
declare(strict_types=1);
namespace JKingWeb\Lax\Parser\XML;
use JKingWeb\Lax\Date;
use JKingWeb\Lax\Person\Person;
use JKingWeb\Lax\Person\Collection as PersonCollection;
use JKingWeb\Lax\Text;
trait Construct {
use \JKingWeb\Lax\Parser\Construct;
@ -23,7 +25,7 @@ trait Construct {
protected function fetchElement(string $query, \DOMNode $context = null) {
$node = @$this->xpath->query("(".$query.")[1]", $context ?? $this->subject);
if ($node===false) {
throw new \Exception("Invalid XPath query: $query");
throw new \Exception("Invalid XPath query: $query"); // @codeCoverageIgnore
}
return ($node->length) ? $node->item(0) : null;
}
@ -34,13 +36,13 @@ trait Construct {
}
/** Retrieves the trimmed text content of a DOM element based on an XPath query */
protected function fetchText(string $query, \DOMNode $context = null) {
protected function fetchString(string $query, \DOMNode $context = null): ?string {
$node = $this->fetchElement($query, $context);
return ($node) ? $this->trimText($node->textContent) : null;
}
/** Retrieves the trimmed text content of multiple DOM elements based on an XPath query */
protected function fetchTextMulti(string $query, \DOMNode $context = null) {
protected function fetchStringMulti(string $query, \DOMNode $context = null) {
$out = [];
$nodes = $this->xpath->query($query, $context ?? $this->subject);
foreach ($nodes as $node) {
@ -50,7 +52,7 @@ trait Construct {
}
/** Retrieves the trimmed plain-text or HTML content of an Atom text construct based on an XPath query */
protected function fetchTextAtom(string $query, bool $html = false) {
protected function fetchStringAtom(string $query, bool $html = false): ?Text {
$node = $this->fetchElement($query);
if ($node) {
if (!$node->hasAttribute("type") || $node->getAttribute("type")=="text") {
@ -69,8 +71,8 @@ trait Construct {
}
/** Retrieves and parses a date from the content of a DOM element based on an XPath query */
protected function fetchDate(string $query, \DOMNode $context = null) {
return $this->parseDate($this->fetchText($query, $context) ?? "");
protected function fetchDate(string $query, \DOMNode $context = null): ?Date {
return $this->parseDate($this->fetchString($query, $context) ?? "");
}
/** Returns a node-list of Atom link elements with the desired relation or equivalents.
@ -104,8 +106,8 @@ trait Construct {
* - user@example.com (Full Name)
* - Full Name <user@example.com>
*/
protected function fetchPeople(string $query, string $role) {
$people = $this->fetchTextMulti($query) ?? [];
protected function fetchPeople(string $query, string $role): ?PersonCollection {
$people = $this->fetchStringMulti($query) ?? [];
$out = new PersonCollection;
foreach ($people as $person) {
if (!strlen($person)) {
@ -139,13 +141,13 @@ trait Construct {
}
/** Finds and parses Atom person-constructs, and returns a collection of Person objects */
protected function fetchPeopleAtom(string $query, string $role) {
protected function fetchPeopleAtom(string $query, string $role): ?PersonCollection {
$nodes = $this->fetchElements($query);
$out = new PersonCollection;
foreach ($nodes as $node) {
$p = new Person;
$p->mail = $this->fetchText("atom:email", $node) ?? "";
$p->name = $this->fetchText("atom:name", $node) ?? $p->mail;
$p->mail = $this->fetchString("atom:email", $node) ?? "";
$p->name = $this->fetchString("atom:name", $node) ?? $p->mail;
$p->url = $this->fetchUrl("atom:uri", $node);
$p->role = $role;
if (strlen($p->name)) {

93
lib/Parser/XML/Feed.php

@ -18,101 +18,108 @@ class Feed implements \JKingWeb\Lax\Parser\Feed {
use Primitives\Construct;
use Primitives\Feed;
/** @var string */
protected $data;
/** @var string */
protected $contentType;
/** @var \JKingWeb\Lax\Url */
protected $url;
/** @var \DOMDocument */
protected $document;
/** @var \DOMElement */
protected $subject;
/** @var \DOMXpath */
protected $xpath;
/** Constructs a parsed feed */
public function __construct(string $data, string $contentType = "", string $url = "") {
$this->init($data, $contentType, $url);
$this->data = $data;
$this->contentType = $contentType;
if (strlen($url ?? "")) {
$this->url = new Url($url);
}
}
/** Performs initialization of the instance */
protected function init(string $data, string $contentType = "", string $url = "") {
$this->reqUrl = $url;
protected function init(FeedStruct $feed): FeedStruct {
$this->document = new \DOMDocument();
$this->document->loadXML($data, \LIBXML_BIGLINES | \LIBXML_COMPACT);
$this->document->documentURI = $url;
$this->document->loadXML($this->data, \LIBXML_BIGLINES | \LIBXML_COMPACT);
$this->document->documentURI = (string) $this->url;
$this->xpath = new XPath($this->document);
$this->subject = $this->document->documentElement;
$ns = $this->subject->namespaceURI;
$name = $this->subject->localName;
if (is_null($ns) && $name=="rss") {
$this->subject = $this->fetchElement("channel") ?? $this->subject;
$this->type = "rss";
$this->version = $this->document->documentElement->getAttribute("version");
$feed->format = "rss";
$feed->version = $this->document->documentElement->getAttribute("version");
} elseif ($ns==XPath::NS['rdf'] && $name=="RDF") {
$this->type = "rdf";
$feed->format = "rdf";
$channel = $this->fetchElement("rss1:channel|rss0:channel");
if ($channel) {
$this->subject = $channel;
$this->version = ($channel->namespaceURI==XPath::NS['rss1']) ? "1.0" : "0.90";
$feed->version = ($channel->namespaceURI==XPath::NS['rss1']) ? "1.0" : "0.90";
} else {
$element = $this->fetchElement("rss1:item|rss0:item|rss1:image|rss0:image");
if ($element) {
$this->version = ($element->namespaceURI==XPath::NS['rss1']) ? "1.0" : "0.90";
$feed->version = ($element->namespaceURI==XPath::NS['rss1']) ? "1.0" : "0.90";
}
}
} elseif ($ns==XPath::NS['atom'] && $name=="feed") {
$this->type = "atom";
$this->version = "1.0";
$feed->format = "atom";
$feed->version = "1.0";
} else {
throw new \Exception;
}
$this->url = $url;
$feed->meta->url = $this->url;
return $feed;
}
/** Parses the feed to extract sundry metadata */
/** Parses the feed to extract data */
public function parse(FeedStruct $feed = null): FeedStruct {
$feed = $feed ?? new FeedStruct;
$feed = $this->init($feed ?? new FeedStruct);
$feed->meta->url = $this->url;
//$feed->sched->expired = $this->getExpired();
$feed->id = $this->getId();
$feed->url = $this->getUrl();
$feed->link = $this->getLink();
$feed->title = $this->getTitle();
$feed->summary = $this->getSummary();
$feed->people = $this->getPeople();
$feed->author = $this->people->primary();
$feed->dateModified = $this->getDateModified();
$feed->entries = $this->getEntries($feed);
// do a second pass on missing data we'd rather fill in
$feed->link = strlen($this->link) ? $this->link : $this->url;
$feed->title = strlen($this->title) ? $this->title : $this->link;
// do extra stuff just to test it
$feed->categories = $this->getCategories();
//$feed->lang = $this->getLang();
//$feed->url = $this->getUrl();
//$feed->link = $this->getLink();
//$feed->title = $this->getTitle();
//$feed->summary = $this->getSummary();
//$feed->dateModified = $this->getDateModified();
//$feed->icon = $this->getIcon();
//$feed->image = $this->getImage();
//$feed->people = $this->getPeople();
//$feed->categories = $this->getCategories();
//$feed->entries = $this->getEntries($feed);
return $feed;
}
public function getId(): ?string {
return $this->getIdAtom() ?? $this->getIdDC() ?? $this->getIdRss2() ?? "";
}
/** General function to fetch the canonical feed URL
*
* If the feed does not include a canonical URL, the request URL is returned instead
*/
public function getUrl(): ?Url {
return $this->getUrlAtom() ?? $this->getUrlRss1() ?? $this->getUrlPod() ?? $this->reqUrl;
}
/** General function to fetch the feed title */
public function getTitle(): ?Text {
return $this->getTitleAtom() ?? $this->getTitleRss1() ?? $this->getTitleRss2() ?? $this->getTitleDC() ?? $this->getTitlePod() ?? "";
}
/** General function to fetch the feed's Web-representation URL */
public function getLink(): ?Url {
return $this->getLinkAtom() ?? $this->getLinkRss1() ?? $this->getLinkRss2() ?? "";
}
/** General function to fetch the description of a feed */
public function getSummary(): ?Text {
// unlike most other data, Atom is not preferred, because Atom doesn't really have feed summaries
return $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryPod() ?? $this->getSummaryAtom() ?? "";
}
/** General function to fetch the categories of a feed */
public function getCategories(): CategoryCollection {
return $this->getCategoriesAtom() ?? $this->getCategoriesRss2() ?? $this->getCategoriesDC() ?? $this->getCategoriesPod() ?? new CategoryCollection;
}
/** General function to fetch the feed identifier */
public function getId(): ?string {
return $this->getIdAtom() ?? $this->getIdDC() ?? $this->getIdRss2() ?? "";
}
/** General function to fetch a collection of all people associated with a feed */
public function getPeople(): PersonCollection {
$authors = $this->getAuthorsAtom() ?? $this->getAuthorsDC() ?? $this->getAuthorsPod() ?? $this->getAuthorsRss2() ?? new PersonCollection;
$contributors = $this->getContributorsAtom() ?? $this->getContributorsDC() ?? new PersonCollection;
@ -121,12 +128,10 @@ class Feed implements \JKingWeb\Lax\Parser\Feed {
return $authors->merge($contributors, $editors, $webmasters);
}
/** General function to fetch the modification date of a feed */
public function getDateModified(): ?Date {
return $this->getDateModifiedAtom() ?? $this->getDateModifiedDC() ?? $this->getDateModifiedRss2();
}
/** General function to fetch the entries of a feed */
public function getEntries(FeedStruct $feed = null): array {
return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? [];
}

94
lib/Parser/XML/Primitives/Construct.php

@ -10,56 +10,58 @@ use JKingWeb\Lax\Person\Person;
use JKingWeb\Lax\Person\Collection as PersonCollection;
use JKingWeb\Lax\Category\Category;
use JKingWeb\Lax\Category\Collection as CategoryCollection;
use JKingWeb\Lax\Date;
use JKingWeb\Lax\Parser\XML\Entry as FeedEntry;
use JKingWeb\Lax\Text;
use JKingWeb\Lax\Url;
trait Construct {
/** Primitive to fetch an Atom feed/entry title
*
* This fetches the title in plain text rather than HTML, even if HTML is provided in the feed/entry
*/
protected function getTitleAtom() {
return $this->fetchTextAtom("atom:title");
protected function getTitleAtom(): ?Text {
// FIXME: fetch rich text
return $this->fetchStringAtom("atom:title");
}
/** Primitive to fetch an RSS feed/entry title */
protected function getTitleRss2() {
return $this->fetchText("title");
protected function getTitleRss2(): ?Text {
return $this->fetchString("title");
}
/** Primitive to fetch an RDF feed/entry title */
protected function getTitleRss1() {
return $this->fetchText("rss1:title|rss0:title");
protected function getTitleRss1(): ?Text {
return $this->fetchString("rss1:title|rss0:title");
}
/** Primitive to fetch a Dublin Core feed/entry title */
protected function getTitleDC() {
return $this->fetchText("dc:title");
protected function getTitleDC(): ?Text {
return $this->fetchString("dc:title");
}
/** Primitive to fetch an Apple podcast/episdoe title */
protected function getTitlePod() {
return $this->fetchText("apple:title");
protected function getTitlePod(): ?Text {
return $this->fetchString("apple:title");
}
/** Primitive to fetch an Atom feed/entry Web-representation URL */
protected function getLinkAtom() {
protected function getLinkAtom(): ?Url {
// FIXME: Atom link fetching should ideally prefer links to text/html resources or the like over e.g. other-format newsfeeds, generic XML, images, etc
$node = $this->fetchAtomRelations();
return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null;
}
/** Primitive to fetch an RSS feed/entry Web-representation URL */
protected function getLinkRss2() {
protected function getLinkRss2(): ?Url {
return $this->fetchUrl("link") ?? $this->fetchUrl("guid[not(@isPermalink='false')]");
}
/** Primitive to fetch an RDF feed/entry Web-representation URL */
protected function getLinkRss1() {
protected function getLinkRss1(): ?Url {
return $this->fetchUrl("rss1:link|rss0:link");
}
/** Primitive to fetch Atom feed/entry categories */
protected function getCategoriesAtom() {
protected function getCategoriesAtom(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->fetchElements("atom:category[@term]") ?? [] as $node) {
$c = new Category;
@ -74,7 +76,7 @@ trait Construct {
}
/** Primitive to fetch RSS feed/entry categories */
protected function getCategoriesRss2() {
protected function getCategoriesRss2(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->fetchElements("category") ?? [] as $node) {
$c = new Category;
@ -91,9 +93,9 @@ trait Construct {
*
* Dublin Core doesn't have an obvious category type, so we use 'subject' as a nearest approximation
*/
protected function getCategoriesDC() {
protected function getCategoriesDC(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->fetchTextMulti("dc:subject") ?? [] as $text) {
foreach ($this->fetchStringMulti("dc:subject") ?? [] as $text) {
if (strlen($text)) {
$c = new Category;
$c->name = $text;
@ -104,7 +106,7 @@ trait Construct {
}
/** Primitive to fetch podcast/episode categories */
protected function getCategoriesPod() {
protected function getCategoriesPod(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->fetchElements("apple:category|gplay:category") ?? [] as $node) {
$c = new Category;
@ -117,50 +119,50 @@ trait Construct {
}
/** Primitive to fetch an Atom feed/entry identifier */
protected function getIdAtom() {
return $this->fetchText("atom:id");
protected function getIdAtom(): ?string {
return $this->fetchString("atom:id");
}
/** Primitive to fetch an RSS feed/entry identifier
*
* Using RSS' <guid> for feed identifiers is non-standard, but harmless
*/
protected function getIdRss2() {
return $this->fetchText("guid");
protected function getIdRss2(): ?string {
return $this->fetchString("guid");
}
/** Primitive to fetch a Dublin Core feed/entry identifier */
protected function getIdDC() {
return $this->fetchText("dc:identifier");
protected function getIdDC(): ?string {
return $this->fetchString("dc:identifier");
}
/** Primitive to fetch a collection of authors associated with a feed/entry via Dublin Core */
protected function getAuthorsDC() {
protected function getAuthorsDC(): ?PersonCollection {
return $this->fetchPeople("dc:creator", "author");
}
/** Primitive to fetch a collection of contributors associated with a feed/entry via Dublin Core */
protected function getContributorsDC() {
protected function getContributorsDC(): ?PersonCollection {
return $this->fetchPeople("dc:ccontributor", "contributor");
}
/** Primitive to fetch a collection of authors associated with an RSS feed/entry */
protected function getAuthorsRss2() {
protected function getAuthorsRss2(): ?PersonCollection {
return $this->fetchPeople("author", "author");
}
/** Primitive to fetch a collection of editors associated with an RSS feed/entry */
protected function getEditorsRss2() {
protected function getEditorsRss2(): ?PersonCollection {
return $this->fetchPeople("managingEditor", "editor");
}
/** Primitive to fetch a collection of authors associated with an RSS feed/entry */
protected function getWebmastersRss2() {
protected function getWebmastersRss2(): ?PersonCollection {
return $this->fetchPeople("webMaster", "webMaster");
}
/** Primitive to fetch a collection of contributors associated with an Atom feed */
protected function getContributorsAtom() {
protected function getContributorsAtom(): ?PersonCollection {
return $this->fetchPeopleAtom("atom:contributor", "contributor");
}
@ -168,11 +170,11 @@ trait Construct {
*
* The collection only ever contains the first author found: podcasts implicitly have only one author
*/
protected function getAuthorsPod() {
protected function getAuthorsPod(): ?PersonCollection {
$out = new PersonCollection;
$p = new Person;
$p->name = $this->fetchText("gplay:author|apple:author") ?? "";
$p->mail = $this->fetchText("gplay:email|apple:email") ?? "";
$p->name = $this->fetchString("gplay:author|apple:author") ?? "";
$p->mail = $this->fetchString("gplay:email|apple:email") ?? "";
$p->role = "author";
if (strlen($p->name)) {
$out[] = $p;
@ -184,13 +186,13 @@ trait Construct {
*
* The collection only ever contains the first webmaster found: podcasts implicitly have only one webmaster
*/
protected function getWebmastersPod() {
protected function getWebmastersPod(): ?PersonCollection {
$out = new PersonCollection;
$node = $this->fetchElement("gplay:owner|apple:owner");
if ($node) {
$p = new Person;
$p->name = $this->fetchText("gplay:author|apple:author", $node) ?? "";
$p->mail = $this->fetchText("gplay:email|apple:email", $node) ?? "";
$p->name = $this->fetchString("gplay:author|apple:author", $node) ?? "";
$p->mail = $this->fetchString("gplay:email|apple:email", $node) ?? "";
$p->role = "webmaster";
if (strlen($p->name)) {
$out[] = $p;
@ -200,28 +202,28 @@ trait Construct {
}
/** Primitive to fetch an Atom feed or entry's canonical URL */
protected function getUrlAtom() {
protected function getUrlAtom(): ?Url {
$node = $this->fetchAtomRelations("self");
return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null;
}
/** Primitive to fetch the modification date of an Atom feed/entry */
protected function getDateModifiedAtom() {
protected function getDateModifiedAtom(): ?Date {
return $this->fetchDate("atom:updated");
}
/** Primitive to fetch the modification date of an Atom feed/entry */
protected function getDateModifiedDC() {
protected function getDateModifiedDC(): ?Date {
return $this->fetchDate("dc:date");
}
/** Primitive to fetch the modification date of an Atom entry */
protected function getDateCreatedAtom() {
protected function getDateCreatedAtom(): ?Date {
return $this->fetchDate("atom:published");
}
/** Primitive to fetch the list of entries in an Atom feed */
protected function getEntriesAtom() {
protected function getEntriesAtom(): ?array {
$out = [];
foreach ($this->fetchElements("atom:entry") ?? [] as $node) {
$out[] = new FeedEntry($node, $this, $this->xpath);
@ -230,7 +232,7 @@ trait Construct {
}
/** Primitive to fetch the list of entries in an RDF feed */
protected function getEntriesRss1() {
protected function getEntriesRss1(): ?array {
$out = [];
foreach ($this->fetchElements("rss1:item", $this->subject->ownerDocument->documentElement) ?? $this->fetchElements("rss1:item") ?? $this->fetchElements("rss0:item", $this->subject->ownerDocument->documentElement) ?? $this->fetchElements("rss0:item") ?? [] as $node) {
$out[] = new FeedEntry($node, $this, $this->xpath);
@ -239,7 +241,7 @@ trait Construct {
}
/** Primitive to fetch the list of entries in an RSS feed */
protected function getEntriesRss2() {
protected function getEntriesRss2(): ?array {
$out = [];
foreach ($this->fetchElements("item") ?? [] as $node) {
$out[] = new FeedEntry($node, $this, $this->xpath);
@ -248,7 +250,7 @@ trait Construct {
}
/** Primitive to fetch the URL of a article related to the entry */
protected function getRelatedLinkAtom() {
protected function getRelatedLinkAtom(): ?Url {
// FIXME: Atom link fetching should ideally prefer links to text/html resources or the like over e.g. other-format newsfeeds, generic XML, images, etc
$node = $this->fetchAtomRelations("related");
return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null;

10
lib/Parser/XML/Primitives/Feed.php

@ -15,27 +15,27 @@ trait Feed {
* Atom does not have a 'description' element like the RSSes, but it does have 'subtitle', which fills roughly the same function
*/
protected function getSummaryAtom() {
return $this->fetchTextAtom("atom:subtitle");
return $this->fetchStringAtom("atom:subtitle");
}
/** Primitive to fetch an RSS feed summary */
protected function getSummaryRss2() {
return $this->fetchText("description");
return $this->fetchString("description");
}
/** Primitive to fetch an RDF feed summary */
protected function getSummaryRss1() {
return $this->fetchText("rss1:description|rss0:description");
return $this->fetchString("rss1:description|rss0:description");
}
/** Primitive to fetch a Dublin Core feed summary */
protected function getSummaryDC() {
return $this->fetchText("dc:description");
return $this->fetchString("dc:description");
}
/** Primitive to fetch a podcast summary */
protected function getSummaryPod() {
return $this->fetchText("apple:summary|gplay:description") ?? $this->fetchText("apple:subtitle");
return $this->fetchString("apple:summary|gplay:description") ?? $this->fetchString("apple:subtitle");
}
/** Primitive to fetch a collection of authors associated with an Atom feed */

1
tests/cases/JSON/JSONTest.php

@ -51,6 +51,7 @@ use JKingWeb\Lax\Category\Collection as CategoryCollection;
use JKingWeb\Lax\Enclosure\Collection as EnclosureCollection;
/**
* @covers JKingWeb\Lax\Parser\Construct<extended>
* @covers JKingWeb\Lax\Parser\JSON\Feed<extended>
* @covers JKingWeb\Lax\Parser\JSON\Entry<extended>
*/

Loading…
Cancel
Save