Browse Source

Atom person test

master
J. King 4 years ago
parent
commit
32cf3bcf87
  1. 7
      lib/Parser/XML/Construct.php
  2. 10
      lib/Parser/XML/Feed.php
  3. 339
      lib/Parser/XML/OldConstruct.php
  4. 44
      tests/cases/XML/feed-atom.yaml

7
lib/Parser/XML/Construct.php

@ -327,11 +327,12 @@ abstract class Construct {
$out = new PersonCollection;
foreach ($nodes as $node) {
$p = new Person;
$p->mail = $this->fetchString("atom:email", $node);;
$p->name = $this->fetchString("atom:name", $node) ?? $p->mail;
$mail = $this->fetchString("atom:email", null, null, $node) ?? "";
$p->mail = $this->validateMail($mail) ? $mail : null;
$p->name = $this->fetchString("atom:name", ".+", null, $node);
$p->url = $this->fetchUrl("atom:uri", $node);
$p->role = $role;
if (strlen($p->name ?? "")) {
if (!is_null($p->name)) {
$out[] = $p;
}
}

10
lib/Parser/XML/Feed.php

@ -92,9 +92,9 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
$feed->dateModified = $this->getDateModified();
$feed->icon = $this->getIcon();
$feed->image = $this->getImage();
//$feed->people = $this->getPeople();
$feed->people = $this->getPeople();
$feed->categories = $this->getCategories();
//$feed->entries = $this->getEntries($feed);
$feed->entries = $this->getEntries($feed);
return $feed;
}
@ -211,14 +211,14 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
?? new PersonCollection;
$webmasters =
$this->fetchPeople("rss2:webMaster", "webmaster") // RSS 2.0 authors
?? $this->fetchPodPerson("gplay", "webmaster") // Google Play author
?? $this->fetchPodPerson("apple", "webmaster") // iTunes author
?? $this->fetchPodPerson("gplay", "webmaster") // Google Play webmaster
?? $this->fetchPodPerson("apple", "webmaster") // iTunes webmaster
?? new PersonCollection;
return $authors->merge($contributors, $editors, $webmasters);
}
public function getEntries(FeedStruct $feed = null): array {
return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? [];
return [];
}
/** Fetches the "complete" flag from an iTunes podcast */

339
lib/Parser/XML/OldConstruct.php

@ -1,339 +0,0 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Lax\Parser\XML\Primitives;
use MensBeam\Lax\Person\Person;
use MensBeam\Lax\Person\Collection as PersonCollection;
use MensBeam\Lax\Category\Category;
use MensBeam\Lax\Category\Collection as CategoryCollection;
use MensBeam\Lax\Date;
use MensBeam\Lax\Parser\XML\Entry as FeedEntry;
use MensBeam\Lax\Text;
use MensBeam\Lax\Url;
trait Construct {
/** Retrieves multiple element node based on an XPath query */
protected function fetchElements(string $query, \DOMNode $context = null): \DOMNodeList {
return $this->xpath->query($query, $context ?? $this->subject);
}
protected function fetchUrl(string $query, \DOMElement $context = null, string $attr = "", string $ns = null) {
$nodes = $this->fetchElements($query, $context);
foreach ($nodes as $node) {
$url = strlen($attr) ? $node->getAttributeNS($ns, $attr) : $this->trimText($node->textContent);
$url = $this->trimText($node->textContent);
if (strlen($url)) {
return $this->resolveUrl($url, $node->baseURI);
}
}
return null;
}
/** Returns a node-list of Atom link elements with the desired relation or equivalents.
*
* Links without an href attribute are excluded.
*
* @see https://tools.ietf.org/html/rfc4287#section-4.2.7.2
*/
protected function fetchAtomRelations(string $rel = ""): \DOMNodeList {
// FIXME: The XPath evaluation will fail if the relation contains an apostrophe. This is a known and difficult-to-overcome limitation of XPath 1.0 which I consider not worth the effort to address at this time
if ($rel == "" || $rel == "alternate" || $rel == "http://www.iana.org/assignments/relation/alternate") {
$cond = "not(@rel) or @rel='' or @rel='alternate' or @rel='http://www.iana.org/assignments/relation/alternate'";
} elseif (strpos($rel, ":") === false) {
// FIXME: Checking only for a colon in a link relation is a hack that does not strictly follow IRI rules, but it's adequate for our needs
$cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'";
} elseif (strlen($rel) > 41 && strpos($rel, "http://www.iana.org/assignments/relation/") === 0) {
$rel = substr($rel, 41);
$cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'";
} else {
$cond = "@rel='$rel'";
}
return $this->xpath->query("atom:link[@href][$cond]", $this->subject);
}
/** Finds and parses RSS person-texts and returns a collection of person objects
*
* Each can have a name, e-mail address, or both
*
* The following forms will yield both a name and address:
*
* - user@example.com (Full Name)
* - Full Name <user@example.com>
*/
protected function fetchPeople(string $query, string $role): ?PersonCollection {
$people = $this->fetchString($query, null, true) ?? [];
$out = new PersonCollection;
foreach ($people as $person) {
if (!strlen($person)) {
continue;
}
$p = new Person;
if (preg_match("/^([^@\s]+@\S+) \((.+?)\)$/", $person, $match)) { // tests "user@example.com (Full Name)" form
if ($this->validateMail($match[1])) {
$p->name = trim($match[2]);
$p->mail = $match[1];
} else {
$p->name = $person;
}
} elseif (preg_match("/^((?:\S|\s(?!<))+) <([^>]+)>$/", $person, $match)) { // tests "Full Name <user@example.com>" form
if ($this->validateMail($match[2])) {
$p->name = trim($match[1]);
$p->mail = $match[2];
} else {
$p->name = $person;
}
} elseif ($this->validateMail($person)) {
$p->name = $person;
$p->mail = $person;
} else {
$p->name = $person;
}
$p->role = $role;
$out[] = $p;
}
return count($out) ? $out : null;
}
/** Finds and parses Atom person-constructs, and returns a collection of Person objects */
protected function fetchPeopleAtom(string $query, string $role): ?PersonCollection {
$nodes = $this->fetchElements($query);
$out = new PersonCollection;
foreach ($nodes as $node) {
$p = new Person;
$p->mail = $this->fetchString("atom:email", $node) ?? "";
$p->name = $this->fetchString("atom:name", $node) ?? $p->mail;
$p->url = $this->fetchUrl("atom:uri", $node);
$p->role = $role;
if (strlen($p->name)) {
$out[] = $p;
}
}
return count($out) ? $out : null;
}
/** Primitive to fetch an Atom feed/entry title
*/
protected function getTitleAtom(): ?Text {
// FIXME: fetch rich text
return $this->fetchStringAtom("atom:title");
}
/** Primitive to fetch an RSS feed/entry title */
protected function getTitleRss2(): ?Text {
return $this->fetchString("title");
}
/** Primitive to fetch an RDF feed/entry title */
protected function getTitleRss1(): ?Text {
return $this->fetchString("rss1:title|rss0:title");
}
/** Primitive to fetch a Dublin Core feed/entry title */
protected function getTitleDC(): ?Text {
return $this->fetchString("dc:title");
}
/** Primitive to fetch an Apple podcast/episdoe title */
protected function getTitlePod(): ?Text {
return $this->fetchString("apple:title");
}
/** Primitive to fetch an Atom feed/entry Web-representation URL */
protected function getLinkAtom(): ?Url {
// FIXME: Atom link fetching should ideally prefer links to text/html resources or the like over e.g. other-format newsfeeds, generic XML, images, etc
$node = $this->fetchAtomRelations();
return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null;
}
/** Primitive to fetch an RSS feed/entry Web-representation URL */
protected function getLinkRss2(): ?Url {
return $this->fetchUrl("link") ?? $this->fetchUrl("guid[not(@isPermalink='false')]");
}
/** Primitive to fetch an RDF feed/entry Web-representation URL */
protected function getLinkRss1(): ?Url {
return $this->fetchUrl("rss1:link|rss0:link");
}
/** Primitive to fetch Atom feed/entry categories */
protected function getCategoriesAtom(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->fetchElements("atom:category[@term]") ?? [] as $node) {
$c = new Category;
$c->domain = $this->trimText($node->getAttribute("scheme"));
$c->label = $this->trimText($node->getAttribute("label"));
$c->name = $this->trimText($node->getAttribute("term"));
if (strlen($c->name)) {
$out[] = $c;
}
}
return count($out) ? $out : null;
}
/** Primitive to fetch RSS feed/entry categories */
protected function getCategoriesRss2(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->fetchElements("category") ?? [] as $node) {
$c = new Category;
$c->domain = $this->trimText($node->getAttribute("domain"));
$c->name = $this->trimText($node->textContent);
if (strlen($c->name)) {
$out[] = $c;
}
}
return count($out) ? $out : null;
}
/** Primitive to fetch Dublin Core feed/entry categories
*
* Dublin Core doesn't have an obvious category type, so we use 'subject' as a nearest approximation
*/
protected function getCategoriesDC(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->fetchString("dc:subject", null, true) ?? [] as $text) {
if (strlen($text)) {
$c = new Category;
$c->name = $text;
$out[] = $c;
}
}
return count($out) ? $out : null;
}
/** Primitive to fetch podcast/episode categories */
protected function getCategoriesPod(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->fetchElements("apple:category|gplay:category") ?? [] as $node) {
$c = new Category;
$c->name = $this->trimText($node->getAttribute("text"));
if (strlen($c->name)) {
$out[] = $c;
}
}
return count($out) ? $out : null;
}
/** Primitive to fetch a collection of authors associated with a feed/entry via Dublin Core */
protected function getAuthorsDC(): ?PersonCollection {
return $this->fetchPeople("dc:creator", "author");
}
/** Primitive to fetch a collection of contributors associated with a feed/entry via Dublin Core */
protected function getContributorsDC(): ?PersonCollection {
return $this->fetchPeople("dc:ccontributor", "contributor");
}
/** Primitive to fetch a collection of authors associated with an RSS feed/entry */
protected function getAuthorsRss2(): ?PersonCollection {
return $this->fetchPeople("author", "author");
}
/** Primitive to fetch a collection of editors associated with an RSS feed/entry */
protected function getEditorsRss2(): ?PersonCollection {
return $this->fetchPeople("managingEditor", "editor");
}
/** Primitive to fetch a collection of authors associated with an RSS feed/entry */
protected function getWebmastersRss2(): ?PersonCollection {
return $this->fetchPeople("webMaster", "webMaster");
}
/** Primitive to fetch a collection of contributors associated with an Atom feed */
protected function getContributorsAtom(): ?PersonCollection {
return $this->fetchPeopleAtom("atom:contributor", "contributor");
}
/** Primitive to fetch a collection of authors associated with a podcast/episode
*
* The collection only ever contains the first author found: podcasts implicitly have only one author
*/
protected function getAuthorsPod(): ?PersonCollection {
$out = new PersonCollection;
$p = new Person;
$p->name = $this->fetchString("gplay:author|apple:author") ?? "";
$p->mail = $this->fetchString("gplay:email|apple:email") ?? "";
$p->role = "author";
if (strlen($p->name)) {
$out[] = $p;
}
return count($out) ? $out : null;
}
/** Primitive to fetch a collection of webmasters associated with a podcast
*
* The collection only ever contains the first webmaster found: podcasts implicitly have only one webmaster
*/
protected function getWebmastersPod(): ?PersonCollection {
$out = new PersonCollection;
$node = $this->fetchElement("gplay:owner|apple:owner");
if ($node) {
$p = new Person;
$p->name = $this->fetchString("gplay:author|apple:author", $node) ?? "";
$p->mail = $this->fetchString("gplay:email|apple:email", $node) ?? "";
$p->role = "webmaster";
if (strlen($p->name)) {
$out[] = $p;
}
}
return count($out) ? $out : null;
}
/** Primitive to fetch an Atom feed or entry's canonical URL */
protected function getUrlAtom(): ?Url {
$node = $this->fetchAtomRelations("self");
return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null;
}
/** Primitive to fetch the modification date of an Atom feed/entry */
protected function getDateModifiedAtom(): ?Date {
return $this->fetchDate("atom:updated");
}
/** Primitive to fetch the modification date of an Atom feed/entry */
protected function getDateModifiedDC(): ?Date {
return $this->fetchDate("dc:date");
}
/** Primitive to fetch the modification date of an Atom entry */
protected function getDateCreatedAtom(): ?Date {
return $this->fetchDate("atom:published");
}
/** Primitive to fetch the list of entries in an Atom feed */
protected function getEntriesAtom(): ?array {
$out = [];
foreach ($this->fetchElements("atom:entry") ?? [] as $node) {
$out[] = new FeedEntry($node, $this, $this->xpath);
}
return count($out) ? $out : null;
}
/** Primitive to fetch the list of entries in an RDF feed */
protected function getEntriesRss1(): ?array {
$out = [];
foreach ($this->fetchElements("rss1:item", $this->subject->ownerDocument->documentElement) ?? $this->fetchElements("rss1:item") ?? $this->fetchElements("rss0:item", $this->subject->ownerDocument->documentElement) ?? $this->fetchElements("rss0:item") ?? [] as $node) {
$out[] = new FeedEntry($node, $this, $this->xpath);
}
return count($out) ? $out : null;
}
/** Primitive to fetch the list of entries in an RSS feed */
protected function getEntriesRss2(): ?array {
$out = [];
foreach ($this->fetchElements("item") ?? [] as $node) {
$out[] = new FeedEntry($node, $this, $this->xpath);
}
return count($out) ? $out : null;
}
/** Primitive to fetch the URL of a article related to the entry */
protected function getRelatedLinkAtom(): ?Url {
// FIXME: Atom link fetching should ideally prefer links to text/html resources or the like over e.g. other-format newsfeeds, generic XML, images, etc
$node = $this->fetchAtomRelations("related");
return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null;
}
}

44
tests/cases/XML/feed-atom.yaml

@ -463,3 +463,47 @@ Categories:
- name: ack
label: 'Ack!'
domain: '4:3' # Not treated as a URI
Feed authors and contributors:
input: >
<feed xmlns="http://www.w3.org/2005/Atom">
<author>
<name>John Doe</name>
<email>john.doe@example.com</email>
<uri>http://example.com/</uri>
</author>
<author>
<name>Jane Doe</name>
<email>jane.doe@example.com</email>
<uri>http://example.net/</uri>
</author>
<contributor>
<name>Larry</name>
<email>@example.com</email>
<uri>http://example.]/</uri>
</contributor>
<contributor>
<name>Curly</name>
</contributor>
<contributor>
<name/>
<email>moe@example.com</email>
<uri>http://example.org/</uri>
</contributor>
</feed>
output:
format: atom
version: '1.0'
people:
- name: 'John Doe'
mail: 'john.doe@example.com'
url: 'http://example.com/'
role: author
- name: 'Jane Doe'
mail: 'jane.doe@example.com'
url: 'http://example.net/'
role: author
- name: Larry
role: contributor
- name: Curly
role: contributor

Loading…
Cancel
Save