From 32cf3bcf8771bd8a687c86817c9bf9c0042cd8a3 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Fri, 20 Mar 2020 15:47:08 -0400 Subject: [PATCH] Atom person test --- lib/Parser/XML/Construct.php | 7 +- lib/Parser/XML/Feed.php | 10 +- lib/Parser/XML/OldConstruct.php | 339 -------------------------------- tests/cases/XML/feed-atom.yaml | 44 +++++ 4 files changed, 53 insertions(+), 347 deletions(-) delete mode 100644 lib/Parser/XML/OldConstruct.php diff --git a/lib/Parser/XML/Construct.php b/lib/Parser/XML/Construct.php index 7224b05..1938878 100644 --- a/lib/Parser/XML/Construct.php +++ b/lib/Parser/XML/Construct.php @@ -327,11 +327,12 @@ abstract class Construct { $out = new PersonCollection; foreach ($nodes as $node) { $p = new Person; - $p->mail = $this->fetchString("atom:email", $node);; - $p->name = $this->fetchString("atom:name", $node) ?? $p->mail; + $mail = $this->fetchString("atom:email", null, null, $node) ?? ""; + $p->mail = $this->validateMail($mail) ? $mail : null; + $p->name = $this->fetchString("atom:name", ".+", null, $node); $p->url = $this->fetchUrl("atom:uri", $node); $p->role = $role; - if (strlen($p->name ?? "")) { + if (!is_null($p->name)) { $out[] = $p; } } diff --git a/lib/Parser/XML/Feed.php b/lib/Parser/XML/Feed.php index 1c6d6b7..421326e 100644 --- a/lib/Parser/XML/Feed.php +++ b/lib/Parser/XML/Feed.php @@ -92,9 +92,9 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { $feed->dateModified = $this->getDateModified(); $feed->icon = $this->getIcon(); $feed->image = $this->getImage(); - //$feed->people = $this->getPeople(); + $feed->people = $this->getPeople(); $feed->categories = $this->getCategories(); - //$feed->entries = $this->getEntries($feed); + $feed->entries = $this->getEntries($feed); return $feed; } @@ -211,14 +211,14 @@ class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { ?? new PersonCollection; $webmasters = $this->fetchPeople("rss2:webMaster", "webmaster") // RSS 2.0 authors - ?? $this->fetchPodPerson("gplay", "webmaster") // Google Play author - ?? $this->fetchPodPerson("apple", "webmaster") // iTunes author + ?? $this->fetchPodPerson("gplay", "webmaster") // Google Play webmaster + ?? $this->fetchPodPerson("apple", "webmaster") // iTunes webmaster ?? new PersonCollection; return $authors->merge($contributors, $editors, $webmasters); } public function getEntries(FeedStruct $feed = null): array { - return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? []; + return []; } /** Fetches the "complete" flag from an iTunes podcast */ diff --git a/lib/Parser/XML/OldConstruct.php b/lib/Parser/XML/OldConstruct.php deleted file mode 100644 index c7a1415..0000000 --- a/lib/Parser/XML/OldConstruct.php +++ /dev/null @@ -1,339 +0,0 @@ -xpath->query($query, $context ?? $this->subject); - } - - protected function fetchUrl(string $query, \DOMElement $context = null, string $attr = "", string $ns = null) { - $nodes = $this->fetchElements($query, $context); - foreach ($nodes as $node) { - $url = strlen($attr) ? $node->getAttributeNS($ns, $attr) : $this->trimText($node->textContent); - $url = $this->trimText($node->textContent); - if (strlen($url)) { - return $this->resolveUrl($url, $node->baseURI); - } - } - return null; - } - - /** Returns a node-list of Atom link elements with the desired relation or equivalents. - * - * Links without an href attribute are excluded. - * - * @see https://tools.ietf.org/html/rfc4287#section-4.2.7.2 - */ - protected function fetchAtomRelations(string $rel = ""): \DOMNodeList { - // FIXME: The XPath evaluation will fail if the relation contains an apostrophe. This is a known and difficult-to-overcome limitation of XPath 1.0 which I consider not worth the effort to address at this time - if ($rel == "" || $rel == "alternate" || $rel == "http://www.iana.org/assignments/relation/alternate") { - $cond = "not(@rel) or @rel='' or @rel='alternate' or @rel='http://www.iana.org/assignments/relation/alternate'"; - } elseif (strpos($rel, ":") === false) { - // FIXME: Checking only for a colon in a link relation is a hack that does not strictly follow IRI rules, but it's adequate for our needs - $cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'"; - } elseif (strlen($rel) > 41 && strpos($rel, "http://www.iana.org/assignments/relation/") === 0) { - $rel = substr($rel, 41); - $cond = "@rel='$rel' or @rel='http://www.iana.org/assignments/relation/$rel'"; - } else { - $cond = "@rel='$rel'"; - } - return $this->xpath->query("atom:link[@href][$cond]", $this->subject); - } - - /** Finds and parses RSS person-texts and returns a collection of person objects - * - * Each can have a name, e-mail address, or both - * - * The following forms will yield both a name and address: - * - * - user@example.com (Full Name) - * - Full Name - */ - protected function fetchPeople(string $query, string $role): ?PersonCollection { - $people = $this->fetchString($query, null, true) ?? []; - $out = new PersonCollection; - foreach ($people as $person) { - if (!strlen($person)) { - continue; - } - $p = new Person; - if (preg_match("/^([^@\s]+@\S+) \((.+?)\)$/", $person, $match)) { // tests "user@example.com (Full Name)" form - if ($this->validateMail($match[1])) { - $p->name = trim($match[2]); - $p->mail = $match[1]; - } else { - $p->name = $person; - } - } elseif (preg_match("/^((?:\S|\s(?!<))+) <([^>]+)>$/", $person, $match)) { // tests "Full Name " form - if ($this->validateMail($match[2])) { - $p->name = trim($match[1]); - $p->mail = $match[2]; - } else { - $p->name = $person; - } - } elseif ($this->validateMail($person)) { - $p->name = $person; - $p->mail = $person; - } else { - $p->name = $person; - } - $p->role = $role; - $out[] = $p; - } - return count($out) ? $out : null; - } - - /** Finds and parses Atom person-constructs, and returns a collection of Person objects */ - protected function fetchPeopleAtom(string $query, string $role): ?PersonCollection { - $nodes = $this->fetchElements($query); - $out = new PersonCollection; - foreach ($nodes as $node) { - $p = new Person; - $p->mail = $this->fetchString("atom:email", $node) ?? ""; - $p->name = $this->fetchString("atom:name", $node) ?? $p->mail; - $p->url = $this->fetchUrl("atom:uri", $node); - $p->role = $role; - if (strlen($p->name)) { - $out[] = $p; - } - } - return count($out) ? $out : null; - } - - /** Primitive to fetch an Atom feed/entry title - */ - protected function getTitleAtom(): ?Text { - // FIXME: fetch rich text - return $this->fetchStringAtom("atom:title"); - } - - /** Primitive to fetch an RSS feed/entry title */ - protected function getTitleRss2(): ?Text { - return $this->fetchString("title"); - } - - /** Primitive to fetch an RDF feed/entry title */ - protected function getTitleRss1(): ?Text { - return $this->fetchString("rss1:title|rss0:title"); - } - - /** Primitive to fetch a Dublin Core feed/entry title */ - protected function getTitleDC(): ?Text { - return $this->fetchString("dc:title"); - } - - /** Primitive to fetch an Apple podcast/episdoe title */ - protected function getTitlePod(): ?Text { - return $this->fetchString("apple:title"); - } - - /** Primitive to fetch an Atom feed/entry Web-representation URL */ - protected function getLinkAtom(): ?Url { - // FIXME: Atom link fetching should ideally prefer links to text/html resources or the like over e.g. other-format newsfeeds, generic XML, images, etc - $node = $this->fetchAtomRelations(); - return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null; - } - - /** Primitive to fetch an RSS feed/entry Web-representation URL */ - protected function getLinkRss2(): ?Url { - return $this->fetchUrl("link") ?? $this->fetchUrl("guid[not(@isPermalink='false')]"); - } - - /** Primitive to fetch an RDF feed/entry Web-representation URL */ - protected function getLinkRss1(): ?Url { - return $this->fetchUrl("rss1:link|rss0:link"); - } - - /** Primitive to fetch Atom feed/entry categories */ - protected function getCategoriesAtom(): ?CategoryCollection { - $out = new CategoryCollection; - foreach ($this->fetchElements("atom:category[@term]") ?? [] as $node) { - $c = new Category; - $c->domain = $this->trimText($node->getAttribute("scheme")); - $c->label = $this->trimText($node->getAttribute("label")); - $c->name = $this->trimText($node->getAttribute("term")); - if (strlen($c->name)) { - $out[] = $c; - } - } - return count($out) ? $out : null; - } - - /** Primitive to fetch RSS feed/entry categories */ - protected function getCategoriesRss2(): ?CategoryCollection { - $out = new CategoryCollection; - foreach ($this->fetchElements("category") ?? [] as $node) { - $c = new Category; - $c->domain = $this->trimText($node->getAttribute("domain")); - $c->name = $this->trimText($node->textContent); - if (strlen($c->name)) { - $out[] = $c; - } - } - return count($out) ? $out : null; - } - - /** Primitive to fetch Dublin Core feed/entry categories - * - * Dublin Core doesn't have an obvious category type, so we use 'subject' as a nearest approximation - */ - protected function getCategoriesDC(): ?CategoryCollection { - $out = new CategoryCollection; - foreach ($this->fetchString("dc:subject", null, true) ?? [] as $text) { - if (strlen($text)) { - $c = new Category; - $c->name = $text; - $out[] = $c; - } - } - return count($out) ? $out : null; - } - - /** Primitive to fetch podcast/episode categories */ - protected function getCategoriesPod(): ?CategoryCollection { - $out = new CategoryCollection; - foreach ($this->fetchElements("apple:category|gplay:category") ?? [] as $node) { - $c = new Category; - $c->name = $this->trimText($node->getAttribute("text")); - if (strlen($c->name)) { - $out[] = $c; - } - } - return count($out) ? $out : null; - } - - /** Primitive to fetch a collection of authors associated with a feed/entry via Dublin Core */ - protected function getAuthorsDC(): ?PersonCollection { - return $this->fetchPeople("dc:creator", "author"); - } - - /** Primitive to fetch a collection of contributors associated with a feed/entry via Dublin Core */ - protected function getContributorsDC(): ?PersonCollection { - return $this->fetchPeople("dc:ccontributor", "contributor"); - } - - /** Primitive to fetch a collection of authors associated with an RSS feed/entry */ - protected function getAuthorsRss2(): ?PersonCollection { - return $this->fetchPeople("author", "author"); - } - - /** Primitive to fetch a collection of editors associated with an RSS feed/entry */ - protected function getEditorsRss2(): ?PersonCollection { - return $this->fetchPeople("managingEditor", "editor"); - } - - /** Primitive to fetch a collection of authors associated with an RSS feed/entry */ - protected function getWebmastersRss2(): ?PersonCollection { - return $this->fetchPeople("webMaster", "webMaster"); - } - - /** Primitive to fetch a collection of contributors associated with an Atom feed */ - protected function getContributorsAtom(): ?PersonCollection { - return $this->fetchPeopleAtom("atom:contributor", "contributor"); - } - - /** Primitive to fetch a collection of authors associated with a podcast/episode - * - * The collection only ever contains the first author found: podcasts implicitly have only one author - */ - protected function getAuthorsPod(): ?PersonCollection { - $out = new PersonCollection; - $p = new Person; - $p->name = $this->fetchString("gplay:author|apple:author") ?? ""; - $p->mail = $this->fetchString("gplay:email|apple:email") ?? ""; - $p->role = "author"; - if (strlen($p->name)) { - $out[] = $p; - } - return count($out) ? $out : null; - } - - /** Primitive to fetch a collection of webmasters associated with a podcast - * - * The collection only ever contains the first webmaster found: podcasts implicitly have only one webmaster - */ - protected function getWebmastersPod(): ?PersonCollection { - $out = new PersonCollection; - $node = $this->fetchElement("gplay:owner|apple:owner"); - if ($node) { - $p = new Person; - $p->name = $this->fetchString("gplay:author|apple:author", $node) ?? ""; - $p->mail = $this->fetchString("gplay:email|apple:email", $node) ?? ""; - $p->role = "webmaster"; - if (strlen($p->name)) { - $out[] = $p; - } - } - return count($out) ? $out : null; - } - - /** Primitive to fetch an Atom feed or entry's canonical URL */ - protected function getUrlAtom(): ?Url { - $node = $this->fetchAtomRelations("self"); - return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null; - } - - /** Primitive to fetch the modification date of an Atom feed/entry */ - protected function getDateModifiedAtom(): ?Date { - return $this->fetchDate("atom:updated"); - } - - /** Primitive to fetch the modification date of an Atom feed/entry */ - protected function getDateModifiedDC(): ?Date { - return $this->fetchDate("dc:date"); - } - - /** Primitive to fetch the modification date of an Atom entry */ - protected function getDateCreatedAtom(): ?Date { - return $this->fetchDate("atom:published"); - } - - /** Primitive to fetch the list of entries in an Atom feed */ - protected function getEntriesAtom(): ?array { - $out = []; - foreach ($this->fetchElements("atom:entry") ?? [] as $node) { - $out[] = new FeedEntry($node, $this, $this->xpath); - } - return count($out) ? $out : null; - } - - /** Primitive to fetch the list of entries in an RDF feed */ - protected function getEntriesRss1(): ?array { - $out = []; - foreach ($this->fetchElements("rss1:item", $this->subject->ownerDocument->documentElement) ?? $this->fetchElements("rss1:item") ?? $this->fetchElements("rss0:item", $this->subject->ownerDocument->documentElement) ?? $this->fetchElements("rss0:item") ?? [] as $node) { - $out[] = new FeedEntry($node, $this, $this->xpath); - } - return count($out) ? $out : null; - } - - /** Primitive to fetch the list of entries in an RSS feed */ - protected function getEntriesRss2(): ?array { - $out = []; - foreach ($this->fetchElements("item") ?? [] as $node) { - $out[] = new FeedEntry($node, $this, $this->xpath); - } - return count($out) ? $out : null; - } - - /** Primitive to fetch the URL of a article related to the entry */ - protected function getRelatedLinkAtom(): ?Url { - // FIXME: Atom link fetching should ideally prefer links to text/html resources or the like over e.g. other-format newsfeeds, generic XML, images, etc - $node = $this->fetchAtomRelations("related"); - return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null; - } -} diff --git a/tests/cases/XML/feed-atom.yaml b/tests/cases/XML/feed-atom.yaml index 7834784..b5c9a13 100644 --- a/tests/cases/XML/feed-atom.yaml +++ b/tests/cases/XML/feed-atom.yaml @@ -463,3 +463,47 @@ Categories: - name: ack label: 'Ack!' domain: '4:3' # Not treated as a URI + +Feed authors and contributors: + input: > + + + John Doe + john.doe@example.com + http://example.com/ + + + Jane Doe + jane.doe@example.com + http://example.net/ + + + Larry + @example.com + http://example.]/ + + + Curly + + + + moe@example.com + http://example.org/ + + + output: + format: atom + version: '1.0' + people: + - name: 'John Doe' + mail: 'john.doe@example.com' + url: 'http://example.com/' + role: author + - name: 'Jane Doe' + mail: 'jane.doe@example.com' + url: 'http://example.net/' + role: author + - name: Larry + role: contributor + - name: Curly + role: contributor