diff --git a/lib/Parser/XML/Construct.php b/lib/Parser/XML/Construct.php index 892f5ee..4f728a0 100644 --- a/lib/Parser/XML/Construct.php +++ b/lib/Parser/XML/Construct.php @@ -6,8 +6,10 @@ declare(strict_types=1); namespace JKingWeb\Lax\Parser\XML; +use JKingWeb\Lax\Date; use JKingWeb\Lax\Person\Person; use JKingWeb\Lax\Person\Collection as PersonCollection; +use JKingWeb\Lax\Text; trait Construct { use \JKingWeb\Lax\Parser\Construct; @@ -23,7 +25,7 @@ trait Construct { protected function fetchElement(string $query, \DOMNode $context = null) { $node = @$this->xpath->query("(".$query.")[1]", $context ?? $this->subject); if ($node===false) { - throw new \Exception("Invalid XPath query: $query"); + throw new \Exception("Invalid XPath query: $query"); // @codeCoverageIgnore } return ($node->length) ? $node->item(0) : null; } @@ -34,13 +36,13 @@ trait Construct { } /** Retrieves the trimmed text content of a DOM element based on an XPath query */ - protected function fetchText(string $query, \DOMNode $context = null) { + protected function fetchString(string $query, \DOMNode $context = null): ?string { $node = $this->fetchElement($query, $context); return ($node) ? $this->trimText($node->textContent) : null; } /** Retrieves the trimmed text content of multiple DOM elements based on an XPath query */ - protected function fetchTextMulti(string $query, \DOMNode $context = null) { + protected function fetchStringMulti(string $query, \DOMNode $context = null) { $out = []; $nodes = $this->xpath->query($query, $context ?? $this->subject); foreach ($nodes as $node) { @@ -50,7 +52,7 @@ trait Construct { } /** Retrieves the trimmed plain-text or HTML content of an Atom text construct based on an XPath query */ - protected function fetchTextAtom(string $query, bool $html = false) { + protected function fetchStringAtom(string $query, bool $html = false): ?Text { $node = $this->fetchElement($query); if ($node) { if (!$node->hasAttribute("type") || $node->getAttribute("type")=="text") { @@ -69,8 +71,8 @@ trait Construct { } /** Retrieves and parses a date from the content of a DOM element based on an XPath query */ - protected function fetchDate(string $query, \DOMNode $context = null) { - return $this->parseDate($this->fetchText($query, $context) ?? ""); + protected function fetchDate(string $query, \DOMNode $context = null): ?Date { + return $this->parseDate($this->fetchString($query, $context) ?? ""); } /** Returns a node-list of Atom link elements with the desired relation or equivalents. @@ -104,8 +106,8 @@ trait Construct { * - user@example.com (Full Name) * - Full Name */ - protected function fetchPeople(string $query, string $role) { - $people = $this->fetchTextMulti($query) ?? []; + protected function fetchPeople(string $query, string $role): ?PersonCollection { + $people = $this->fetchStringMulti($query) ?? []; $out = new PersonCollection; foreach ($people as $person) { if (!strlen($person)) { @@ -139,13 +141,13 @@ trait Construct { } /** Finds and parses Atom person-constructs, and returns a collection of Person objects */ - protected function fetchPeopleAtom(string $query, string $role) { + protected function fetchPeopleAtom(string $query, string $role): ?PersonCollection { $nodes = $this->fetchElements($query); $out = new PersonCollection; foreach ($nodes as $node) { $p = new Person; - $p->mail = $this->fetchText("atom:email", $node) ?? ""; - $p->name = $this->fetchText("atom:name", $node) ?? $p->mail; + $p->mail = $this->fetchString("atom:email", $node) ?? ""; + $p->name = $this->fetchString("atom:name", $node) ?? $p->mail; $p->url = $this->fetchUrl("atom:uri", $node); $p->role = $role; if (strlen($p->name)) { diff --git a/lib/Parser/XML/Feed.php b/lib/Parser/XML/Feed.php index a3d4d29..4d45037 100644 --- a/lib/Parser/XML/Feed.php +++ b/lib/Parser/XML/Feed.php @@ -18,101 +18,108 @@ class Feed implements \JKingWeb\Lax\Parser\Feed { use Primitives\Construct; use Primitives\Feed; + /** @var string */ + protected $data; + /** @var string */ + protected $contentType; + /** @var \JKingWeb\Lax\Url */ + protected $url; + /** @var \DOMDocument */ + protected $document; + /** @var \DOMElement */ + protected $subject; + /** @var \DOMXpath */ + protected $xpath; + /** Constructs a parsed feed */ public function __construct(string $data, string $contentType = "", string $url = "") { - $this->init($data, $contentType, $url); + $this->data = $data; + $this->contentType = $contentType; + if (strlen($url ?? "")) { + $this->url = new Url($url); + } } /** Performs initialization of the instance */ - protected function init(string $data, string $contentType = "", string $url = "") { - $this->reqUrl = $url; + protected function init(FeedStruct $feed): FeedStruct { $this->document = new \DOMDocument(); - $this->document->loadXML($data, \LIBXML_BIGLINES | \LIBXML_COMPACT); - $this->document->documentURI = $url; + $this->document->loadXML($this->data, \LIBXML_BIGLINES | \LIBXML_COMPACT); + $this->document->documentURI = (string) $this->url; $this->xpath = new XPath($this->document); $this->subject = $this->document->documentElement; $ns = $this->subject->namespaceURI; $name = $this->subject->localName; if (is_null($ns) && $name=="rss") { $this->subject = $this->fetchElement("channel") ?? $this->subject; - $this->type = "rss"; - $this->version = $this->document->documentElement->getAttribute("version"); + $feed->format = "rss"; + $feed->version = $this->document->documentElement->getAttribute("version"); } elseif ($ns==XPath::NS['rdf'] && $name=="RDF") { - $this->type = "rdf"; + $feed->format = "rdf"; $channel = $this->fetchElement("rss1:channel|rss0:channel"); if ($channel) { $this->subject = $channel; - $this->version = ($channel->namespaceURI==XPath::NS['rss1']) ? "1.0" : "0.90"; + $feed->version = ($channel->namespaceURI==XPath::NS['rss1']) ? "1.0" : "0.90"; } else { $element = $this->fetchElement("rss1:item|rss0:item|rss1:image|rss0:image"); if ($element) { - $this->version = ($element->namespaceURI==XPath::NS['rss1']) ? "1.0" : "0.90"; + $feed->version = ($element->namespaceURI==XPath::NS['rss1']) ? "1.0" : "0.90"; } } } elseif ($ns==XPath::NS['atom'] && $name=="feed") { - $this->type = "atom"; - $this->version = "1.0"; + $feed->format = "atom"; + $feed->version = "1.0"; } else { throw new \Exception; } - $this->url = $url; + $feed->meta->url = $this->url; + return $feed; } - /** Parses the feed to extract sundry metadata */ + /** Parses the feed to extract data */ public function parse(FeedStruct $feed = null): FeedStruct { - $feed = $feed ?? new FeedStruct; + $feed = $this->init($feed ?? new FeedStruct); + $feed->meta->url = $this->url; + //$feed->sched->expired = $this->getExpired(); $feed->id = $this->getId(); - $feed->url = $this->getUrl(); - $feed->link = $this->getLink(); - $feed->title = $this->getTitle(); - $feed->summary = $this->getSummary(); - $feed->people = $this->getPeople(); - $feed->author = $this->people->primary(); - $feed->dateModified = $this->getDateModified(); - $feed->entries = $this->getEntries($feed); - // do a second pass on missing data we'd rather fill in - $feed->link = strlen($this->link) ? $this->link : $this->url; - $feed->title = strlen($this->title) ? $this->title : $this->link; - // do extra stuff just to test it - $feed->categories = $this->getCategories(); + //$feed->lang = $this->getLang(); + //$feed->url = $this->getUrl(); + //$feed->link = $this->getLink(); + //$feed->title = $this->getTitle(); + //$feed->summary = $this->getSummary(); + //$feed->dateModified = $this->getDateModified(); + //$feed->icon = $this->getIcon(); + //$feed->image = $this->getImage(); + //$feed->people = $this->getPeople(); + //$feed->categories = $this->getCategories(); + //$feed->entries = $this->getEntries($feed); return $feed; } + + public function getId(): ?string { + return $this->getIdAtom() ?? $this->getIdDC() ?? $this->getIdRss2() ?? ""; + } - /** General function to fetch the canonical feed URL - * - * If the feed does not include a canonical URL, the request URL is returned instead - */ public function getUrl(): ?Url { return $this->getUrlAtom() ?? $this->getUrlRss1() ?? $this->getUrlPod() ?? $this->reqUrl; } - /** General function to fetch the feed title */ public function getTitle(): ?Text { return $this->getTitleAtom() ?? $this->getTitleRss1() ?? $this->getTitleRss2() ?? $this->getTitleDC() ?? $this->getTitlePod() ?? ""; } - /** General function to fetch the feed's Web-representation URL */ public function getLink(): ?Url { return $this->getLinkAtom() ?? $this->getLinkRss1() ?? $this->getLinkRss2() ?? ""; } - /** General function to fetch the description of a feed */ public function getSummary(): ?Text { // unlike most other data, Atom is not preferred, because Atom doesn't really have feed summaries return $this->getSummaryDC() ?? $this->getSummaryRss1() ?? $this->getSummaryRss2() ?? $this->getSummaryPod() ?? $this->getSummaryAtom() ?? ""; } - /** General function to fetch the categories of a feed */ public function getCategories(): CategoryCollection { return $this->getCategoriesAtom() ?? $this->getCategoriesRss2() ?? $this->getCategoriesDC() ?? $this->getCategoriesPod() ?? new CategoryCollection; } - /** General function to fetch the feed identifier */ - public function getId(): ?string { - return $this->getIdAtom() ?? $this->getIdDC() ?? $this->getIdRss2() ?? ""; - } - - /** General function to fetch a collection of all people associated with a feed */ public function getPeople(): PersonCollection { $authors = $this->getAuthorsAtom() ?? $this->getAuthorsDC() ?? $this->getAuthorsPod() ?? $this->getAuthorsRss2() ?? new PersonCollection; $contributors = $this->getContributorsAtom() ?? $this->getContributorsDC() ?? new PersonCollection; @@ -121,12 +128,10 @@ class Feed implements \JKingWeb\Lax\Parser\Feed { return $authors->merge($contributors, $editors, $webmasters); } - /** General function to fetch the modification date of a feed */ public function getDateModified(): ?Date { return $this->getDateModifiedAtom() ?? $this->getDateModifiedDC() ?? $this->getDateModifiedRss2(); } - /** General function to fetch the entries of a feed */ public function getEntries(FeedStruct $feed = null): array { return $this->getEntriesAtom() ?? $this->getEntriesRss1() ?? $this->getEntriesRss2() ?? []; } diff --git a/lib/Parser/XML/Primitives/Construct.php b/lib/Parser/XML/Primitives/Construct.php index 7567965..a8be996 100644 --- a/lib/Parser/XML/Primitives/Construct.php +++ b/lib/Parser/XML/Primitives/Construct.php @@ -10,56 +10,58 @@ use JKingWeb\Lax\Person\Person; use JKingWeb\Lax\Person\Collection as PersonCollection; use JKingWeb\Lax\Category\Category; use JKingWeb\Lax\Category\Collection as CategoryCollection; +use JKingWeb\Lax\Date; use JKingWeb\Lax\Parser\XML\Entry as FeedEntry; +use JKingWeb\Lax\Text; +use JKingWeb\Lax\Url; trait Construct { /** Primitive to fetch an Atom feed/entry title - * - * This fetches the title in plain text rather than HTML, even if HTML is provided in the feed/entry */ - protected function getTitleAtom() { - return $this->fetchTextAtom("atom:title"); + protected function getTitleAtom(): ?Text { + // FIXME: fetch rich text + return $this->fetchStringAtom("atom:title"); } /** Primitive to fetch an RSS feed/entry title */ - protected function getTitleRss2() { - return $this->fetchText("title"); + protected function getTitleRss2(): ?Text { + return $this->fetchString("title"); } /** Primitive to fetch an RDF feed/entry title */ - protected function getTitleRss1() { - return $this->fetchText("rss1:title|rss0:title"); + protected function getTitleRss1(): ?Text { + return $this->fetchString("rss1:title|rss0:title"); } /** Primitive to fetch a Dublin Core feed/entry title */ - protected function getTitleDC() { - return $this->fetchText("dc:title"); + protected function getTitleDC(): ?Text { + return $this->fetchString("dc:title"); } /** Primitive to fetch an Apple podcast/episdoe title */ - protected function getTitlePod() { - return $this->fetchText("apple:title"); + protected function getTitlePod(): ?Text { + return $this->fetchString("apple:title"); } /** Primitive to fetch an Atom feed/entry Web-representation URL */ - protected function getLinkAtom() { + protected function getLinkAtom(): ?Url { // FIXME: Atom link fetching should ideally prefer links to text/html resources or the like over e.g. other-format newsfeeds, generic XML, images, etc $node = $this->fetchAtomRelations(); return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null; } /** Primitive to fetch an RSS feed/entry Web-representation URL */ - protected function getLinkRss2() { + protected function getLinkRss2(): ?Url { return $this->fetchUrl("link") ?? $this->fetchUrl("guid[not(@isPermalink='false')]"); } /** Primitive to fetch an RDF feed/entry Web-representation URL */ - protected function getLinkRss1() { + protected function getLinkRss1(): ?Url { return $this->fetchUrl("rss1:link|rss0:link"); } /** Primitive to fetch Atom feed/entry categories */ - protected function getCategoriesAtom() { + protected function getCategoriesAtom(): ?CategoryCollection { $out = new CategoryCollection; foreach ($this->fetchElements("atom:category[@term]") ?? [] as $node) { $c = new Category; @@ -74,7 +76,7 @@ trait Construct { } /** Primitive to fetch RSS feed/entry categories */ - protected function getCategoriesRss2() { + protected function getCategoriesRss2(): ?CategoryCollection { $out = new CategoryCollection; foreach ($this->fetchElements("category") ?? [] as $node) { $c = new Category; @@ -91,9 +93,9 @@ trait Construct { * * Dublin Core doesn't have an obvious category type, so we use 'subject' as a nearest approximation */ - protected function getCategoriesDC() { + protected function getCategoriesDC(): ?CategoryCollection { $out = new CategoryCollection; - foreach ($this->fetchTextMulti("dc:subject") ?? [] as $text) { + foreach ($this->fetchStringMulti("dc:subject") ?? [] as $text) { if (strlen($text)) { $c = new Category; $c->name = $text; @@ -104,7 +106,7 @@ trait Construct { } /** Primitive to fetch podcast/episode categories */ - protected function getCategoriesPod() { + protected function getCategoriesPod(): ?CategoryCollection { $out = new CategoryCollection; foreach ($this->fetchElements("apple:category|gplay:category") ?? [] as $node) { $c = new Category; @@ -117,50 +119,50 @@ trait Construct { } /** Primitive to fetch an Atom feed/entry identifier */ - protected function getIdAtom() { - return $this->fetchText("atom:id"); + protected function getIdAtom(): ?string { + return $this->fetchString("atom:id"); } /** Primitive to fetch an RSS feed/entry identifier * * Using RSS' for feed identifiers is non-standard, but harmless */ - protected function getIdRss2() { - return $this->fetchText("guid"); + protected function getIdRss2(): ?string { + return $this->fetchString("guid"); } /** Primitive to fetch a Dublin Core feed/entry identifier */ - protected function getIdDC() { - return $this->fetchText("dc:identifier"); + protected function getIdDC(): ?string { + return $this->fetchString("dc:identifier"); } /** Primitive to fetch a collection of authors associated with a feed/entry via Dublin Core */ - protected function getAuthorsDC() { + protected function getAuthorsDC(): ?PersonCollection { return $this->fetchPeople("dc:creator", "author"); } /** Primitive to fetch a collection of contributors associated with a feed/entry via Dublin Core */ - protected function getContributorsDC() { + protected function getContributorsDC(): ?PersonCollection { return $this->fetchPeople("dc:ccontributor", "contributor"); } /** Primitive to fetch a collection of authors associated with an RSS feed/entry */ - protected function getAuthorsRss2() { + protected function getAuthorsRss2(): ?PersonCollection { return $this->fetchPeople("author", "author"); } /** Primitive to fetch a collection of editors associated with an RSS feed/entry */ - protected function getEditorsRss2() { + protected function getEditorsRss2(): ?PersonCollection { return $this->fetchPeople("managingEditor", "editor"); } /** Primitive to fetch a collection of authors associated with an RSS feed/entry */ - protected function getWebmastersRss2() { + protected function getWebmastersRss2(): ?PersonCollection { return $this->fetchPeople("webMaster", "webMaster"); } /** Primitive to fetch a collection of contributors associated with an Atom feed */ - protected function getContributorsAtom() { + protected function getContributorsAtom(): ?PersonCollection { return $this->fetchPeopleAtom("atom:contributor", "contributor"); } @@ -168,11 +170,11 @@ trait Construct { * * The collection only ever contains the first author found: podcasts implicitly have only one author */ - protected function getAuthorsPod() { + protected function getAuthorsPod(): ?PersonCollection { $out = new PersonCollection; $p = new Person; - $p->name = $this->fetchText("gplay:author|apple:author") ?? ""; - $p->mail = $this->fetchText("gplay:email|apple:email") ?? ""; + $p->name = $this->fetchString("gplay:author|apple:author") ?? ""; + $p->mail = $this->fetchString("gplay:email|apple:email") ?? ""; $p->role = "author"; if (strlen($p->name)) { $out[] = $p; @@ -184,13 +186,13 @@ trait Construct { * * The collection only ever contains the first webmaster found: podcasts implicitly have only one webmaster */ - protected function getWebmastersPod() { + protected function getWebmastersPod(): ?PersonCollection { $out = new PersonCollection; $node = $this->fetchElement("gplay:owner|apple:owner"); if ($node) { $p = new Person; - $p->name = $this->fetchText("gplay:author|apple:author", $node) ?? ""; - $p->mail = $this->fetchText("gplay:email|apple:email", $node) ?? ""; + $p->name = $this->fetchString("gplay:author|apple:author", $node) ?? ""; + $p->mail = $this->fetchString("gplay:email|apple:email", $node) ?? ""; $p->role = "webmaster"; if (strlen($p->name)) { $out[] = $p; @@ -200,28 +202,28 @@ trait Construct { } /** Primitive to fetch an Atom feed or entry's canonical URL */ - protected function getUrlAtom() { + protected function getUrlAtom(): ?Url { $node = $this->fetchAtomRelations("self"); return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null; } /** Primitive to fetch the modification date of an Atom feed/entry */ - protected function getDateModifiedAtom() { + protected function getDateModifiedAtom(): ?Date { return $this->fetchDate("atom:updated"); } /** Primitive to fetch the modification date of an Atom feed/entry */ - protected function getDateModifiedDC() { + protected function getDateModifiedDC(): ?Date { return $this->fetchDate("dc:date"); } /** Primitive to fetch the modification date of an Atom entry */ - protected function getDateCreatedAtom() { + protected function getDateCreatedAtom(): ?Date { return $this->fetchDate("atom:published"); } /** Primitive to fetch the list of entries in an Atom feed */ - protected function getEntriesAtom() { + protected function getEntriesAtom(): ?array { $out = []; foreach ($this->fetchElements("atom:entry") ?? [] as $node) { $out[] = new FeedEntry($node, $this, $this->xpath); @@ -230,7 +232,7 @@ trait Construct { } /** Primitive to fetch the list of entries in an RDF feed */ - protected function getEntriesRss1() { + protected function getEntriesRss1(): ?array { $out = []; foreach ($this->fetchElements("rss1:item", $this->subject->ownerDocument->documentElement) ?? $this->fetchElements("rss1:item") ?? $this->fetchElements("rss0:item", $this->subject->ownerDocument->documentElement) ?? $this->fetchElements("rss0:item") ?? [] as $node) { $out[] = new FeedEntry($node, $this, $this->xpath); @@ -239,7 +241,7 @@ trait Construct { } /** Primitive to fetch the list of entries in an RSS feed */ - protected function getEntriesRss2() { + protected function getEntriesRss2(): ?array { $out = []; foreach ($this->fetchElements("item") ?? [] as $node) { $out[] = new FeedEntry($node, $this, $this->xpath); @@ -248,7 +250,7 @@ trait Construct { } /** Primitive to fetch the URL of a article related to the entry */ - protected function getRelatedLinkAtom() { + protected function getRelatedLinkAtom(): ?Url { // FIXME: Atom link fetching should ideally prefer links to text/html resources or the like over e.g. other-format newsfeeds, generic XML, images, etc $node = $this->fetchAtomRelations("related"); return $node->length ? $this->resolveNodeUrl($node->item(0), "href") : null; diff --git a/lib/Parser/XML/Primitives/Feed.php b/lib/Parser/XML/Primitives/Feed.php index c3690d2..9814ab4 100644 --- a/lib/Parser/XML/Primitives/Feed.php +++ b/lib/Parser/XML/Primitives/Feed.php @@ -15,27 +15,27 @@ trait Feed { * Atom does not have a 'description' element like the RSSes, but it does have 'subtitle', which fills roughly the same function */ protected function getSummaryAtom() { - return $this->fetchTextAtom("atom:subtitle"); + return $this->fetchStringAtom("atom:subtitle"); } /** Primitive to fetch an RSS feed summary */ protected function getSummaryRss2() { - return $this->fetchText("description"); + return $this->fetchString("description"); } /** Primitive to fetch an RDF feed summary */ protected function getSummaryRss1() { - return $this->fetchText("rss1:description|rss0:description"); + return $this->fetchString("rss1:description|rss0:description"); } /** Primitive to fetch a Dublin Core feed summary */ protected function getSummaryDC() { - return $this->fetchText("dc:description"); + return $this->fetchString("dc:description"); } /** Primitive to fetch a podcast summary */ protected function getSummaryPod() { - return $this->fetchText("apple:summary|gplay:description") ?? $this->fetchText("apple:subtitle"); + return $this->fetchString("apple:summary|gplay:description") ?? $this->fetchString("apple:subtitle"); } /** Primitive to fetch a collection of authors associated with an Atom feed */ diff --git a/tests/cases/JSON/JSONTest.php b/tests/cases/JSON/JSONTest.php index eda5713..cf86d9b 100644 --- a/tests/cases/JSON/JSONTest.php +++ b/tests/cases/JSON/JSONTest.php @@ -51,6 +51,7 @@ use JKingWeb\Lax\Category\Collection as CategoryCollection; use JKingWeb\Lax\Enclosure\Collection as EnclosureCollection; /** + * @covers JKingWeb\Lax\Parser\Construct * @covers JKingWeb\Lax\Parser\JSON\Feed * @covers JKingWeb\Lax\Parser\JSON\Entry */