lax/lib/Parser/XML/Construct.php

<?php
/** @license MIT
 * Copyright 2018 J. King et al.
 * See LICENSE and AUTHORS files for details */

declare(strict_types=1);
namespace MensBeam\Lax\Parser\XML;

use MensBeam\Lax\Person\Person;
use MensBeam\Lax\Person\Collection as PersonCollection;
use MensBeam\Lax\Text;
use MensBeam\Lax\Date;
use MensBeam\Lax\Url;

abstract class Construct {
    use \MensBeam\Lax\Parser\Construct;

    /** @var \DOMDocument */
    protected $document;
    /** @var \DOMXPath */
    protected $xpath;
    /** @var \DOMElement */
    protected $subject;

    /** Retrieves an element node based on an XPath query */
    protected function fetchElement(string $query, \DOMNode $context = null): ?\DOMElement {
        $node = @$this->xpath->query("(".$query.")[1]", $context ?? $this->subject);
        if ($node === false) {
            throw new \Exception("Invalid XPath query: $query"); // @codeCoverageIgnore
        }
        return ($node->length) ? $node->item(0) : null;
    }

    /** Retrieves the trimmed text content of one or more DOM elements based on an XPath query, optionally matching a pattern
     * 
     * Returns null if no suitable nodes were found
     * 
     * @param string $query The XPath query of the nodes to return
     * @param string|null $pattern The pattern to optionally filter matches with. The pattern should not include delimiters or anchors and is always case-insensitive
     * @param bool|null $multi Whether to return multiple results as an array (true) or one result as a string (false, default)
     * @param \DOMNode $context The context node for the XPath query
     * @return string|array|null
      */
    protected function fetchString(string $query, ?string $pattern = null, ?bool $multi = null, ?\DOMNode $context = null) {
        $out = [];
        $pattern = strlen($pattern ?? "") ? "/^(?:".str_replace("/", "\\/", $pattern).")$/i" : "";
        $multi = $multi ?? false;
        $nodes = $this->xpath->query($query, $context ?? $this->subject);
        foreach ($nodes as $node) {
            $t = $this->trimText($node->textContent);
            if (!$pattern || preg_match($pattern, $t)) {
                if (!$multi) {
                    return $t;
                } else {
                    $out[] = $t;
                }
            }
        }
        return ($out) ? $out : null;
    }

    /** Retrieves and parses a date from one or more DOM elements based on an XPath query
     * 
     * Returns null if no suitable nodes were found  
     * 
     * @param string $query The XPath query of the nodes to return
     * @param bool|null $multi Whether to return multiple results as an array (true) or one result as a date object (false, default)
     * @param \DOMNode $context The context node for the XPath query
     * @return \MensBeam\Lax\Date|array|null
     */
    protected function fetchDate(string $query, ?bool $multi = null, \DOMNode $context = null) {
        $out = [];
        foreach((array) $this->fetchString($query, null, true, $context) as $d) {
            if ($d = $this->parseDate($d ?? "")) {
                if (!$multi) {
                    return $d;
                } else {
                    $out[] = $d;
                }
            }
        }
        return $out ?: null;
    }

    /** Returns the first valid URL matching an XPath query. Relative URLs are resolved when possible
     * 
     * @param string $query The XPath query of the node to return
     * @param \DOMNode $context The context node for the XPath query
     */
    protected function fetchUrl(string $query, \DOMNode $context = null): ?Url {
        foreach ($this->xpath->query($query, $context ?? $this->subject) as $node) {
            $url = trim($node->textContent);
            if (strlen($url)) {
                try {
                    return new Url($url, $node->baseURI);
                } catch (\InvalidArgumentException $e) {
                    // don't return a result that doesn't evaluate to a valid URL of some sort
                }
            }
        }
        return null;
    }

    /** Returns a node-list of Atom link elements with the desired relation or equivalents.
     *
     * Links without an href attribute are excluded.
     *
     * @see https://tools.ietf.org/html/rfc4287#section-4.2.7.2
     */
    protected function fetchAtomRelations(string $rel = "", \DOMNode $context = null): array {
        // normalize the relation
        $custom = false;
        $rel = trim($rel);
        if ($rel === "") {
            $rel = "alternate";
        } elseif (strpos(strtolower($rel), "http://www.iana.org/assignments/relation/") === 0) {
            $rel = substr($rel, 41);
        } elseif (preg_match("<^[a-z\.\-]+$>i", $rel)) {
            $rel = strtolower($rel);
        } else {
            $custom = true;
            $url = (string) new Url($rel);
        }
        // look at all the links that have a non-empty href attribute
        $out = [];
        foreach ($this->xpath->query("atom:link[normalize-space(@href)]", $context ?? $this->subject) as $l) {
            try {
                new Url($l->getAttribute("href"));
            } catch (\InvalidArgumentException $e) {
                // reject any links which do not have valid URLs
                continue;
            }
            $r = trim($l->getAttribute("rel"));
            if ($custom) {
                if ($url === (string) new Url($r)) {
                    $out[] = $l;
                }
            } else {
                $r = trim(strtolower(rawurldecode($r)));
                $r = (strpos($r, "http://www.iana.org/assignments/relation/") === 0) ? substr($r, 41) : $r;
                $r = !strlen($r) ? "alternate" : $r;
                if ($r === $rel) {
                    $out[] = $l;
                }
            }
        }
        return $out;
    }

    /** Returns the first Atom link URL which matches the desired relation, with nearest desired media type, or no media type if none match */
    protected function fetchAtomRelation(string $rel = "", array $mediaTypes = [], \DOMNode $context = null): ?Url {
        // tidy ther list of media types; this orders them worst (0)to best (highest index) and then creates a hashtable
        $mediaTypes = array_flip(array_reverse(array_values(array_unique(array_map(function(string $t) {
            return strtolower(trim($t));
        }, $mediaTypes)))));
        $rels = $this->fetchAtomRelations($rel, $context);
        if ($rels && !$mediaTypes) {
            return new Url($rels[0]->getAttribute("href"), );
        }
        $result = array_reduce($rels, function($best, $cur) use ($mediaTypes) {
            $t = trim($cur->getAttribute("type"));
            // absence of media type is acceptable if no better match yet exists
            if (!strlen($t)) {
                if (!$best) {
                    return [$cur, -1]; // any match will rank higher than -1
                }
            }
            $t = $this->parseMediaType($t);
            if ($t) {
                $rank = $mediaTypes[$t] ?? null;
                if (!is_null($rank) && (!$best || $rank > $best[1])) {
                    // if the media type is acceptable and there is currently no candidate or the candidate ranks lower, use the current link
                    return [$cur, $rank];
                }
            }
            return $best;
        });
        return $result ? new Url($result[0]->getAttribute("href")) : null;
    }

    /** Primitive to fetch an Atom feed/entry identifier */
    protected function getIdAtom(): ?string {
        return $this->fetchString("atom:id", ".+");
    }

    /** Primitive to fetch an RSS feed/entry identifier
     *
     * Using RSS' <guid> for feed identifiers is non-standard, but harmless
     */
    protected function getIdRss2(): ?string {
        return $this->fetchString("guid", ".+");
    }

    /** Primitive to fetch a Dublin Core feed/entry identifier */
    protected function getIdDC(): ?string {
        return $this->fetchString("dc:identifier", ".+");
    }

    protected function getLangXML(): ?string {
        // walk up the tree looking for the nearest language tag
        $el = $this->subject;
        do {
            $out = $this->fetchString("@xml:lang", ".+", false, $el);
            $el = $el->parentNode;
        } while (is_null($out) && $el);
        return $out;
    }

    protected function getLangDC(): ?string {
        return $this->fetchString("dc:language", ".+");
    }

    protected function getLangRss2(): ?string {
        return $this->fetchString("language", ".+");
    }

    protected function getLinkAtom(): ?Url {
        return $this->fetchAtomRelation("alternate", ["text/html", "application/xhtml+xml"]);
    }

    protected function getLinkRss2(): ?Url {
        return $this->fetchUrl("link") ?? $this->fetchUrl("guid[not(@isPermalink) or @isPermalink='true']");
    }

    protected function getLinkRss1(): ?Url {
        return $this->fetchUrl("rss1:link|rss0:link");
    }
}
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago			`<?php`
			`/** @license MIT`
			`* Copyright 2018 J. King et al.`
			`* See LICENSE and AUTHORS files for details */`

			`declare(strict_types=1);`
Change namespace to MensBeam 4 years ago			`namespace MensBeam\Lax\Parser\XML;`
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago
Change namespace to MensBeam 4 years ago			`use MensBeam\Lax\Person\Person;`
			`use MensBeam\Lax\Person\Collection as PersonCollection;`
			`use MensBeam\Lax\Text;`
Move more old code 4 years ago			`use MensBeam\Lax\Date;`
			`use MensBeam\Lax\Url;`
Re-organize namespace 6 years ago
Reorganize code 4 years ago			`abstract class Construct {`
Change namespace to MensBeam 4 years ago			`use \MensBeam\Lax\Parser\Construct;`
Implement JSON Feed 6 years ago
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago			`/** @var \DOMDocument */`
Apply house style 4 years ago			`protected $document;`
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago			`/** @var \DOMXPath */`
			`protected $xpath;`
			`/** @var \DOMElement */`
			`protected $subject;`

			`/** Retrieves an element node based on an XPath query */`
Refactor of string fetching; RSS update intervals There are bugs in the refactor; these will be fixed in next commit 4 years ago			`protected function fetchElement(string $query, \DOMNode $context = null): ?\DOMElement {`
Add canonical feed URLs 6 years ago			`$node = @$this->xpath->query("(".$query.")[1]", $context ?? $this->subject);`
Apply house style 4 years ago			`if ($node === false) {`
Initial cleanup of XML parser 4 years ago			`throw new \Exception("Invalid XPath query: $query"); // @codeCoverageIgnore`
Add canonical feed URLs 6 years ago			`}`
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago			`return ($node->length) ? $node->item(0) : null;`
			`}`

Refactor of string fetching; RSS update intervals There are bugs in the refactor; these will be fixed in next commit 4 years ago			`/** Retrieves the trimmed text content of one or more DOM elements based on an XPath query, optionally matching a pattern`
			`*`
			`* Returns null if no suitable nodes were found`
			`*`
			`* @param string $query The XPath query of the nodes to return`
			`* @param string\|null $pattern The pattern to optionally filter matches with. The pattern should not include delimiters or anchors and is always case-insensitive`
			`* @param bool\|null $multi Whether to return multiple results as an array (true) or one result as a string (false, default)`
			`* @param \DOMNode $context The context node for the XPath query`
			`* @return string\|array\|null`
			`*/`
			`protected function fetchString(string $query, ?string $pattern = null, ?bool $multi = null, ?\DOMNode $context = null) {`
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago			`$out = [];`
Refactor of string fetching; RSS update intervals There are bugs in the refactor; these will be fixed in next commit 4 years ago			`$pattern = strlen($pattern ?? "") ? "/^(?:".str_replace("/", "\\/", $pattern).")$/i" : "";`
			`$multi = $multi ?? false;`
Add Atom and podcast people primitives; fix iTunes namespace The general getPeople() function jumps through many hoops to get at least one author while also including any contributors, and including RSS' unique people-types. Atom logic for entries will be slightly different than for feeds; this still needs to be implemented. 6 years ago			`$nodes = $this->xpath->query($query, $context ?? $this->subject);`
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago			`foreach ($nodes as $node) {`
Refactor of string fetching; RSS update intervals There are bugs in the refactor; these will be fixed in next commit 4 years ago			`$t = $this->trimText($node->textContent);`
Fix string fetching 4 years ago			`if (!$pattern \|\| preg_match($pattern, $t)) {`
Refactor of string fetching; RSS update intervals There are bugs in the refactor; these will be fixed in next commit 4 years ago			`if (!$multi) {`
			`return $t;`
			`} else {`
			`$out[] = $t;`
			`}`
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago			`}`
			`}`
Refactor of string fetching; RSS update intervals There are bugs in the refactor; these will be fixed in next commit 4 years ago			`return ($out) ? $out : null;`
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago			`}`

Reorganize code 4 years ago			`/** Retrieves and parses a date from one or more DOM elements based on an XPath query`
			`*`
			`* Returns null if no suitable nodes were found`
			`*`
			`* @param string $query The XPath query of the nodes to return`
			`* @param bool\|null $multi Whether to return multiple results as an array (true) or one result as a date object (false, default)`
			`* @param \DOMNode $context The context node for the XPath query`
			`* @return \MensBeam\Lax\Date\|array\|null`
			`*/`
			`protected function fetchDate(string $query, ?bool $multi = null, \DOMNode $context = null) {`
			`$out = [];`
Add RSS1 schedule base 4 years ago			`foreach((array) $this->fetchString($query, null, true, $context) as $d) {`
Reorganize code 4 years ago			`if ($d = $this->parseDate($d ?? "")) {`
			`if (!$multi) {`
			`return $d;`
			`} else {`
			`$out[] = $d;`
			`}`
			`}`
			`}`
			`return $out ?: null;`
Add feed modification dates 6 years ago			`}`

Make base URL to string; feed URL fetching 4 years ago			`/** Returns the first valid URL matching an XPath query. Relative URLs are resolved when possible`
			`*`
			`* @param string $query The XPath query of the node to return`
			`* @param \DOMNode $context The context node for the XPath query`
			`*/`
			`protected function fetchUrl(string $query, \DOMNode $context = null): ?Url {`
			`foreach ($this->xpath->query($query, $context ?? $this->subject) as $node) {`
			`$url = trim($node->textContent);`
			`if (strlen($url)) {`
			`try {`
			`return new Url($url, $node->baseURI);`
			`} catch (\InvalidArgumentException $e) {`
			`// don't return a result that doesn't evaluate to a valid URL of some sort`
			`}`
			`}`
			`}`
			`return null;`
			`}`

			`/** Returns a node-list of Atom link elements with the desired relation or equivalents.`
			`*`
			`* Links without an href attribute are excluded.`
			`*`
			`* @see https://tools.ietf.org/html/rfc4287#section-4.2.7.2`
			`*/`
			`protected function fetchAtomRelations(string $rel = "", \DOMNode $context = null): array {`
			`// normalize the relation`
			`$custom = false;`
			`$rel = trim($rel);`
			`if ($rel === "") {`
			`$rel = "alternate";`
			`} elseif (strpos(strtolower($rel), "http://www.iana.org/assignments/relation/") === 0) {`
			`$rel = substr($rel, 41);`
			`} elseif (preg_match("<^[a-z\.\-]+$>i", $rel)) {`
			`$rel = strtolower($rel);`
			`} else {`
			`$custom = true;`
			`$url = (string) new Url($rel);`
			`}`
			`// look at all the links that have a non-empty href attribute`
			`$out = [];`
			`foreach ($this->xpath->query("atom:link[normalize-space(@href)]", $context ?? $this->subject) as $l) {`
			`try {`
			`new Url($l->getAttribute("href"));`
			`} catch (\InvalidArgumentException $e) {`
			`// reject any links which do not have valid URLs`
			`continue;`
			`}`
			`$r = trim($l->getAttribute("rel"));`
			`if ($custom) {`
			`if ($url === (string) new Url($r)) {`
			`$out[] = $l;`
			`}`
			`} else {`
			`$r = trim(strtolower(rawurldecode($r)));`
			`$r = (strpos($r, "http://www.iana.org/assignments/relation/") === 0) ? substr($r, 41) : $r;`
			`$r = !strlen($r) ? "alternate" : $r;`
			`if ($r === $rel) {`
			`$out[] = $l;`
			`}`
			`}`
			`}`
			`return $out;`
			`}`

			`/** Returns the first Atom link URL which matches the desired relation, with nearest desired media type, or no media type if none match */`
			`protected function fetchAtomRelation(string $rel = "", array $mediaTypes = [], \DOMNode $context = null): ?Url {`
			`// tidy ther list of media types; this orders them worst (0)to best (highest index) and then creates a hashtable`
			`$mediaTypes = array_flip(array_reverse(array_values(array_unique(array_map(function(string $t) {`
			`return strtolower(trim($t));`
			`}, $mediaTypes)))));`
			`$rels = $this->fetchAtomRelations($rel, $context);`
			`if ($rels && !$mediaTypes) {`
			`return new Url($rels[0]->getAttribute("href"), );`
			`}`
			`$result = array_reduce($rels, function($best, $cur) use ($mediaTypes) {`
			`$t = trim($cur->getAttribute("type"));`
			`// absence of media type is acceptable if no better match yet exists`
			`if (!strlen($t)) {`
			`if (!$best) {`
			`return [$cur, -1]; // any match will rank higher than -1`
			`}`
			`}`
			`$t = $this->parseMediaType($t);`
			`if ($t) {`
			`$rank = $mediaTypes[$t] ?? null;`
			`if (!is_null($rank) && (!$best \|\| $rank > $best[1])) {`
			`// if the media type is acceptable and there is currently no candidate or the candidate ranks lower, use the current link`
			`return [$cur, $rank];`
			`}`
			`}`
			`return $best;`
			`});`
			`return $result ? new Url($result[0]->getAttribute("href")) : null;`
			`}`

Reorganize code 4 years ago			`/** Primitive to fetch an Atom feed/entry identifier */`
			`protected function getIdAtom(): ?string {`
			`return $this->fetchString("atom:id", ".+");`
			`}`

			`/** Primitive to fetch an RSS feed/entry identifier`
			`*`
			`* Using RSS' <guid> for feed identifiers is non-standard, but harmless`
			`*/`
			`protected function getIdRss2(): ?string {`
			`return $this->fetchString("guid", ".+");`
			`}`

			`/** Primitive to fetch a Dublin Core feed/entry identifier */`
			`protected function getIdDC(): ?string {`
			`return $this->fetchString("dc:identifier", ".+");`
			`}`
Implement feed language 4 years ago
			`protected function getLangXML(): ?string {`
			`// walk up the tree looking for the nearest language tag`
			`$el = $this->subject;`
			`do {`
			`$out = $this->fetchString("@xml:lang", ".+", false, $el);`
			`$el = $el->parentNode;`
			`} while (is_null($out) && $el);`
			`return $out;`
			`}`

			`protected function getLangDC(): ?string {`
			`return $this->fetchString("dc:language", ".+");`
			`}`

			`protected function getLangRss2(): ?string {`
			`return $this->fetchString("language", ".+");`
			`}`
Feed links 4 years ago
			`protected function getLinkAtom(): ?Url {`
			`return $this->fetchAtomRelation("alternate", ["text/html", "application/xhtml+xml"]);`
			`}`

			`protected function getLinkRss2(): ?Url {`
			`return $this->fetchUrl("link") ?? $this->fetchUrl("guid[not(@isPermalink) or @isPermalink='true']");`
			`}`

			`protected function getLinkRss1(): ?Url {`
			`return $this->fetchUrl("rss1:link\|rss0:link");`
			`}`
Initial prototype of feed parser Support RSS, RDF RSS (with various extensions), Atom, iTunes podcasts, and Dublin Core metadata; JSON Feed support is forthcoming Currently feed-level titles, links, summaries, and categories are implemented 6 years ago			`}`