diff --git a/lib/Parser/Construct.php b/lib/Parser/Construct.php index 2b99d43..1f4887f 100644 --- a/lib/Parser/Construct.php +++ b/lib/Parser/Construct.php @@ -49,6 +49,14 @@ trait Construct { return false; } + protected function parseInt(string ...$v) { + for ($a = 0; $a < sizeof($v); $a++) { + $v[$a] = (preg_match("<^\d+$>", $v[$a])) ? (int) $v[$a] : null; + } + return sizeof($v) === 1 ? $v[0] : $v; + } + + protected function parseDate(string $date): ?Date { $out = null; $date = $this->trimText($date); diff --git a/lib/Parser/XML/Construct.php b/lib/Parser/XML/Construct.php index f318f1d..5c77756 100644 --- a/lib/Parser/XML/Construct.php +++ b/lib/Parser/XML/Construct.php @@ -22,7 +22,7 @@ trait Construct { protected $subject; /** Retrieves an element node based on an XPath query */ - protected function fetchElement(string $query, \DOMNode $context = null) { + protected function fetchElement(string $query, \DOMNode $context = null): ?\DOMElement { $node = @$this->xpath->query("(".$query.")[1]", $context ?? $this->subject); if ($node === false) { throw new \Exception("Invalid XPath query: $query"); // @codeCoverageIgnore @@ -35,39 +35,32 @@ trait Construct { return $this->xpath->query($query, $context ?? $this->subject); } - /** Retrieves the trimmed text content of a DOM element based on an XPath query */ - protected function fetchString(string $query, \DOMNode $context = null): ?string { - $node = $this->fetchElement($query, $context); - return ($node) ? $this->trimText($node->textContent) : null; - } - - /** Retrieves the trimmed text content of multiple DOM elements based on an XPath query */ - protected function fetchStringMulti(string $query, \DOMNode $context = null) { + /** Retrieves the trimmed text content of one or more DOM elements based on an XPath query, optionally matching a pattern + * + * Returns null if no suitable nodes were found + * + * @param string $query The XPath query of the nodes to return + * @param string|null $pattern The pattern to optionally filter matches with. The pattern should not include delimiters or anchors and is always case-insensitive + * @param bool|null $multi Whether to return multiple results as an array (true) or one result as a string (false, default) + * @param \DOMNode $context The context node for the XPath query + * @return string|array|null + */ + protected function fetchString(string $query, ?string $pattern = null, ?bool $multi = null, ?\DOMNode $context = null) { $out = []; + $pattern = strlen($pattern ?? "") ? "/^(?:".str_replace("/", "\\/", $pattern).")$/i" : ""; + $multi = $multi ?? false; $nodes = $this->xpath->query($query, $context ?? $this->subject); foreach ($nodes as $node) { - $out[] = $this->trimText($node->textContent); - } - return ($out) ? $out : null; - } - - /** Retrieves the trimmed plain-text or HTML content of an Atom text construct based on an XPath query */ - protected function fetchStringAtom(string $query, bool $html = false): ?Text { - $node = $this->fetchElement($query); - if ($node) { - if (!$node->hasAttribute("type") || $node->getAttribute("type") == "text") { - return $html ? htmlspecialchars($this->trimText($node->textContent), \ENT_QUOTES | \ENT_HTML5) : $this->trimText($node->textContent); - } elseif ($node->getAttribute("type") == "xhtml") { - $node = $node->getElementsByTagNameNS(self::NS['xhtml'], "div")->item(0); - return $node ? $this->sanitizeElement($node, $html) : null; - } elseif ($node->getAttribute("type") == "html") { - return $this->sanitizeString($node->textContent, $html); - } else { - return null; + $t = $this->trimText($node->textContent); + if ($pattern && preg_match($pattern, $t)) { + if (!$multi) { + return $t; + } else { + $out[] = $t; + } } - } else { - return null; } + return ($out) ? $out : null; } /** Retrieves and parses a date from the content of a DOM element based on an XPath query */ @@ -107,7 +100,7 @@ trait Construct { * - Full Name */ protected function fetchPeople(string $query, string $role): ?PersonCollection { - $people = $this->fetchStringMulti($query) ?? []; + $people = $this->fetchString($query, null, true) ?? []; $out = new PersonCollection; foreach ($people as $person) { if (!strlen($person)) { diff --git a/lib/Parser/XML/Feed.php b/lib/Parser/XML/Feed.php index 58853ec..efd2cdd 100644 --- a/lib/Parser/XML/Feed.php +++ b/lib/Parser/XML/Feed.php @@ -144,6 +144,7 @@ class Feed implements \MensBeam\Lax\Parser\Feed { public function getSchedule(): Schedule { $out = new Schedule; + $out->interval = $this->getSchedIntervalRss2(); $out->skip = $this->getSchedSkipRss2(); $out->expired = $this->getExpiredPod(); if (is_null($out->expired) && (($out->skip & Schedule::DAY_ALL) == Schedule::DAY_ALL || ($out->skip & Schedule::HOUR_ALL) == Schedule::HOUR_ALL)) { diff --git a/lib/Parser/XML/Primitives/Construct.php b/lib/Parser/XML/Primitives/Construct.php index 66dde8b..3aaaedb 100644 --- a/lib/Parser/XML/Primitives/Construct.php +++ b/lib/Parser/XML/Primitives/Construct.php @@ -95,7 +95,7 @@ trait Construct { */ protected function getCategoriesDC(): ?CategoryCollection { $out = new CategoryCollection; - foreach ($this->fetchStringMulti("dc:subject") ?? [] as $text) { + foreach ($this->fetchString("dc:subject", null, true) ?? [] as $text) { if (strlen($text)) { $c = new Category; $c->name = $text; diff --git a/lib/Parser/XML/Primitives/Feed.php b/lib/Parser/XML/Primitives/Feed.php index 7e5575a..f21886e 100644 --- a/lib/Parser/XML/Primitives/Feed.php +++ b/lib/Parser/XML/Primitives/Feed.php @@ -68,6 +68,7 @@ trait Feed { return $this->fetchDate("lastBuildDate") ?? $this->fetchDate("pubDate"); } + /** Fetches the "complete" flag from an iTunes podcast */ protected function getExpiredPod(): ?bool { $complete = $this->fetchString("apple:complete"); if ($complete === "Yes") { @@ -76,49 +77,70 @@ trait Feed { return null; } + protected function getSchedIntervalRss2(): ?\DateInterval { + $ttl = (int) $this->fetchString("ttl", "\d+"); + if ($ttl) { + return new \DateInterval("PT{$ttl}M"); + } + return null; + } + + protected function getSchedIntervalRss1(): ?\DateInterval { + $period = $this->fetchString("sched:updatePeriod", "(?:year|month|week|dai|hour)ly"); + if ($period) { + [$p, $n] = [ + "hourly" => ["TM", 60], // 60 minutes + "daily" => ["TH", 24], // 24 hors + "weekly" => ["D", 7], // 7 days + "monthly" => ["D", 30], // 30 days + "yearly" => ["M", 12], // 12 months + ][strtolower($period)]; + $f = min(1, (int) $this->fetchString("sched:updateFrequency", "0*[1-9]\d*")); // a frequency of zero makes no sense + // divide the period by the frequency + // FIXME: we must have an integer result because PHP (incorrectly) rejects fractional intervals + // see https://bugs.php.net/bug.php?id=53831 + $n = min(1, intdiv($n, $f)); // a frequency of zero still makes no sense, so we assume at least one subdivision + return new \DateInterval("P".(strlen($p) === 1 ? "" : $p[0]).$n.$p[-1]); + } + return null; + } + + + + /** Computes the "skip-schedule" of an RSS feed, the set of days and hours during which a feed should not be fetched */ protected function getSchedSkipRss2(): ?int { $out = 0; - $hours = $this->fetchStringMulti("skipHours/hour") ?? []; + $hours = $this->fetchString("skipHours/hour", "\d+", true) ?? []; foreach($hours as $h) { $out |= [ - "0" => Schedule::HOUR_0, - "1" => Schedule::HOUR_1, - "2" => Schedule::HOUR_2, - "3" => Schedule::HOUR_3, - "4" => Schedule::HOUR_4, - "5" => Schedule::HOUR_5, - "6" => Schedule::HOUR_6, - "7" => Schedule::HOUR_7, - "8" => Schedule::HOUR_8, - "9" => Schedule::HOUR_9, - "00" => Schedule::HOUR_0, - "01" => Schedule::HOUR_1, - "02" => Schedule::HOUR_2, - "03" => Schedule::HOUR_3, - "04" => Schedule::HOUR_4, - "05" => Schedule::HOUR_5, - "06" => Schedule::HOUR_6, - "07" => Schedule::HOUR_7, - "08" => Schedule::HOUR_8, - "09" => Schedule::HOUR_9, - "10" => Schedule::HOUR_10, - "11" => Schedule::HOUR_11, - "12" => Schedule::HOUR_12, - "13" => Schedule::HOUR_13, - "14" => Schedule::HOUR_14, - "15" => Schedule::HOUR_15, - "16" => Schedule::HOUR_16, - "17" => Schedule::HOUR_17, - "18" => Schedule::HOUR_18, - "19" => Schedule::HOUR_19, - "20" => Schedule::HOUR_20, - "21" => Schedule::HOUR_21, - "22" => Schedule::HOUR_22, - "23" => Schedule::HOUR_23, - "24" => Schedule::HOUR_0, - ][$h] ?? 0; + Schedule::HOUR_0, + Schedule::HOUR_1, + Schedule::HOUR_2, + Schedule::HOUR_3, + Schedule::HOUR_4, + Schedule::HOUR_5, + Schedule::HOUR_6, + Schedule::HOUR_7, + Schedule::HOUR_8, + Schedule::HOUR_9, + Schedule::HOUR_10, + Schedule::HOUR_11, + Schedule::HOUR_12, + Schedule::HOUR_13, + Schedule::HOUR_14, + Schedule::HOUR_15, + Schedule::HOUR_16, + Schedule::HOUR_17, + Schedule::HOUR_18, + Schedule::HOUR_19, + Schedule::HOUR_20, + Schedule::HOUR_21, + Schedule::HOUR_22, + Schedule::HOUR_23, + Schedule::HOUR_0, + ][(int) $h] ?? 0; } - $days = $this->fetchStringMulti("skipDays/day") ?? []; + $days = $this->fetchString("skipDays/day", null, true) ?? []; foreach($days as $d) { $out |= [ "monday" => Schedule::DAY_MON, diff --git a/lib/Schedule.php b/lib/Schedule.php index aaaae90..31dc856 100644 --- a/lib/Schedule.php +++ b/lib/Schedule.php @@ -54,6 +54,6 @@ class Schedule { * UTC on other days. */ public $skip; - /** @var int $interval The suggested interval before the feed should be fetched again, in seconds */ + /** @var \DateInterval $interval The suggested interval before the feed should be fetched again */ public $interval; } diff --git a/tests/cases/XML/feed-rss2.yaml b/tests/cases/XML/feed-rss2.yaml index ef8bece..4b9860e 100644 --- a/tests/cases/XML/feed-rss2.yaml +++ b/tests/cases/XML/feed-rss2.yaml @@ -44,7 +44,7 @@ Skip days: MONDAY sunday - TuE + TuE bogus @@ -78,7 +78,7 @@ Skip hours: 24 04 - 9 + 9 bogus 25