Browse Source

Refactor of string fetching; RSS update intervals

There are bugs in the refactor; these will be fixed in next commit
master
J. King 4 years ago
parent
commit
e69f04e08c
  1. 8
      lib/Parser/Construct.php
  2. 53
      lib/Parser/XML/Construct.php
  3. 1
      lib/Parser/XML/Feed.php
  4. 2
      lib/Parser/XML/Primitives/Construct.php
  5. 98
      lib/Parser/XML/Primitives/Feed.php
  6. 2
      lib/Schedule.php
  7. 4
      tests/cases/XML/feed-rss2.yaml

8
lib/Parser/Construct.php

@ -49,6 +49,14 @@ trait Construct {
return false;
}
protected function parseInt(string ...$v) {
for ($a = 0; $a < sizeof($v); $a++) {
$v[$a] = (preg_match("<^\d+$>", $v[$a])) ? (int) $v[$a] : null;
}
return sizeof($v) === 1 ? $v[0] : $v;
}
protected function parseDate(string $date): ?Date {
$out = null;
$date = $this->trimText($date);

53
lib/Parser/XML/Construct.php

@ -22,7 +22,7 @@ trait Construct {
protected $subject;
/** Retrieves an element node based on an XPath query */
protected function fetchElement(string $query, \DOMNode $context = null) {
protected function fetchElement(string $query, \DOMNode $context = null): ?\DOMElement {
$node = @$this->xpath->query("(".$query.")[1]", $context ?? $this->subject);
if ($node === false) {
throw new \Exception("Invalid XPath query: $query"); // @codeCoverageIgnore
@ -35,39 +35,32 @@ trait Construct {
return $this->xpath->query($query, $context ?? $this->subject);
}
/** Retrieves the trimmed text content of a DOM element based on an XPath query */
protected function fetchString(string $query, \DOMNode $context = null): ?string {
$node = $this->fetchElement($query, $context);
return ($node) ? $this->trimText($node->textContent) : null;
}
/** Retrieves the trimmed text content of multiple DOM elements based on an XPath query */
protected function fetchStringMulti(string $query, \DOMNode $context = null) {
/** Retrieves the trimmed text content of one or more DOM elements based on an XPath query, optionally matching a pattern
*
* Returns null if no suitable nodes were found
*
* @param string $query The XPath query of the nodes to return
* @param string|null $pattern The pattern to optionally filter matches with. The pattern should not include delimiters or anchors and is always case-insensitive
* @param bool|null $multi Whether to return multiple results as an array (true) or one result as a string (false, default)
* @param \DOMNode $context The context node for the XPath query
* @return string|array|null
*/
protected function fetchString(string $query, ?string $pattern = null, ?bool $multi = null, ?\DOMNode $context = null) {
$out = [];
$pattern = strlen($pattern ?? "") ? "/^(?:".str_replace("/", "\\/", $pattern).")$/i" : "";
$multi = $multi ?? false;
$nodes = $this->xpath->query($query, $context ?? $this->subject);
foreach ($nodes as $node) {
$out[] = $this->trimText($node->textContent);
}
return ($out) ? $out : null;
}
/** Retrieves the trimmed plain-text or HTML content of an Atom text construct based on an XPath query */
protected function fetchStringAtom(string $query, bool $html = false): ?Text {
$node = $this->fetchElement($query);
if ($node) {
if (!$node->hasAttribute("type") || $node->getAttribute("type") == "text") {
return $html ? htmlspecialchars($this->trimText($node->textContent), \ENT_QUOTES | \ENT_HTML5) : $this->trimText($node->textContent);
} elseif ($node->getAttribute("type") == "xhtml") {
$node = $node->getElementsByTagNameNS(self::NS['xhtml'], "div")->item(0);
return $node ? $this->sanitizeElement($node, $html) : null;
} elseif ($node->getAttribute("type") == "html") {
return $this->sanitizeString($node->textContent, $html);
} else {
return null;
$t = $this->trimText($node->textContent);
if ($pattern && preg_match($pattern, $t)) {
if (!$multi) {
return $t;
} else {
$out[] = $t;
}
}
} else {
return null;
}
return ($out) ? $out : null;
}
/** Retrieves and parses a date from the content of a DOM element based on an XPath query */
@ -107,7 +100,7 @@ trait Construct {
* - Full Name <user@example.com>
*/
protected function fetchPeople(string $query, string $role): ?PersonCollection {
$people = $this->fetchStringMulti($query) ?? [];
$people = $this->fetchString($query, null, true) ?? [];
$out = new PersonCollection;
foreach ($people as $person) {
if (!strlen($person)) {

1
lib/Parser/XML/Feed.php

@ -144,6 +144,7 @@ class Feed implements \MensBeam\Lax\Parser\Feed {
public function getSchedule(): Schedule {
$out = new Schedule;
$out->interval = $this->getSchedIntervalRss2();
$out->skip = $this->getSchedSkipRss2();
$out->expired = $this->getExpiredPod();
if (is_null($out->expired) && (($out->skip & Schedule::DAY_ALL) == Schedule::DAY_ALL || ($out->skip & Schedule::HOUR_ALL) == Schedule::HOUR_ALL)) {

2
lib/Parser/XML/Primitives/Construct.php

@ -95,7 +95,7 @@ trait Construct {
*/
protected function getCategoriesDC(): ?CategoryCollection {
$out = new CategoryCollection;
foreach ($this->fetchStringMulti("dc:subject") ?? [] as $text) {
foreach ($this->fetchString("dc:subject", null, true) ?? [] as $text) {
if (strlen($text)) {
$c = new Category;
$c->name = $text;

98
lib/Parser/XML/Primitives/Feed.php

@ -68,6 +68,7 @@ trait Feed {
return $this->fetchDate("lastBuildDate") ?? $this->fetchDate("pubDate");
}
/** Fetches the "complete" flag from an iTunes podcast */
protected function getExpiredPod(): ?bool {
$complete = $this->fetchString("apple:complete");
if ($complete === "Yes") {
@ -76,49 +77,70 @@ trait Feed {
return null;
}
protected function getSchedIntervalRss2(): ?\DateInterval {
$ttl = (int) $this->fetchString("ttl", "\d+");
if ($ttl) {
return new \DateInterval("PT{$ttl}M");
}
return null;
}
protected function getSchedIntervalRss1(): ?\DateInterval {
$period = $this->fetchString("sched:updatePeriod", "(?:year|month|week|dai|hour)ly");
if ($period) {
[$p, $n] = [
"hourly" => ["TM", 60], // 60 minutes
"daily" => ["TH", 24], // 24 hors
"weekly" => ["D", 7], // 7 days
"monthly" => ["D", 30], // 30 days
"yearly" => ["M", 12], // 12 months
][strtolower($period)];
$f = min(1, (int) $this->fetchString("sched:updateFrequency", "0*[1-9]\d*")); // a frequency of zero makes no sense
// divide the period by the frequency
// FIXME: we must have an integer result because PHP (incorrectly) rejects fractional intervals
// see https://bugs.php.net/bug.php?id=53831
$n = min(1, intdiv($n, $f)); // a frequency of zero still makes no sense, so we assume at least one subdivision
return new \DateInterval("P".(strlen($p) === 1 ? "" : $p[0]).$n.$p[-1]);
}
return null;
}
/** Computes the "skip-schedule" of an RSS feed, the set of days and hours during which a feed should not be fetched */
protected function getSchedSkipRss2(): ?int {
$out = 0;
$hours = $this->fetchStringMulti("skipHours/hour") ?? [];
$hours = $this->fetchString("skipHours/hour", "\d+", true) ?? [];
foreach($hours as $h) {
$out |= [
"0" => Schedule::HOUR_0,
"1" => Schedule::HOUR_1,
"2" => Schedule::HOUR_2,
"3" => Schedule::HOUR_3,
"4" => Schedule::HOUR_4,
"5" => Schedule::HOUR_5,
"6" => Schedule::HOUR_6,
"7" => Schedule::HOUR_7,
"8" => Schedule::HOUR_8,
"9" => Schedule::HOUR_9,
"00" => Schedule::HOUR_0,
"01" => Schedule::HOUR_1,
"02" => Schedule::HOUR_2,
"03" => Schedule::HOUR_3,
"04" => Schedule::HOUR_4,
"05" => Schedule::HOUR_5,
"06" => Schedule::HOUR_6,
"07" => Schedule::HOUR_7,
"08" => Schedule::HOUR_8,
"09" => Schedule::HOUR_9,
"10" => Schedule::HOUR_10,
"11" => Schedule::HOUR_11,
"12" => Schedule::HOUR_12,
"13" => Schedule::HOUR_13,
"14" => Schedule::HOUR_14,
"15" => Schedule::HOUR_15,
"16" => Schedule::HOUR_16,
"17" => Schedule::HOUR_17,
"18" => Schedule::HOUR_18,
"19" => Schedule::HOUR_19,
"20" => Schedule::HOUR_20,
"21" => Schedule::HOUR_21,
"22" => Schedule::HOUR_22,
"23" => Schedule::HOUR_23,
"24" => Schedule::HOUR_0,
][$h] ?? 0;
Schedule::HOUR_0,
Schedule::HOUR_1,
Schedule::HOUR_2,
Schedule::HOUR_3,
Schedule::HOUR_4,
Schedule::HOUR_5,
Schedule::HOUR_6,
Schedule::HOUR_7,
Schedule::HOUR_8,
Schedule::HOUR_9,
Schedule::HOUR_10,
Schedule::HOUR_11,
Schedule::HOUR_12,
Schedule::HOUR_13,
Schedule::HOUR_14,
Schedule::HOUR_15,
Schedule::HOUR_16,
Schedule::HOUR_17,
Schedule::HOUR_18,
Schedule::HOUR_19,
Schedule::HOUR_20,
Schedule::HOUR_21,
Schedule::HOUR_22,
Schedule::HOUR_23,
Schedule::HOUR_0,
][(int) $h] ?? 0;
}
$days = $this->fetchStringMulti("skipDays/day") ?? [];
$days = $this->fetchString("skipDays/day", null, true) ?? [];
foreach($days as $d) {
$out |= [
"monday" => Schedule::DAY_MON,

2
lib/Schedule.php

@ -54,6 +54,6 @@ class Schedule {
* UTC on other days.
*/
public $skip;
/** @var int $interval The suggested interval before the feed should be fetched again, in seconds */
/** @var \DateInterval $interval The suggested interval before the feed should be fetched again */
public $interval;
}

4
tests/cases/XML/feed-rss2.yaml

@ -44,7 +44,7 @@ Skip days:
<skipDays>
<day>MONDAY</day>
<day>sunday</day>
<day>TuE</day>
<day>TuE </day>
<day>bogus</day>
</skipDays>
</channel></rss>
@ -78,7 +78,7 @@ Skip hours:
<skipHours>
<hour>24</hour>
<hour>04</hour>
<hour>9</hour>
<hour> 9</hour>
<hour>bogus</hour>
<hour>25</hour>
</skipHours>

Loading…
Cancel
Save