Browse Source

Avoid dynamic property creation with PicoFeed

This only leaves the Laminas XML deprecated behaviour to handle
arch
J. King 1 year ago
parent
commit
fe06ffc176
  1. 8
      lib/Database.php
  2. 107
      lib/Feed.php
  3. 24
      lib/Feed/Item.php
  4. 48
      tests/cases/Feed/TestFeed.php

8
lib/Database.php

@ -1335,12 +1335,12 @@ class Database {
"UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?",
["str", "str", "datetime", "strict str", "datetime", "int", "int", "int"]
)->run(
$feed->data->title,
$feed->data->siteUrl,
$feed->title,
$feed->siteUrl,
$feed->lastModified,
$feed->resource->getEtag(),
$feed->etag,
$feed->nextFetch,
sizeof($feed->data->items),
sizeof($feed->items),
$icon,
$feedID
);

107
lib/Feed.php

@ -6,6 +6,7 @@
declare(strict_types=1);
namespace JKingWeb\Arsse;
use JKingWeb\Arsse\Feed\Item;
use JKingWeb\Arsse\Misc\Date;
use JKingWeb\Arsse\Rule\Rule;
use PicoFeed\PicoFeedException;
@ -15,63 +16,63 @@ use PicoFeed\Reader\Reader;
use PicoFeed\Reader\Favicon;
use PicoFeed\Scraper\Scraper;
class Feed {
public $data = null;
class Feed {
public $title;
public $siteUrl;
public $iconUrl;
public $iconType;
public $iconData;
public $resource;
public $modified = false;
public $lastModified;
public $etag;
public $nextFetch;
public $items = [];
public $newItems = [];
public $changedItems = [];
public $filteredItems = [];
public static function discover(string $url, string $username = '', string $password = ''): string {
// fetch the candidate feed
$f = self::download($url, "", "", $username, $password);
if ($f->reader->detectFormat($f->getContent())) {
[$client, $reader] = self::download($url, "", "", $username, $password);
if ($reader->detectFormat($client->getContent())) {
// if the prospective URL is a feed, use it
$out = $url;
} else {
$links = $f->reader->find($f->getUrl(), $f->getContent());
$links = $reader->find($client->getUrl(), $client->getContent());
if (!$links) {
// work around a PicoFeed memory leak
libxml_use_internal_errors(false);
throw new Feed\Exception("", ['url' => $url], new \PicoFeed\Reader\SubscriptionNotFoundException('Unable to find a subscription'));
} else {
$out = $links[0];
}
}
// work around a PicoFeed memory leak
libxml_use_internal_errors(false);
return $out;
}
public static function discoverAll(string $url, string $username = '', string $password = ''): array {
// fetch the candidate feed
$f = self::download($url, "", "", $username, $password);
if ($f->reader->detectFormat($f->getContent())) {
[$client, $reader] = self::download($url, "", "", $username, $password);
if ($reader->detectFormat($client->getContent())) {
// if the prospective URL is a feed, use it
return [$url];
} else {
return $f->reader->find($f->getUrl(), $f->getContent());
return $reader->find($client->getUrl(), $client->getContent());
}
}
public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) {
// fetch the feed
$this->resource = self::download($url, $lastModified, $etag, $username, $password);
[$client, $reader] = self::download($url, $lastModified, $etag, $username, $password);
// format the HTTP Last-Modified date returned
$lastMod = $this->resource->getLastModified();
$lastMod = $client->getLastModified();
if (strlen($lastMod ?? "")) {
$this->lastModified = Date::normalize($lastMod, "http");
}
$this->modified = $this->resource->isModified();
//parse the feed, if it has been modified
$this->modified = $client->isModified();
// get the ETag
$this->etag = $client->getEtag();
// parse the feed, if it has been modified
if ($this->modified) {
$this->parse();
$this->parse($client, $reader);
// ascertain whether there are any articles not in the database
$this->matchToDatabase($feedID);
// if caching header fields are not sent by the server, try to ascertain a last-modified date from the feed contents
@ -112,12 +113,11 @@ class Feed {
return $config;
}
protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): Client {
protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): array {
try {
$reader = new Reader(self::configure());
$client = $reader->download($url, $lastModified, $etag, $username, $password);
$client->reader = $reader;
return $client;
return [$client, $reader];
} catch (PicoFeedException $e) {
throw new Feed\Exception("", ['url' => $url], $e); // @codeCoverageIgnore
} catch (\GuzzleHttp\Exception\GuzzleException $e) {
@ -125,17 +125,17 @@ class Feed {
}
}
protected function parse(): void {
protected function parse(Client $client, Reader $reader): void {
try {
$feed = $this->resource->reader->getParser(
$this->resource->getUrl(),
$this->resource->getContent(),
$this->resource->getEncoding()
$feed = $reader->getParser(
$client->getUrl(),
$client->getContent(),
$client->getEncoding()
)->execute();
} catch (PicoFeedException $e) {
throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e);
throw new Feed\Exception("", ['url' => $client->getUrl()], $e);
} catch (\GuzzleHttp\Exception\GuzzleException $e) { // @codeCoverageIgnore
throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e); // @codeCoverageIgnore
throw new Feed\Exception("", ['url' => $client->getUrl()], $e); // @codeCoverageIgnore
}
// Grab the favicon for the feed, or null if no valid icon is found
@ -150,6 +150,10 @@ class Feed {
$this->iconUrl = $this->iconData = null;
}
// Next gather all other feed-level information we want out of the feed
$this->siteUrl = $feed->siteUrl;
$this->title = $feed->title;
// PicoFeed does not provide valid ids when there is no id element. Its solution
// of hashing the url, title, and content together for the id if there is no id
// element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but
@ -158,29 +162,38 @@ class Feed {
// only be reserved for severely broken feeds.
foreach ($feed->items as $f) {
// Hashes used for comparison to check for updates and also to identify when an
// copy the basic information of an article
$i = new Item;
$i->url = $f->url;
$i->title = $f->title;
$i->content = $f->content;
$i->author = $f->author;
$i->publishedDate = $f->publishedDate;
$i->updatedDate = $f->updatedDate;
$i->enclosureType = $f->enclosureType;
$i->enclosureUrl = $f->enclosureUrl;
// add hashes used for comparison to check for updates and also to identify when an
// id doesn't exist.
$content = $f->content.$f->enclosureUrl.$f->enclosureType;
// if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title
if ($f->url === $feed->siteUrl && $f->title === $feed->siteUrl) {
$f->urlTitleHash = "";
$i->urlTitleHash = "";
} else {
$f->urlTitleHash = hash('sha256', $f->url.$f->title);
$i->urlTitleHash = hash('sha256', $f->url.$f->title);
}
// if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed
if (!strlen($content) && $f->url === $feed->siteUrl) {
$f->urlContentHash = "";
$i->urlContentHash = "";
} else {
$f->urlContentHash = hash('sha256', $f->url.$content);
$i->urlContentHash = hash('sha256', $f->url.$content);
}
// if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed
if (!strlen($content) && $f->title === $f->url) {
$f->titleContentHash = "";
$i->titleContentHash = "";
} else {
$f->titleContentHash = hash('sha256', $f->title.$content);
$i->titleContentHash = hash('sha256', $f->title.$content);
}
$f->id = null;
// prefer an Atom ID as the item's ID
// next add an id; prefer an Atom ID as the item's ID
$id = (string) $f->xml->children('http://www.w3.org/2005/Atom')->id;
// otherwise use the RSS2 guid element
if (!strlen($id)) {
@ -192,11 +205,10 @@ class Feed {
}
// otherwise there is no ID; if there is one, hash it
if (strlen($id)) {
$f->id = hash('sha256', $id);
$i->id = hash('sha256', $id);
}
// PicoFeed also doesn't gather up categories, so we do this as well
$f->categories = [];
// first add Atom categories
foreach ($f->xml->children('http://www.w3.org/2005/Atom')->category as $c) {
// if the category has a label, use that
@ -207,27 +219,28 @@ class Feed {
}
// ... assuming it has that much
if (strlen($name)) {
$f->categories[] = $name;
$i->categories[] = $name;
}
}
// next add RSS2 categories
foreach ($f->xml->children()->category as $c) {
$name = (string) $c;
if (strlen($name)) {
$f->categories[] = $name;
$i->categories[] = $name;
}
}
// and finally try Dublin Core subjects
foreach ($f->xml->children('http://purl.org/dc/elements/1.1/')->subject as $c) {
$name = (string) $c;
if (strlen($name)) {
$f->categories[] = $name;
$i->categories[] = $name;
}
}
//sort the results
sort($f->categories);
sort($i->categories);
// add the item to the feed's list of items
$this->items[] = $i;
}
$this->data = $feed;
}
protected function deduplicateItems(array $items): array {
@ -251,7 +264,7 @@ class Feed {
($item->urlContentHash && $item->urlContentHash === $check->urlContentHash) ||
($item->titleContentHash && $item->titleContentHash === $check->titleContentHash)
) {
if (// because newsfeeds are usually order newest-first, the later item should only be used if...
if (// because newsfeeds are usually ordered newest-first, the later item should only be used if...
// the later item has an update date and the existing item does not
($item->updatedDate && !$check->updatedDate) ||
// the later item has an update date newer than the existing item's
@ -276,7 +289,7 @@ class Feed {
protected function matchToDatabase(int $feedID = null): void {
// first perform deduplication on items
$items = $this->deduplicateItems($this->data->items);
$items = $this->deduplicateItems($this->items);
// if we haven't been given a database feed ID to check against, all items are new
if (is_null($feedID)) {
$this->newItems = $items;
@ -429,7 +442,7 @@ class Feed {
protected function gatherDates(): array {
$dates = [];
foreach ($this->data->items as $item) {
foreach ($this->items as $item) {
if ($item->updatedDate) {
$dates[] = $item->updatedDate->getTimestamp();
}

24
lib/Feed/Item.php

@ -0,0 +1,24 @@
<?php
/** @license MIT
* Copyright 2017 J. King, Dustin Wilson et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\Arsse\Feed;
class Item {
public $id;
public $url;
public $title;
public $author;
public $publishedDate;
public $updatedDate;
public $urlContentHash;
public $urlTitleHash;
public $titleContentHash;
public $content;
public $scrapedContent;
public $enclosureUrl;
public $enclosureType;
public $categories = [];
}

48
tests/cases/Feed/TestFeed.php

@ -113,26 +113,26 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
$h0 = "0a4f0e3768c8a5e9d8d9a16545ae4ff5b097f6dac3ad49555a94a7cace68ba73"; // hash of Atom ID
$h1 = "a135beced0236b723d12f845ff20ec22d4fc3afe1130012618f027170d57cb4e"; // hash of RSS2 GUID
$h2 = "205e986f4f8b3acfa281227beadb14f5e8c32c8dae4737f888c94c0df49c56f8"; // hash of Dublin Core identifier
$this->assertSame($h0, $f->data->items[0]->id);
$this->assertSame($h1, $f->data->items[1]->id);
$this->assertSame($h2, $f->data->items[2]->id);
$this->assertSame($h0, $f->items[0]->id);
$this->assertSame($h1, $f->items[1]->id);
$this->assertSame($h2, $f->items[2]->id);
// check null hashes
$h3 = "6287ba30f534e404e68356237e809683e311285d8b9f47d046ac58784eece052"; // URL hash
$h4 = "6cbb5d2dcb11610a99eb3f633dc246690c0acf33327bf7534f95542caa8f27c4"; // title hash
$h5 = "2b7c57ffa9adde92ccd1884fa1153a5bcd3211e48d99e27be5414cb078e6891c"; // content/enclosure hash
$this->assertNotEquals("", $f->data->items[3]->urlTitleHash);
$this->assertSame($h3, $f->data->items[3]->urlContentHash);
$this->assertSame("", $f->data->items[3]->titleContentHash);
$this->assertNotEquals("", $f->data->items[4]->urlTitleHash);
$this->assertSame("", $f->data->items[4]->urlContentHash);
$this->assertSame($h4, $f->data->items[4]->titleContentHash);
$this->assertSame("", $f->data->items[5]->urlTitleHash);
$this->assertNotEquals("", $f->data->items[5]->urlContentHash);
$this->assertNotEquals("", $f->data->items[5]->titleContentHash);
$this->assertNotEquals("", $f->items[3]->urlTitleHash);
$this->assertSame($h3, $f->items[3]->urlContentHash);
$this->assertSame("", $f->items[3]->titleContentHash);
$this->assertNotEquals("", $f->items[4]->urlTitleHash);
$this->assertSame("", $f->items[4]->urlContentHash);
$this->assertSame($h4, $f->items[4]->titleContentHash);
$this->assertSame("", $f->items[5]->urlTitleHash);
$this->assertNotEquals("", $f->items[5]->urlContentHash);
$this->assertNotEquals("", $f->items[5]->titleContentHash);
// check null IDs
$this->assertSame(null, $f->data->items[3]->id);
$this->assertSame(null, $f->data->items[4]->id);
$this->assertSame(null, $f->data->items[5]->id);
$this->assertSame(null, $f->items[3]->id);
$this->assertSame(null, $f->items[4]->id);
$this->assertSame(null, $f->items[5]->id);
// check categories
$categories = [
"Aniki!",
@ -140,11 +140,11 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
"Bodybuilders",
"Men",
];
$this->assertSame([], $f->data->items[0]->categories);
$this->assertSame([], $f->data->items[1]->categories);
$this->assertSame([], $f->data->items[3]->categories);
$this->assertSame([], $f->data->items[4]->categories);
$this->assertSame($categories, $f->data->items[5]->categories);
$this->assertSame([], $f->items[0]->categories);
$this->assertSame([], $f->items[1]->categories);
$this->assertSame([], $f->items[3]->categories);
$this->assertSame([], $f->items[4]->categories);
$this->assertSame($categories, $f->items[5]->categories);
}
public function testDiscoverAFeedSuccessfully(): void {
@ -232,7 +232,7 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
$e = "78567a";
$f = new Feed(null, $this->base.$url."?t=$t&e=$e", Date::transform($t, "http"), $e);
$this->assertTime($t, $f->lastModified);
$this->assertSame($e, $f->resource->getETag());
$this->assertSame($e, $f->etag);
}
public function provide304ResponseURLs() {
@ -250,15 +250,15 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
$t = time() - 2000;
$f = new Feed(null, $this->base."Caching/200Past");
$this->assertTime($t, $f->lastModified);
$this->assertNotEmpty($f->resource->getETag());
$this->assertNotEmpty($f->etag);
$t = time() - 2000;
$f = new Feed(null, $this->base."Caching/200Past", Date::transform(time(), "http"));
$this->assertTime($t, $f->lastModified);
$this->assertNotEmpty($f->resource->getETag());
$this->assertNotEmpty($f->etag);
$t = time() + 2000;
$f = new Feed(null, $this->base."Caching/200Future");
$this->assertTime($t, $f->lastModified);
$this->assertNotEmpty($f->resource->getETag());
$this->assertNotEmpty($f->etag);
// these tests have no HTTP headers and rely on article dates
$t = strtotime("2002-05-19T15:21:36Z");
$f = new Feed(null, $this->base."Caching/200PubDateOnly");

Loading…
Cancel
Save