diff --git a/lib/Database.php b/lib/Database.php index 107c6df..7910a4a 100644 --- a/lib/Database.php +++ b/lib/Database.php @@ -1335,12 +1335,12 @@ class Database { "UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?", ["str", "str", "datetime", "strict str", "datetime", "int", "int", "int"] )->run( - $feed->data->title, - $feed->data->siteUrl, + $feed->title, + $feed->siteUrl, $feed->lastModified, - $feed->resource->getEtag(), + $feed->etag, $feed->nextFetch, - sizeof($feed->data->items), + sizeof($feed->items), $icon, $feedID ); diff --git a/lib/Feed.php b/lib/Feed.php index ce4ab4c..69f9464 100644 --- a/lib/Feed.php +++ b/lib/Feed.php @@ -6,6 +6,7 @@ declare(strict_types=1); namespace JKingWeb\Arsse; +use JKingWeb\Arsse\Feed\Item; use JKingWeb\Arsse\Misc\Date; use JKingWeb\Arsse\Rule\Rule; use PicoFeed\PicoFeedException; @@ -15,63 +16,63 @@ use PicoFeed\Reader\Reader; use PicoFeed\Reader\Favicon; use PicoFeed\Scraper\Scraper; -class Feed { - public $data = null; +class Feed { + public $title; + public $siteUrl; public $iconUrl; public $iconType; public $iconData; - public $resource; public $modified = false; public $lastModified; + public $etag; public $nextFetch; + public $items = []; public $newItems = []; public $changedItems = []; public $filteredItems = []; public static function discover(string $url, string $username = '', string $password = ''): string { // fetch the candidate feed - $f = self::download($url, "", "", $username, $password); - if ($f->reader->detectFormat($f->getContent())) { + [$client, $reader] = self::download($url, "", "", $username, $password); + if ($reader->detectFormat($client->getContent())) { // if the prospective URL is a feed, use it $out = $url; } else { - $links = $f->reader->find($f->getUrl(), $f->getContent()); + $links = $reader->find($client->getUrl(), $client->getContent()); if (!$links) { - // work around a PicoFeed memory leak - libxml_use_internal_errors(false); throw new Feed\Exception("", ['url' => $url], new \PicoFeed\Reader\SubscriptionNotFoundException('Unable to find a subscription')); } else { $out = $links[0]; } } - // work around a PicoFeed memory leak - libxml_use_internal_errors(false); return $out; } public static function discoverAll(string $url, string $username = '', string $password = ''): array { // fetch the candidate feed - $f = self::download($url, "", "", $username, $password); - if ($f->reader->detectFormat($f->getContent())) { + [$client, $reader] = self::download($url, "", "", $username, $password); + if ($reader->detectFormat($client->getContent())) { // if the prospective URL is a feed, use it return [$url]; } else { - return $f->reader->find($f->getUrl(), $f->getContent()); + return $reader->find($client->getUrl(), $client->getContent()); } } public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) { // fetch the feed - $this->resource = self::download($url, $lastModified, $etag, $username, $password); + [$client, $reader] = self::download($url, $lastModified, $etag, $username, $password); // format the HTTP Last-Modified date returned - $lastMod = $this->resource->getLastModified(); + $lastMod = $client->getLastModified(); if (strlen($lastMod ?? "")) { $this->lastModified = Date::normalize($lastMod, "http"); } - $this->modified = $this->resource->isModified(); - //parse the feed, if it has been modified + $this->modified = $client->isModified(); + // get the ETag + $this->etag = $client->getEtag(); + // parse the feed, if it has been modified if ($this->modified) { - $this->parse(); + $this->parse($client, $reader); // ascertain whether there are any articles not in the database $this->matchToDatabase($feedID); // if caching header fields are not sent by the server, try to ascertain a last-modified date from the feed contents @@ -112,12 +113,11 @@ class Feed { return $config; } - protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): Client { + protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): array { try { $reader = new Reader(self::configure()); $client = $reader->download($url, $lastModified, $etag, $username, $password); - $client->reader = $reader; - return $client; + return [$client, $reader]; } catch (PicoFeedException $e) { throw new Feed\Exception("", ['url' => $url], $e); // @codeCoverageIgnore } catch (\GuzzleHttp\Exception\GuzzleException $e) { @@ -125,17 +125,17 @@ class Feed { } } - protected function parse(): void { + protected function parse(Client $client, Reader $reader): void { try { - $feed = $this->resource->reader->getParser( - $this->resource->getUrl(), - $this->resource->getContent(), - $this->resource->getEncoding() + $feed = $reader->getParser( + $client->getUrl(), + $client->getContent(), + $client->getEncoding() )->execute(); } catch (PicoFeedException $e) { - throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e); + throw new Feed\Exception("", ['url' => $client->getUrl()], $e); } catch (\GuzzleHttp\Exception\GuzzleException $e) { // @codeCoverageIgnore - throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e); // @codeCoverageIgnore + throw new Feed\Exception("", ['url' => $client->getUrl()], $e); // @codeCoverageIgnore } // Grab the favicon for the feed, or null if no valid icon is found @@ -150,6 +150,10 @@ class Feed { $this->iconUrl = $this->iconData = null; } + // Next gather all other feed-level information we want out of the feed + $this->siteUrl = $feed->siteUrl; + $this->title = $feed->title; + // PicoFeed does not provide valid ids when there is no id element. Its solution // of hashing the url, title, and content together for the id if there is no id // element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but @@ -158,29 +162,38 @@ class Feed { // only be reserved for severely broken feeds. foreach ($feed->items as $f) { - // Hashes used for comparison to check for updates and also to identify when an + // copy the basic information of an article + $i = new Item; + $i->url = $f->url; + $i->title = $f->title; + $i->content = $f->content; + $i->author = $f->author; + $i->publishedDate = $f->publishedDate; + $i->updatedDate = $f->updatedDate; + $i->enclosureType = $f->enclosureType; + $i->enclosureUrl = $f->enclosureUrl; + // add hashes used for comparison to check for updates and also to identify when an // id doesn't exist. $content = $f->content.$f->enclosureUrl.$f->enclosureType; // if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title if ($f->url === $feed->siteUrl && $f->title === $feed->siteUrl) { - $f->urlTitleHash = ""; + $i->urlTitleHash = ""; } else { - $f->urlTitleHash = hash('sha256', $f->url.$f->title); + $i->urlTitleHash = hash('sha256', $f->url.$f->title); } // if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed if (!strlen($content) && $f->url === $feed->siteUrl) { - $f->urlContentHash = ""; + $i->urlContentHash = ""; } else { - $f->urlContentHash = hash('sha256', $f->url.$content); + $i->urlContentHash = hash('sha256', $f->url.$content); } // if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed if (!strlen($content) && $f->title === $f->url) { - $f->titleContentHash = ""; + $i->titleContentHash = ""; } else { - $f->titleContentHash = hash('sha256', $f->title.$content); + $i->titleContentHash = hash('sha256', $f->title.$content); } - $f->id = null; - // prefer an Atom ID as the item's ID + // next add an id; prefer an Atom ID as the item's ID $id = (string) $f->xml->children('http://www.w3.org/2005/Atom')->id; // otherwise use the RSS2 guid element if (!strlen($id)) { @@ -192,11 +205,10 @@ class Feed { } // otherwise there is no ID; if there is one, hash it if (strlen($id)) { - $f->id = hash('sha256', $id); + $i->id = hash('sha256', $id); } // PicoFeed also doesn't gather up categories, so we do this as well - $f->categories = []; // first add Atom categories foreach ($f->xml->children('http://www.w3.org/2005/Atom')->category as $c) { // if the category has a label, use that @@ -207,27 +219,28 @@ class Feed { } // ... assuming it has that much if (strlen($name)) { - $f->categories[] = $name; + $i->categories[] = $name; } } // next add RSS2 categories foreach ($f->xml->children()->category as $c) { $name = (string) $c; if (strlen($name)) { - $f->categories[] = $name; + $i->categories[] = $name; } } // and finally try Dublin Core subjects foreach ($f->xml->children('http://purl.org/dc/elements/1.1/')->subject as $c) { $name = (string) $c; if (strlen($name)) { - $f->categories[] = $name; + $i->categories[] = $name; } } //sort the results - sort($f->categories); + sort($i->categories); + // add the item to the feed's list of items + $this->items[] = $i; } - $this->data = $feed; } protected function deduplicateItems(array $items): array { @@ -251,7 +264,7 @@ class Feed { ($item->urlContentHash && $item->urlContentHash === $check->urlContentHash) || ($item->titleContentHash && $item->titleContentHash === $check->titleContentHash) ) { - if (// because newsfeeds are usually order newest-first, the later item should only be used if... + if (// because newsfeeds are usually ordered newest-first, the later item should only be used if... // the later item has an update date and the existing item does not ($item->updatedDate && !$check->updatedDate) || // the later item has an update date newer than the existing item's @@ -276,7 +289,7 @@ class Feed { protected function matchToDatabase(int $feedID = null): void { // first perform deduplication on items - $items = $this->deduplicateItems($this->data->items); + $items = $this->deduplicateItems($this->items); // if we haven't been given a database feed ID to check against, all items are new if (is_null($feedID)) { $this->newItems = $items; @@ -429,7 +442,7 @@ class Feed { protected function gatherDates(): array { $dates = []; - foreach ($this->data->items as $item) { + foreach ($this->items as $item) { if ($item->updatedDate) { $dates[] = $item->updatedDate->getTimestamp(); } diff --git a/lib/Feed/Item.php b/lib/Feed/Item.php new file mode 100644 index 0000000..fd53046 --- /dev/null +++ b/lib/Feed/Item.php @@ -0,0 +1,24 @@ +assertSame($h0, $f->data->items[0]->id); - $this->assertSame($h1, $f->data->items[1]->id); - $this->assertSame($h2, $f->data->items[2]->id); + $this->assertSame($h0, $f->items[0]->id); + $this->assertSame($h1, $f->items[1]->id); + $this->assertSame($h2, $f->items[2]->id); // check null hashes $h3 = "6287ba30f534e404e68356237e809683e311285d8b9f47d046ac58784eece052"; // URL hash $h4 = "6cbb5d2dcb11610a99eb3f633dc246690c0acf33327bf7534f95542caa8f27c4"; // title hash $h5 = "2b7c57ffa9adde92ccd1884fa1153a5bcd3211e48d99e27be5414cb078e6891c"; // content/enclosure hash - $this->assertNotEquals("", $f->data->items[3]->urlTitleHash); - $this->assertSame($h3, $f->data->items[3]->urlContentHash); - $this->assertSame("", $f->data->items[3]->titleContentHash); - $this->assertNotEquals("", $f->data->items[4]->urlTitleHash); - $this->assertSame("", $f->data->items[4]->urlContentHash); - $this->assertSame($h4, $f->data->items[4]->titleContentHash); - $this->assertSame("", $f->data->items[5]->urlTitleHash); - $this->assertNotEquals("", $f->data->items[5]->urlContentHash); - $this->assertNotEquals("", $f->data->items[5]->titleContentHash); + $this->assertNotEquals("", $f->items[3]->urlTitleHash); + $this->assertSame($h3, $f->items[3]->urlContentHash); + $this->assertSame("", $f->items[3]->titleContentHash); + $this->assertNotEquals("", $f->items[4]->urlTitleHash); + $this->assertSame("", $f->items[4]->urlContentHash); + $this->assertSame($h4, $f->items[4]->titleContentHash); + $this->assertSame("", $f->items[5]->urlTitleHash); + $this->assertNotEquals("", $f->items[5]->urlContentHash); + $this->assertNotEquals("", $f->items[5]->titleContentHash); // check null IDs - $this->assertSame(null, $f->data->items[3]->id); - $this->assertSame(null, $f->data->items[4]->id); - $this->assertSame(null, $f->data->items[5]->id); + $this->assertSame(null, $f->items[3]->id); + $this->assertSame(null, $f->items[4]->id); + $this->assertSame(null, $f->items[5]->id); // check categories $categories = [ "Aniki!", @@ -140,11 +140,11 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest { "Bodybuilders", "Men", ]; - $this->assertSame([], $f->data->items[0]->categories); - $this->assertSame([], $f->data->items[1]->categories); - $this->assertSame([], $f->data->items[3]->categories); - $this->assertSame([], $f->data->items[4]->categories); - $this->assertSame($categories, $f->data->items[5]->categories); + $this->assertSame([], $f->items[0]->categories); + $this->assertSame([], $f->items[1]->categories); + $this->assertSame([], $f->items[3]->categories); + $this->assertSame([], $f->items[4]->categories); + $this->assertSame($categories, $f->items[5]->categories); } public function testDiscoverAFeedSuccessfully(): void { @@ -232,7 +232,7 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest { $e = "78567a"; $f = new Feed(null, $this->base.$url."?t=$t&e=$e", Date::transform($t, "http"), $e); $this->assertTime($t, $f->lastModified); - $this->assertSame($e, $f->resource->getETag()); + $this->assertSame($e, $f->etag); } public function provide304ResponseURLs() { @@ -250,15 +250,15 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest { $t = time() - 2000; $f = new Feed(null, $this->base."Caching/200Past"); $this->assertTime($t, $f->lastModified); - $this->assertNotEmpty($f->resource->getETag()); + $this->assertNotEmpty($f->etag); $t = time() - 2000; $f = new Feed(null, $this->base."Caching/200Past", Date::transform(time(), "http")); $this->assertTime($t, $f->lastModified); - $this->assertNotEmpty($f->resource->getETag()); + $this->assertNotEmpty($f->etag); $t = time() + 2000; $f = new Feed(null, $this->base."Caching/200Future"); $this->assertTime($t, $f->lastModified); - $this->assertNotEmpty($f->resource->getETag()); + $this->assertNotEmpty($f->etag); // these tests have no HTTP headers and rely on article dates $t = strtotime("2002-05-19T15:21:36Z"); $f = new Feed(null, $this->base."Caching/200PubDateOnly");