From a80e283abc092b44c9c29c57f0413dc7ea596439 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Mon, 2 Oct 2017 11:53:52 -0400 Subject: [PATCH] Perform feed discovery correctly; fixes #118 --- lib/Database.php | 16 +++-- lib/Feed.php | 75 +++++++++++++++-------- tests/Feed/TestFeed.php | 5 +- tests/lib/Database/SeriesSubscription.php | 26 ++++++-- 4 files changed, 84 insertions(+), 38 deletions(-) diff --git a/lib/Database.php b/lib/Database.php index e952404..593d5e0 100644 --- a/lib/Database.php +++ b/lib/Database.php @@ -411,13 +411,19 @@ class Database { throw new User\ExceptionAuthz("notAuthorized", ["action" => __FUNCTION__, "user" => $user]); } // check to see if the feed exists - $feedID = $this->db->prepare("SELECT id from arsse_feeds where url is ? and username is ? and password is ?", "str", "str", "str")->run($url, $fetchUser, $fetchPassword)->getValue(); + $check = $this->db->prepare("SELECT id from arsse_feeds where url is ? and username is ? and password is ?", "str", "str", "str"); + $feedID = $check->run($url, $fetchUser, $fetchPassword)->getValue(); + if ($discover && is_null($feedID)) { + // if the feed doesn't exist, first perform discovery if requested and check for the existence of that URL + $url = Feed::discover($url, $fetchUser, $fetchPassword); + $feedID = $check->run($url, $fetchUser, $fetchPassword)->getValue(); + } if (is_null($feedID)) { - // if the feed doesn't exist add it to the database; we do this unconditionally so as to lock SQLite databases for as little time as possible + // if the feed still doesn't exist in the database, add it to the database; we do this unconditionally so as to lock SQLite databases for as little time as possible $feedID = $this->db->prepare('INSERT INTO arsse_feeds(url,username,password) values(?,?,?)', 'str', 'str', 'str')->run($url, $fetchUser, $fetchPassword)->lastId(); try { // perform an initial update on the newly added feed - $this->feedUpdate($feedID, true, $discover); + $this->feedUpdate($feedID, true); } catch (\Throwable $e) { // if the update fails, delete the feed we just added $this->db->prepare('DELETE from arsse_feeds where id is ?', 'int')->run($feedID); @@ -548,7 +554,7 @@ class Database { return array_column($feeds, 'id'); } - public function feedUpdate($feedID, bool $throwError = false, bool $discover = false): bool { + public function feedUpdate($feedID, bool $throwError = false): bool { $tr = $this->db->begin(); // check to make sure the feed exists if (!ValueInfo::id($feedID)) { @@ -564,7 +570,7 @@ class Database { // here. When an exception is thrown it should update the database with the // error instead of failing; if other exceptions are thrown, we should simply roll back try { - $feed = new Feed((int) $feedID, $f['url'], (string) Date::transform($f['modified'], "http", "sql"), $f['etag'], $f['username'], $f['password'], $scrape, $discover); + $feed = new Feed((int) $feedID, $f['url'], (string) Date::transform($f['modified'], "http", "sql"), $f['etag'], $f['username'], $f['password'], $scrape); if (!$feed->modified) { // if the feed hasn't changed, just compute the next fetch time and record it $this->db->prepare("UPDATE arsse_feeds SET updated = CURRENT_TIMESTAMP, next_fetch = ? WHERE id is ?", 'datetime', 'int')->run($feed->nextFetch, $feedID); diff --git a/lib/Feed.php b/lib/Feed.php index f256016..a390659 100644 --- a/lib/Feed.php +++ b/lib/Feed.php @@ -5,6 +5,7 @@ namespace JKingWeb\Arsse; use JKingWeb\Arsse\Misc\Date; use PicoFeed\PicoFeedException; use PicoFeed\Config\Config; +use PicoFeed\Client\Client; use PicoFeed\Reader\Reader; use PicoFeed\Reader\Favicon; use PicoFeed\Scraper\Scraper; @@ -12,31 +13,37 @@ use PicoFeed\Scraper\Scraper; class Feed { public $data = null; public $favicon; - public $parser; - public $reader; public $resource; public $modified = false; public $lastModified; public $nextFetch; public $newItems = []; public $changedItems = []; + + public static function discover(string $url, string $username = '', string $password = ''): string { + // fetch the candidate feed + $f = self::download($url, "", "", $username, $password); + if ($f->reader->detectFormat($f->getContent())) { + // if the prospective URL is a feed, use it + $out = $url; + } else { + $links = $f->reader->find($f->getUrl(), $f->getContent()); + if (!$links) { + // work around a PicoFeed memory leak FIXME: remove this hack (or not) once PicoFeed stops leaking memory + libxml_use_internal_errors(false); + throw new Feed\Exception($url, new \PicoFeed\Reader\SubscriptionNotFoundException('Unable to find a subscription')); + } else { + $out = $links[0]; + } + } + // work around a PicoFeed memory leak FIXME: remove this hack (or not) once PicoFeed stops leaking memory + libxml_use_internal_errors(false); + return $out; + } - public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false, bool $discover = false) { - // set the configuration - $userAgent = Arsse::$conf->fetchUserAgentString ?? sprintf('Arsse/%s (%s %s; %s; https://code.jkingweb.ca/jking/arsse) PicoFeed (https://github.com/fguillot/picoFeed)', - Arsse::VERSION, // Arsse version - php_uname('s'), // OS - php_uname('r'), // OS version - php_uname('m') // platform architecture - ); - $this->config = new Config; - $this->config->setMaxBodySize(Arsse::$conf->fetchSizeLimit); - $this->config->setClientTimeout(Arsse::$conf->fetchTimeout); - $this->config->setGrabberTimeout(Arsse::$conf->fetchTimeout); - $this->config->setClientUserAgent($userAgent); - $this->config->setGrabberUserAgent($userAgent); + public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) { // fetch the feed - $this->download($url, $lastModified, $etag, $username, $password, $discover); + $this->resource = self::download($url, $lastModified, $etag, $username, $password); // format the HTTP Last-Modified date returned $lastMod = $this->resource->getLastModified(); if (strlen($lastMod)) { @@ -65,26 +72,40 @@ class Feed { $this->nextFetch = $this->computeNextFetch(); } - protected function download(string $url, string $lastModified, string $etag, string $username, string $password, bool $discover): bool { - $action = $discover ? "discover" : "download"; + protected static function configure(): Config { + $userAgent = Arsse::$conf->fetchUserAgentString ?? sprintf('Arsse/%s (%s %s; %s; https://thearsse.com/) PicoFeed (https://github.com/miniflux/picoFeed)', + Arsse::VERSION, // Arsse version + php_uname('s'), // OS + php_uname('r'), // OS version + php_uname('m') // platform architecture + ); + $config = new Config; + $config->setMaxBodySize(Arsse::$conf->fetchSizeLimit); + $config->setClientTimeout(Arsse::$conf->fetchTimeout); + $config->setGrabberTimeout(Arsse::$conf->fetchTimeout); + $config->setClientUserAgent($userAgent); + $config->setGrabberUserAgent($userAgent); + return $config; + } + + protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): Client { try { - $this->reader = new Reader($this->config); - $this->resource = $this->reader->$action($url, $lastModified, $etag, $username, $password); + $reader = new Reader(self::configure()); + $client = $reader->download($url, $lastModified, $etag, $username, $password); + $client->reader = $reader; + return $client; } catch (PicoFeedException $e) { throw new Feed\Exception($url, $e); } - return true; } protected function parse(): bool { try { - $this->parser = $this->reader->getParser( + $feed = $this->resource->reader->getParser( $this->resource->getUrl(), $this->resource->getContent(), $this->resource->getEncoding() - ); - $feed = $this->parser->execute(); - + )->execute(); // Grab the favicon for the feed; returns an empty string if it cannot find one. // Some feeds might use a different domain (eg: feedburner), so the site url is // used instead of the feed's url. @@ -388,7 +409,7 @@ class Feed { } protected function scrape(): bool { - $scraper = new Scraper($this->config); + $scraper = new Scraper(self::configure()); foreach (array_merge($this->newItems, $this->changedItems) as $item) { $scraper->setUrl($item->url); $scraper->execute(); diff --git a/tests/Feed/TestFeed.php b/tests/Feed/TestFeed.php index 8957ba8..0ef1eb4 100644 --- a/tests/Feed/TestFeed.php +++ b/tests/Feed/TestFeed.php @@ -134,12 +134,13 @@ class TestFeed extends Test\AbstractTest { } public function testDiscoverAFeedSuccessfully() { - $this->assertInstanceOf(Feed::class, new Feed(null, $this->base."Discovery/Valid", "", "", "", "", false, true)); + $this->assertSame($this->base."Discovery/Feed", Feed::discover($this->base."Discovery/Valid")); + $this->assertSame($this->base."Discovery/Feed", Feed::discover($this->base."Discovery/Feed")); } public function testDiscoverAFeedUnsuccessfully() { $this->assertException("subscriptionNotFound", "Feed"); - new Feed(null, $this->base."Discovery/Invalid", "", "", "", "", false, true); + Feed::discover($this->base."Discovery/Invalid"); } public function testParseEntityExpansionAttack() { diff --git a/tests/lib/Database/SeriesSubscription.php b/tests/lib/Database/SeriesSubscription.php index fc528c3..a40e509 100644 --- a/tests/lib/Database/SeriesSubscription.php +++ b/tests/lib/Database/SeriesSubscription.php @@ -133,9 +133,9 @@ trait SeriesSubscription { $feedID = $this->nextID("arsse_feeds"); $subID = $this->nextID("arsse_subscriptions"); Phake::when(Arsse::$db)->feedUpdate->thenReturn(true); - $this->assertSame($subID, Arsse::$db->subscriptionAdd($this->user, $url)); + $this->assertSame($subID, Arsse::$db->subscriptionAdd($this->user, $url, "", "", false)); Phake::verify(Arsse::$user)->authorize($this->user, "subscriptionAdd"); - Phake::verify(Arsse::$db)->feedUpdate($feedID, true, true); + Phake::verify(Arsse::$db)->feedUpdate($feedID, true); $state = $this->primeExpectations($this->data, [ 'arsse_feeds' => ['id','url','username','password'], 'arsse_subscriptions' => ['id','owner','feed'], @@ -145,15 +145,33 @@ trait SeriesSubscription { $this->compareExpectations($state); } + public function testAddASubscriptionToANewFeedViaDiscovery() { + $url = "http://localhost:8000/Feed/Discovery/Valid"; + $discovered = "http://localhost:8000/Feed/Discovery/Feed"; + $feedID = $this->nextID("arsse_feeds"); + $subID = $this->nextID("arsse_subscriptions"); + Phake::when(Arsse::$db)->feedUpdate->thenReturn(true); + $this->assertSame($subID, Arsse::$db->subscriptionAdd($this->user, $url, "", "", true)); + Phake::verify(Arsse::$user)->authorize($this->user, "subscriptionAdd"); + Phake::verify(Arsse::$db)->feedUpdate($feedID, true); + $state = $this->primeExpectations($this->data, [ + 'arsse_feeds' => ['id','url','username','password'], + 'arsse_subscriptions' => ['id','owner','feed'], + ]); + $state['arsse_feeds']['rows'][] = [$feedID,$discovered,"",""]; + $state['arsse_subscriptions']['rows'][] = [$subID,$this->user,$feedID]; + $this->compareExpectations($state); + } + public function testAddASubscriptionToAnInvalidFeed() { $url = "http://example.org/feed1"; $feedID = $this->nextID("arsse_feeds"); Phake::when(Arsse::$db)->feedUpdate->thenThrow(new FeedException($url, new \PicoFeed\Client\InvalidUrlException())); try { - Arsse::$db->subscriptionAdd($this->user, $url); + Arsse::$db->subscriptionAdd($this->user, $url, "", "", false); } catch (FeedException $e) { Phake::verify(Arsse::$user)->authorize($this->user, "subscriptionAdd"); - Phake::verify(Arsse::$db)->feedUpdate($feedID, true, true); + Phake::verify(Arsse::$db)->feedUpdate($feedID, true); $state = $this->primeExpectations($this->data, [ 'arsse_feeds' => ['id','url','username','password'], 'arsse_subscriptions' => ['id','owner','feed'],