From ce0584e7f88127c0284905a966f2bd7ede594b31 Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Sun, 26 Mar 2017 15:16:15 -0500 Subject: [PATCH] Added Feed Updating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Started implementing feed updating (Database->updateFeeds()) • Moved hashing to the Feed object, now done when parsing • Moved adding of articles to the database to its own method (Database->articleAdd()) --- lib/Database.php | 149 ++++++++++++++++++++++++++++++++++++++-------- lib/Feed.php | 22 ++++--- sql/SQLite3/0.sql | 4 +- 3 files changed, 139 insertions(+), 36 deletions(-) diff --git a/lib/Database.php b/lib/Database.php index 9994328..f9b5fca 100644 --- a/lib/Database.php +++ b/lib/Database.php @@ -288,31 +288,7 @@ class Database { // Add each of the articles to the database. foreach ($feed->data->items as $i) { - $articleID = $this->db->prepare('INSERT INTO newssync_articles(feed,url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash) - values(?,?,?,?,?,?,?,?,?,?,?)', - 'int', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str')->run( - $feedID, - $i->url, - $i->title, - $i->author, - $i->publishedDate, - $i->updatedDate, - $i->id, - $i->content, - // Since feeds cannot be trusted to have valid ids additional hashes are used for identifiers. - // These hashes are made regardless to check against for changes. - hash('sha256', $i->url.$i->title), - hash('sha256', $i->url.$i->content.$i->enclosureUrl.$i->enclosureType), - hash('sha256', $i->title.$i->content.$i->enclosureUrl.$i->enclosureType) - )->lastId(); - - // If the article has categories add them into the categories database. - $categories = $i->getTag('category'); - if (count($categories) > 0) { - foreach ($categories as $c) { - $this->db->prepare('INSERT INTO newssync_tags(article,name) values(?,?)', 'int', 'str')->run($articleID, $c); - } - } + $this->articleAdd($i); } } @@ -389,4 +365,127 @@ class Database { "str", "int")->run($user, $parent); } } + + public function articleAdd(PicoFeed\Parser\Item $article): int { + $this->db->begin(); + + $articleId = $this->db->prepare('INSERT INTO newssync_articles(feed,url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash) + values(?,?,?,?,?,?,?,?,?,?,?)', + 'int', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str')->run( + $feedID, + $article->url, + $article->title, + $article->author, + $article->publishedDate, + $article->updatedDate, + $article->id, + $article->content, + $article->urlTitleHash, + $article->urlContentHash, + $article->titleContentHash + )->lastId(); + + // If the article has categories add them into the categories database. + $categories = $article->getTag('category'); + if (count($categories) > 0) { + foreach ($categories as $c) { + $this->db->prepare('INSERT INTO newssync_tags(article,name) values(?,?)', 'int', 'str')->run($articleId, $c); + } + } + + $this->db->commit(); + return 1; + } + + public function updateFeeds(): int { + $feeds = $this->db->query('SELECT id, url, username, password, DATEFORMAT("http", modified) AS lastmodified, etag FROM newssync_feeds')->getAll(); + foreach ($feeds as $f) { + $feed = new Feed($f['url'], $f['lastmodified'], $f['etag'], $f['username'], $f['password']); + // FIXME: What to do if fails? It currently throws an exception which isn't ideal here. + + // If the feed has been updated then + if ($feed->resource->isModified()) { + $feed->parse(); + + $this->db->begin(); + $articles = $this->db->prepare('SELECT id, url, title, author, DATEFORMAT("http", edited) AS edited_date, guid, content, url_title_hash, url_content_hash, title_content_hash FROM newssync_articles WHERE feed is ? ORDER BY id', 'int')->run($f['id'])->getAll(); + + foreach ($feed->data->items as $i) { + // Iterate through the articles in the database to determine a match for the one + // in the just-parsed feed. + $match = null; + foreach ($articles as $a) { + // If the id exists and is equal to one in the database then this is the post. + if ($i->id) { + if ($i->id === $a['guid']) { + $match = $a; + } + } + + // Otherwise if the id doesn't exist and any of the hashes match then this is + // the post. + elseif ($i->urlTitleHash === $a['url_title_hash'] || $i->urlContentHash === $a['url_content_hash'] || $i->titleContentHash === $a['title_content_hash']) { + $match = $a; + } + } + + // If there is no match then this is a new post and must be added to the + // database. + if (!$match) { + $this->articleAdd($i); + continue; + } + + // With that out of the way determine if the post has been updated. + // If there is an updated date, and it doesn't match the database's then update + // the post. + $update = false; + if ($i->updatedDate) { + if ($i->updatedDate !== $match['edited_date']) { + $update = true; + } + } + // Otherwise if there isn't an updated date and any of the hashes don't match + // then update the post. + elseif ($i->urlTitleHash !== $match['url_title_hash'] || $i->urlContentHash !== $match['url_content_hash'] || $i->titleContentHash !== $match['title_content_hash']) { + $update = true; + } + + if ($update) { + $this->db->prepare('UPDATE newssync_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = ?, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ? WHERE id is ?', 'str', 'str', 'str', 'datetime', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int')->run( + $i->url, + $i->title, + $i->author, + $i->publishedDate, + $i->updatedDate, + time(), + $i->id, + $i->content, + $i->urlTitleHash, + $i->urlContentHash, + $i->titleContentHash, + $match['id'] + ); + + // TODO: Update categories + } + } + + // Lastly update the feed database itself with updated information. + $this->db->prepare('UPDATE newssync_feeds SET url = ?, title = ?, favicon = ?, source = ?, updated = ?, modified = ?, etag = ? WHERE id is ?', 'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'int')->run( + $feed->feedUrl, + $feed->title, + $feed->favicon, + $feed->siteUrl, + $feed->date, + $feed->resource->getLastModified(), + $feed->resource->getEtag(), + $f['id'] + ); + } + } + + $this->db->commit(); + return 1; + } } \ No newline at end of file diff --git a/lib/Feed.php b/lib/Feed.php index 795cc92..06631ce 100644 --- a/lib/Feed.php +++ b/lib/Feed.php @@ -5,18 +5,18 @@ use PicoFeed\PicoFeedException; use PicoFeed\Reader\Favicon; class Feed { + public $data = null; + public $favicon; + public $parser; public $reader; public $resource; - public $parser; - public $data; - public $favicon; - public function __construct(string $url, string $lastModified = '', string $etag = '') { + public function __construct(string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '') { try { $this->reader = new Reader; - $this->resource = $reader->download($url, $lastModified, $etag); + $this->resource = $reader->download($url, $lastModified, $etag, $username, $password); // Grab the favicon for the feed; returns an empty string if it cannot find one. - $this->favicon = new Favicon->find($url); + $this->favicon = (new Favicon)->find($url); } catch (PicoFeedException $e) { throw new Feed\Exception($url, $e); } @@ -43,6 +43,12 @@ class Feed { // only be reserved for severely broken feeds. foreach ($feed->items as &$f) { + // Hashes used for comparison to check for updates and also to identify when an + // id doesn't exist. + $f->urlTitleHash = hash('sha256', $i->url.$i->title); + $f->urlContentHash = hash('sha256', $i->url.$i->content.$i->enclosureUrl.$i->enclosureType); + $f->titleContentHash = hash('sha256', $i->title.$i->content.$i->enclosureUrl.$i->enclosureType); + // If there is an id element then continue. The id is used already. $id = (string)$f->xml->id; if ($id !== '') { @@ -63,9 +69,7 @@ class Feed { continue; } - // If there aren't any of those there is no id. Hashes are created when adding - // the feed to the database which will serve to identify the post in this - // situation. + // If there aren't any of those there is no id. $f->id = ''; } diff --git a/sql/SQLite3/0.sql b/sql/SQLite3/0.sql index c02da7b..ee9f282 100644 --- a/sql/SQLite3/0.sql +++ b/sql/SQLite3/0.sql @@ -69,12 +69,12 @@ create table newssync_articles( author TEXT, -- author's name published datetime, -- time of original publication edited datetime, -- time of last edit + modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified guid TEXT, -- GUID content TEXT, -- content, as (X)HTML - modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified url_title_hash varchar(64), -- hash of URL + title; used when checking for updates and for identification if there is no guid. url_content_hash varchar(64), -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. - title_content_hash varchar(64) -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. + title_content_hash varchar(64) -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. ); -- enclosures associated with articles