From 4cb23dd1980e75a8d78da7f6da9194b343e15992 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 16 Jan 2021 14:24:01 -0500 Subject: [PATCH] Partial implementation of proper content scraping --- lib/Database.php | 27 ++++++++++++++++++--------- lib/Feed.php | 2 +- sql/MySQL/6.sql | 4 ++++ sql/PostgreSQL/6.sql | 4 ++++ sql/SQLite3/6.sql | 28 ++++++++++++++++++++++++++-- 5 files changed, 53 insertions(+), 12 deletions(-) diff --git a/lib/Database.php b/lib/Database.php index 6e72cf6..a78ba37 100644 --- a/lib/Database.php +++ b/lib/Database.php @@ -1126,12 +1126,19 @@ class Database { if (!V::id($feedID)) { throw new Db\ExceptionInput("typeViolation", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID, 'type' => "int > 0"]); } - $f = $this->db->prepare("SELECT url, username, password, modified, etag, err_count, scrape FROM arsse_feeds where id = ?", "int")->run($feedID)->getRow(); + $f = $this->db->prepareArray( + "SELECT + url, username, password, modified, etag, err_count, scrapers + FROM arsse_feeds as f + left join (select feed, count(*) as scrapers from arsse_subscriptions where scrape = 1 group by feed) as s on f.id = s.feed + where id = ?", + ["int"] + )->run($feedID)->getRow(); if (!$f) { throw new Db\ExceptionInput("subjectMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]); } // determine whether the feed's items should be scraped for full content from the source Web site - $scrape = (Arsse::$conf->fetchEnableScraping && $f['scrape']); + $scrape = (Arsse::$conf->fetchEnableScraping && $f['scrapers']); // the Feed object throws an exception when there are problems, but that isn't ideal // here. When an exception is thrown it should update the database with the // error instead of failing; if other exceptions are thrown, we should simply roll back @@ -1161,8 +1168,8 @@ class Database { } if (sizeof($feed->newItems)) { $qInsertArticle = $this->db->prepareArray( - "INSERT INTO arsse_articles(url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash,feed) values(?,?,?,?,?,?,?,?,?,?,?)", - ['str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int'] + "INSERT INTO arsse_articles(url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash,feed,content_scraped) values(?,?,?,?,?,?,?,?,?,?,?,?)", + ["str", "str", "str", "datetime", "datetime", "str", "str", "str", "str", "str", "int", "str"] ); } if (sizeof($feed->changedItems)) { @@ -1170,8 +1177,8 @@ class Database { $qDeleteCategories = $this->db->prepare("DELETE FROM arsse_categories WHERE article = ?", 'int'); $qClearReadMarks = $this->db->prepare("UPDATE arsse_marks SET \"read\" = 0, modified = CURRENT_TIMESTAMP WHERE article = ? and \"read\" = 1", 'int'); $qUpdateArticle = $this->db->prepareArray( - "UPDATE arsse_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = CURRENT_TIMESTAMP, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ? WHERE id = ?", - ['str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int'] + "UPDATE arsse_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = CURRENT_TIMESTAMP, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ?, content_scraped = ? WHERE id = ?", + ["str", "str", "str", "datetime", "datetime", "str", "str", "str", "str", "str", "str", "int"] ); } // determine if the feed icon needs to be updated, and update it if appropriate @@ -1204,7 +1211,8 @@ class Database { $article->urlTitleHash, $article->urlContentHash, $article->titleContentHash, - $feedID + $feedID, + $article->scrapedContent ?? null )->lastId(); // note the new ID for later use $articleMap[$k] = $articleID; @@ -1232,6 +1240,7 @@ class Database { $article->urlTitleHash, $article->urlContentHash, $article->titleContentHash, + $article->scrapedContent ?? null, $articleID ); // delete all enclosures and categories and re-insert them @@ -1273,7 +1282,7 @@ class Database { // lastly update the feed database itself with updated information. $this->db->prepareArray( "UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?", - ['str', 'str', 'datetime', 'strict str', 'datetime', 'int', 'int', 'int'] + ["str", "str", "datetime", "strict str", "datetime", "int", "int", "int"] )->run( $feed->data->title, $feed->data->siteUrl, @@ -1429,7 +1438,7 @@ class Database { 'url' => "arsse_articles.url", 'title' => "arsse_articles.title", 'author' => "arsse_articles.author", - 'content' => "arsse_articles.content", + 'content' => "coalesce(case when arsse_subscriptions.scrape = 1 then arsse_articles.content_scraped end, arsse_articles.content)", 'guid' => "arsse_articles.guid", 'fingerprint' => "arsse_articles.url_title_hash || ':' || arsse_articles.url_content_hash || ':' || arsse_articles.title_content_hash", 'folder' => "coalesce(arsse_subscriptions.folder,0)", diff --git a/lib/Feed.php b/lib/Feed.php index e96d064..af43f22 100644 --- a/lib/Feed.php +++ b/lib/Feed.php @@ -448,7 +448,7 @@ class Feed { $scraper->setUrl($item->url); $scraper->execute(); if ($scraper->hasRelevantContent()) { - $item->content = $scraper->getFilteredContent(); + $item->scrapedContent = $scraper->getFilteredContent(); } } } diff --git a/sql/MySQL/6.sql b/sql/MySQL/6.sql index c2f8b53..7d9eb12 100644 --- a/sql/MySQL/6.sql +++ b/sql/MySQL/6.sql @@ -32,6 +32,10 @@ create table arsse_user_meta( primary key(owner,"key") ) character set utf8mb4 collate utf8mb4_unicode_ci; +alter table arsse_subscriptions add column scrape boolean not null default 0; +alter table arsse_feeds drop column scrape; +alter table arsse_articles add column content_scraped longtext; + create table arsse_icons( id serial primary key, url varchar(767) unique not null, diff --git a/sql/PostgreSQL/6.sql b/sql/PostgreSQL/6.sql index a27b87a..825f67d 100644 --- a/sql/PostgreSQL/6.sql +++ b/sql/PostgreSQL/6.sql @@ -32,6 +32,10 @@ create table arsse_user_meta( primary key(owner,key) ); +alter table arsse_subscriptions add column scrape smallint not null default 0; +alter table arsse_feeds drop column scrape; +alter table arsse_articles add column content_scraped text; + create table arsse_icons( id bigserial primary key, url text unique not null, diff --git a/sql/SQLite3/6.sql b/sql/SQLite3/6.sql index 3c5f358..e43c4ea 100644 --- a/sql/SQLite3/6.sql +++ b/sql/SQLite3/6.sql @@ -44,8 +44,11 @@ create table arsse_user_meta( primary key(owner,key) ) without rowid; +-- Add a "scrape" column for subscriptions +alter table arsse_subscriptions add column scrape boolean not null default 0; -- Add a separate table for feed icons and replace their URLs in the feeds table with their IDs +-- Also remove the "scrape" column of the feeds table, which was never an advertised feature create table arsse_icons( -- Icons associated with feeds -- At a minimum the URL of the icon must be known, but its content may be missing @@ -76,16 +79,37 @@ create table arsse_feeds_new( username text not null default '', -- HTTP authentication username password text not null default '', -- HTTP authentication password (this is stored in plain text) size integer not null default 0, -- number of articles in the feed at last fetch - scrape boolean not null default 0, -- whether to use picoFeed's content scraper with this feed icon integer references arsse_icons(id) on delete set null, -- numeric identifier of any associated icon unique(url,username,password) -- a URL with particular credentials should only appear once ); insert into arsse_feeds_new - select f.id, f.url, title, source, updated, f.modified, f.next_fetch, f.orphaned, f.etag, err_count, err_msg, username, password, size, scrape, i.id + select f.id, f.url, title, source, updated, f.modified, f.next_fetch, f.orphaned, f.etag, err_count, err_msg, username, password, size, i.id from arsse_feeds as f left join arsse_icons as i on f.favicon = i.url; drop table arsse_feeds; alter table arsse_feeds_new rename to arsse_feeds; +-- Add a column for scraped article content, and re-order some column +create table arsse_articles_new( +-- entries in newsfeeds + id integer primary key, -- sequence number + feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription + url text, -- URL of article + title text collate nocase, -- article title + author text collate nocase, -- author's name + published text, -- time of original publication + edited text, -- time of last edit by author + modified text not null default CURRENT_TIMESTAMP, -- time when article was last modified in database + guid text, -- GUID + url_title_hash text not null, -- hash of URL + title; used when checking for updates and for identification if there is no guid. + url_content_hash text not null, -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. + title_content_hash text not null, -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. + content_scraped text, -- scraped content, as HTML + content text -- content, as HTML +); +insert into arsse_articles_new select id, feed, url, title, author, published, edited, modified, guid, url_title_hash, url_content_hash, title_content_hash, null, content from arsse_articles; +drop table arsse_articles; +alter table arsse_articles_new rename to arsse_articles; + -- set version marker pragma user_version = 7; update arsse_meta set value = '7' where "key" = 'schema_version';