From 86897af0b3e085f3e3e7dd7895a487e34aa898ab Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 16 Jan 2021 19:06:20 -0500 Subject: [PATCH] Add ability to enable scraper Also transfer any existing scraper booleans on database upgrade. It was previously possible to enable scraping manually by editing the database, and these settings will be honoured. --- lib/Database.php | 2 + sql/MySQL/6.sql | 1 + sql/PostgreSQL/6.sql | 1 + sql/SQLite3/6.sql | 47 +++++++++++---------- tests/cases/Database/SeriesSubscription.php | 18 ++++---- tests/cases/Db/BaseUpdate.php | 31 ++++++++++---- 6 files changed, 61 insertions(+), 39 deletions(-) diff --git a/lib/Database.php b/lib/Database.php index ea70d95..a69d246 100644 --- a/lib/Database.php +++ b/lib/Database.php @@ -898,6 +898,7 @@ class Database { * - "title": The title of the subscription * - "folder": The numeric identifier (or null) of the subscription's folder * - "pinned": Whether the subscription is pinned + * - "scrape": Whether to scrape full article contents from the HTML article * - "order_type": Whether articles should be sorted in reverse cronological order (2), chronological order (1), or the default (0) * - "keep_rule": The subscription's "keep" filter rule; articles which do not match this are hidden * - "block_rule": The subscription's "block" filter rule; articles which match this are hidden @@ -948,6 +949,7 @@ class Database { 'pinned' => "strict bool", 'keep_rule' => "str", 'block_rule' => "str", + 'scrape' => "bool", ]; [$setClause, $setTypes, $setValues] = $this->generateSet($data, $valid); if (!$setClause) { diff --git a/sql/MySQL/6.sql b/sql/MySQL/6.sql index 7d9eb12..789900e 100644 --- a/sql/MySQL/6.sql +++ b/sql/MySQL/6.sql @@ -33,6 +33,7 @@ create table arsse_user_meta( ) character set utf8mb4 collate utf8mb4_unicode_ci; alter table arsse_subscriptions add column scrape boolean not null default 0; +update arsse_subscriptions set scrape = 1 where feed in (select id from arsse_feeds where scrape = 1); alter table arsse_feeds drop column scrape; alter table arsse_articles add column content_scraped longtext; diff --git a/sql/PostgreSQL/6.sql b/sql/PostgreSQL/6.sql index 825f67d..0f559a8 100644 --- a/sql/PostgreSQL/6.sql +++ b/sql/PostgreSQL/6.sql @@ -33,6 +33,7 @@ create table arsse_user_meta( ); alter table arsse_subscriptions add column scrape smallint not null default 0; +update arsse_subscriptions set scrape = 1 where feed in (select id from arsse_feeds where scrape = 1); alter table arsse_feeds drop column scrape; alter table arsse_articles add column content_scraped text; diff --git a/sql/SQLite3/6.sql b/sql/SQLite3/6.sql index e43c4ea..2be4fed 100644 --- a/sql/SQLite3/6.sql +++ b/sql/SQLite3/6.sql @@ -44,8 +44,31 @@ create table arsse_user_meta( primary key(owner,key) ) without rowid; --- Add a "scrape" column for subscriptions +-- Add a "scrape" column for subscriptions and copy any existing scraping alter table arsse_subscriptions add column scrape boolean not null default 0; +update arsse_subscriptions set scrape = 1 where feed in (select id from arsse_feeds where scrape = 1); + +-- Add a column for scraped article content, and re-order some columns +create table arsse_articles_new( +-- entries in newsfeeds + id integer primary key, -- sequence number + feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription + url text, -- URL of article + title text collate nocase, -- article title + author text collate nocase, -- author's name + published text, -- time of original publication + edited text, -- time of last edit by author + modified text not null default CURRENT_TIMESTAMP, -- time when article was last modified in database + guid text, -- GUID + url_title_hash text not null, -- hash of URL + title; used when checking for updates and for identification if there is no guid. + url_content_hash text not null, -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. + title_content_hash text not null, -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. + content_scraped text, -- scraped content, as HTML + content text -- content, as HTML +); +insert into arsse_articles_new select id, feed, url, title, author, published, edited, modified, guid, url_title_hash, url_content_hash, title_content_hash, null, content from arsse_articles; +drop table arsse_articles; +alter table arsse_articles_new rename to arsse_articles; -- Add a separate table for feed icons and replace their URLs in the feeds table with their IDs -- Also remove the "scrape" column of the feeds table, which was never an advertised feature @@ -88,28 +111,6 @@ insert into arsse_feeds_new drop table arsse_feeds; alter table arsse_feeds_new rename to arsse_feeds; --- Add a column for scraped article content, and re-order some column -create table arsse_articles_new( --- entries in newsfeeds - id integer primary key, -- sequence number - feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription - url text, -- URL of article - title text collate nocase, -- article title - author text collate nocase, -- author's name - published text, -- time of original publication - edited text, -- time of last edit by author - modified text not null default CURRENT_TIMESTAMP, -- time when article was last modified in database - guid text, -- GUID - url_title_hash text not null, -- hash of URL + title; used when checking for updates and for identification if there is no guid. - url_content_hash text not null, -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. - title_content_hash text not null, -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. - content_scraped text, -- scraped content, as HTML - content text -- content, as HTML -); -insert into arsse_articles_new select id, feed, url, title, author, published, edited, modified, guid, url_title_hash, url_content_hash, title_content_hash, null, content from arsse_articles; -drop table arsse_articles; -alter table arsse_articles_new rename to arsse_articles; - -- set version marker pragma user_version = 7; update arsse_meta set value = '7' where "key" = 'schema_version'; diff --git a/tests/cases/Database/SeriesSubscription.php b/tests/cases/Database/SeriesSubscription.php index abbdab3..389495d 100644 --- a/tests/cases/Database/SeriesSubscription.php +++ b/tests/cases/Database/SeriesSubscription.php @@ -80,13 +80,14 @@ trait SeriesSubscription { 'order_type' => "int", 'keep_rule' => "str", 'block_rule' => "str", + 'scrape' => "bool", ], 'rows' => [ - [1,"john.doe@example.com",2,null,null,1,2,null,null], - [2,"jane.doe@example.com",2,null,null,0,0,null,null], - [3,"john.doe@example.com",3,"Ook",2,0,1,null,null], - [4,"jill.doe@example.com",2,null,null,0,0,null,null], - [5,"jack.doe@example.com",2,null,null,1,2,"","3|E"], + [1,"john.doe@example.com",2,null,null,1,2,null,null,0], + [2,"jane.doe@example.com",2,null,null,0,0,null,null,0], + [3,"john.doe@example.com",3,"Ook",2,0,1,null,null,0], + [4,"jill.doe@example.com",2,null,null,0,0,null,null,0], + [5,"jack.doe@example.com",2,null,null,1,2,"","3|E",0], ], ], 'arsse_tags' => [ @@ -409,22 +410,23 @@ trait SeriesSubscription { 'title' => "Ook Ook", 'folder' => 3, 'pinned' => false, + 'scrape' => true, 'order_type' => 0, 'keep_rule' => "ook", 'block_rule' => "eek", ]); $state = $this->primeExpectations($this->data, [ 'arsse_feeds' => ['id','url','username','password','title'], - 'arsse_subscriptions' => ['id','owner','feed','title','folder','pinned','order_type','keep_rule','block_rule'], + 'arsse_subscriptions' => ['id','owner','feed','title','folder','pinned','order_type','keep_rule','block_rule','scrape'], ]); - $state['arsse_subscriptions']['rows'][0] = [1,"john.doe@example.com",2,"Ook Ook",3,0,0,"ook","eek"]; + $state['arsse_subscriptions']['rows'][0] = [1,"john.doe@example.com",2,"Ook Ook",3,0,0,"ook","eek",1]; $this->compareExpectations(static::$drv, $state); Arsse::$db->subscriptionPropertiesSet($this->user, 1, [ 'title' => null, 'keep_rule' => null, 'block_rule' => null, ]); - $state['arsse_subscriptions']['rows'][0] = [1,"john.doe@example.com",2,null,3,0,0,null,null]; + $state['arsse_subscriptions']['rows'][0] = [1,"john.doe@example.com",2,null,3,0,0,null,null,1]; $this->compareExpectations(static::$drv, $state); // making no changes is a valid result Arsse::$db->subscriptionPropertiesSet($this->user, 1, ['unhinged' => true]); diff --git a/tests/cases/Db/BaseUpdate.php b/tests/cases/Db/BaseUpdate.php index bce4dbc..4e1ed79 100644 --- a/tests/cases/Db/BaseUpdate.php +++ b/tests/cases/Db/BaseUpdate.php @@ -139,14 +139,22 @@ class BaseUpdate extends \JKingWeb\Arsse\Test\AbstractTest { $this->drv->schemaUpdate(6); $this->drv->exec( <<drv->schemaUpdate(7); @@ -168,9 +176,16 @@ QUERY_TEXT ['url' => 'https://example.com/', 'icon' => 1], ['url' => 'http://example.net/', 'icon' => null], ]; + $subs = [ + ['id' => 1, 'scrape' => 1], + ['id' => 2, 'scrape' => 1], + ['id' => 3, 'scrape' => 0], + ['id' => 4, 'scrape' => 0], + ]; $this->assertEquals($users, $this->drv->query("SELECT id, password, num from arsse_users order by id")->getAll()); $this->assertEquals($folders, $this->drv->query("SELECT owner, name from arsse_folders order by owner")->getAll()); $this->assertEquals($icons, $this->drv->query("SELECT id, url from arsse_icons order by id")->getAll()); $this->assertEquals($feeds, $this->drv->query("SELECT url, icon from arsse_feeds order by id")->getAll()); + $this->assertEquals($subs, $this->drv->query("SELECT id, scrape from arsse_subscriptions order by id")->getAll()); } }