From 4cb23dd1980e75a8d78da7f6da9194b343e15992 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Sat, 16 Jan 2021 14:24:01 -0500
Subject: [PATCH] Partial implementation of proper content scraping

---
 lib/Database.php     | 27 ++++++++++++++++++---------
 lib/Feed.php         |  2 +-
 sql/MySQL/6.sql      |  4 ++++
 sql/PostgreSQL/6.sql |  4 ++++
 sql/SQLite3/6.sql    | 28 ++++++++++++++++++++++++++--
 5 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/lib/Database.php b/lib/Database.php
index 6e72cf6..a78ba37 100644
--- a/lib/Database.php
+++ b/lib/Database.php
@@ -1126,12 +1126,19 @@ class Database {
         if (!V::id($feedID)) {
             throw new Db\ExceptionInput("typeViolation", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID, 'type' => "int > 0"]);
         }
-        $f = $this->db->prepare("SELECT url, username, password, modified, etag, err_count, scrape FROM arsse_feeds where id = ?", "int")->run($feedID)->getRow();
+        $f = $this->db->prepareArray(
+            "SELECT 
+                url, username, password, modified, etag, err_count, scrapers 
+            FROM arsse_feeds as f
+            left join (select feed, count(*) as scrapers from arsse_subscriptions where scrape = 1 group by feed) as s on f.id = s.feed
+            where id = ?",
+            ["int"]
+        )->run($feedID)->getRow();
         if (!$f) {
             throw new Db\ExceptionInput("subjectMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]);
         }
         // determine whether the feed's items should be scraped for full content from the source Web site
-        $scrape = (Arsse::$conf->fetchEnableScraping && $f['scrape']);
+        $scrape = (Arsse::$conf->fetchEnableScraping && $f['scrapers']);
         // the Feed object throws an exception when there are problems, but that isn't ideal
         // here. When an exception is thrown it should update the database with the
         // error instead of failing; if other exceptions are thrown, we should simply roll back
@@ -1161,8 +1168,8 @@ class Database {
         }
         if (sizeof($feed->newItems)) {
             $qInsertArticle = $this->db->prepareArray(
-                "INSERT INTO arsse_articles(url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash,feed) values(?,?,?,?,?,?,?,?,?,?,?)",
-                ['str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int']
+                "INSERT INTO arsse_articles(url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash,feed,content_scraped) values(?,?,?,?,?,?,?,?,?,?,?,?)",
+                ["str", "str", "str", "datetime", "datetime", "str", "str", "str", "str", "str", "int", "str"]
             );
         }
         if (sizeof($feed->changedItems)) {
@@ -1170,8 +1177,8 @@ class Database {
             $qDeleteCategories = $this->db->prepare("DELETE FROM arsse_categories WHERE article = ?", 'int');
             $qClearReadMarks = $this->db->prepare("UPDATE arsse_marks SET \"read\" = 0, modified = CURRENT_TIMESTAMP WHERE article = ? and \"read\" = 1", 'int');
             $qUpdateArticle = $this->db->prepareArray(
-                "UPDATE arsse_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = CURRENT_TIMESTAMP, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ? WHERE id = ?",
-                ['str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int']
+                "UPDATE arsse_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = CURRENT_TIMESTAMP, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ?, content_scraped = ? WHERE id = ?",
+                ["str", "str", "str", "datetime", "datetime", "str", "str", "str", "str", "str", "str", "int"]
             );
         }
         // determine if the feed icon needs to be updated, and update it if appropriate
@@ -1204,7 +1211,8 @@ class Database {
                 $article->urlTitleHash,
                 $article->urlContentHash,
                 $article->titleContentHash,
-                $feedID
+                $feedID,
+                $article->scrapedContent ?? null
             )->lastId();
             // note the new ID for later use
             $articleMap[$k] = $articleID;
@@ -1232,6 +1240,7 @@ class Database {
                 $article->urlTitleHash,
                 $article->urlContentHash,
                 $article->titleContentHash,
+                $article->scrapedContent ?? null,
                 $articleID
             );
             // delete all enclosures and categories and re-insert them
@@ -1273,7 +1282,7 @@ class Database {
         // lastly update the feed database itself with updated information.
         $this->db->prepareArray(
             "UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?",
-            ['str', 'str', 'datetime', 'strict str', 'datetime', 'int', 'int', 'int']
+            ["str", "str", "datetime", "strict str", "datetime", "int", "int", "int"]
         )->run(
             $feed->data->title,
             $feed->data->siteUrl,
@@ -1429,7 +1438,7 @@ class Database {
             'url'                => "arsse_articles.url",
             'title'              => "arsse_articles.title",
             'author'             => "arsse_articles.author",
-            'content'            => "arsse_articles.content",
+            'content'            => "coalesce(case when arsse_subscriptions.scrape = 1 then arsse_articles.content_scraped end, arsse_articles.content)",
             'guid'               => "arsse_articles.guid",
             'fingerprint'        => "arsse_articles.url_title_hash || ':' || arsse_articles.url_content_hash || ':' || arsse_articles.title_content_hash",
             'folder'             => "coalesce(arsse_subscriptions.folder,0)",
diff --git a/lib/Feed.php b/lib/Feed.php
index e96d064..af43f22 100644
--- a/lib/Feed.php
+++ b/lib/Feed.php
@@ -448,7 +448,7 @@ class Feed {
             $scraper->setUrl($item->url);
             $scraper->execute();
             if ($scraper->hasRelevantContent()) {
-                $item->content = $scraper->getFilteredContent();
+                $item->scrapedContent = $scraper->getFilteredContent();
             }
         }
     }
diff --git a/sql/MySQL/6.sql b/sql/MySQL/6.sql
index c2f8b53..7d9eb12 100644
--- a/sql/MySQL/6.sql
+++ b/sql/MySQL/6.sql
@@ -32,6 +32,10 @@ create table arsse_user_meta(
     primary key(owner,"key")
 ) character set utf8mb4 collate utf8mb4_unicode_ci;
 
+alter table arsse_subscriptions add column scrape boolean not null default 0;
+alter table arsse_feeds drop column scrape;
+alter table arsse_articles add column content_scraped longtext;
+
 create table arsse_icons(
     id serial primary key,
     url varchar(767) unique not null,
diff --git a/sql/PostgreSQL/6.sql b/sql/PostgreSQL/6.sql
index a27b87a..825f67d 100644
--- a/sql/PostgreSQL/6.sql
+++ b/sql/PostgreSQL/6.sql
@@ -32,6 +32,10 @@ create table arsse_user_meta(
     primary key(owner,key)
 );
 
+alter table arsse_subscriptions add column scrape smallint not null default 0;
+alter table arsse_feeds drop column scrape;
+alter table arsse_articles add column content_scraped text;
+
 create table arsse_icons(
     id bigserial primary key,
     url text unique not null,
diff --git a/sql/SQLite3/6.sql b/sql/SQLite3/6.sql
index 3c5f358..e43c4ea 100644
--- a/sql/SQLite3/6.sql
+++ b/sql/SQLite3/6.sql
@@ -44,8 +44,11 @@ create table arsse_user_meta(
     primary key(owner,key)
 ) without rowid;
 
+-- Add a "scrape" column for subscriptions
+alter table arsse_subscriptions add column scrape boolean not null default 0;
 
 -- Add a separate table for feed icons and replace their URLs in the feeds table with their IDs
+-- Also remove the "scrape" column of the feeds table, which was never an advertised feature
 create table arsse_icons(
     -- Icons associated with feeds
     -- At a minimum the URL of the icon must be known, but its content may be missing
@@ -76,16 +79,37 @@ create table arsse_feeds_new(
     username text not null default '',                             -- HTTP authentication username
     password text not null default '',                             -- HTTP authentication password (this is stored in plain text)
     size integer not null default 0,                               -- number of articles in the feed at last fetch
-    scrape boolean not null default 0,                             -- whether to use picoFeed's content scraper with this feed
     icon integer references arsse_icons(id) on delete set null,    -- numeric identifier of any associated icon
     unique(url,username,password)                                  -- a URL with particular credentials should only appear once
 );
 insert into arsse_feeds_new 
-    select f.id, f.url, title, source, updated, f.modified, f.next_fetch, f.orphaned, f.etag, err_count, err_msg, username, password, size, scrape, i.id
+    select f.id, f.url, title, source, updated, f.modified, f.next_fetch, f.orphaned, f.etag, err_count, err_msg, username, password, size, i.id
     from arsse_feeds as f left join arsse_icons as i on f.favicon = i.url;
 drop table arsse_feeds;
 alter table arsse_feeds_new rename to arsse_feeds;
 
+-- Add a column for scraped article content, and re-order some column
+create table arsse_articles_new(
+-- entries in newsfeeds
+    id integer primary key,                                                 -- sequence number
+    feed integer not null references arsse_feeds(id) on delete cascade,     -- feed for the subscription
+    url text,                                                               -- URL of article
+    title text collate nocase,                                              -- article title
+    author text collate nocase,                                             -- author's name
+    published text,                                                         -- time of original publication
+    edited text,                                                            -- time of last edit by author
+    modified text not null default CURRENT_TIMESTAMP,                       -- time when article was last modified in database
+    guid text,                                                              -- GUID
+    url_title_hash text not null,                                           -- hash of URL + title; used when checking for updates and for identification if there is no guid.
+    url_content_hash text not null,                                         -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
+    title_content_hash text not null,                                       -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
+    content_scraped text,                                                   -- scraped content, as HTML
+    content text                                                            -- content, as HTML
+);
+insert into arsse_articles_new select id, feed, url, title, author, published, edited, modified, guid, url_title_hash, url_content_hash, title_content_hash, null, content from arsse_articles;
+drop table arsse_articles;
+alter table arsse_articles_new rename to arsse_articles;
+
 -- set version marker
 pragma user_version = 7;
 update arsse_meta set value = '7' where "key" = 'schema_version';