diff --git a/lib/Feed.php b/lib/Feed.php index 6dc6761..bfddb15 100644 --- a/lib/Feed.php +++ b/lib/Feed.php @@ -82,17 +82,104 @@ class Feed { // If there aren't any of those there is no id. $f->id = ''; } + $this->data = $feed; // if a feedID is supplied, determine which items are already in the database, which are not, and which might have been edited if(!is_null($feedID)) { - // FIXME: first perform deduplication on items - // array if items in the fetched feed - $items = $feed->items; - // get as many of the latest articles in the database as there are in the feed - $articles = Data::$db->articleMatchLatest($feedID, sizeof($items)); - // arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes - $new = $tentative = $edited = []; - // iterate through the articles and for each determine whether it is existing, edited, or entirely new - foreach($items as $index => $i) { + $this->matchToDatabase($feedID); + } + return true; + } + + protected function deduplicateItems(array $items): array { + /* Rationale: + Some newsfeeds (notably Planet) include multiple versions of an + item if it is updated. As we only care about the latest, we + try to remove any "old" versions of an item that might also be + present within the feed. + */ + $out = []; + foreach($items as $item) { + foreach($out as $index => $check) { + // if the two items have the same ID or any one hash matches, they are two versions of the same item + if( + ($item->id && $check->id && $item->id == $check->id) || + $item->urlTitleHash == $check->urlTitleHash || + $item->urlContentHash == $check->urlContentHash || + $item->titleContentHash == $check->titleContentHash + ) { + if(// because newsfeeds are usually order newest-first, the later item should only be used if... + // the later item has an update date and the existing item does not + ($item->updatedDate && !$check->updatedDate) || + // the later item has an update date newer than the existing item's + ($item->updatedDate && $check->updatedDate && $item->updatedDate->getTimestamp() > $check->updatedDate->getTimestamp()) || + // neither item has update dates, both have publish dates, and the later item has a newer publish date + (!$item->updatedDate && !$check->updatedDate && $item->publishedDate && $check->publishedDate && $item->publishedDate->getTimestamp() > $check->publishedDate->getTimestamp()) + ) { + // if the later item should be used, replace the existing one + $out[$index] = $item; + continue 2; + } else { + // otherwise skip the item + continue 2; + } + } + } + // if there was no match, add the item + $out[] = $item; + } + return $out; + } + + protected function matchToDatabase(int $feedID): bool { + // first perform deduplication on items + $items = $this->deduplicateItems($this->data->items); + // get as many of the latest articles in the database as there are in the feed + $articles = Data::$db->articleMatchLatest($feedID, sizeof($items)); + // arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes + $new = $tentative = $edited = []; + // iterate through the articles and for each determine whether it is existing, edited, or entirely new + foreach($items as $index => $i) { + foreach($articles as $a) { + if( + // the item matches if the GUID matches... + ($i->id && $i->id === $a['guid']) || + // ... or if any one of the hashes match + $i->urlTitleHash === $a['url_title_hash'] || + $i->urlContentHash === $a['url_content_hash'] || + $i->titleContentHash === $a['title_content_hash'] + ) { + if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) { + // if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited + // we store the item index and database record ID as a key/value pair + $edited[$index] = $a['id']; + break; + } else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) { + // if any of the hashes do not match, then the article has been edited + $edited[$index] = $a['id']; + break; + } else { + // otherwise the item is unchanged and we can ignore it + break; + } + } else { + // if we don't have a match, add the item to the tentatively new list + $tentative[] = $index; + } + } + } + if(sizeof($tentative)) { + // if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items + $ids = $hashesUT = $hashesUC = $hashesTC = []; + foreach($tentative as $index) { + $i = $items[$index]; + if($i->id) $ids[] = $id->id; + $hashesUT[] = $i->urlTitleHash; + $hashesUC[] = $i->urlContentHash; + $hashesTC[] = $i->titleContentHash; + } + $articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC); + foreach($tentative as $index) { + $i = $items[$index]; foreach($articles as $a) { if( // the item matches if the GUID matches... @@ -116,62 +203,19 @@ class Feed { break; } } else { - // if we don't have a match, add the item to the tentatively new list - $tentative[] = $index; + // if we don't have a match, add the item to the definite new list + $new[] = $index; } } } - if(sizeof($tentative)) { - // if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items - $ids = $hashesUT = $hashesUC = $hashesTC = []; - foreach($tentative as $index) { - $i = $items[$index]; - if($i->id) $ids[] = $id->id; - $hashesUT[] = $i->urlTitleHash; - $hashesUC[] = $i->urlContentHash; - $hashesTC[] = $i->titleContentHash; - } - $articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC); - foreach($tentative as $index) { - $i = $items[$index]; - foreach($articles as $a) { - if( - // the item matches if the GUID matches... - ($i->id && $i->id === $a['guid']) || - // ... or if any one of the hashes match - $i->urlTitleHash === $a['url_title_hash'] || - $i->urlContentHash === $a['url_content_hash'] || - $i->titleContentHash === $a['title_content_hash'] - ) { - if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) { - // if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited - // we store the item index and database record ID as a key/value pair - $edited[$index] = $a['id']; - break; - } else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) { - // if any of the hashes do not match, then the article has been edited - $edited[$index] = $a['id']; - break; - } else { - // otherwise the item is unchanged and we can ignore it - break; - } - } else { - // if we don't have a match, add the item to the definite new list - $new[] = $index; - } - } - } - } - // FIXME: fetch full content when appropriate - foreach($new as $index) { - $this->newItems[] = $items[$index]; - } - foreach($edited as $index => $id) { - $this->changedItems[$id] = $items[$index]; - } } - $this->data = $feed; + // FIXME: fetch full content when appropriate + foreach($new as $index) { + $this->newItems[] = $items[$index]; + } + foreach($edited as $index => $id) { + $this->changedItems[$id] = $items[$index]; + } return true; } } \ No newline at end of file