Browse Source

Deduplicate feed items within the feed itself

microsub
J. King 7 years ago
parent
commit
f842439b01
  1. 54
      lib/Feed.php

54
lib/Feed.php

@ -82,11 +82,57 @@ class Feed {
// If there aren't any of those there is no id.
$f->id = '';
}
$this->data = $feed;
// if a feedID is supplied, determine which items are already in the database, which are not, and which might have been edited
if(!is_null($feedID)) {
// FIXME: first perform deduplication on items
// array if items in the fetched feed
$items = $feed->items;
$this->matchToDatabase($feedID);
}
return true;
}
protected function deduplicateItems(array $items): array {
/* Rationale:
Some newsfeeds (notably Planet) include multiple versions of an
item if it is updated. As we only care about the latest, we
try to remove any "old" versions of an item that might also be
present within the feed.
*/
$out = [];
foreach($items as $item) {
foreach($out as $index => $check) {
// if the two items have the same ID or any one hash matches, they are two versions of the same item
if(
($item->id && $check->id && $item->id == $check->id) ||
$item->urlTitleHash == $check->urlTitleHash ||
$item->urlContentHash == $check->urlContentHash ||
$item->titleContentHash == $check->titleContentHash
) {
if(// because newsfeeds are usually order newest-first, the later item should only be used if...
// the later item has an update date and the existing item does not
($item->updatedDate && !$check->updatedDate) ||
// the later item has an update date newer than the existing item's
($item->updatedDate && $check->updatedDate && $item->updatedDate->getTimestamp() > $check->updatedDate->getTimestamp()) ||
// neither item has update dates, both have publish dates, and the later item has a newer publish date
(!$item->updatedDate && !$check->updatedDate && $item->publishedDate && $check->publishedDate && $item->publishedDate->getTimestamp() > $check->publishedDate->getTimestamp())
) {
// if the later item should be used, replace the existing one
$out[$index] = $item;
continue 2;
} else {
// otherwise skip the item
continue 2;
}
}
}
// if there was no match, add the item
$out[] = $item;
}
return $out;
}
protected function matchToDatabase(int $feedID): bool {
// first perform deduplication on items
$items = $this->deduplicateItems($this->data->items);
// get as many of the latest articles in the database as there are in the feed
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items));
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes
@ -170,8 +216,6 @@ class Feed {
foreach($edited as $index => $id) {
$this->changedItems[$id] = $items[$index];
}
}
$this->data = $feed;
return true;
}
}
Loading…
Cancel
Save