|
|
@ -82,17 +82,104 @@ class Feed { |
|
|
|
// If there aren't any of those there is no id. |
|
|
|
$f->id = ''; |
|
|
|
} |
|
|
|
$this->data = $feed; |
|
|
|
// if a feedID is supplied, determine which items are already in the database, which are not, and which might have been edited |
|
|
|
if(!is_null($feedID)) { |
|
|
|
// FIXME: first perform deduplication on items |
|
|
|
// array if items in the fetched feed |
|
|
|
$items = $feed->items; |
|
|
|
// get as many of the latest articles in the database as there are in the feed |
|
|
|
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items)); |
|
|
|
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes |
|
|
|
$new = $tentative = $edited = []; |
|
|
|
// iterate through the articles and for each determine whether it is existing, edited, or entirely new |
|
|
|
foreach($items as $index => $i) { |
|
|
|
$this->matchToDatabase($feedID); |
|
|
|
} |
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
protected function deduplicateItems(array $items): array { |
|
|
|
/* Rationale: |
|
|
|
Some newsfeeds (notably Planet) include multiple versions of an |
|
|
|
item if it is updated. As we only care about the latest, we |
|
|
|
try to remove any "old" versions of an item that might also be |
|
|
|
present within the feed. |
|
|
|
*/ |
|
|
|
$out = []; |
|
|
|
foreach($items as $item) { |
|
|
|
foreach($out as $index => $check) { |
|
|
|
// if the two items have the same ID or any one hash matches, they are two versions of the same item |
|
|
|
if( |
|
|
|
($item->id && $check->id && $item->id == $check->id) || |
|
|
|
$item->urlTitleHash == $check->urlTitleHash || |
|
|
|
$item->urlContentHash == $check->urlContentHash || |
|
|
|
$item->titleContentHash == $check->titleContentHash |
|
|
|
) { |
|
|
|
if(// because newsfeeds are usually order newest-first, the later item should only be used if... |
|
|
|
// the later item has an update date and the existing item does not |
|
|
|
($item->updatedDate && !$check->updatedDate) || |
|
|
|
// the later item has an update date newer than the existing item's |
|
|
|
($item->updatedDate && $check->updatedDate && $item->updatedDate->getTimestamp() > $check->updatedDate->getTimestamp()) || |
|
|
|
// neither item has update dates, both have publish dates, and the later item has a newer publish date |
|
|
|
(!$item->updatedDate && !$check->updatedDate && $item->publishedDate && $check->publishedDate && $item->publishedDate->getTimestamp() > $check->publishedDate->getTimestamp()) |
|
|
|
) { |
|
|
|
// if the later item should be used, replace the existing one |
|
|
|
$out[$index] = $item; |
|
|
|
continue 2; |
|
|
|
} else { |
|
|
|
// otherwise skip the item |
|
|
|
continue 2; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
// if there was no match, add the item |
|
|
|
$out[] = $item; |
|
|
|
} |
|
|
|
return $out; |
|
|
|
} |
|
|
|
|
|
|
|
protected function matchToDatabase(int $feedID): bool { |
|
|
|
// first perform deduplication on items |
|
|
|
$items = $this->deduplicateItems($this->data->items); |
|
|
|
// get as many of the latest articles in the database as there are in the feed |
|
|
|
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items)); |
|
|
|
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes |
|
|
|
$new = $tentative = $edited = []; |
|
|
|
// iterate through the articles and for each determine whether it is existing, edited, or entirely new |
|
|
|
foreach($items as $index => $i) { |
|
|
|
foreach($articles as $a) { |
|
|
|
if( |
|
|
|
// the item matches if the GUID matches... |
|
|
|
($i->id && $i->id === $a['guid']) || |
|
|
|
// ... or if any one of the hashes match |
|
|
|
$i->urlTitleHash === $a['url_title_hash'] || |
|
|
|
$i->urlContentHash === $a['url_content_hash'] || |
|
|
|
$i->titleContentHash === $a['title_content_hash'] |
|
|
|
) { |
|
|
|
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) { |
|
|
|
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited |
|
|
|
// we store the item index and database record ID as a key/value pair |
|
|
|
$edited[$index] = $a['id']; |
|
|
|
break; |
|
|
|
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) { |
|
|
|
// if any of the hashes do not match, then the article has been edited |
|
|
|
$edited[$index] = $a['id']; |
|
|
|
break; |
|
|
|
} else { |
|
|
|
// otherwise the item is unchanged and we can ignore it |
|
|
|
break; |
|
|
|
} |
|
|
|
} else { |
|
|
|
// if we don't have a match, add the item to the tentatively new list |
|
|
|
$tentative[] = $index; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
if(sizeof($tentative)) { |
|
|
|
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items |
|
|
|
$ids = $hashesUT = $hashesUC = $hashesTC = []; |
|
|
|
foreach($tentative as $index) { |
|
|
|
$i = $items[$index]; |
|
|
|
if($i->id) $ids[] = $id->id; |
|
|
|
$hashesUT[] = $i->urlTitleHash; |
|
|
|
$hashesUC[] = $i->urlContentHash; |
|
|
|
$hashesTC[] = $i->titleContentHash; |
|
|
|
} |
|
|
|
$articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC); |
|
|
|
foreach($tentative as $index) { |
|
|
|
$i = $items[$index]; |
|
|
|
foreach($articles as $a) { |
|
|
|
if( |
|
|
|
// the item matches if the GUID matches... |
|
|
@ -116,62 +203,19 @@ class Feed { |
|
|
|
break; |
|
|
|
} |
|
|
|
} else { |
|
|
|
// if we don't have a match, add the item to the tentatively new list |
|
|
|
$tentative[] = $index; |
|
|
|
// if we don't have a match, add the item to the definite new list |
|
|
|
$new[] = $index; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
if(sizeof($tentative)) { |
|
|
|
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items |
|
|
|
$ids = $hashesUT = $hashesUC = $hashesTC = []; |
|
|
|
foreach($tentative as $index) { |
|
|
|
$i = $items[$index]; |
|
|
|
if($i->id) $ids[] = $id->id; |
|
|
|
$hashesUT[] = $i->urlTitleHash; |
|
|
|
$hashesUC[] = $i->urlContentHash; |
|
|
|
$hashesTC[] = $i->titleContentHash; |
|
|
|
} |
|
|
|
$articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC); |
|
|
|
foreach($tentative as $index) { |
|
|
|
$i = $items[$index]; |
|
|
|
foreach($articles as $a) { |
|
|
|
if( |
|
|
|
// the item matches if the GUID matches... |
|
|
|
($i->id && $i->id === $a['guid']) || |
|
|
|
// ... or if any one of the hashes match |
|
|
|
$i->urlTitleHash === $a['url_title_hash'] || |
|
|
|
$i->urlContentHash === $a['url_content_hash'] || |
|
|
|
$i->titleContentHash === $a['title_content_hash'] |
|
|
|
) { |
|
|
|
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) { |
|
|
|
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited |
|
|
|
// we store the item index and database record ID as a key/value pair |
|
|
|
$edited[$index] = $a['id']; |
|
|
|
break; |
|
|
|
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) { |
|
|
|
// if any of the hashes do not match, then the article has been edited |
|
|
|
$edited[$index] = $a['id']; |
|
|
|
break; |
|
|
|
} else { |
|
|
|
// otherwise the item is unchanged and we can ignore it |
|
|
|
break; |
|
|
|
} |
|
|
|
} else { |
|
|
|
// if we don't have a match, add the item to the definite new list |
|
|
|
$new[] = $index; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
// FIXME: fetch full content when appropriate |
|
|
|
foreach($new as $index) { |
|
|
|
$this->newItems[] = $items[$index]; |
|
|
|
} |
|
|
|
foreach($edited as $index => $id) { |
|
|
|
$this->changedItems[$id] = $items[$index]; |
|
|
|
} |
|
|
|
} |
|
|
|
$this->data = $feed; |
|
|
|
// FIXME: fetch full content when appropriate |
|
|
|
foreach($new as $index) { |
|
|
|
$this->newItems[] = $items[$index]; |
|
|
|
} |
|
|
|
foreach($edited as $index => $id) { |
|
|
|
$this->changedItems[$id] = $items[$index]; |
|
|
|
} |
|
|
|
return true; |
|
|
|
} |
|
|
|
} |