From 6771e8916a8858b3f2ee921d7fff730db44f86b2 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Tue, 23 May 2017 22:15:57 -0400 Subject: [PATCH] Fixed more Feed bugs - Two items with differing IDs but identical hashes were matching in the deduplicator; they shouldn't - They would also match in the database matcher, and shouldn't - The second-pass database matcher was overaggressively finding items as new --- lib/Feed.php | 9 +++-- tests/Feed/TestFeed.php | 3 ++ .../Feed/Deduplication/IdenticalHashes.php | 37 +++++++++++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 tests/docroot/Feed/Deduplication/IdenticalHashes.php diff --git a/lib/Feed.php b/lib/Feed.php index 7c89dcf..3c48f54 100644 --- a/lib/Feed.php +++ b/lib/Feed.php @@ -139,6 +139,8 @@ class Feed { $out = []; foreach($items as $item) { foreach($out as $index => $check) { + // if the two items both have IDs and they differ, they do not match, regardless of hashes + if($item->id && $check->id && $item->id != $check->id) continue; // if the two items have the same ID or any one hash matches, they are two versions of the same item if( ($item->id && $check->id && $item->id == $check->id) || @@ -185,6 +187,8 @@ class Feed { foreach($items as $index => $i) { $found = false; foreach($articles as $a) { + // if the item has an ID and it doesn't match the article ID, the two don't match, regardless of hashes + if($i->id && $i->id !== $a['guid']) continue; if( // the item matches if the GUID matches... ($i->id && $i->id === $a['guid']) || @@ -228,6 +232,8 @@ class Feed { $i = $items[$index]; $found = false; foreach($articles as $a) { + // if the item has an ID and it doesn't match the article ID, the two don't match, regardless of hashes + if($i->id && $i->id !== $a['guid']) continue; if( // the item matches if the GUID matches... ($i->id && $i->id === $a['guid']) || @@ -252,9 +258,6 @@ class Feed { $found = true; break; } - } else { - // if we don't have a match, add the item to the definite new list - $new[] = $index; } } if(!$found) $new[] = $index; diff --git a/tests/Feed/TestFeed.php b/tests/Feed/TestFeed.php index dd294a2..0bd9bef 100644 --- a/tests/Feed/TestFeed.php +++ b/tests/Feed/TestFeed.php @@ -27,6 +27,9 @@ class TestFeed extends \PHPUnit\Framework\TestCase { $f = new Feed(null, $this->base."Deduplication/ID-Dates"); $this->assertCount(2, $f->newItems); $this->assertTime($t, $f->newItems[0]->updatedDate); + $f = new Feed(null, $this->base."Deduplication/IdenticalHashes"); + $this->assertCount(2, $f->newItems); + $this->assertTime($t, $f->newItems[0]->updatedDate); } function testHandleCacheHeadersOn304() { diff --git a/tests/docroot/Feed/Deduplication/IdenticalHashes.php b/tests/docroot/Feed/Deduplication/IdenticalHashes.php new file mode 100644 index 0000000..a512631 --- /dev/null +++ b/tests/docroot/Feed/Deduplication/IdenticalHashes.php @@ -0,0 +1,37 @@ + "application/rss+xml", + 'content' => << + + Test feed + http://example.com/ + A basic feed for testing + + + 1 + Sample article 2 + Sun, 19 May 2002 15:21:36 GMT + 2002-04-19T15:21:36Z + + + 1 + Sample article 2 + Sun, 19 May 2002 15:21:36 GMT + 2002-04-19T15:21:36Z + + + 1 + Sample article 2 + Sun, 19 May 2002 15:21:36 GMT + 2002-04-19T15:21:36Z + + + 2 + Sample article 2 + Sun, 19 May 2002 15:21:36 GMT + 2002-04-19T15:21:36Z + + + +MESSAGE_BODY +]; \ No newline at end of file