Skip to content

Commit

Permalink
Do not detect clone entry as duplicated content.
Browse files Browse the repository at this point in the history
If two entries point to the same tuple (cluter_id, blob_id), we must
not report them as duplicated content.
  • Loading branch information
mgautierfr committed Dec 18, 2023
1 parent 6875d84 commit a4c4d2b
Showing 1 changed file with 18 additions and 5 deletions.
23 changes: 18 additions & 5 deletions src/zimcheck/checks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <mutex>
#include <thread>
#include <queue>
#include <optional>
#include <zim/archive.h>
#include <zim/item.h>

Expand Down Expand Up @@ -113,6 +114,11 @@ SortedMsgParams sortedMsgParams(const MsgParams& msgParams)
return SortedMsgParams(msgParams.begin(), msgParams.end());
}

bool areAliases(const zim::Item& i1, const zim::Item& i2)
{
return i1.getClusterIndex() == i2.getClusterIndex() && i1.getBlobIndex() == i2.getBlobIndex();
}

} // unnamed namespace

namespace JSON
Expand Down Expand Up @@ -487,15 +493,22 @@ void ArticleChecker::detect_redundant_articles()
progress.report();
auto l = it.second;
while ( !l.empty() ) {
const auto e1 = archive.getEntryByPath(l.front());
// The way we have constructed `l`, e1 MUST BE an item
const auto e1 = archive.getEntryByPath(l.front()).getItem();
l.pop_front();
if ( !l.empty() ) {
// The way we have constructed `l`, e1 MUST BE an item
const std::string s1 = e1.getItem().getData();
std::optional<std::string> s1;
decltype(l) articlesDifferentFromE1;
for(auto other : l) {
auto e2 = archive.getEntryByPath(other);
std::string s2 = e2.getItem().getData();
// The way we have constructed `l`, e2 MUST BE an item
const auto e2 = archive.getEntryByPath(other).getItem();
if (areAliases(e1, e2)) {
continue;
}
if (!s1) {
s1 = e1.getData();
}
std::string s2 = e2.getData();
if (s1 != s2 ) {
articlesDifferentFromE1.push_back(other);
continue;
Expand Down

0 comments on commit a4c4d2b

Please sign in to comment.