From 3054b3ff15817d3dc918cb656982d874e6e54b28 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 25 Oct 2023 16:41:45 +0200 Subject: [PATCH 1/3] Do not detect clone entry as duplicated content. If two entries point to the same tuple (cluter_id, blob_id), we must not report them as duplicated content. --- src/zimcheck/checks.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/zimcheck/checks.cpp b/src/zimcheck/checks.cpp index 38e51931..ccafc663 100644 --- a/src/zimcheck/checks.cpp +++ b/src/zimcheck/checks.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -113,6 +114,11 @@ SortedMsgParams sortedMsgParams(const MsgParams& msgParams) return SortedMsgParams(msgParams.begin(), msgParams.end()); } +bool areAliases(const zim::Item& i1, const zim::Item& i2) +{ + return i1.getClusterIndex() == i2.getClusterIndex() && i1.getBlobIndex() == i2.getBlobIndex(); +} + } // unnamed namespace namespace JSON @@ -487,15 +493,22 @@ void ArticleChecker::detect_redundant_articles() progress.report(); auto l = it.second; while ( !l.empty() ) { - const auto e1 = archive.getEntryByPath(l.front()); + // The way we have constructed `l`, e1 MUST BE an item + const auto e1 = archive.getEntryByPath(l.front()).getItem(); l.pop_front(); if ( !l.empty() ) { - // The way we have constructed `l`, e1 MUST BEĀ an item - const std::string s1 = e1.getItem().getData(); + std::optional s1; decltype(l) articlesDifferentFromE1; for(auto other : l) { - auto e2 = archive.getEntryByPath(other); - std::string s2 = e2.getItem().getData(); + // The way we have constructed `l`, e2 MUST BE an item + const auto e2 = archive.getEntryByPath(other).getItem(); + if (areAliases(e1, e2)) { + continue; + } + if (!s1) { + s1 = e1.getData(); + } + std::string s2 = e2.getData(); if (s1 != s2 ) { articlesDifferentFromE1.push_back(other); continue; From f3d38b31ed2343fb4ac03fbd9793b3306e830833 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 18 Dec 2023 14:12:35 +0100 Subject: [PATCH 2/3] Remove unnecessary specialization of std::hash. --- src/zimcheck/checks.cpp | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/zimcheck/checks.cpp b/src/zimcheck/checks.cpp index ccafc663..446544b0 100644 --- a/src/zimcheck/checks.cpp +++ b/src/zimcheck/checks.cpp @@ -18,27 +18,6 @@ #include #include -// Specialization of std::hash needed for our unordered_map. Can be removed in c++14 -namespace std { - template <> struct hash { - size_t operator() (const LogTag &t) const { return size_t(t); } - }; -} - -// Specialization of std::hash needed for our unordered_map. Can be removed in c++14 -namespace std { - template <> struct hash { - size_t operator() (const TestType &t) const { return size_t(t); } - }; -} - -// Specialization of std::hash needed for our unordered_map. Can be removed in c++14 -namespace std { - template <> struct hash { - size_t operator() (const MsgId &msgid) const { return size_t(msgid); } - }; -} - namespace { From 48616c5be7ed6207c2239bbae8e00c372f3f1af7 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 18 Dec 2023 14:32:36 +0100 Subject: [PATCH 3/3] We need libzim version 9.1.0 --- meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 1f4b5954..2cdfbd51 100644 --- a/meson.build +++ b/meson.build @@ -19,7 +19,7 @@ if static_linkage endif endif -libzim_dep = dependency('libzim', version : '>=8.0.0', static:static_linkage) +libzim_dep = dependency('libzim', version : '>=9.1.0', static:static_linkage) with_xapian_support = compiler.has_header_symbol('zim/zim.h', 'LIBZIM_WITH_XAPIAN') find_library_in_compiler = meson.version().version_compare('>=0.31.0')