From ddb1cf3525b38a6c3a2b08c08fb2c36817186c4b Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 27 Jul 2022 15:53:33 +0400 Subject: [PATCH] zimcheck: better and faster redirect loop check With this implementation deep chains of redirections are not mis-reported as loops. Besides it is faster compared to the old implementation for the following reasons: - Redirection info is read from every entry/dirent exactly once; all subsequent processing is with minimal in-memory data required for the task. - When a standalone loop redirection is performed (for example, `zimcheck -L` with no other option) the auxiliary effient-order-to-by-path-order conversion table is not computed. In this case, in addition to shorter runtime, the memory usage is lower, too. --- src/zimcheck/checks.cpp | 104 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 10 deletions(-) diff --git a/src/zimcheck/checks.cpp b/src/zimcheck/checks.cpp index 98a72e57..ab48330c 100644 --- a/src/zimcheck/checks.cpp +++ b/src/zimcheck/checks.cpp @@ -702,22 +702,106 @@ void test_articles(const zim::Archive& archive, ErrorLogger& reporter, ProgressB } } -void test_redirect_loop(const zim::Archive& archive, ErrorLogger& reporter) { - reporter.infoMsg("[INFO] Checking for redirect loops..."); +namespace +{ + +class RedirectionTable +{ +private: // types + enum LoopStatus : uint8_t + { + UNKNOWN, + LOOP, + NONLOOP + }; + +public: // functions + explicit RedirectionTable(size_t entryCount) + { + loopStatus.reserve(entryCount); + redirTable.reserve(entryCount); + } + + void addRedirectionEntry(zim::entry_index_type targetEntryIndex) + { + redirTable.push_back(targetEntryIndex); + loopStatus.push_back(LoopStatus::UNKNOWN); + } + + void addItem() + { + redirTable.push_back(redirTable.size()); + loopStatus.push_back(LoopStatus::NONLOOP); + } + + size_t size() const { return redirTable.size(); } + + bool isInRedirectionLoop(zim::entry_index_type i) + { + if ( loopStatus[i] == UNKNOWN ) + { + resolveLoopStatus(i); + } - int chained_redirection_limit = 50; + return loopStatus[i] == LOOP; + } + +private: // functions + LoopStatus detectLoopStatus(zim::entry_index_type i) const + { + auto i1 = i; + auto i2 = i; + // Follow redirections until an entry with known loop status + // is found. + // i2 moves through redirections at twice the speed of i1 + // if i2 runs over i1 then they are both inside a redirection loop + for (bool moveI1 = false ; ; moveI1 = !moveI1) + { + if ( loopStatus[i2] != LoopStatus::UNKNOWN ) + return loopStatus[i2]; + + i2 = redirTable[i2]; - for(auto& entry: archive.iterEfficient()) + if ( i2 == i1 ) + return LoopStatus::LOOP; + + if ( moveI1 ) + i1 = redirTable[i1]; + } + } + + void resolveLoopStatus(zim::entry_index_type i) { - auto current_entry = entry; - int redirections_done = 0; - while(current_entry.isRedirect() && redirections_done < chained_redirection_limit) + const LoopStatus s = detectLoopStatus(i); + for ( ; loopStatus[i] == LoopStatus::UNKNOWN; i = redirTable[i] ) { - current_entry = current_entry.getRedirectEntry(); - redirections_done++; + loopStatus[i] = s; } + } - if(current_entry.isRedirect()){ +private: // data + std::vector redirTable; + std::vector loopStatus; +}; + +} // unnamed namespace + +void test_redirect_loop(const zim::Archive& archive, ErrorLogger& reporter) { + reporter.infoMsg("[INFO] Checking for redirect loops..."); + + RedirectionTable redirTable(archive.getAllEntryCount()); + for(const auto& entry: archive.iterByPath()) + { + if ( entry.isRedirect() ) + redirTable.addRedirectionEntry(entry.getRedirectEntryIndex()); + else + redirTable.addItem(); + } + + for(zim::entry_index_type i = 0; i < redirTable.size(); ++i ) + { + if(redirTable.isInRedirectionLoop(i)){ + const auto entry = archive.getEntryByPath(i); reporter.addMsg(MsgId::REDIRECT_LOOP, {{"entry_path", entry.getPath()}}); } }