Skip to content

Commit

Permalink
zimcheck: better and faster redirect loop check
Browse files Browse the repository at this point in the history
With this implementation deep chains of redirections are not
mis-reported as loops. Besides it is faster compared to the old
implementation for the following reasons:

- Redirection info is read from every entry/dirent exactly once;
  all subsequent processing is with minimal in-memory data required
  for the task.

- When a standalone loop redirection is performed (for example,
  `zimcheck -L` with no other option) the auxiliary
  effient-order-to-by-path-order conversion table is not computed.
  In this case, in addition to shorter runtime, the memory usage is
  lower, too.
  • Loading branch information
veloman-yunkan committed Aug 12, 2022
1 parent e7853d1 commit ddb1cf3
Showing 1 changed file with 94 additions and 10 deletions.
104 changes: 94 additions & 10 deletions src/zimcheck/checks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,22 +702,106 @@ void test_articles(const zim::Archive& archive, ErrorLogger& reporter, ProgressB
}
}

void test_redirect_loop(const zim::Archive& archive, ErrorLogger& reporter) {
reporter.infoMsg("[INFO] Checking for redirect loops...");
namespace
{

class RedirectionTable
{
private: // types
enum LoopStatus : uint8_t
{
UNKNOWN,
LOOP,
NONLOOP
};

public: // functions
explicit RedirectionTable(size_t entryCount)
{
loopStatus.reserve(entryCount);
redirTable.reserve(entryCount);
}

void addRedirectionEntry(zim::entry_index_type targetEntryIndex)
{
redirTable.push_back(targetEntryIndex);
loopStatus.push_back(LoopStatus::UNKNOWN);
}

void addItem()
{
redirTable.push_back(redirTable.size());
loopStatus.push_back(LoopStatus::NONLOOP);
}

size_t size() const { return redirTable.size(); }

bool isInRedirectionLoop(zim::entry_index_type i)
{
if ( loopStatus[i] == UNKNOWN )
{
resolveLoopStatus(i);
}

int chained_redirection_limit = 50;
return loopStatus[i] == LOOP;
}

private: // functions
LoopStatus detectLoopStatus(zim::entry_index_type i) const
{
auto i1 = i;
auto i2 = i;
// Follow redirections until an entry with known loop status
// is found.
// i2 moves through redirections at twice the speed of i1
// if i2 runs over i1 then they are both inside a redirection loop
for (bool moveI1 = false ; ; moveI1 = !moveI1)
{
if ( loopStatus[i2] != LoopStatus::UNKNOWN )
return loopStatus[i2];

i2 = redirTable[i2];

for(auto& entry: archive.iterEfficient())
if ( i2 == i1 )
return LoopStatus::LOOP;

if ( moveI1 )
i1 = redirTable[i1];
}
}

void resolveLoopStatus(zim::entry_index_type i)
{
auto current_entry = entry;
int redirections_done = 0;
while(current_entry.isRedirect() && redirections_done < chained_redirection_limit)
const LoopStatus s = detectLoopStatus(i);
for ( ; loopStatus[i] == LoopStatus::UNKNOWN; i = redirTable[i] )
{
current_entry = current_entry.getRedirectEntry();
redirections_done++;
loopStatus[i] = s;
}
}

if(current_entry.isRedirect()){
private: // data
std::vector<zim::entry_index_type> redirTable;
std::vector<LoopStatus> loopStatus;
};

} // unnamed namespace

void test_redirect_loop(const zim::Archive& archive, ErrorLogger& reporter) {
reporter.infoMsg("[INFO] Checking for redirect loops...");

RedirectionTable redirTable(archive.getAllEntryCount());
for(const auto& entry: archive.iterByPath())
{
if ( entry.isRedirect() )
redirTable.addRedirectionEntry(entry.getRedirectEntryIndex());
else
redirTable.addItem();
}

for(zim::entry_index_type i = 0; i < redirTable.size(); ++i )
{
if(redirTable.isInRedirectionLoop(i)){
const auto entry = archive.getEntryByPath(i);
reporter.addMsg(MsgId::REDIRECT_LOOP, {{"entry_path", entry.getPath()}});
}
}
Expand Down

0 comments on commit ddb1cf3

Please sign in to comment.