Skip to content

Commit

Permalink
zimcheck: better and faster redirect loop check
Browse files Browse the repository at this point in the history
With this implementation deep chains of redirections are not
mis-reported as loops. Besides it is faster compared to the old
implementation for the following reasons:

- Redirection info is read from every entry/dirent exactly once;
  all subsequent processing is with minimal in-memory data required
  for the task.

- When a standalone loop redirection is performed (for example,
  `zimcheck -L` with no other option) the auxiliary
  effient-order-to-by-path-order conversion table is not computed.
  In this case, in addition to shorter runtime, the memory usage is
  lower, too.
  • Loading branch information
veloman-yunkan committed Jul 27, 2022
1 parent 1e0e9fa commit d87dd35
Showing 1 changed file with 90 additions and 10 deletions.
100 changes: 90 additions & 10 deletions src/zimcheck/checks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,22 +702,102 @@ void test_articles(const zim::Archive& archive, ErrorLogger& reporter, ProgressB
}
}

void test_redirect_loop(const zim::Archive& archive, ErrorLogger& reporter) {
reporter.infoMsg("[INFO] Checking for redirect loops...");
namespace
{

class RedirectionTable
{
private: // types
enum LoopStatus : uint8_t
{
UNKNOWN,
LOOP,
NONLOOP
};

public: // functions

int chained_redirection_limit = 50;
void addRedirectionEntry(zim::entry_index_type targetEntryIndex)
{
redirTable.push_back(targetEntryIndex);
loopStatus.push_back(LoopStatus::UNKNOWN);
}

for(auto& entry: archive.iterEfficient())
void addItem()
{
auto current_entry = entry;
int redirections_done = 0;
while(current_entry.isRedirect() && redirections_done < chained_redirection_limit)
redirTable.push_back(redirTable.size());
loopStatus.push_back(LoopStatus::NONLOOP);
}

size_t size() const { return redirTable.size(); }

bool isInRedirectionLoop(zim::entry_index_type i)
{
if ( loopStatus[i] == UNKNOWN )
{
current_entry = current_entry.getRedirectEntry();
redirections_done++;
resolveLoopStatus(i);
}

if(current_entry.isRedirect()){
return loopStatus[i] == LOOP;
}

private: // functions

LoopStatus detectLoopStatus(zim::entry_index_type i) const
{
auto i1 = i;
auto i2 = i;
// Follow redirections until an entry with known loop status
// is found.
// i2 moves through redirections at twice the speed of i1
// if i2 runs over i1 then they are both inside a redirection loop
for (bool moveI1 = false ; ; moveI1 = !moveI1)
{
if ( loopStatus[i2] != LoopStatus::UNKNOWN )
return loopStatus[i2];

i2 = redirTable[i2];

if ( i2 == i1 )
return LoopStatus::LOOP;

if ( moveI1 )
i1 = redirTable[i1];
}
}

void resolveLoopStatus(zim::entry_index_type i)
{
const LoopStatus s = detectLoopStatus(i);
for ( ; loopStatus[i] == LoopStatus::UNKNOWN; i = redirTable[i] )
{
loopStatus[i] = s;
}
}

private: // data
std::deque<zim::entry_index_type> redirTable;
std::deque<LoopStatus> loopStatus;
};

} // unnamed namespace

void test_redirect_loop(const zim::Archive& archive, ErrorLogger& reporter) {
reporter.infoMsg("[INFO] Checking for redirect loops...");

RedirectionTable redirTable;
for(const auto& entry: archive.iterByPath())
{
if ( entry.isRedirect() )
redirTable.addRedirectionEntry(entry.getRedirectEntryIndex());
else
redirTable.addItem();
}

for(zim::entry_index_type i = 0; i < redirTable.size(); ++i )
{
if(redirTable.isInRedirectionLoop(i)){
const auto entry = archive.getEntryByPath(i);
reporter.addMsg(MsgId::REDIRECT_LOOP, {{"entry_path", entry.getPath()}});
}
}
Expand Down

0 comments on commit d87dd35

Please sign in to comment.