Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zimcheck: better and faster redirect loop check #312

Merged
merged 1 commit into from
Aug 12, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 94 additions & 10 deletions src/zimcheck/checks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,22 +702,106 @@ void test_articles(const zim::Archive& archive, ErrorLogger& reporter, ProgressB
}
}

void test_redirect_loop(const zim::Archive& archive, ErrorLogger& reporter) {
reporter.infoMsg("[INFO] Checking for redirect loops...");
namespace
{

class RedirectionTable
{
private: // types
enum LoopStatus : uint8_t
{
UNKNOWN,
LOOP,
NONLOOP
};

public: // functions
explicit RedirectionTable(size_t entryCount)
{
loopStatus.reserve(entryCount);
redirTable.reserve(entryCount);
}

void addRedirectionEntry(zim::entry_index_type targetEntryIndex)
{
redirTable.push_back(targetEntryIndex);
loopStatus.push_back(LoopStatus::UNKNOWN);
}

void addItem()
{
redirTable.push_back(redirTable.size());
loopStatus.push_back(LoopStatus::NONLOOP);
}

size_t size() const { return redirTable.size(); }

bool isInRedirectionLoop(zim::entry_index_type i)
{
if ( loopStatus[i] == UNKNOWN )
{
resolveLoopStatus(i);
}

int chained_redirection_limit = 50;
return loopStatus[i] == LOOP;
}

private: // functions
LoopStatus detectLoopStatus(zim::entry_index_type i) const
{
auto i1 = i;
auto i2 = i;
// Follow redirections until an entry with known loop status
// is found.
// i2 moves through redirections at twice the speed of i1
// if i2 runs over i1 then they are both inside a redirection loop
for (bool moveI1 = false ; ; moveI1 = !moveI1)
{
if ( loopStatus[i2] != LoopStatus::UNKNOWN )
return loopStatus[i2];

i2 = redirTable[i2];

for(auto& entry: archive.iterEfficient())
if ( i2 == i1 )
return LoopStatus::LOOP;

if ( moveI1 )
i1 = redirTable[i1];
}
}

void resolveLoopStatus(zim::entry_index_type i)
{
auto current_entry = entry;
int redirections_done = 0;
while(current_entry.isRedirect() && redirections_done < chained_redirection_limit)
const LoopStatus s = detectLoopStatus(i);
for ( ; loopStatus[i] == LoopStatus::UNKNOWN; i = redirTable[i] )
{
current_entry = current_entry.getRedirectEntry();
redirections_done++;
loopStatus[i] = s;
}
}

if(current_entry.isRedirect()){
private: // data
std::vector<zim::entry_index_type> redirTable;
std::vector<LoopStatus> loopStatus;
};

} // unnamed namespace

void test_redirect_loop(const zim::Archive& archive, ErrorLogger& reporter) {
reporter.infoMsg("[INFO] Checking for redirect loops...");

RedirectionTable redirTable(archive.getAllEntryCount());
for(const auto& entry: archive.iterByPath())
{
if ( entry.isRedirect() )
redirTable.addRedirectionEntry(entry.getRedirectEntryIndex());
else
redirTable.addItem();
}

for(zim::entry_index_type i = 0; i < redirTable.size(); ++i )
{
if(redirTable.isInRedirectionLoop(i)){
const auto entry = archive.getEntryByPath(i);
reporter.addMsg(MsgId::REDIRECT_LOOP, {{"entry_path", entry.getPath()}});
}
}
Expand Down