Skip to content

Commit

Permalink
Merge pull request #869 from openzim/url_to_path
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr committed Apr 2, 2024
2 parents 96afb38 + 54a3e6c commit 48da5b8
Show file tree
Hide file tree
Showing 22 changed files with 112 additions and 108 deletions.
2 changes: 1 addition & 1 deletion include/zim/zim.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ namespace zim
CHECKSUM,

/**
* Checks that offsets in UrlPtrList are valid.
* Checks that offsets in PathPtrList are valid.
*/
DIRENT_PTRS,

Expand Down
16 changes: 8 additions & 8 deletions src/_dirent.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace zim

char ns;
std::string title;
std::string url;
std::string path;
std::string parameter;

public:
Expand Down Expand Up @@ -79,15 +79,15 @@ namespace zim
entry_index_t getRedirectIndex() const { return isRedirect() ? redirectIndex : entry_index_t(0); }

char getNamespace() const { return ns; }
const std::string& getTitle() const { return title.empty() ? url : title; }
const std::string& getUrl() const { return url; }
std::string getLongUrl() const;
const std::string &getTitle() const { return title.empty() ? path : title; }
const std::string &getPath() const { return path; }
std::string getLongPath() const;
const std::string& getParameter() const { return parameter; }

size_t getDirentSize() const
{
size_t ret = (isRedirect() ? 12 : 16) + url.size() + parameter.size() + 2;
if (title != url)
size_t ret = (isRedirect() ? 12 : 16) + path.size() + parameter.size() + 2;
if (title != path)
ret += title.size();
return ret;
}
Expand All @@ -97,10 +97,10 @@ namespace zim
title = title_;
}

void setUrl(char ns_, const std::string& url_)
void setPath(char ns_, const std::string &path_)
{
ns = ns_;
url = url_;
path = path_;
}

void setParameter(const std::string& parameter_)
Expand Down
2 changes: 1 addition & 1 deletion src/archive.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ namespace zim
auto end = m_impl->getNamespaceEndOffset('M');
for (auto idx=start; idx!=end; idx++) {
auto dirent = m_impl->getDirent(idx);
ret.push_back(dirent->getUrl());
ret.push_back(dirent->getPath());
}
return ret;
}
Expand Down
24 changes: 12 additions & 12 deletions src/dirent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,21 @@ namespace zim
dirent.setItem(mimeType, cluster_index_t(clusterNumber), blob_index_t(blobNumber));
}

std::string url;
std::string path;
std::string title;
std::string parameter;

log_debug("read url, title and parameters");
log_debug("read path, title and parameters");

size_type url_size = strnlen(
size_type path_size = strnlen(
reader.current(),
reader.left().v - extraLen
);
if (url_size >= reader.left().v) {
if (path_size >= reader.left().v) {
return false;
}
url = std::string(reader.current(), url_size);
reader.skip(zsize_t(url_size+1));
path = std::string(reader.current(), path_size);
reader.skip(zsize_t(path_size + 1));

size_type title_size = strnlen(
reader.current(),
Expand All @@ -106,7 +106,7 @@ namespace zim
return false;
}
parameter = std::string(reader.current(), extraLen);
dirent.setUrl(ns, url);
dirent.setPath(ns, path);
dirent.setTitle(title);
dirent.setParameter(parameter);
return true;
Expand All @@ -120,12 +120,12 @@ namespace zim
}

// We don't know the size of the dirent because it depends of the size of
// the title, url and extra parameters.
// the title, path and extra parameters.
// This is a pity but we have no choice.
// We cannot take a buffer of the size of the file, it would be really
// inefficient. Let's do try, catch and retry while chosing a smart value
// for the buffer size. Most dirent will be "Article" entry (header's size
// == 16) without extra parameters. Let's hope that url + title size will
// == 16) without extra parameters. Let's hope that path + title size will
// be < 256 and if not try again with a bigger size.

size_t bufferSize(std::min(size_type(256), mp_zimReader->size().v-offset.v));
Expand All @@ -139,12 +139,12 @@ namespace zim
}
}

std::string Dirent::getLongUrl() const
std::string Dirent::getLongPath() const
{
log_trace("Dirent::getLongUrl()");
log_trace("Dirent::getLongPath()");
log_debug("namespace=" << getNamespace() << " title=" << getTitle());

return std::string(1, getNamespace()) + '/' + getUrl();
return std::string(1, getNamespace()) + '/' + getPath();
}

}
9 changes: 6 additions & 3 deletions src/dirent_accessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@

using namespace zim;

DirectDirentAccessor::DirectDirentAccessor(std::shared_ptr<DirentReader> direntReader, std::unique_ptr<const Reader> urlPtrReader, entry_index_t direntCount)
DirectDirentAccessor::DirectDirentAccessor(
std::shared_ptr<DirentReader> direntReader,
std::unique_ptr<const Reader> pathPtrReader,
entry_index_t direntCount)
: mp_direntReader(direntReader),
mp_urlPtrReader(std::move(urlPtrReader)),
mp_pathPtrReader(std::move(pathPtrReader)),
m_direntCount(direntCount),
m_direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)),
m_bufferDirentZone(256)
Expand Down Expand Up @@ -60,7 +63,7 @@ offset_t DirectDirentAccessor::getOffset(entry_index_t idx) const
if (idx >= m_direntCount) {
throw std::out_of_range("entry index out of range");
}
offset_t offset(mp_urlPtrReader->read_uint<offset_type>(offset_t(sizeof(offset_type)*idx.v)));
offset_t offset(mp_pathPtrReader->read_uint<offset_type>(offset_t(sizeof(offset_type)*idx.v)));
return offset;
}

Expand Down
6 changes: 4 additions & 2 deletions src/dirent_accessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ class DirentReader;
class DirectDirentAccessor
{
public: // functions
DirectDirentAccessor(std::shared_ptr<DirentReader> direntReader, std::unique_ptr<const Reader> urlPtrReader, entry_index_t direntCount);
DirectDirentAccessor(std::shared_ptr<DirentReader> direntReader,
std::unique_ptr<const Reader> pathPtrReader,
entry_index_t direntCount);

offset_t getOffset(entry_index_t idx) const;
std::shared_ptr<const Dirent> getDirent(entry_index_t idx) const;
Expand All @@ -56,7 +58,7 @@ class DirectDirentAccessor

private: // data
std::shared_ptr<DirentReader> mp_direntReader;
std::unique_ptr<const Reader> mp_urlPtrReader;
std::unique_ptr<const Reader> mp_pathPtrReader;
entry_index_t m_direntCount;

mutable lru_cache<entry_index_type, std::shared_ptr<const Dirent>> m_direntCache;
Expand Down
4 changes: 2 additions & 2 deletions src/entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ std::string Entry::getTitle() const
std::string Entry::getPath() const
{
if (m_file->hasNewNamespaceScheme()) {
return m_dirent->getUrl();
return m_dirent->getPath();
} else {
return m_dirent->getLongUrl();
return m_dirent->getLongPath();
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/fileheader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ namespace zim
std::copy(getUuid().data, getUuid().data + sizeof(Uuid), header + 8);
toLittleEndian(getArticleCount(), header + 24);
toLittleEndian(getClusterCount(), header + 28);
toLittleEndian(getUrlPtrPos(), header + 32);
toLittleEndian(getPathPtrPos(), header + 32);
toLittleEndian(getTitleIdxPos(), header + 40);
toLittleEndian(getClusterPtrPos(), header + 48);
toLittleEndian(getMimeListPos(), header + 56);
Expand Down Expand Up @@ -100,7 +100,7 @@ namespace zim

setArticleCount(seqReader.read<uint32_t>());
setClusterCount(seqReader.read<uint32_t>());
setUrlPtrPos(seqReader.read<uint64_t>());
setPathPtrPos(seqReader.read<uint64_t>());
setTitleIdxPos(seqReader.read<uint64_t>());
setClusterPtrPos(seqReader.read<uint64_t>());
setMimeListPos(seqReader.read<uint64_t>());
Expand All @@ -120,8 +120,8 @@ namespace zim
throw ZimFileFormatError("mimelistPos must be 80.");
}

if (urlPtrPos < mimeListPos) {
throw ZimFileFormatError("urlPtrPos must be > mimelistPos.");
if (pathPtrPos < mimeListPos) {
throw ZimFileFormatError("pathPtrPos must be > mimelistPos.");
}
if (titleIdxPos < mimeListPos) {
throw ZimFileFormatError("titleIdxPos must be > mimelistPos.");
Expand Down
8 changes: 4 additions & 4 deletions src/fileheader.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace zim
Uuid uuid;
entry_index_type articleCount;
offset_type titleIdxPos;
offset_type urlPtrPos;
offset_type pathPtrPos;
offset_type mimeListPos;
cluster_index_type clusterCount;
offset_type clusterPtrPos;
Expand All @@ -62,7 +62,7 @@ namespace zim
minorVersion(zimMinorVersion),
articleCount(0),
titleIdxPos(0),
urlPtrPos(0),
pathPtrPos(0),
clusterCount(0),
clusterPtrPos(0),
mainPage(std::numeric_limits<entry_index_type>::max()),
Expand Down Expand Up @@ -92,8 +92,8 @@ namespace zim
offset_type getTitleIdxPos() const { return titleIdxPos; }
void setTitleIdxPos(offset_type p) { titleIdxPos = p; }

offset_type getUrlPtrPos() const { return urlPtrPos; }
void setUrlPtrPos(offset_type p) { urlPtrPos = p; }
offset_type getPathPtrPos() const { return pathPtrPos; }
void setPathPtrPos(offset_type p) { pathPtrPos = p; }

offset_type getMimeListPos() const { return mimeListPos; }
void setMimeListPos(offset_type p) { mimeListPos = p; }
Expand Down
Loading

0 comments on commit 48da5b8

Please sign in to comment.