Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore resource WARC records for now #198

Merged
merged 1 commit into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ def add_items_for_warc_record(self, record):
logger.debug(f"Skipping url {url}, outside included domains")
return

if record.rec_type != "revisit":
if record.rec_type == "response":
if self.is_self_redirect(record, url):
logger.debug("Skipping self-redirect: " + url)
return
Expand All @@ -510,7 +510,8 @@ def add_items_for_warc_record(self, record):
self.indexed_urls.add(normalized_url)

elif (
record.rec_headers["WARC-Refers-To-Target-URI"] != url
record.rec_type == "revisit"
and record.rec_headers["WARC-Refers-To-Target-URI"] != url
and normalized_url not in self.revisits
): # pragma: no branch
self.revisits[normalized_url] = normalize(
Expand Down
5 changes: 4 additions & 1 deletion tests/test_warc_to_zim.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
# We should check with the content of the targeted record...
# But difficult to test as we don't have it
assert payload
else:
elif record.rec_type == "response":
# We must have a payload
assert payload
payload_content = payload.content.tobytes()
Expand All @@ -168,6 +168,9 @@ def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
# have exact match
if payload.mimetype.startswith("text/html"):
assert head_insert in payload_content
elif record.rec_type == "resource":
# we do not want to embed resources "as-is"
assert not payload
mgautierfr marked this conversation as resolved.
Show resolved Hide resolved

warc_urls.add(url)

Expand Down
Loading