Skip to content

Commit

Permalink
Really do not consider 'resource' WARC record for all operations
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Mar 18, 2024
1 parent 60d174b commit 4068d85
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,12 @@ def iter_all_warc_records(self):
def gather_information_from_warc(self):
main_page_found = False
for record in iter_warc_records(self.inputs):

# only response records can be considered as main_path and as existing ZIM
# path
if record.rec_type not in ("response", "revisit"):
continue

url = get_record_url(record)
normalized_url = normalize(url)

Expand Down Expand Up @@ -500,6 +506,10 @@ def is_self_redirect(self, record, url):
return normalize(url) == normalize(location)

def add_items_for_warc_record(self, record):

if record.rec_type not in ("response", "revisit"):
return

url = get_record_url(record)
normalized_url = normalize(url)
if not url:
Expand Down
Binary file removed tests/data/example-resource.warc.gz
Binary file not shown.

0 comments on commit 4068d85

Please sign in to comment.