Skip to content

Commit

Permalink
Merge pull request #370 from openzim/add_warc_tar
Browse files Browse the repository at this point in the history
Add support for tar files in --warcs
  • Loading branch information
benoit74 committed Aug 12, 2024
2 parents be1e2d6 + af48be8 commit d0d0c6e
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- Add support for uncompressed tar archive in --warcs (#369)

## [2.1.0] - 2024-08-09

### Added
Expand Down
8 changes: 4 additions & 4 deletions src/zimit/zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ def run(raw_args):
"--warcs",
help="Directly convert WARC archives to ZIM, by-passing the crawling phase. "
"This argument must contain the path or HTTP(S) URL to either warc.gz files or"
"to a tar.gz containing the warc.gz files. Single value with individual "
"to a tar or tar.gz containing the warc.gz files. Single value with individual "
"path/URLs separated by comma",
)

Expand Down Expand Up @@ -517,7 +517,7 @@ def cleanup():
warc_location.strip() for warc_location in zimit_args.warcs.split(",")
]:
suffix = "".join(Path(urllib.parse.urlparse(warc_location).path).suffixes)
if suffix not in {".tar.gz", ".warc", ".warc.gz"}:
if suffix not in {".tar", ".tar.gz", ".warc", ".warc.gz"}:
raise Exception(f"Unsupported file at {warc_location}")

filename = tempfile.NamedTemporaryFile(
Expand All @@ -542,7 +542,7 @@ def cleanup():
logger.info(
f"Extracting WARC(s) from {warc_location} to {extract_path}"
)
with tarfile.open(warc_location, "r:gz") as fh:
with tarfile.open(warc_location, "r") as fh:
# Extract all the contents to the specified directory
fh.extractall(path=extract_path, filter="data")
warc_files.append(Path(extract_path))
Expand All @@ -564,7 +564,7 @@ def cleanup():
# otherwise extract tar.gz and delete it afterwards
extract_path = temp_root_dir / f"{filename.name}_files"
logger.info(f"Extracting WARC(s) from {warc_file} to {extract_path}")
with tarfile.open(warc_file, "r:gz") as fh:
with tarfile.open(warc_file, "r") as fh:
# Extract all the contents to the specified directory
fh.extractall(path=extract_path, filter="data")
logger.info(f"Deleting archive at {warc_file}")
Expand Down

0 comments on commit d0d0c6e

Please sign in to comment.