From ddbf74e81b3b16eca8ada90326af187759be2b6c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 2 Aug 2024 09:55:55 +0000 Subject: [PATCH] Replace os.path with pathlib --- src/warc2zim/converter.py | 3 +-- tests/test_warc_to_zim.py | 54 +++++++++++++++++---------------------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index b5977d1..fa0e739 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -18,7 +18,6 @@ import json import logging import mimetypes -import os import pathlib import re import sys @@ -147,7 +146,7 @@ def __init__(self, args): self.full_filename = self.output / self.zim_file # ensure output file exists - if not os.path.isdir(self.output): + if not self.output.is_dir(): logger.error( f"Output directory {self.output} does not exist. Exiting with error " "code 1" diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index fbe124d..9936588 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -3,7 +3,6 @@ import io import json -import os import pathlib import re import time @@ -20,12 +19,10 @@ from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize from warc2zim.utils import get_record_url -TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") +TEST_DATA_DIR = pathlib.Path(__file__).parent / "data" # special data dir for WARC files which are not supposed to be ran in the # `test_all_warcs_root_dir` test -TEST_DATA_SPECIAL_DIR = os.path.join( - os.path.dirname(os.path.realpath(__file__)), "data-special" -) +TEST_DATA_SPECIAL_DIR = pathlib.Path(__file__).parent / "data-special" SCRAPER_SUFFIX = " + zimit x.y.z-devw" @@ -125,8 +122,8 @@ def assert_item_does_not_exist(self, zimfile, path): assert payload is None def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix): - assert os.path.isfile(warcfile) - assert os.path.isfile(zimfile) + assert pathlib.Path(warcfile).is_file() + assert pathlib.Path(zimfile).is_file() # [TOFIX] head_insert = b"" @@ -337,7 +334,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): main( [ "-v", - os.path.join(TEST_DATA_DIR, "example-response.warc"), + str(TEST_DATA_DIR / "example-response.warc"), "--name", "example-response", "--output", @@ -355,7 +352,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): zim_output = tmp_path / zim_output - assert os.path.isfile(zim_output) + assert pathlib.Path(zim_output).is_file() all_articles = { article.path: article.title for article in self.list_articles(zim_output) @@ -408,8 +405,8 @@ def test_warc_to_zim_main(self, cmdline, tmp_path): filename = cmdline[0] # set intput filename (first arg) to absolute path from test dir - warcfile = os.path.join(TEST_DATA_DIR, filename) - cmdline[0] = warcfile + warcfile = TEST_DATA_DIR / filename + cmdline[0] = str(warcfile) cmdline.extend(["--output", str(tmp_path), "--name", filename]) @@ -434,7 +431,7 @@ def test_same_domain_only(self, tmp_path): zim_output = "same-domain.zim" main( [ - os.path.join(TEST_DATA_DIR, "example-revisit.warc.gz"), + str(TEST_DATA_DIR / "example-revisit.warc.gz"), "--favicon", "http://example.com/favicon.ico", "--include-domains", @@ -462,7 +459,7 @@ def test_skip_self_redirect(self, tmp_path): zim_output = "self-redir.zim" main( [ - os.path.join(TEST_DATA_DIR, "self-redirect.warc"), + str(TEST_DATA_DIR / "self-redirect.warc"), "--output", str(tmp_path), "--zim-file", @@ -478,7 +475,7 @@ def test_include_domains_favicon_and_language(self, tmp_path): zim_output = "spt.zim" main( [ - os.path.join(TEST_DATA_DIR, "single-page-test.warc"), + str(TEST_DATA_DIR / "single-page-test.warc"), "-i", "reseau-canope.fr", "--output", @@ -526,7 +523,7 @@ def test_website_with_redirect(self, tmp_path): zim_output = "kiwix.zim" main( [ - os.path.join(TEST_DATA_DIR, "kiwix-with-redirects.warc.gz"), + str(TEST_DATA_DIR / "kiwix-with-redirects.warc.gz"), "-u", "http://www.kiwix.org", "--output", @@ -562,7 +559,7 @@ def test_all_warcs_root_dir(self, tmp_path): zim_output = "test-all.zim" main( [ - os.path.join(TEST_DATA_DIR), + str(TEST_DATA_DIR), "--output", str(tmp_path), "--zim-file", @@ -595,7 +592,7 @@ def test_fuzzy_urls(self, tmp_path, fuzzycheck): zim_output = fuzzycheck["filename"] + ".zim" main( [ - os.path.join(TEST_DATA_DIR, fuzzycheck["filename"]), + str(TEST_DATA_DIR / fuzzycheck["filename"]), "--output", str(tmp_path), "--zim-file", @@ -617,7 +614,7 @@ def test_error_bad_main_page(self, tmp_path): main( [ "-v", - os.path.join(TEST_DATA_DIR, "example-response.warc"), + str(TEST_DATA_DIR / "example-response.warc"), "-u", "https://no-such-url.example.com", "--output", @@ -637,7 +634,7 @@ def test_error_main_page_unprocessable(self, tmp_path): main( [ "-v", - os.path.join(TEST_DATA_DIR, "main-entry-403.warc.gz"), + str(TEST_DATA_DIR / "main-entry-403.warc.gz"), "-u", "https://wikizilla.org/wiki/Doug", "--output", @@ -681,7 +678,7 @@ def test_custom_css(self, tmp_path): main( [ - os.path.join(TEST_DATA_DIR, "example-response.warc"), + str(TEST_DATA_DIR / "example-response.warc"), "--output", str(tmp_path), "--zim-file", @@ -709,7 +706,7 @@ def test_custom_css_remote(self, tmp_path): main( [ - os.path.join(TEST_DATA_DIR, "example-response.warc"), + str(TEST_DATA_DIR / "example-response.warc"), "--output", str(tmp_path), "--zim-file", @@ -734,7 +731,7 @@ def test_http_return_codes(self, tmp_path): main( [ - os.path.join(TEST_DATA_DIR, "http-return-codes.warc.gz"), + str(TEST_DATA_DIR / "http-return-codes.warc.gz"), "--output", str(tmp_path), "--zim-file", @@ -798,7 +795,7 @@ def test_redirection_loops(self, tmp_path): main( [ - os.path.join(TEST_DATA_DIR, "redir-loops.warc.gz"), + str(TEST_DATA_DIR / "redir-loops.warc.gz"), "--output", str(tmp_path), "--zim-file", @@ -831,7 +828,7 @@ def test_content_resource_types(self, tmp_path): main( [ - os.path.join(TEST_DATA_DIR, "content-resource-types.warc.gz"), + str(TEST_DATA_DIR / "content-resource-types.warc.gz"), "--output", str(tmp_path), "--zim-file", @@ -859,12 +856,9 @@ def test_content_encoding_aliases(self, tmp_path): main( [ - os.path.join( - TEST_DATA_DIR, - "..", - "data-special", - "qsl.net-encoding-alias.warc.gz", - ), + # cannot be processed like other TEST_DATA_DIR warcs since it needs + # special encoding aliases to be used in --encoding-aliases + str(TEST_DATA_SPECIAL_DIR / "qsl.net-encoding-alias.warc.gz"), "--output", str(tmp_path), "--zim-file",