From 9244f2e69c9adfb81228be1c2185caab8fa40cbd Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 31 Jan 2024 14:56:09 +0100 Subject: [PATCH 1/2] Set zimit and browsertrix crawler versions in final ZIM 'Scraper' metadata --- CHANGELOG.md | 1 + src/zimit/__about__.py | 2 +- src/zimit/zimit.py | 16 ++++++++++++++++ tests-integration/integration.py | 21 ++++++++++++++------- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5871a4a..d5a432e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `--version` flag to display Zimit version - New `--logging` flag to adjust Browsertrix Crawler logging (#273) +- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275) ### Changed diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 494af57..11fb1b2 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.0.0-dev0" +__version__ = "2.0.0-dev1" diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 74c19fa..d023c68 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -360,6 +360,22 @@ def run(raw_args): zimit_args, warc2zim_args = parser.parse_known_args(raw_args) + logger.info("Checking browsertrix-crawler version") + crawl_version_cmd = ["crawl", "--version"] + crawl = subprocess.run(crawl_version_cmd, check=False, capture_output=True) + if crawl.returncode: + raise subprocess.CalledProcessError(crawl.returncode, crawl_version_cmd) + else: + crawler_version = crawl.stdout.decode("utf-8").strip() + logger.info(f"Browsertrix crawler: version {crawler_version}") + + # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler + # versions are associated with the ZIM + warc2zim_args.append("--scraper-suffix") + warc2zim_args.append( + f" + zimit {__version__} + Browsertrix crawler {crawler_version}" + ) + # pass url and output to warc2zim also if zimit_args.output: warc2zim_args.append("--output") diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 17bfe9f..0463fad 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -2,13 +2,9 @@ import json import os -import libzim.reader +from libzim.reader import Archive as LibzimArchive from warcio import ArchiveIterator - - -def get_zim_main_entry(zimfile): - zim_fh = libzim.reader.Archive(zimfile) - return zim_fh.main_entry +from zimscraperlib.zim import Archive as ScraperLibArchive def test_is_file(): @@ -20,11 +16,22 @@ def test_zim_main_page(): """Main page specified, http://isago.rskg.org/, was a redirect to https Ensure main page is the redirected page""" - main_entry = get_zim_main_entry("/output/isago.zim") + main_entry = LibzimArchive("/output/isago.zim").main_entry assert main_entry.is_redirect assert main_entry.get_redirect_entry().path == "isago.rskg.org/" +def test_zim_scraper(): + """Main page specified, http://isago.rskg.org/, was a redirect to https + Ensure main page is the redirected page""" + + zim_fh = ScraperLibArchive("/output/isago.zim") + scraper = zim_fh.get_text_metadata("Scraper") + assert "zimit " in scraper + assert "warc2zim " in scraper + assert "Browsertrix crawler " in scraper + + def test_user_agent(): """Test that mobile user agent was used From 49da57c5b686f272db7e286909734963daabd89a Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 5 Feb 2024 14:33:38 +0100 Subject: [PATCH 2/2] fixup! Set zimit and browsertrix crawler versions in final ZIM 'Scraper' metadata --- src/zimit/zimit.py | 23 ++++++++++++++--------- tests-integration/integration.py | 7 +++---- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index d023c68..5d91607 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -362,12 +362,15 @@ def run(raw_args): logger.info("Checking browsertrix-crawler version") crawl_version_cmd = ["crawl", "--version"] - crawl = subprocess.run(crawl_version_cmd, check=False, capture_output=True) - if crawl.returncode: - raise subprocess.CalledProcessError(crawl.returncode, crawl_version_cmd) - else: - crawler_version = crawl.stdout.decode("utf-8").strip() - logger.info(f"Browsertrix crawler: version {crawler_version}") + try: + crawl = subprocess.run( + crawl_version_cmd, check=True, capture_output=True, text=True + ) + except Exception: + logger.error("Failed to get Browsertrix crawler version") + raise + crawler_version = crawl.stdout + logger.info(f"Browsertrix crawler: version {crawler_version}") # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler # versions are associated with the ZIM @@ -538,9 +541,11 @@ def check_url(url: str, user_agent: str, scope: str | None = None): "your homepage might be out-of-scope. Please check!".format( parsed_url.geturl(), actual_url.geturl(), - "is" - if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl()) - else "is not", + ( + "is" + if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl()) + else "is not" + ), scope, ) ) diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 0463fad..14c7ad8 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -2,9 +2,8 @@ import json import os -from libzim.reader import Archive as LibzimArchive from warcio import ArchiveIterator -from zimscraperlib.zim import Archive as ScraperLibArchive +from zimscraperlib.zim import Archive def test_is_file(): @@ -16,7 +15,7 @@ def test_zim_main_page(): """Main page specified, http://isago.rskg.org/, was a redirect to https Ensure main page is the redirected page""" - main_entry = LibzimArchive("/output/isago.zim").main_entry + main_entry = Archive("/output/isago.zim").main_entry assert main_entry.is_redirect assert main_entry.get_redirect_entry().path == "isago.rskg.org/" @@ -25,7 +24,7 @@ def test_zim_scraper(): """Main page specified, http://isago.rskg.org/, was a redirect to https Ensure main page is the redirected page""" - zim_fh = ScraperLibArchive("/output/isago.zim") + zim_fh = Archive("/output/isago.zim") scraper = zim_fh.get_text_metadata("Scraper") assert "zimit " in scraper assert "warc2zim " in scraper