diff --git a/CHANGELOG.md b/CHANGELOG.md index 5871a4a..d5a432e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `--version` flag to display Zimit version - New `--logging` flag to adjust Browsertrix Crawler logging (#273) +- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275) ### Changed diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 494af57..11fb1b2 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.0.0-dev0" +__version__ = "2.0.0-dev1" diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 74c19fa..5d91607 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -360,6 +360,25 @@ def run(raw_args): zimit_args, warc2zim_args = parser.parse_known_args(raw_args) + logger.info("Checking browsertrix-crawler version") + crawl_version_cmd = ["crawl", "--version"] + try: + crawl = subprocess.run( + crawl_version_cmd, check=True, capture_output=True, text=True + ) + except Exception: + logger.error("Failed to get Browsertrix crawler version") + raise + crawler_version = crawl.stdout + logger.info(f"Browsertrix crawler: version {crawler_version}") + + # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler + # versions are associated with the ZIM + warc2zim_args.append("--scraper-suffix") + warc2zim_args.append( + f" + zimit {__version__} + Browsertrix crawler {crawler_version}" + ) + # pass url and output to warc2zim also if zimit_args.output: warc2zim_args.append("--output") @@ -522,9 +541,11 @@ def check_url(url: str, user_agent: str, scope: str | None = None): "your homepage might be out-of-scope. Please check!".format( parsed_url.geturl(), actual_url.geturl(), - "is" - if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl()) - else "is not", + ( + "is" + if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl()) + else "is not" + ), scope, ) ) diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 17bfe9f..14c7ad8 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -2,13 +2,8 @@ import json import os -import libzim.reader from warcio import ArchiveIterator - - -def get_zim_main_entry(zimfile): - zim_fh = libzim.reader.Archive(zimfile) - return zim_fh.main_entry +from zimscraperlib.zim import Archive def test_is_file(): @@ -20,11 +15,22 @@ def test_zim_main_page(): """Main page specified, http://isago.rskg.org/, was a redirect to https Ensure main page is the redirected page""" - main_entry = get_zim_main_entry("/output/isago.zim") + main_entry = Archive("/output/isago.zim").main_entry assert main_entry.is_redirect assert main_entry.get_redirect_entry().path == "isago.rskg.org/" +def test_zim_scraper(): + """Main page specified, http://isago.rskg.org/, was a redirect to https + Ensure main page is the redirected page""" + + zim_fh = Archive("/output/isago.zim") + scraper = zim_fh.get_text_metadata("Scraper") + assert "zimit " in scraper + assert "warc2zim " in scraper + assert "Browsertrix crawler " in scraper + + def test_user_agent(): """Test that mobile user agent was used