Skip to content

Commit

Permalink
Set zimit and browsertrix crawler versions in final ZIM 'Scraper' met…
Browse files Browse the repository at this point in the history
…adata
  • Loading branch information
benoit74 committed Jan 31, 2024
1 parent ef462b5 commit de14a3a
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- New `--version` flag to display Zimit version
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#)

### Changed

Expand Down
2 changes: 1 addition & 1 deletion src/zimit/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.0.0-dev0"
__version__ = "2.0.0-dev1"
16 changes: 16 additions & 0 deletions src/zimit/zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,22 @@ def run(raw_args):

zimit_args, warc2zim_args = parser.parse_known_args(raw_args)

logger.info("Checking browsertrix-crawler version")
crawl_version_cmd = ["crawl", "--version"]
crawl = subprocess.run(crawl_version_cmd, check=False, capture_output=True)

Check warning on line 365 in src/zimit/zimit.py

View check run for this annotation

Codecov / codecov/patch

src/zimit/zimit.py#L363-L365

Added lines #L363 - L365 were not covered by tests
if crawl.returncode:
raise subprocess.CalledProcessError(crawl.returncode, crawl_version_cmd)

Check warning on line 367 in src/zimit/zimit.py

View check run for this annotation

Codecov / codecov/patch

src/zimit/zimit.py#L367

Added line #L367 was not covered by tests
else:
crawler_version = crawl.stdout.decode("utf-8").strip()
logger.info(f"Browsertrix crawler: version {crawler_version}")

Check warning on line 370 in src/zimit/zimit.py

View check run for this annotation

Codecov / codecov/patch

src/zimit/zimit.py#L369-L370

Added lines #L369 - L370 were not covered by tests

# pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
# versions are associated with the ZIM
warc2zim_args.append("--scraper-suffix")
warc2zim_args.append(

Check warning on line 375 in src/zimit/zimit.py

View check run for this annotation

Codecov / codecov/patch

src/zimit/zimit.py#L374-L375

Added lines #L374 - L375 were not covered by tests
f" + zimit {__version__} + Browsertrix crawler {crawler_version}"
)

# pass url and output to warc2zim also
if zimit_args.output:
warc2zim_args.append("--output")
Expand Down
21 changes: 14 additions & 7 deletions tests-integration/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@
import json
import os

import libzim.reader
from libzim.reader import Archive as LibzimArchive
from warcio import ArchiveIterator


def get_zim_main_entry(zimfile):
zim_fh = libzim.reader.Archive(zimfile)
return zim_fh.main_entry
from zimscraperlib.zim import Archive as ScraperLibArchive


def test_is_file():
Expand All @@ -20,11 +16,22 @@ def test_zim_main_page():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""

main_entry = get_zim_main_entry("/output/isago.zim")
main_entry = LibzimArchive("/output/isago.zim").main_entry
assert main_entry.is_redirect
assert main_entry.get_redirect_entry().path == "isago.rskg.org/"


def test_zim_scraper():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""

zim_fh = ScraperLibArchive("/output/isago.zim")
scraper = zim_fh.get_text_metadata("Scraper")
assert "zimit " in scraper
assert "warc2zim " in scraper
assert "Browsertrix crawler " in scraper


def test_user_agent():
"""Test that mobile user agent was used
Expand Down

0 comments on commit de14a3a

Please sign in to comment.