Skip to content

Commit

Permalink
Merge pull request #277 from openzim/scraper_suffix
Browse files Browse the repository at this point in the history
Pass scraper suffix to warc2zim
  • Loading branch information
rgaudin authored Feb 5, 2024
2 parents ef462b5 + 49da57c commit 7caa355
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 11 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- New `--version` flag to display Zimit version
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)

### Changed

Expand Down
2 changes: 1 addition & 1 deletion src/zimit/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.0.0-dev0"
__version__ = "2.0.0-dev1"
27 changes: 24 additions & 3 deletions src/zimit/zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,25 @@ def run(raw_args):

zimit_args, warc2zim_args = parser.parse_known_args(raw_args)

logger.info("Checking browsertrix-crawler version")
crawl_version_cmd = ["crawl", "--version"]
try:
crawl = subprocess.run(
crawl_version_cmd, check=True, capture_output=True, text=True
)
except Exception:
logger.error("Failed to get Browsertrix crawler version")
raise
crawler_version = crawl.stdout
logger.info(f"Browsertrix crawler: version {crawler_version}")

# pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
# versions are associated with the ZIM
warc2zim_args.append("--scraper-suffix")
warc2zim_args.append(
f" + zimit {__version__} + Browsertrix crawler {crawler_version}"
)

# pass url and output to warc2zim also
if zimit_args.output:
warc2zim_args.append("--output")
Expand Down Expand Up @@ -522,9 +541,11 @@ def check_url(url: str, user_agent: str, scope: str | None = None):
"your homepage might be out-of-scope. Please check!".format(
parsed_url.geturl(),
actual_url.geturl(),
"is"
if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl())
else "is not",
(
"is"
if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl())
else "is not"
),
scope,
)
)
Expand Down
20 changes: 13 additions & 7 deletions tests-integration/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,8 @@
import json
import os

import libzim.reader
from warcio import ArchiveIterator


def get_zim_main_entry(zimfile):
zim_fh = libzim.reader.Archive(zimfile)
return zim_fh.main_entry
from zimscraperlib.zim import Archive


def test_is_file():
Expand All @@ -20,11 +15,22 @@ def test_zim_main_page():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""

main_entry = get_zim_main_entry("/output/isago.zim")
main_entry = Archive("/output/isago.zim").main_entry
assert main_entry.is_redirect
assert main_entry.get_redirect_entry().path == "isago.rskg.org/"


def test_zim_scraper():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""

zim_fh = Archive("/output/isago.zim")
scraper = zim_fh.get_text_metadata("Scraper")
assert "zimit " in scraper
assert "warc2zim " in scraper
assert "Browsertrix crawler " in scraper


def test_user_agent():
"""Test that mobile user agent was used
Expand Down

0 comments on commit 7caa355

Please sign in to comment.