Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to browsertrix crawler 1.4.2, fix integration tests and fix docker label #450

Merged
merged 3 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/Tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jobs:
run: docker run -v $PWD/output:/output zimit zimit --help

- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep

- name: run integration test suite
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed

- Upgrade to browsertrix crawler 1.4.0-beta.0 (#434)
- Upgrade to browsertrix crawler 1.4.2 (#450)

## [2.1.6] - 2024-11-07

Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM webrecorder/browsertrix-crawler:1.4.0-beta.0
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
FROM webrecorder/browsertrix-crawler:1.4.2
LABEL org.opencontainers.image.source=https://github.com/openzim/zimit

RUN apt-get update \
&& apt-get install -qqy --no-install-recommends \
Expand Down
83 changes: 48 additions & 35 deletions tests-integration/integration.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,34 @@
import glob
import json
import os
from pathlib import Path

from warcio import ArchiveIterator
from zimscraperlib.zim import Archive


def test_is_file():
"""Ensure ZIM file exists"""
assert os.path.isfile("/output/isago.zim")
assert os.path.isfile("/output/tests_en_onepage.zim")


def test_zim_main_page():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
"""Main page specified, http://website.test.openzim.org/http-return-codes.html,
was a redirect to https
Ensure main page is the redirected page"""

main_entry = Archive("/output/isago.zim").main_entry
main_entry = Archive("/output/tests_en_onepage.zim").main_entry
assert main_entry.is_redirect
assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
assert (
main_entry.get_redirect_entry().path
== "website.test.openzim.org/http-return-codes.html"
)


def test_zim_scraper():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""
"""Check content of scraper metadata"""

zim_fh = Archive("/output/isago.zim")
zim_fh = Archive("/output/tests_en_onepage.zim")
scraper = zim_fh.get_text_metadata("Scraper")
assert "zimit " in scraper
assert "warc2zim " in scraper
Expand All @@ -33,18 +37,28 @@ def test_zim_scraper():

def test_files_list():
"""Check that expected files are present in the ZIM at proper path"""
zim_fh = Archive("/output/isago.zim")
zim_fh = Archive("/output/tests_en_onepage.zim")
for expected_entry in [
"_zim_static/__wb_module_decl.js",
"_zim_static/wombat.js",
"_zim_static/wombatSetup.js",
"isago.rskg.org/",
"isago.rskg.org/a-propos",
"isago.rskg.org/conseils",
"isago.rskg.org/faq",
"isago.rskg.org/static/favicon256.png",
"isago.rskg.org/static/tarifs-isago.pdf",
"maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css",
"website.test.openzim.org/http-return-codes.html",
"website.test.openzim.org/200-response",
"website.test.openzim.org/201-response",
"website.test.openzim.org/202-response",
"website.test.openzim.org/301-external-redirect-ok",
"website.test.openzim.org/301-internal-redirect-ok",
"website.test.openzim.org/302-external-redirect-ok",
"website.test.openzim.org/302-internal-redirect-ok",
"website.test.openzim.org/307-external-redirect-ok",
"website.test.openzim.org/307-internal-redirect-ok",
"website.test.openzim.org/308-external-redirect-ok",
"website.test.openzim.org/308-internal-redirect-ok",
"website.test.openzim.org/http-return-codes.html",
"website.test.openzim.org/icons/favicon.ico",
"website.test.openzim.org/icons/site.webmanifest",
"website.test.openzim.org/internal_redirect_target.html",
"www.example.com/",
]:
assert zim_fh.get_content(expected_entry)

Expand Down Expand Up @@ -72,23 +86,22 @@ def test_user_agent():


def test_stats_output():
with open("/output/crawl.json") as fh:
assert json.loads(fh.read()) == {
"crawled": 5,
"pending": 0,
"pendingPages": [],
"total": 5,
"failed": 0,
"limit": {"max": 0, "hit": False},
}
with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == {
"written": 7,
"total": 7,
}
with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == {
"done": 7,
"total": 7,
"limit": {"max": 0, "hit": False},
}
assert json.loads(Path("/output/crawl.json").read_bytes()) == {
"crawled": 35,
"pending": 0,
"pendingPages": [],
"total": 35,
"failed": 18,
"limit": {"max": 0, "hit": False},
}

assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
"written": 8,
"total": 8,
}

assert json.loads(Path("/output/stats.json").read_bytes()) == {
"done": 8,
"total": 8,
"limit": {"max": 0, "hit": False},
}