From 00d2433383c32e539a84ac36a19746d6aaf9c6a0 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Jan 2025 09:06:08 +0000 Subject: [PATCH 1/3] Upgrade to browsertrix crawler 1.4.2 --- CHANGELOG.md | 2 +- Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20dce9f..0711aea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Upgrade to browsertrix crawler 1.4.0-beta.0 (#434) +- Upgrade to browsertrix crawler 1.4.2 (#450) ## [2.1.6] - 2024-11-07 diff --git a/Dockerfile b/Dockerfile index 67e0e18..9b304d0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.4.0-beta.0 +FROM webrecorder/browsertrix-crawler:1.4.2 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update \ From 8d42a8dd93ba12aeb75d6294ff9d55ed51e6de5a Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Jan 2025 10:41:05 +0000 Subject: [PATCH 2/3] Move integration tests to test website --- .github/workflows/Tests.yaml | 2 +- tests-integration/integration.py | 83 ++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 36 deletions(-) diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index 9e21fa7..592a5aa 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -63,7 +63,7 @@ jobs: run: docker run -v $PWD/output:/output zimit zimit --help - name: run crawl - run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep + run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep - name: run integration test suite run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 16ab337..9d37b0f 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -1,6 +1,7 @@ import glob import json import os +from pathlib import Path from warcio import ArchiveIterator from zimscraperlib.zim import Archive @@ -8,23 +9,26 @@ def test_is_file(): """Ensure ZIM file exists""" - assert os.path.isfile("/output/isago.zim") + assert os.path.isfile("/output/tests_en_onepage.zim") def test_zim_main_page(): - """Main page specified, http://isago.rskg.org/, was a redirect to https + """Main page specified, http://website.test.openzim.org/http-return-codes.html, + was a redirect to https Ensure main page is the redirected page""" - main_entry = Archive("/output/isago.zim").main_entry + main_entry = Archive("/output/tests_en_onepage.zim").main_entry assert main_entry.is_redirect - assert main_entry.get_redirect_entry().path == "isago.rskg.org/" + assert ( + main_entry.get_redirect_entry().path + == "website.test.openzim.org/http-return-codes.html" + ) def test_zim_scraper(): - """Main page specified, http://isago.rskg.org/, was a redirect to https - Ensure main page is the redirected page""" + """Check content of scraper metadata""" - zim_fh = Archive("/output/isago.zim") + zim_fh = Archive("/output/tests_en_onepage.zim") scraper = zim_fh.get_text_metadata("Scraper") assert "zimit " in scraper assert "warc2zim " in scraper @@ -33,18 +37,28 @@ def test_zim_scraper(): def test_files_list(): """Check that expected files are present in the ZIM at proper path""" - zim_fh = Archive("/output/isago.zim") + zim_fh = Archive("/output/tests_en_onepage.zim") for expected_entry in [ "_zim_static/__wb_module_decl.js", "_zim_static/wombat.js", "_zim_static/wombatSetup.js", - "isago.rskg.org/", - "isago.rskg.org/a-propos", - "isago.rskg.org/conseils", - "isago.rskg.org/faq", - "isago.rskg.org/static/favicon256.png", - "isago.rskg.org/static/tarifs-isago.pdf", - "maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css", + "website.test.openzim.org/http-return-codes.html", + "website.test.openzim.org/200-response", + "website.test.openzim.org/201-response", + "website.test.openzim.org/202-response", + "website.test.openzim.org/301-external-redirect-ok", + "website.test.openzim.org/301-internal-redirect-ok", + "website.test.openzim.org/302-external-redirect-ok", + "website.test.openzim.org/302-internal-redirect-ok", + "website.test.openzim.org/307-external-redirect-ok", + "website.test.openzim.org/307-internal-redirect-ok", + "website.test.openzim.org/308-external-redirect-ok", + "website.test.openzim.org/308-internal-redirect-ok", + "website.test.openzim.org/http-return-codes.html", + "website.test.openzim.org/icons/favicon.ico", + "website.test.openzim.org/icons/site.webmanifest", + "website.test.openzim.org/internal_redirect_target.html", + "www.example.com/", ]: assert zim_fh.get_content(expected_entry) @@ -72,23 +86,22 @@ def test_user_agent(): def test_stats_output(): - with open("/output/crawl.json") as fh: - assert json.loads(fh.read()) == { - "crawled": 5, - "pending": 0, - "pendingPages": [], - "total": 5, - "failed": 0, - "limit": {"max": 0, "hit": False}, - } - with open("/output/warc2zim.json") as fh: - assert json.loads(fh.read()) == { - "written": 7, - "total": 7, - } - with open("/output/stats.json") as fh: - assert json.loads(fh.read()) == { - "done": 7, - "total": 7, - "limit": {"max": 0, "hit": False}, - } + assert json.loads(Path("/output/crawl.json").read_bytes()) == { + "crawled": 35, + "pending": 0, + "pendingPages": [], + "total": 35, + "failed": 18, + "limit": {"max": 0, "hit": False}, + } + + assert json.loads(Path("/output/warc2zim.json").read_bytes()) == { + "written": 8, + "total": 8, + } + + assert json.loads(Path("/output/stats.json").read_bytes()) == { + "done": 8, + "total": 8, + "limit": {"max": 0, "hit": False}, + } From 97ea6dfd7b8d74026e859a49c1681af88e1cbcb8 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Jan 2025 10:41:22 +0000 Subject: [PATCH 3/3] Fix Docker label to follow new convention --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9b304d0..9d88f45 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ FROM webrecorder/browsertrix-crawler:1.4.2 -LABEL org.opencontainers.image.source https://github.com/openzim/zimit +LABEL org.opencontainers.image.source=https://github.com/openzim/zimit RUN apt-get update \ && apt-get install -qqy --no-install-recommends \