From 44ed3bc9673b547423c6c0dd52064bc6a7839b41 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 21 May 2024 11:28:24 +0000 Subject: [PATCH] Validate ZIM title, description, long_description and tags early --- src/warc2zim/converter.py | 22 ++++++++ src/warc2zim/main.py | 8 +++ tests/test_metadata_validation.py | 89 +++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+) create mode 100644 tests/test_metadata_validation.py diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 75082de..0219f05 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -44,6 +44,12 @@ from zimscraperlib.image.transformation import resize_image from zimscraperlib.types import FALLBACK_MIME from zimscraperlib.zim.creator import Creator +from zimscraperlib.zim.metadata import ( + validate_description, + validate_longdescription, + validate_tags, + validate_title, +) from warc2zim.constants import logger from warc2zim.items import StaticArticle, StaticFile, WARCPayloadItem @@ -156,6 +162,7 @@ def __init__(self, args): self.scraper_suffix = args.scraper_suffix self.continue_on_error = bool(args.continue_on_error) + self.disable_metadata_checks = bool(args.disable_metadata_checks) def update_stats(self): """write progress as JSON to self.stats_filename if requested""" @@ -182,6 +189,21 @@ def add_custom_css_item(self): ) def run(self): + + if not self.disable_metadata_checks: + # Validate ZIM metadata early so that we do not waste time doing operations + # for a scraper which will fail anyway in the end + validate_tags("Tags", self.tags) + if self.title: + validate_title("Title", self.title) + if self.description: + validate_description("Description", self.description) + if self.long_description: + validate_longdescription("LongDescription", self.long_description) + # Nota: we do not validate illustration since logic in the scraper is made + # to always provide a valid image, at least a fallback transparent PNG and + # final illustration is most probably not yet known at this stage + if not self.inputs: logger.info( "Arguments valid, no inputs to process. Exiting with return code 100" diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index d50516f..3dedb4d 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -99,6 +99,14 @@ def main(raw_args=None): default="fails", ) + parser.add_argument( + "--disable-metadata-checks", + help="Disable validity checks of metadata according to openZIM conventions", + action="store_true", + default=False, + dest="disable_metadata_checks", + ) + args = parser.parse_args(args=raw_args) converter = Converter(args) return converter.run() diff --git a/tests/test_metadata_validation.py b/tests/test_metadata_validation.py new file mode 100644 index 0000000..2e07164 --- /dev/null +++ b/tests/test_metadata_validation.py @@ -0,0 +1,89 @@ +from itertools import chain + +import pytest + +from warc2zim.main import main + + +@pytest.mark.parametrize( + "title, is_valid", + [ + pytest.param("A title", True, id="a_valid_title"), + pytest.param("A very very very very long title", False, id="an_invalid_title"), + ], +) +def test_title_validation(title, is_valid): + args = ["--name", "test", "--title", title, "--output", "./"] + if is_valid: + assert main(args) == 100 + else: + with pytest.raises(ValueError, match="Title is too long"): + main(args) + + +@pytest.mark.parametrize( + "description, is_valid", + [ + pytest.param("A description", True, id="a_valid_description"), + pytest.param( + "A " + "".join(["very " for i in range(20)]) + "long description", + False, + id="an_invalid_description", + ), + ], +) +def test_description_validation(description, is_valid): + args = ["--name", "test", "--description", description, "--output", "./"] + if is_valid: + assert main(args) == 100 + else: + with pytest.raises(ValueError, match="Description is too long"): + main(args) + + +@pytest.mark.parametrize( + "long_description, is_valid", + [ + pytest.param("A long description", True, id="a_valid_long_description"), + pytest.param( + "A " + "".join(["very " for i in range(800)]) + "long description", + False, + id="an_invalid_long_description", + ), + ], +) +def test_long_description_validation(long_description, is_valid): + args = [ + "--name", + "test", + "--long-description", + long_description, + "--output", + "./", + ] + if is_valid: + assert main(args) == 100 + else: + with pytest.raises(ValueError, match="Description is too long"): + main(args) + + +@pytest.mark.parametrize( + "tags, is_valid", + [ + pytest.param(["tag1", "tag2"], True, id="valid_tags"), + # NOTA: there is no tests for invalid tags, since it is not currently possible + ], +) +def test_tags_validation(tags, is_valid): + args = list( + chain( + *( + ["--name", "test"], + chain(*(["--tags", tag] for tag in tags)), + ["--output", "./"], + ) + ) + ) + if is_valid: + assert main(args) == 100