Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate ZIM metadata early #266

Merged
merged 1 commit into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@
from zimscraperlib.image.transformation import resize_image
from zimscraperlib.types import FALLBACK_MIME
from zimscraperlib.zim.creator import Creator
from zimscraperlib.zim.metadata import (
validate_description,
validate_longdescription,
validate_tags,
validate_title,
)

from warc2zim.constants import logger
from warc2zim.items import StaticArticle, StaticFile, WARCPayloadItem
Expand Down Expand Up @@ -156,6 +162,7 @@ def __init__(self, args):
self.scraper_suffix = args.scraper_suffix

self.continue_on_error = bool(args.continue_on_error)
self.disable_metadata_checks = bool(args.disable_metadata_checks)

def update_stats(self):
"""write progress as JSON to self.stats_filename if requested"""
Expand All @@ -182,6 +189,21 @@ def add_custom_css_item(self):
)

def run(self):

if not self.disable_metadata_checks:
# Validate ZIM metadata early so that we do not waste time doing operations
# for a scraper which will fail anyway in the end
validate_tags("Tags", self.tags)
if self.title:
validate_title("Title", self.title)
if self.description:
validate_description("Description", self.description)
if self.long_description:
validate_longdescription("LongDescription", self.long_description)
# Nota: we do not validate illustration since logic in the scraper is made
# to always provide a valid image, at least a fallback transparent PNG and
# final illustration is most probably not yet known at this stage

if not self.inputs:
logger.info(
"Arguments valid, no inputs to process. Exiting with return code 100"
Expand Down
8 changes: 8 additions & 0 deletions src/warc2zim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ def main(raw_args=None):
default="fails",
)

parser.add_argument(
"--disable-metadata-checks",
help="Disable validity checks of metadata according to openZIM conventions",
action="store_true",
default=False,
dest="disable_metadata_checks",
)

args = parser.parse_args(args=raw_args)
converter = Converter(args)
return converter.run()
Expand Down
89 changes: 89 additions & 0 deletions tests/test_metadata_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from itertools import chain

import pytest

from warc2zim.main import main


@pytest.mark.parametrize(
"title, is_valid",
[
pytest.param("A title", True, id="a_valid_title"),
pytest.param("A very very very very long title", False, id="an_invalid_title"),
],
)
def test_title_validation(title, is_valid):
args = ["--name", "test", "--title", title, "--output", "./"]
if is_valid:
assert main(args) == 100
else:
with pytest.raises(ValueError, match="Title is too long"):
main(args)


@pytest.mark.parametrize(
"description, is_valid",
[
pytest.param("A description", True, id="a_valid_description"),
pytest.param(
"A " + "".join(["very " for i in range(20)]) + "long description",
False,
id="an_invalid_description",
),
],
)
def test_description_validation(description, is_valid):
args = ["--name", "test", "--description", description, "--output", "./"]
if is_valid:
assert main(args) == 100
else:
with pytest.raises(ValueError, match="Description is too long"):
main(args)


@pytest.mark.parametrize(
"long_description, is_valid",
[
pytest.param("A long description", True, id="a_valid_long_description"),
pytest.param(
"A " + "".join(["very " for i in range(800)]) + "long description",
False,
id="an_invalid_long_description",
),
],
)
def test_long_description_validation(long_description, is_valid):
args = [
"--name",
"test",
"--long-description",
long_description,
"--output",
"./",
]
if is_valid:
assert main(args) == 100
else:
with pytest.raises(ValueError, match="Description is too long"):
main(args)


@pytest.mark.parametrize(
"tags, is_valid",
[
pytest.param(["tag1", "tag2"], True, id="valid_tags"),
# NOTA: there is no tests for invalid tags, since it is not currently possible
],
)
def test_tags_validation(tags, is_valid):
args = list(
chain(
*(
["--name", "test"],
chain(*(["--tags", tag] for tag in tags)),
["--output", "./"],
)
)
)
if is_valid:
assert main(args) == 100
Loading