Skip to content

Commit

Permalink
Metadata length validation: count graphemes instead of unicode code p…
Browse files Browse the repository at this point in the history
…oints
  • Loading branch information
benoit74 committed Jul 4, 2024
1 parent 7ab3fcd commit 91337d1
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 6 deletions.
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ dependencies = [
"beautifulsoup4>=4.9.3,<5.0",
"lxml>=4.6.3,<6.0",
"optimize-images>=1.3.6,<2.0",
# regex has nNo upper-bound due to "date-based" release numbers, no semver, so their
# promise is that they will never (or always) break the API, and the API is very
# limited and we use only a very small subset of it.
"regex>=2020.7.14",
# youtube-dl should be updated as frequently as possible
"yt-dlp"
]
Expand Down
16 changes: 13 additions & 3 deletions src/zimscraperlib/zim/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from collections.abc import Iterable
from typing import Any

import regex

from zimscraperlib.constants import (
ILLUSTRATIONS_METADATA_RE,
MANDATORY_ZIM_METADATA_KEYS,
Expand All @@ -16,6 +18,11 @@
from zimscraperlib.image.probing import is_valid_image


def nb_grapheme_for(value: str) -> int:
"""Number of graphemes (visually perceived characters) in a given string"""
return len(regex.findall(r"\X", value))


def validate_required_values(name: str, value: Any):
"""ensures required ones have a value (spec doesnt requires it but makes sense)"""
if name in MANDATORY_ZIM_METADATA_KEYS and not value:
Expand Down Expand Up @@ -43,7 +50,7 @@ def validate_standard_str_types(name: str, value: str):

def validate_title(name: str, value: str):
"""ensures Title metadata is within recommended length"""
if name == "Title" and len(value) > RECOMMENDED_MAX_TITLE_LENGTH:
if name == "Title" and nb_grapheme_for(value) > RECOMMENDED_MAX_TITLE_LENGTH:
raise ValueError(f"{name} is too long.")


Expand Down Expand Up @@ -83,15 +90,18 @@ def validate_counter(name: str, value: str): # noqa: ARG001

def validate_description(name: str, value: str):
"""ensures Description metadata is with required length"""
if name == "Description" and len(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH:
if (
name == "Description"
and nb_grapheme_for(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH
):
raise ValueError(f"{name} is too long.")


def validate_longdescription(name: str, value: str):
"""ensures LongDescription metadata is with required length"""
if (
name == "LongDescription"
and len(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
and nb_grapheme_for(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
):
raise ValueError(f"{name} is too long.")

Expand Down
6 changes: 3 additions & 3 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def test_config_metadata(tmp_path, png_image, tags):
("Flavour", 4, False),
("Source", 4, False),
("Scraper", 4, False),
("Title", "X" * 30, True),
("Title", "में" * 30, True),
("Title", "X" * 31, False),
("Date", 4, False),
("Date", datetime.datetime.now(), True), # noqa: DTZ005
Expand All @@ -762,9 +762,9 @@ def test_config_metadata(tmp_path, png_image, tags):
("Language", "eng,", False),
("Language", "eng, fra", False),
("Counter", "1", False),
("Description", "X" * 80, True),
("Description", "में" * 80, True),
("Description", "X" * 81, False),
("LongDescription", "X" * 4000, True),
("LongDescription", "में" * 4000, True),
("LongDescription", "X" * 4001, False),
("Tags", 4, False),
("Tags", ["wikipedia", 4, "football"], False),
Expand Down

0 comments on commit 91337d1

Please sign in to comment.