From 91337d1a0997c6acd3ff3e7c290ec84a01bf74fe Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 2 Jul 2024 06:26:03 +0000 Subject: [PATCH] Metadata length validation: count graphemes instead of unicode code points --- pyproject.toml | 4 ++++ src/zimscraperlib/zim/metadata.py | 16 +++++++++++++--- tests/zim/test_zim_creator.py | 6 +++--- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2efbab7c..f06851e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,10 @@ dependencies = [ "beautifulsoup4>=4.9.3,<5.0", "lxml>=4.6.3,<6.0", "optimize-images>=1.3.6,<2.0", + # regex has nNo upper-bound due to "date-based" release numbers, no semver, so their + # promise is that they will never (or always) break the API, and the API is very + # limited and we use only a very small subset of it. + "regex>=2020.7.14", # youtube-dl should be updated as frequently as possible "yt-dlp" ] diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index a364fc6b..6d5ec7fc 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -5,6 +5,8 @@ from collections.abc import Iterable from typing import Any +import regex + from zimscraperlib.constants import ( ILLUSTRATIONS_METADATA_RE, MANDATORY_ZIM_METADATA_KEYS, @@ -16,6 +18,11 @@ from zimscraperlib.image.probing import is_valid_image +def nb_grapheme_for(value: str) -> int: + """Number of graphemes (visually perceived characters) in a given string""" + return len(regex.findall(r"\X", value)) + + def validate_required_values(name: str, value: Any): """ensures required ones have a value (spec doesnt requires it but makes sense)""" if name in MANDATORY_ZIM_METADATA_KEYS and not value: @@ -43,7 +50,7 @@ def validate_standard_str_types(name: str, value: str): def validate_title(name: str, value: str): """ensures Title metadata is within recommended length""" - if name == "Title" and len(value) > RECOMMENDED_MAX_TITLE_LENGTH: + if name == "Title" and nb_grapheme_for(value) > RECOMMENDED_MAX_TITLE_LENGTH: raise ValueError(f"{name} is too long.") @@ -83,7 +90,10 @@ def validate_counter(name: str, value: str): # noqa: ARG001 def validate_description(name: str, value: str): """ensures Description metadata is with required length""" - if name == "Description" and len(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH: + if ( + name == "Description" + and nb_grapheme_for(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH + ): raise ValueError(f"{name} is too long.") @@ -91,7 +101,7 @@ def validate_longdescription(name: str, value: str): """ensures LongDescription metadata is with required length""" if ( name == "LongDescription" - and len(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH + and nb_grapheme_for(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH ): raise ValueError(f"{name} is too long.") diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 6c957a3e..e2a5a685 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -739,7 +739,7 @@ def test_config_metadata(tmp_path, png_image, tags): ("Flavour", 4, False), ("Source", 4, False), ("Scraper", 4, False), - ("Title", "X" * 30, True), + ("Title", "में" * 30, True), ("Title", "X" * 31, False), ("Date", 4, False), ("Date", datetime.datetime.now(), True), # noqa: DTZ005 @@ -762,9 +762,9 @@ def test_config_metadata(tmp_path, png_image, tags): ("Language", "eng,", False), ("Language", "eng, fra", False), ("Counter", "1", False), - ("Description", "X" * 80, True), + ("Description", "में" * 80, True), ("Description", "X" * 81, False), - ("LongDescription", "X" * 4000, True), + ("LongDescription", "में" * 4000, True), ("LongDescription", "X" * 4001, False), ("Tags", 4, False), ("Tags", ["wikipedia", 4, "football"], False),