Skip to content

Metadata length validation: count graphemes #176

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ dependencies = [
"beautifulsoup4>=4.9.3,<5.0",
"lxml>=4.6.3,<6.0",
"optimize-images>=1.3.6,<2.0",
# regex has nNo upper-bound due to "date-based" release numbers, no semver, so their
# promise is that they will never (or always) break the API, and the API is very
# limited and we use only a very small subset of it.
"regex>=2020.7.14",
# youtube-dl should be updated as frequently as possible
"yt-dlp"
]
Expand Down
5 changes: 3 additions & 2 deletions src/zimscraperlib/image/probing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import io
import pathlib
import re
from typing import IO

import colorthief
import PIL.Image
Expand Down Expand Up @@ -52,7 +53,7 @@ def is_hex_color(text: str) -> bool:


def format_for(
src: pathlib.Path | io.BytesIO,
src: pathlib.Path | IO[bytes],
from_suffix: bool = True, # noqa: FBT001, FBT002
) -> str:
"""Pillow format of a given filename, either Pillow-detected or from suffix"""
Expand All @@ -70,7 +71,7 @@ def format_for(


def is_valid_image(
image: pathlib.Path | io.IOBase | bytes,
image: pathlib.Path | IO[bytes] | bytes,
imformat: str,
size: tuple[int, int] | None = None,
) -> bool:
Expand Down
16 changes: 13 additions & 3 deletions src/zimscraperlib/zim/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from collections.abc import Iterable
from typing import Any

import regex

from zimscraperlib.constants import (
ILLUSTRATIONS_METADATA_RE,
MANDATORY_ZIM_METADATA_KEYS,
Expand All @@ -16,6 +18,11 @@
from zimscraperlib.image.probing import is_valid_image


def nb_grapheme_for(value: str) -> int:
"""Number of graphemes (visually perceived characters) in a given string"""
return len(regex.findall(r"\X", value))


def validate_required_values(name: str, value: Any):
"""ensures required ones have a value (spec doesnt requires it but makes sense)"""
if name in MANDATORY_ZIM_METADATA_KEYS and not value:
Expand Down Expand Up @@ -43,7 +50,7 @@ def validate_standard_str_types(name: str, value: str):

def validate_title(name: str, value: str):
"""ensures Title metadata is within recommended length"""
if name == "Title" and len(value) > RECOMMENDED_MAX_TITLE_LENGTH:
if name == "Title" and nb_grapheme_for(value) > RECOMMENDED_MAX_TITLE_LENGTH:
raise ValueError(f"{name} is too long.")


Expand Down Expand Up @@ -83,15 +90,18 @@ def validate_counter(name: str, value: str): # noqa: ARG001

def validate_description(name: str, value: str):
"""ensures Description metadata is with required length"""
if name == "Description" and len(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH:
if (
name == "Description"
and nb_grapheme_for(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH
):
raise ValueError(f"{name} is too long.")


def validate_longdescription(name: str, value: str):
"""ensures LongDescription metadata is with required length"""
if (
name == "LongDescription"
and len(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
and nb_grapheme_for(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
):
raise ValueError(f"{name} is too long.")

Expand Down
6 changes: 3 additions & 3 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def test_config_metadata(tmp_path, png_image, tags):
("Flavour", 4, False),
("Source", 4, False),
("Scraper", 4, False),
("Title", "X" * 30, True),
("Title", "में" * 30, True),
("Title", "X" * 31, False),
("Date", 4, False),
("Date", datetime.datetime.now(), True), # noqa: DTZ005
Expand All @@ -762,9 +762,9 @@ def test_config_metadata(tmp_path, png_image, tags):
("Language", "eng,", False),
("Language", "eng, fra", False),
("Counter", "1", False),
("Description", "X" * 80, True),
("Description", "में" * 80, True),
("Description", "X" * 81, False),
("LongDescription", "X" * 4000, True),
("LongDescription", "में" * 4000, True),
("LongDescription", "X" * 4001, False),
("Tags", 4, False),
("Tags", ["wikipedia", 4, "football"], False),
Expand Down
Loading