Skip to content

Commit

Permalink
Merge pull request #176 from openzim/graphemes
Browse files Browse the repository at this point in the history
Metadata length validation: count graphemes
  • Loading branch information
benoit74 authored Jul 4, 2024
2 parents 7ab3fcd + a2c1892 commit 6aa458b
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 8 deletions.
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ dependencies = [
"beautifulsoup4>=4.9.3,<5.0",
"lxml>=4.6.3,<6.0",
"optimize-images>=1.3.6,<2.0",
# regex has nNo upper-bound due to "date-based" release numbers, no semver, so their
# promise is that they will never (or always) break the API, and the API is very
# limited and we use only a very small subset of it.
"regex>=2020.7.14",
# youtube-dl should be updated as frequently as possible
"yt-dlp"
]
Expand Down
5 changes: 3 additions & 2 deletions src/zimscraperlib/image/probing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import io
import pathlib
import re
from typing import IO

import colorthief
import PIL.Image
Expand Down Expand Up @@ -52,7 +53,7 @@ def is_hex_color(text: str) -> bool:


def format_for(
src: pathlib.Path | io.BytesIO,
src: pathlib.Path | IO[bytes],
from_suffix: bool = True, # noqa: FBT001, FBT002
) -> str:
"""Pillow format of a given filename, either Pillow-detected or from suffix"""
Expand All @@ -70,7 +71,7 @@ def format_for(


def is_valid_image(
image: pathlib.Path | io.IOBase | bytes,
image: pathlib.Path | IO[bytes] | bytes,
imformat: str,
size: tuple[int, int] | None = None,
) -> bool:
Expand Down
16 changes: 13 additions & 3 deletions src/zimscraperlib/zim/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from collections.abc import Iterable
from typing import Any

import regex

from zimscraperlib.constants import (
ILLUSTRATIONS_METADATA_RE,
MANDATORY_ZIM_METADATA_KEYS,
Expand All @@ -16,6 +18,11 @@
from zimscraperlib.image.probing import is_valid_image


def nb_grapheme_for(value: str) -> int:
"""Number of graphemes (visually perceived characters) in a given string"""
return len(regex.findall(r"\X", value))


def validate_required_values(name: str, value: Any):
"""ensures required ones have a value (spec doesnt requires it but makes sense)"""
if name in MANDATORY_ZIM_METADATA_KEYS and not value:
Expand Down Expand Up @@ -43,7 +50,7 @@ def validate_standard_str_types(name: str, value: str):

def validate_title(name: str, value: str):
"""ensures Title metadata is within recommended length"""
if name == "Title" and len(value) > RECOMMENDED_MAX_TITLE_LENGTH:
if name == "Title" and nb_grapheme_for(value) > RECOMMENDED_MAX_TITLE_LENGTH:
raise ValueError(f"{name} is too long.")


Expand Down Expand Up @@ -83,15 +90,18 @@ def validate_counter(name: str, value: str): # noqa: ARG001

def validate_description(name: str, value: str):
"""ensures Description metadata is with required length"""
if name == "Description" and len(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH:
if (
name == "Description"
and nb_grapheme_for(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH
):
raise ValueError(f"{name} is too long.")


def validate_longdescription(name: str, value: str):
"""ensures LongDescription metadata is with required length"""
if (
name == "LongDescription"
and len(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
and nb_grapheme_for(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
):
raise ValueError(f"{name} is too long.")

Expand Down
6 changes: 3 additions & 3 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def test_config_metadata(tmp_path, png_image, tags):
("Flavour", 4, False),
("Source", 4, False),
("Scraper", 4, False),
("Title", "X" * 30, True),
("Title", "में" * 30, True),
("Title", "X" * 31, False),
("Date", 4, False),
("Date", datetime.datetime.now(), True), # noqa: DTZ005
Expand All @@ -762,9 +762,9 @@ def test_config_metadata(tmp_path, png_image, tags):
("Language", "eng,", False),
("Language", "eng, fra", False),
("Counter", "1", False),
("Description", "X" * 80, True),
("Description", "में" * 80, True),
("Description", "X" * 81, False),
("LongDescription", "X" * 4000, True),
("LongDescription", "में" * 4000, True),
("LongDescription", "X" * 4001, False),
("Tags", 4, False),
("Tags", ["wikipedia", 4, "football"], False),
Expand Down

0 comments on commit 6aa458b

Please sign in to comment.