From cb8b7e827f2751fb648aceb3cdc0bde8b6fb2005 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 22 Nov 2024 07:46:45 +0000 Subject: [PATCH 01/13] Significantly enhance the safety of metadata manipulation --- CHANGELOG.md | 18 + src/zimscraperlib/constants.py | 29 -- src/zimscraperlib/inputs.py | 5 + src/zimscraperlib/zim/creator.py | 324 +++++++--------- src/zimscraperlib/zim/filesystem.py | 47 +-- src/zimscraperlib/zim/metadata.py | 558 ++++++++++++++++++++++------ tests/inputs/test_inputs.py | 12 +- tests/zim/conftest.py | 9 + tests/zim/test_metadata.py | 449 ++++++++++++++++++++-- tests/zim/test_zim_creator.py | 386 ++++++++++--------- 10 files changed, 1279 insertions(+), 558 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e287837e..7aefd01c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Renamed `filesystem.validate_zimfile_creatable` to `filesystem.file_creatable` to reflect general applicability to check file creation beyond ZIM files #200 - Remove any "ZIM" reference in exceptions while working with files #200 +- Significantly enhance the safety of metadata manipulation (#205) + - add types for all metadata, one type per metadata name plus some generic ones for non-standard metadata + - all types are responsible to validate metadata value at initialization time + - validation checks for adherence to the ZIM specification and conventions are automated + - cleanup of unwanted control characters and stripping white characters are **automated in all text metadata** + - whenever possible, try to **automatically clean a "reasonably" bad metadata** (e.g. automaticall accept and remove duplicate tags - harmless - but not duplicate language codes - codes are supposed to be ordered, so it is a weird situation) ; this is an alignment of paradigm, because for some metadata the lib was permissive, while for other it was quite restrictive ; this PR tries to align this and **make the lib as permissive as possible**, avoiding to fail a scraper for something which could be automatically fixed + - it is now possible to disable ZIM conventions checks with `zim.metadata.check_metadata_conventions` + - simplify `zim.creator.Creator.config_metadata` by using these types and been more strict: + - add new `StandardMetadata` class for standard metadata, including list of mandatory one + - by default, all non-standard metadata must start with `X-` prefix + - this not yet an openZIM convention / specification, so it is possible to disable this check with `fail_on_missing_prefix` argument + - simplify `add_metadata`, use same metadata types + - simplify `zim.creator.Creator.start` with new types, and drop all metadata from memory after being passed to the libzim + - drop `zim.creator.convert_and_check_metadata` (not usefull anymore, simply use proper metadata type) + - move `MANDATORY_ZIM_METADATA_KEYS` and `DEFAULT_DEV_ZIM_METADATA` from `constants` to `zim.metadata` to avoid circular dependencies + - new `inputs.unique_values` utility function to compute the list of uniques values from a given list, but preserving initial list order + - in `__init__` of `zim.creator.Creator`, rename `disable_metadata_checks` to `check_metadata_conventions` for clarity and brevity + - beware that this manipulate the global `zim.metadata.check_metadata_conventions`, so if you have many creator running in parallel, they can't have different settings, last one initialized will "win" ### Added diff --git a/src/zimscraperlib/constants.py b/src/zimscraperlib/constants.py index 4e601121..c2a89ebd 100644 --- a/src/zimscraperlib/constants.py +++ b/src/zimscraperlib/constants.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # vim: ai ts=4 sts=4 et sw=4 nu -import base64 import pathlib import re @@ -22,34 +21,6 @@ # list of mimetypes we consider articles using it should default to FRONT_ARTICLE FRONT_ARTICLE_MIMETYPES = ["text/html"] -# list of mandatory meta tags of the zim file. -MANDATORY_ZIM_METADATA_KEYS = [ - "Name", - "Title", - "Creator", - "Publisher", - "Date", - "Description", - "Language", - "Illustration_48x48@1", -] - -DEFAULT_DEV_ZIM_METADATA = { - "Name": "Test Name", - "Title": "Test Title", - "Creator": "Test Creator", - "Publisher": "Test Publisher", - "Date": "2023-01-01", - "Description": "Test Description", - "Language": "fra", - # blank 48x48 transparent PNG - "Illustration_48x48_at_1": base64.b64decode( - "iVBORw0KGgoAAAANSUhEUgAAADAAAAAwAQMAAABtzGvEAAAAGXRFWHRTb2Z0d2FyZQBB" - "ZG9iZSBJbWFnZVJlYWR5ccllPAAAAANQTFRFR3BMgvrS0gAAAAF0Uk5TAEDm2GYAAAAN" - "SURBVBjTY2AYBdQEAAFQAAGn4toWAAAAAElFTkSuQmCC" - ), -} - RECOMMENDED_MAX_TITLE_LENGTH = 30 MAXIMUM_DESCRIPTION_METADATA_LENGTH = 80 MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH = 4000 diff --git a/src/zimscraperlib/inputs.py b/src/zimscraperlib/inputs.py index 438a9fc0..23fee6b8 100644 --- a/src/zimscraperlib/inputs.py +++ b/src/zimscraperlib/inputs.py @@ -136,3 +136,8 @@ def compute_tags( return { tag.strip() for tag in list(default_tags) + (user_tags or "").split(";") if tag } + + +def unique_values(items: list) -> list: + """Return unique values in input list while preserving list order""" + return list(dict.fromkeys(items)) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index a4558b8d..e0046c87 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -19,25 +19,20 @@ from __future__ import annotations -import datetime import io import logging import pathlib import re import weakref -from collections.abc import Callable, Iterable +from collections.abc import Callable from typing import Any import libzim.writer # pyright: ignore import PIL.Image -import regex # pyright: ignore [reportMissingModuleSource] +import zimscraperlib.zim.metadata from zimscraperlib import logger -from zimscraperlib.constants import ( - DEFAULT_DEV_ZIM_METADATA, - FRONT_ARTICLE_MIMETYPES, - MANDATORY_ZIM_METADATA_KEYS, -) +from zimscraperlib.constants import FRONT_ARTICLE_MIMETYPES from zimscraperlib.filesystem import ( delete_callback, get_content_mimetype, @@ -48,16 +43,12 @@ from zimscraperlib.zim.indexing import IndexData from zimscraperlib.zim.items import StaticItem from zimscraperlib.zim.metadata import ( - validate_counter, - validate_date, - validate_description, - validate_illustrations, - validate_language, - validate_longdescription, - validate_required_values, - validate_standard_str_types, - validate_tags, - validate_title, + DEFAULT_DEV_ZIM_METADATA, + MANDATORY_ZIM_METADATA_KEYS, + IllustrationMetadata, + LanguageMetadata, + Metadata, + StandardMetadataList, ) DUPLICATE_EXC_STR = re.compile( @@ -67,9 +58,6 @@ re.MULTILINE | re.DOTALL, ) -# All control characters are disallowed in str metadata except \n, \r and \t -UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}") - def mimetype_for( path: str, @@ -112,8 +100,8 @@ class Creator(libzim.writer.Creator): Meaning you should exit right after an exception in your code (during zim creation) Use workaround_nocancel=False to disable the workaround. - By default, all metadata are validated for compliance with openZIM guidelines and - conventions. Set disable_metadata_checks=True to disable this validation (you can + By default, all metadata are validated for compliance with openZIM specification and + conventions. Set check_metadata_conventions=True to disable this validation (you can still do checks manually with the validation methods or your own logic). """ @@ -125,10 +113,10 @@ def __init__( *, workaround_nocancel: bool | None = True, ignore_duplicates: bool | None = False, - disable_metadata_checks: bool = False, + check_metadata_conventions: bool = True, ): super().__init__(filename=filename) - self._metadata = {} + self._metadata: dict[str, Metadata] = {} self.__indexing_configured = False self.can_finish = True @@ -143,7 +131,9 @@ def __init__( self.workaround_nocancel = workaround_nocancel self.ignore_duplicates = ignore_duplicates - self.disable_metadata_checks = disable_metadata_checks + zimscraperlib.zim.metadata.check_metadata_conventions = ( + check_metadata_conventions + ) def config_indexing( self, indexing: bool, language: str | None = None # noqa: FBT001 @@ -151,7 +141,7 @@ def config_indexing( """Toggle full-text and title indexing of entries Uses Language metadata's value (or "") if not set""" - language = language or self._metadata.get("Language", "") + language = language or self._get_first_language_metadata_value() or "" if indexing and not is_valid_iso_639_3(language): raise ValueError("Not a valid ISO-639-3 language code") super().config_indexing(indexing, language) @@ -159,217 +149,185 @@ def config_indexing( return self def _log_metadata(self): - """Log (DEBUG) all metadata set on (_metadata ~ config_metadata()) + """Log in DEBUG level all metadata key and value""" + for name, metadata in sorted(self._metadata.items()): + + if metadata is None: + logger.debug(f"Metadata: {name} is None") + continue + + if not hasattr(metadata, "value"): + logger.debug( + f"Metadata: {name} is improper metadata type: " + f"{metadata.__class__.__qualname__}: {metadata!s}" + ) + continue - Does not log metadata set post-start (via add_metadata())""" - for name, value in sorted(self._metadata.items()): # illustration mandates an Image if re.match(r"^Illustration_(\d+)x(\d+)@(\d+)$", name): try: - with PIL.Image.open(io.BytesIO(value)) as img: + with PIL.Image.open(io.BytesIO(metadata.libzim_value)) as img: logger.debug( - f"Metadata: {name} is a {len(value)} bytes " + f"Metadata: {name} is a {len(metadata.libzim_value)} bytes " f"{img.size[0]}x{img.size[1]}px {img.format} Image" ) - except Exception: - logger.debug( - f"Metadata: {name} is a {len(value)} bytes " - f"{get_content_mimetype(value[:64])} blob " - "not recognized as an Image" - ) - continue + continue + except Exception: # noqa: S110 # nosec B110 + pass # bytes are either encoded string or arbitrary data - if isinstance(value, bytes): - mimetype = get_content_mimetype(value[:64]) + if isinstance(metadata.value, bytes | io.BytesIO): + raw_value = ( + metadata.value + if isinstance(metadata.value, bytes) + else metadata.value.getvalue() + ) + mimetype = get_content_mimetype(raw_value[:64]) if not mimetype.startswith("text/"): logger.debug( - f"Metadata: {name} is a {len(value)} bytes {mimetype} blob" + f"Metadata: {name} is a {len(raw_value)} bytes {mimetype} blob" ) continue try: - logger.debug(f"Metadata: {name} = {value.decode('UTF-8')}") + logger.debug(f"Metadata: {name} = {raw_value.decode('UTF-8')}") except Exception: logger.debug( - f"Metadata: {name} is a {len(value)} bytes {mimetype} blob " + f"Metadata: {name} is a {len(raw_value)} bytes {mimetype} blob " "not decodable as an UTF-8 string" ) continue # rest is either printable or unexpected try: - logger.debug(f"Metadata: {name} = {value!s}") + logger.debug(f"Metadata: {name} = {metadata.value!s}") except Exception: logger.debug( - f"Metadata: {name} is unexpected data type: {type(value).__name__}" + f"Metadata: {name} is unexpected data type: " + f"{type(metadata.value).__qualname__}" ) + def _get_first_language_metadata_value(self) -> str | None: + """Private methods to get most popular lang code from Language metadata""" + for metadata in self._metadata.values(): + if isinstance(metadata, LanguageMetadata): + return metadata.libzim_value.decode().split(",")[0] + return None + def start(self): + """Start creator operation at libzim level + + Includes creating metadata, including validating mandatory ones are set, + and configuring indexing. + """ if logger.isEnabledFor(logging.DEBUG): # pragma: no cover self._log_metadata() if not all(self._metadata.get(key) for key in MANDATORY_ZIM_METADATA_KEYS): - raise ValueError("Mandatory metadata are not all set.") - - if not self.disable_metadata_checks: - for name, value in self._metadata.items(): - if value: - self.validate_metadata(name, value) + missing_keys = [ + key + for key in MANDATORY_ZIM_METADATA_KEYS + if not self._metadata.get(key) + ] + raise ValueError( + "Mandatory metadata are not all set. Missing metadata: " + f'{",".join(missing_keys)}. You should prefer to use ' + "StandardMetadataList if possible." + ) - language = self._metadata.get("Language", "").split(",") - if language[0] and not self.__indexing_configured: - self.config_indexing(True, language[0]) + if ( + language := self._get_first_language_metadata_value() + ) and not self.__indexing_configured: + self.config_indexing(True, language) super().__enter__() - self.add_illustration(48, self._metadata["Illustration_48x48@1"]) - del self._metadata["Illustration_48x48@1"] - for name, value in self._metadata.items(): - if value: - self.add_metadata(name, self.convert_and_check_metadata(name, value)) + for metadata in self._metadata.values(): + if isinstance(metadata, IllustrationMetadata): + self.add_illustration(metadata.size, metadata.libzim_value) + else: + self.add_metadata(metadata) + self._metadata.clear() return self - def validate_metadata( + def add_metadata( self, - name: str, - value: bytes | str, + value: Metadata, + mimetype: str = "text/plain;charset=UTF-8", ): - """Ensures metadata value for name is conform with the openZIM spec on Metadata - - Also enforces recommendations - See https://wiki.openzim.org/wiki/Metadata""" + """Really add the metadata to the ZIM, after ZIM creation has started. - validate_required_values(name, value) - validate_standard_str_types(name, value) + You would probably prefer to use config_metadata methods to check metadata + before starting the ZIM, ensure all mandatory metadata are set, and avoid + duplicate metadata name. + """ - validate_title(name, value) # pyright: ignore - validate_date(name, value) # pyright: ignore - validate_language(name, value) # pyright: ignore - validate_counter(name, value) # pyright: ignore - validate_description(name, value) # pyright: ignore - validate_longdescription(name, value) # pyright: ignore - validate_tags(name, value) # pyright: ignore - validate_illustrations(name, value) # pyright: ignore + super().add_metadata(value.name, value.libzim_value, mimetype) - def convert_and_check_metadata( + def config_metadata( self, - name: str, - value: str | bytes | datetime.date | datetime.datetime | Iterable[str], - ) -> str | bytes: - """Convert metadata to appropriate type for few known usecase and check type + std_metadata: StandardMetadataList | list[Metadata], + extra_metadata: list[Metadata] | None = None, + *, + fail_on_missing_prefix_in_extras: bool = True, + ): + """Checks and prepare list of ZIM metadata - Date: converts date and datetime to string YYYY-MM-DD - Tags: converts iterable to string with semi-colon separator + Checks ensure that metadata value can be converted to bytes, including all + requirements of the ZIM specifications and optionally openZIM conventions. - Also checks that final type is appropriate for libzim (str or bytes) - """ - if name == "Date" and isinstance(value, datetime.date | datetime.datetime): - value = value.strftime("%Y-%m-%d") - if ( - name == "Tags" - and not isinstance(value, str) - and not isinstance(value, bytes) - and isinstance(value, Iterable) - ): - value = ";".join(value) - - if not isinstance(value, str) and not isinstance(value, bytes): - raise ValueError(f"Invalid type for {name}: {type(value)}") + Metadata are only kept in memory at this stage, not yet passed to libzim. - return value + They will be passed to libzim / writen to the ZIM on creator.start(). - def add_metadata( - self, - name: str, - value: str | bytes, - mimetype: str = "text/plain;charset=UTF-8", - ): - # drop control characters before passing them to libzim - if isinstance(value, str): - value = UNWANTED_CONTROL_CHARACTERS_REGEX.sub("", value).strip(" \r\n\t") - if not self.disable_metadata_checks: - self.validate_metadata(name, value) + Arguments: + std_metadata: standard metadata defined in the ZIM specifications. + Prefer to use StandardMetadataList which ensure mandatory metadata are + all set. + extra_metadata: a list of extra metadata (not standard). + fail_on_missing_prefix_in_extras: disable the default check which force the + X- prefix on extra metadata name which is a convention to distinguish + these extra metadata - super().add_metadata(name, value, mimetype) + """ + for fail_on_missing_prefix, metadata in [ + (False, metadata) + for metadata in ( + std_metadata.values() + if isinstance(std_metadata, StandardMetadataList) + else std_metadata + ) + ] + [ + (fail_on_missing_prefix_in_extras, metadata) + for metadata in extra_metadata or [] + ]: + if fail_on_missing_prefix and not metadata.name.startswith("X-"): + raise ValueError( + f"Metadata key {metadata.name} does not starts with X- as expected" + ) + # if metadata.name in self._metadata: + # raise ValueError(f"{metadata.name} cannot be defined twice") + self._metadata[metadata.name] = metadata - # there are many N803 problems, but they are intentional to match real tag name - def config_metadata( - self, - *, - Name: str, # noqa: N803 - Language: str, # noqa: N803 - Title: str, # noqa: N803 - Description: str, # noqa: N803 - LongDescription: str | None = None, # noqa: N803 - Creator: str, # noqa: N803 - Publisher: str, # noqa: N803 - Date: datetime.datetime | datetime.date | str, # noqa: N803 - Illustration_48x48_at_1: bytes, # noqa: N803 - Tags: Iterable[str] | str | None = None, # noqa: N803 - Scraper: str | None = None, # noqa: N803 - Flavour: str | None = None, # noqa: N803 - Source: str | None = None, # noqa: N803 - License: str | None = None, # noqa: N803 - Relation: str | None = None, # noqa: N803 - **extras: ( - None - | float - | int - | bytes - | str - | datetime.datetime - | datetime.date - | Iterable[str] - ), - ): - """Sets all mandatory Metadata as well as standard and any other text ones""" - self._metadata.update( - { - "Name": Name, - "Title": Title, - "Creator": Creator, - "Publisher": Publisher, - "Date": Date, - "Description": Description, - "Language": Language, - "License": License, - "LongDescription": LongDescription, - "Tags": Tags, - "Relation": Relation, - "Flavour": Flavour, - "Source": Source, - "Scraper": Scraper, - "Illustration_48x48@1": Illustration_48x48_at_1, - } - ) - self._metadata.update(extras) - for metadata_key, metadata_value in self._metadata.items(): - # drop control characters so that proper value is stored in memory and - # logged in DEBUG mode ; also strip blank characters - if isinstance(metadata_value, str): - self._metadata[metadata_key] = UNWANTED_CONTROL_CHARACTERS_REGEX.sub( - "", metadata_value - ).strip(" \r\n\t") return self def config_dev_metadata( self, - **extras: ( - None - | int - | float - | bytes - | str - | datetime.datetime - | datetime.date - | Iterable[str] - ), + extra_metadata: Metadata | list[Metadata] | None = None, ): - """Calls config_metadata with default (yet overridable) values for dev""" - devel_default_metadata = DEFAULT_DEV_ZIM_METADATA.copy() - devel_default_metadata.update(extras) - return self.config_metadata(**devel_default_metadata) + """Calls minimal set of mandatory metadata with default values for dev + + Extra metadata can be passed, and they are not checked for proper key prefix + """ + return self.config_metadata( + std_metadata=DEFAULT_DEV_ZIM_METADATA, + extra_metadata=( + [extra_metadata] + if isinstance(extra_metadata, Metadata) + else extra_metadata + ), + fail_on_missing_prefix_in_extras=False, + ) def add_item_for( self, diff --git a/src/zimscraperlib/zim/filesystem.py b/src/zimscraperlib/zim/filesystem.py index bdf0125a..6cbc73e3 100644 --- a/src/zimscraperlib/zim/filesystem.py +++ b/src/zimscraperlib/zim/filesystem.py @@ -38,6 +38,7 @@ from zimscraperlib.filesystem import get_file_mimetype from zimscraperlib.html import find_title_in_file from zimscraperlib.types import get_mime_for_name +from zimscraperlib.zim import metadata from zimscraperlib.zim.creator import Creator from zimscraperlib.zim.items import StaticItem @@ -169,29 +170,31 @@ def make_zim_file( filename=fpath, main_path=main_page, ignore_duplicates=ignore_duplicates, - disable_metadata_checks=disable_metadata_checks, + check_metadata_conventions=disable_metadata_checks, ).config_metadata( - **{ - k: v - for k, v in { - # (somewhat) mandatory - "Name": name, - "Title": title, - "Description": description, - "Date": date or datetime.date.today(), # noqa: DTZ011 - "Language": language, - "Creator": creator, - "Publisher": publisher, - # optional - "Tags": ";".join(tags) if tags else None, - "Source": source, - "Flavour": flavour, - "Scraper": scraper, - "LongDescription": long_description, - "Illustration_48x48_at_1": illustration_data, - }.items() - if v is not None - } + metadata.StandardMetadataList( + # mandatory + Name=metadata.NameMetadata(name), + Title=metadata.TitleMetadata(title), + Description=metadata.DescriptionMetadata(description), + Date=metadata.DateMetadata(date or datetime.date.today()), # noqa: DTZ011 + Language=metadata.LanguageMetadata(language), + Creator=metadata.CreatorMetadata(creator), + Publisher=metadata.PublisherMetadata(publisher), + Illustration_48x48_at_1=metadata.IllustrationMetadata( + "Illustration_48x48@1", illustration_data + ), + # optional + Tags=metadata.TagsMetadata(list(tags)) if tags else None, + Source=metadata.SourceMetadata(source) if source else None, + Flavour=metadata.FlavourMetadata(flavour) if flavour else None, + Scraper=metadata.ScraperMetadata(scraper) if scraper else None, + LongDescription=( + metadata.LongDescriptionMetadata(long_description) + if long_description + else None + ), + ) ) zim_file.start() diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index 411e42f6..0fe4f508 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -1,21 +1,34 @@ from __future__ import annotations +import base64 import datetime +import io import re -from collections.abc import Iterable -from typing import Any +from dataclasses import dataclass import regex from zimscraperlib.constants import ( ILLUSTRATIONS_METADATA_RE, - MANDATORY_ZIM_METADATA_KEYS, MAXIMUM_DESCRIPTION_METADATA_LENGTH, MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH, RECOMMENDED_MAX_TITLE_LENGTH, ) from zimscraperlib.i18n import is_valid_iso_639_3 from zimscraperlib.image.probing import is_valid_image +from zimscraperlib.inputs import unique_values + +# All control characters are disallowed in str metadata except \n, \r and \t +UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}") + +# flag to enable / disable check conventions (e.g. title shorter than 30 chars, +# description shorter than 80 chars, ...) +check_metadata_conventions: bool = True + + +def clean_str(value: str) -> str: + """Clean a string value for unwanted control characters and strip white chars""" + return UNWANTED_CONTROL_CHARACTERS_REGEX.sub("", value).strip(" \r\n\t") def nb_grapheme_for(value: str) -> int: @@ -23,123 +36,436 @@ def nb_grapheme_for(value: str) -> int: return len(regex.findall(r"\X", value)) -def validate_required_values(name: str, value: Any): - """ensures required ones have a value (spec doesnt requires it but makes sense)""" - if name in MANDATORY_ZIM_METADATA_KEYS and not value: - raise ValueError(f"Missing value for {name}") - - -def validate_standard_str_types( - name: str, - value: str | bytes, -): - """ensures standard string metadata are indeed str""" - if name in ( - "Name", - "Title", - "Creator", - "Publisher", - "Description", - "LongDescription", - "License", - "Relation", - "Relation", - "Flavour", - "Source", - "Scraper", - ) and not isinstance(value, str): - raise ValueError(f"Invalid type for {name}: {type(value)}") - - -def validate_title(name: str, value: str): - """ensures Title metadata is within recommended length""" - if name == "Title" and nb_grapheme_for(value) > RECOMMENDED_MAX_TITLE_LENGTH: - raise ValueError(f"{name} is too long.") - - -def validate_date(name: str, value: datetime.datetime | datetime.date | str): - """ensures Date metadata can be casted to an ISO 8601 string""" - if name == "Date": - if not isinstance(value, datetime.datetime | datetime.date | str): - raise ValueError(f"Invalid type for {name}: {type(value)}") - elif isinstance(value, str): - match = re.match(r"(?P\d{4})-(?P\d{2})-(?P\d{2})", value) - if not match: - raise ValueError(f"Invalid {name} format, not matching regex") - try: - datetime.date(**{k: int(v) for k, v in match.groupdict().items()}) - except Exception as exc: - raise ValueError(f"Invalid {name} format: {exc}") from None - - -def validate_language(name: str, value: Iterable[str] | str): - """ensures Language metadata is a single or list of ISO-639-3 codes""" - if name == "Language": - if isinstance(value, str): - value = value.split(",") - for code in value: - if not is_valid_iso_639_3(code): - raise ValueError(f"{code} is not ISO-639-3.") +class Metadata: + """A very basic metadata, with a name and a bytes or io.BytesIO value + + Compliance with ZIM specification is done at initialisation, value passed at + initialization is kept in memory in `value` attribute, target name is stored in + `name` value and conversion to libzim value is available with `libzim_value` + property. + """ + + def __init__(self, name: str, value: bytes | io.BytesIO) -> None: + if not isinstance(value, bytes | io.BytesIO): + raise ValueError( + f"Unexpected type passed to {self.__class__.__qualname__}: " + f"{value.__class__.__qualname__}" + ) + if name == "Counter": + raise ValueError("Counter cannot be set. libzim sets it.") + self.name = name + self.value = value + libzim_value = self.libzim_value # to check for errors + if libzim_value is None or len(libzim_value) == 0: + raise ValueError("Cannot set empty metadata") + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + if isinstance(self.value, io.BytesIO): + return self.value.getvalue() + return self.value -def validate_counter(name: str, _: str): - """ensures Counter metadata is not manually set""" - if name == "Counter": - raise ValueError(f"{name} cannot be set. libzim sets it.") - - -def validate_description(name: str, value: str): - """ensures Description metadata is with required length""" - if ( - name == "Description" - and nb_grapheme_for(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH - ): - raise ValueError(f"{name} is too long.") - - -def validate_longdescription(name: str, value: str): - """ensures LongDescription metadata is with required length""" - if ( - name == "LongDescription" - and nb_grapheme_for(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH - ): - raise ValueError(f"{name} is too long.") - - -def validate_tags(name: str, value: Iterable[str] | str): - """ensures Tags metadata is either one or a list of strings""" - if name == "Tags" and ( - not isinstance(value, Iterable) - or not all(isinstance(tag, str) for tag in value) - ): - raise ValueError(f"Invalid type(s) for {name}") - if ( - name == "Tags" - and not isinstance(value, str) - and isinstance(value, Iterable) - and len(set(value)) != len(list(value)) - ): - raise ValueError(f"Duplicate tags are not valid: {value}") - if name == "Tags" and isinstance(value, str): - values = value.split(";") - if len(set(values)) != len(list(values)): - raise ValueError(f"Duplicate tags are not valid: {value}") - - -def validate_illustrations(name: str, value: bytes): - """ensures illustrations are PNG images or the advertised size""" +class _TextMetadata(Metadata): + """A basic metadata whose value is expected to be some text""" + + def __init__(self, name: str, value: bytes | io.BytesIO | str) -> None: + if not isinstance(value, bytes | io.BytesIO | str): + raise ValueError( + f"Unexpected type passed to {self.__class__.__qualname__}: " + f"{value.__class__.__qualname__}" + ) + super().__init__( + name=name, + value=( + value.encode() + if isinstance(value, str) + else value.getvalue() if isinstance(value, io.BytesIO) else value + ), + ) + self.value = value + _ = self.libzim_value # to check for errors + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + # decode and reencode byte types to validate it's proper UTF-8 text + if isinstance(self.value, io.BytesIO): + str_value = self.value.getvalue().decode() + elif isinstance(self.value, bytes): + str_value = self.value.decode() + else: + str_value = self.value + return clean_str(str_value).encode() + + +def _check_for_allowed_custom_metadata_name(name: str, class_name: str): + """Check that metadata name is not among the standard ones for which a type exist""" + if name in DEFAULT_DEV_ZIM_METADATA.__dict__.keys(): + # this list contains the 'bad' illustration keys, but we don't care, they should + # still not be used + raise ValueError( + f"It is not allowed to use {class_name} for standard {name} " + f"metadata. Please use {name}Metadata type for proper checks" + ) if name.startswith("Illustration_"): - match = ILLUSTRATIONS_METADATA_RE.match(name) - if match and not is_valid_image( - image=value, - imformat="PNG", - size=( - int(match.groupdict()["width"]), - int(match.groupdict()["height"]), + raise ValueError( + f"It is not allowed to use {class_name} for standard Illustration" + "metadata. Please use IllustrationMetadata type for proper checks" + ) + + +class CustomTextMetadata(_TextMetadata): + """A text metadata for which we do little checks""" + + def __init__(self, name: str, value: bytes | io.BytesIO | str) -> None: + _check_for_allowed_custom_metadata_name(name, self.__class__.__qualname__) + super().__init__(name=name, value=value) + + +class CustomMetadata(Metadata): + """A bytes metadata for which we do little checks""" + + def __init__(self, name: str, value: bytes | io.BytesIO) -> None: + _check_for_allowed_custom_metadata_name(name, self.__class__.__qualname__) + super().__init__(name=name, value=value) + + +class _MandatoryTextMetadata(_TextMetadata): + """A mandatory (value must be set) text metadata""" + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + value = super().libzim_value + if len(value) == 0: + raise ValueError("Missing value for mandatory metadata") + return value + + +class _MandatoryMetadata(Metadata): + """A mandatory (value must be set) bytes metadata""" + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + value = super().libzim_value + if len(value) == 0: + raise ValueError("Missing value for mandatory metadata") + return value + + +class TitleMetadata(_MandatoryTextMetadata): + """The Title metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__(name="Title", value=value) + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + value = super().libzim_value + if check_metadata_conventions: + if nb_grapheme_for(value.decode()) > RECOMMENDED_MAX_TITLE_LENGTH: + raise ValueError("Title is too long.") + return value + + +class DescriptionMetadata(_MandatoryTextMetadata): + """The Description metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__(name="Description", value=value) + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + value = super().libzim_value + if check_metadata_conventions: + if nb_grapheme_for(value.decode()) > MAXIMUM_DESCRIPTION_METADATA_LENGTH: + raise ValueError("Description is too long.") + return value + + +class LongDescriptionMetadata(_MandatoryTextMetadata): + """The LongDescription metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__(name="LongDescription", value=value) + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + value = super().libzim_value + if check_metadata_conventions: + if ( + nb_grapheme_for(value.decode()) + > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH + ): + raise ValueError("LongDescription is too long.") + return value + + +class DateMetadata(_TextMetadata): + """The Date metadata""" + + def __init__(self, value: bytes | str | datetime.date | datetime.datetime) -> None: + if not isinstance(value, bytes | str | datetime.date | datetime.datetime): + raise ValueError( + f"Unexpected type passed to {self.__class__.__qualname__}: " + f"{value.__class__.__qualname__}" + ) + super().__init__( + name="Date", + value=( + value + if isinstance(value, bytes) + else ( + value.encode() + if isinstance(value, str) + else value.strftime("%Y-%m-%d").encode() + ) ), + ) + self.value = value + _ = self.libzim_value # to check for errors + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + if isinstance(self.value, datetime.date | datetime.datetime): + return self.value.strftime("%Y-%m-%d").encode() + match = re.match( + r"(?P\d{4})-(?P\d{2})-(?P\d{2})", + self.value.decode() if isinstance(self.value, bytes) else self.value, + ) + if not match: + raise ValueError("Invalid date format, not matching regex yyyy-mm-dd") + try: + datetime.date(**{k: int(v) for k, v in match.groupdict().items()}) + except Exception as exc: + raise ValueError(f"Invalid date format: {exc}") from None + return self.value if isinstance(self.value, bytes) else self.value.encode() + + +class IllustrationMetadata(_MandatoryMetadata): + """Any Illustration_**x**@* metadata""" + + def __init__(self, name: str, value: bytes | io.BytesIO) -> None: + super().__init__(name=name, value=value) + _ = self.libzim_value # to check for errors + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + match = ILLUSTRATIONS_METADATA_RE.match(self.name) + if not match: + raise ValueError("Illustration metadata has improper name") + self.size = int(match.groupdict()["height"]) + if int(match.groupdict()["width"]) != self.size: + raise ValueError("Illustration is not squared") + if not is_valid_image( + image=self.value, + imformat="PNG", + size=(self.size, self.size), ): raise ValueError( - f"{name} is not a " - f"{match.groupdict()['width']}x{match.groupdict()['height']} " - "PNG Image" + f"{self.name} is not a valid {self.size}x{self.size} PNG Image" ) + if isinstance(self.value, io.BytesIO): + return self.value.getvalue() + else: + return self.value + + +class LanguageMetadata(_MandatoryTextMetadata): + """The Language metadata""" + + def __init__(self, value: bytes | io.BytesIO | str | list[str] | set[str]) -> None: + if not isinstance(value, bytes | io.BytesIO | str | list | set): + raise ValueError( + f"Unexpected type passed to {self.__class__.__qualname__}: " + f"{value.__class__.__qualname__}" + ) + if isinstance(value, list | set) and not all( + isinstance(item, str) for item in value + ): + bad_types = {item.__class__.__qualname__ for item in value} - {"str"} + raise ValueError( + f"Invalid type(s) found in Iterable: {",".join(bad_types)}" + ) + super().__init__( + name="Language", + value=",".join(value) if isinstance(value, list | set) else value, + ) + self.value = value + _ = self.libzim_value # to check for errors + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + if isinstance(self.value, bytes | io.BytesIO | str): + codes = [ + clean_str(code) for code in super().libzim_value.decode().split(",") + ] + else: + codes = [clean_str(code) for code in self.value] + codes = [code for code in codes if code] # remove empty values + if len(codes) == 0: + raise ValueError("Missing value for mandatory Language metadata") + if len(set(codes)) != len(codes): + raise ValueError("Duplicate codes not allowed in Language metadata") + for code in codes: + if not is_valid_iso_639_3(code): + raise ValueError(f"{code} is not ISO-639-3.") + return ",".join(codes).encode() + + +class TagsMetadata(_TextMetadata): + """The Tags metadata""" + + def __init__(self, value: bytes | io.BytesIO | str | list[str] | set[str]) -> None: + if not isinstance(value, bytes | io.BytesIO | str | list | set): + raise ValueError( + f"Unexpected type passed to {self.__class__.__qualname__}: " + f"{value.__class__.__qualname__}" + ) + if isinstance(value, list | set) and not all( + isinstance(item, str) for item in value + ): + bad_types = {item.__class__.__qualname__ for item in value} - {"str"} + raise ValueError( + f"Invalid type(s) found in Iterable: {",".join(bad_types)}" + ) + super().__init__( + name="Tags", + value=";".join(value) if isinstance(value, list | set) else value, + ) + self.value = value + _ = self.libzim_value # to check for errors + + @property + def libzim_value(self) -> bytes: + """The value to pass to the libzim for creating the metadata""" + if isinstance(self.value, bytes | io.BytesIO | str): + tags = unique_values( + [clean_str(tag) for tag in super().libzim_value.decode().split(";")] + ) + else: + tags = unique_values([clean_str(tag) for tag in self.value]) + tags = [tag for tag in tags if tag] # remove empty values + return ";".join(tags).encode() + + +class NameMetadata(_MandatoryTextMetadata): + """The Name metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__("Name", value) + + +class CreatorMetadata(_MandatoryTextMetadata): + """The Creator metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__("Creator", value) + + +class PublisherMetadata(_MandatoryTextMetadata): + """The Publisher metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__("Publisher", value) + + +class ScraperMetadata(_TextMetadata): + """The Scraper metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__("Scraper", value) + + +class FlavourMetadata(_TextMetadata): + """The Flavour metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__("Flavour", value) + + +class SourceMetadata(_TextMetadata): + """The Source metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__("Source", value) + + +class LicenseMetadata(_TextMetadata): + """The License metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__("License", value) + + +class RelationMetadata(_TextMetadata): + """The Relation metadata""" + + def __init__(self, value: bytes | io.BytesIO | str) -> None: + super().__init__("Relation", value) + + +@dataclass +class StandardMetadataList: + """A class holding all openZIM standard metadata + + Useful to ensure that all mandatory metadata are set, no typo occurs in metadata + name and the specification is respected (forbidden duplicate values, ...). + """ + + Name: NameMetadata + Language: LanguageMetadata + Title: TitleMetadata + Creator: CreatorMetadata + Publisher: PublisherMetadata + Date: DateMetadata + Illustration_48x48_at_1: IllustrationMetadata + Description: DescriptionMetadata + LongDescription: LongDescriptionMetadata | None = None + Tags: TagsMetadata | None = None + Scraper: ScraperMetadata | None = None + Flavour: FlavourMetadata | None = None + Source: SourceMetadata | None = None + License: LicenseMetadata | None = None + Relation: RelationMetadata | None = None + + def values(self) -> list[Metadata]: + return [v for v in self.__dict__.values() if v] + + +# A sample standard metadata list, to be used for dev purposes when one does not care +# at all about metadata values ; this ensures all metadata are compliant with the spec +DEFAULT_DEV_ZIM_METADATA = StandardMetadataList( + Name=NameMetadata("Test Name"), + Title=TitleMetadata("Test Title"), + Creator=CreatorMetadata("Test Creator"), + Publisher=PublisherMetadata("Test Publisher"), + Date=DateMetadata("2023-01-01"), + Description=DescriptionMetadata("Test Description"), + Language=LanguageMetadata("fra"), + # blank 48x48 transparent PNG + Illustration_48x48_at_1=IllustrationMetadata( + "Illustration_48x48@1", + base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAADAAAAAwAQMAAABtzGvEAAAAGXRFWHRTb2Z0d2FyZQBB" + "ZG9iZSBJbWFnZVJlYWR5ccllPAAAAANQTFRFR3BMgvrS0gAAAAF0Uk5TAEDm2GYAAAAN" + "SURBVBjTY2AYBdQEAAFQAAGn4toWAAAAAElFTkSuQmCC" + ), + ), +) + +# list of mandatory metadata of the zim file, automatically computed +MANDATORY_ZIM_METADATA_KEYS = [ + metadata.name + for metadata in DEFAULT_DEV_ZIM_METADATA.__dict__.values() + if isinstance(metadata, _MandatoryTextMetadata | _MandatoryMetadata) +] diff --git a/tests/inputs/test_inputs.py b/tests/inputs/test_inputs.py index fd8a2cda..016b3371 100644 --- a/tests/inputs/test_inputs.py +++ b/tests/inputs/test_inputs.py @@ -9,18 +9,18 @@ import zimscraperlib from zimscraperlib.constants import CONTACT -from zimscraperlib.constants import ( - MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LENGTH, -) -from zimscraperlib.constants import ( - MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH, -) from zimscraperlib.constants import NAME as PROJECT_NAME from zimscraperlib.inputs import ( compute_descriptions, compute_tags, handle_user_provided_file, ) +from zimscraperlib.zim.metadata import ( + MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LENGTH, +) +from zimscraperlib.zim.metadata import ( + MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH, +) def test_with_none(): diff --git a/tests/zim/conftest.py b/tests/zim/conftest.py index 1efb5129..eeaf4544 100644 --- a/tests/zim/conftest.py +++ b/tests/zim/conftest.py @@ -3,6 +3,8 @@ import pytest +import zimscraperlib.zim.metadata + @pytest.fixture(scope="function") def html_str(): @@ -101,3 +103,10 @@ def counters(): "application/xml": 1, "image/svg+xml": 5, } + + +@pytest.fixture +def ignore_metadata_conventions(): + zimscraperlib.zim.metadata.check_metadata_conventions = False + yield + zimscraperlib.zim.metadata.check_metadata_conventions = True diff --git a/tests/zim/test_metadata.py b/tests/zim/test_metadata.py index fce742ea..24cdc8e3 100644 --- a/tests/zim/test_metadata.py +++ b/tests/zim/test_metadata.py @@ -1,7 +1,12 @@ from __future__ import annotations +import base64 +import dataclasses +import datetime +import io +import pathlib import re -from collections.abc import Iterable +from typing import NamedTuple import pytest @@ -9,55 +14,437 @@ @pytest.mark.parametrize( - "name, value", + "value", [ - ("Language", "fra"), - ("Language", "fra,eng"), - ("Language", ["fra", "eng"]), - ("Other", "not_an_iso_639_3_code"), + ("fra"), + ("eng"), + ("bam"), + ("fra,eng"), + ("eng, fra"), + ("fra,eng,bam"), + (["fra", "eng"]), + ("fra, eng"), # codes are automatically cleaned-up + ("eng, \r"), # codes are automatically cleaned-up + (["fra", " "]), # codes are automatically cleaned-up ], ) -def test_validate_language_valid(name: str, value: Iterable[str] | str): - metadata.validate_language(name, value) +def test_validate_language_valid(value: list[str] | str): + assert metadata.LanguageMetadata(value) @pytest.mark.parametrize( - "name, value", + "value,error", [ - ("Language", "fr"), - ("Language", "fra;eng"), - ("Language", "fra, eng"), + ("", "Missing value for mandatory metadata"), + ("fr", "is not ISO-639-3"), + ("en", "is not ISO-639-3"), + ("xxx", "is not ISO-639-3"), + ("rmr", "is not ISO-639-3"), + ("fra,en,bam", "is not ISO-639-3"), + ("fra;eng", "is not ISO-639-3"), + ("fra,fra", "Duplicate codes not allowed in Language metadata"), + (["fra", "fra"], "Duplicate codes not allowed in Language metadata"), + ([], "Missing value for mandatory metadata"), + ([" ", "\t"], "Missing value for mandatory Language metadata"), + (" , ", "Missing value for mandatory Language metadata"), + (["fra", 1], "Invalid type(s) found in Iterable:"), + (["fra", b"1"], "Invalid type(s) found in Iterable:"), + (["fra", 1, b"1"], "Invalid type(s) found in Iterable:"), ], ) -def test_validate_language_invalid(name: str, value: Iterable[str] | str): - with pytest.raises(ValueError, match=re.escape("is not ISO-639-3")): - metadata.validate_language(name, value) +def test_validate_language_invalid(value: list[str] | str, error: str): + with pytest.raises(ValueError, match=re.escape(error)): + assert metadata.LanguageMetadata(value) @pytest.mark.parametrize( "tags, is_valid", [ - pytest.param("", True, id="empty_string"), - pytest.param("tag1", True, id="empty_string"), + pytest.param("wikipedia", True, id="simple_tag"), pytest.param("taaaag1", True, id="many_letters"), - pytest.param("tag1;tag2", True, id="semi_colon_distinct_1"), - pytest.param("tag2;tag2", False, id="semi_colon_identical"), + pytest.param("wikipedia; ", True, id="remove_empty_values"), + pytest.param("wikipedia ; football", True, id="semi_colon_distinct_1"), + pytest.param("football;football", True, id="semi_colon_identical"), pytest.param("tag,1;tug,1", True, id="semi_colon_distinct_2"), pytest.param( - "tag1,tag2", True, id="comma" + "wikipedia,football", True, id="comma" ), # we cannot say that this ought to be a tags separator - pytest.param({"tag1"}, True, id="one_tag_in_set"), - pytest.param({"tag1", "tag2"}, True, id="two_tags_in_set"), - pytest.param(1, False, id="one_int"), - pytest.param(None, False, id="none_value"), - pytest.param(["tag1", "tag2"], True, id="two_distinct"), - pytest.param(["tag1", "tag1"], False, id="two_identical"), - pytest.param(["tag1", 1], False, id="int_in_list"), + pytest.param({"wikipedia"}, True, id="one_tag_in_set"), + pytest.param({"wikipedia", "football"}, True, id="two_tags_in_set"), + pytest.param(["wikipedia", "football"], True, id="two_distinct"), + pytest.param(["wikipedia", "wikipedia"], True, id="two_identical"), ], ) -def test_validate_tags(tags, is_valid): +def test_validate_tags_valid(tags, is_valid): if is_valid: - metadata.validate_tags("Tags", tags) + metadata.TagsMetadata(tags) else: - with pytest.raises(ValueError): - metadata.validate_tags("Tags", tags) + with pytest.raises((ValueError, TypeError)): + metadata.TagsMetadata(tags) + + +@pytest.mark.parametrize( + "value, error", + [ + pytest.param("", "Cannot set empty metadata", id="empty_string"), + pytest.param(1, "Unexpected type passed to TagsMetadata: int", id="one_int"), + pytest.param( + None, "Unexpected type passed to TagsMetadata: NoneType", id="none_value" + ), + pytest.param( + ["wikipedia", 1], "Invalid type(s) found in Iterable: int", id="int_in_list" + ), + ], +) +def test_validate_tags_invalid(value: list[str] | str, error: str): + with pytest.raises(ValueError, match=re.escape(error)): + metadata.TagsMetadata(value) + + +def test_validate_dedup_tags(): + assert ( + metadata.TagsMetadata(" wikipedia \t ; wikipedia").libzim_value + == b"wikipedia" + ) + + +def test_validate_short_title_check_enabled(): + assert metadata.TitleMetadata("T" * 30) + + +def test_validate_short_grapheme_title_check_enabled(): + assert metadata.TitleMetadata("में" * 30) + + +def test_validate_too_long_title_check_enabled(): + with pytest.raises(ValueError, match="Title is too long"): + assert metadata.TitleMetadata("T" * 31) + + +def test_validate_too_long_title_check_disabled( + ignore_metadata_conventions, # noqa: ARG001 +): + assert metadata.TitleMetadata("T" * 31) + + +def test_validate_short_description_check_enabled(): + assert metadata.DescriptionMetadata("T" * 80) + + +def test_validate_short_grapheme_description_check_enabled(): + assert metadata.DescriptionMetadata("में" * 80) + + +def test_validate_too_long_description_check_enabled(): + with pytest.raises(ValueError, match="Description is too long"): + assert metadata.DescriptionMetadata("T" * 81) + + +def test_validate_too_long_description_check_disabled( + ignore_metadata_conventions, # noqa: ARG001 +): + assert metadata.DescriptionMetadata("T" * 81) + + +def test_validate_short_longdescription_check_enabled(): + assert metadata.LongDescriptionMetadata("T" * 4000) + + +def test_validate_short_grapheme_longdescription_check_enabled(): + assert metadata.LongDescriptionMetadata("में" * 4000) + + +def test_validate_too_long_longdescription_check_enabled(): + with pytest.raises(ValueError, match="Description is too long"): + assert metadata.LongDescriptionMetadata("T" * 4001) + + +def test_validate_too_long_longdescription_check_disabled( + ignore_metadata_conventions, # noqa: ARG001 +): + assert metadata.LongDescriptionMetadata("T" * 4001) + + +def test_validate_date_datetime_date(): + assert metadata.DateMetadata(datetime.date(2024, 12, 11)) + + +def test_validate_date_datetime_datetime(): + assert metadata.DateMetadata( + datetime.datetime(2024, 12, 11, 0, 0, 0, tzinfo=datetime.UTC) + ) + + +@pytest.mark.parametrize("value", [("9999-99-99"), ("2023/02/29"), ("1969-13-31")]) +def test_validate_date_invalid_date(value): + with pytest.raises(ValueError, match="Invalid date format"): + metadata.DateMetadata(value) + + +def test_validate_illustration_invalid_name(): + with pytest.raises(ValueError, match="Illustration metadata has improper name"): + metadata.IllustrationMetadata("Illustration_48x48_at_1", b"") + + +def test_validate_illustration_not_squared(): + with pytest.raises(ValueError, match="Illustration is not squared"): + metadata.IllustrationMetadata("Illustration_48x96@1", b"") + + +def test_validate_illustration_wrong_sizes(png_image2): + with open(png_image2, "rb") as fh: + png_data = fh.read() + with pytest.raises( + ValueError, match="Illustration_48x48@1 is not a valid 48x48 PNG Image" + ): + metadata.IllustrationMetadata("Illustration_48x48@1", png_data) + + +def test_blank_metadata(): + with pytest.raises(ValueError, match="Cannot set empty metadata"): + metadata.Metadata("Blank", b"") + + +class MetadataInitConfig(NamedTuple): + a_type: type + nb_args: int + + +@pytest.fixture( + params=[ + MetadataInitConfig(metadata.Metadata, 2), + MetadataInitConfig(metadata._TextMetadata, 2), + MetadataInitConfig(metadata.CustomTextMetadata, 2), + MetadataInitConfig(metadata.CustomMetadata, 2), + MetadataInitConfig(metadata._MandatoryTextMetadata, 2), + MetadataInitConfig(metadata._MandatoryMetadata, 2), + MetadataInitConfig(metadata.TitleMetadata, 1), + MetadataInitConfig(metadata.DescriptionMetadata, 1), + MetadataInitConfig(metadata.LongDescriptionMetadata, 1), + MetadataInitConfig(metadata.DateMetadata, 1), + MetadataInitConfig(metadata.IllustrationMetadata, 2), + MetadataInitConfig(metadata.LanguageMetadata, 1), + MetadataInitConfig(metadata.TagsMetadata, 1), + MetadataInitConfig(metadata.NameMetadata, 1), + MetadataInitConfig(metadata.CreatorMetadata, 1), + MetadataInitConfig(metadata.PublisherMetadata, 1), + MetadataInitConfig(metadata.ScraperMetadata, 1), + MetadataInitConfig(metadata.FlavourMetadata, 1), + MetadataInitConfig(metadata.SourceMetadata, 1), + MetadataInitConfig(metadata.LicenseMetadata, 1), + MetadataInitConfig(metadata.RelationMetadata, 1), + ] +) +def metadata_init(request: pytest.FixtureRequest) -> MetadataInitConfig: + return request.param + + +def test_none_metadata_two_args(metadata_init: MetadataInitConfig): + with pytest.raises( + ValueError, + match=f"Unexpected type passed to {metadata_init.a_type.__qualname__}: " + "NoneType", + ): + if metadata_init.nb_args == 2: + metadata_init.a_type( + "Foo", + None, # pyright: ignore[reportArgumentType] + ) + elif metadata_init.nb_args == 1: + metadata_init.a_type(None) + + +@pytest.mark.parametrize( + "a_type", + [ + (metadata.Metadata), + (metadata.CustomTextMetadata), + (metadata._TextMetadata), + (metadata._MandatoryMetadata), + ], +) +def test_reserved_counter_metadata(a_type: type): + with pytest.raises(ValueError, match="Counter cannot be set. libzim sets it."): + a_type("Counter", b"Foo") + + +@pytest.mark.parametrize( + "metadata_name", + [ + ("Name"), + ("Language"), + ("Title"), + ("Creator"), + ("Publisher"), + ("Date"), + ("Illustration_48x48@1"), + ("Illustration_96x96@1"), + ("Description"), + ("LongDescription"), + ("Tags"), + ("Scraper"), + ("Flavour"), + ("Source"), + ("License"), + ("Relation"), + ], +) +@pytest.mark.parametrize( + "metadata_type", + [ + (metadata.CustomMetadata), + (metadata.CustomTextMetadata), + ], +) +def test_reserved_names(metadata_name: str, metadata_type: type): + with pytest.raises(ValueError, match="It is not allowed to use"): + metadata_type(metadata_name, b"a value") + + +def test_mandatory_value(metadata_init: MetadataInitConfig): + + with pytest.raises( + ValueError, + match=( + "is not a valid 48x48 PNG Image" + if metadata_init.a_type == metadata.IllustrationMetadata + else ( + "Invalid date format, not matching regex" + if metadata_init.a_type == metadata.DateMetadata + else "Cannot set empty metadata|Missing value for mandatory metadata" + ) + ), + ): + if metadata_init.nb_args == 2: + metadata_init.a_type( + ( + "Foo" + if metadata_init.a_type != metadata.IllustrationMetadata + else "Illustration_48x48@1" + ), + b"", + ) + elif metadata_init.nb_args == 1: + metadata_init.a_type(b"") + + +def test_clean_value(metadata_init: MetadataInitConfig): + raw_value = b"\t\n\r\n \tA val \t\awith \bcontrol chars\v\n" + clean_value = b"A val \twith control chars" + if metadata_init.a_type in ( + # some types do not support the raw value + metadata.IllustrationMetadata, + metadata.LanguageMetadata, + metadata.DateMetadata, + # and binary types are not cleaned-up + metadata.Metadata, + metadata.CustomMetadata, + metadata._MandatoryMetadata, + ): + pytest.skip("unsupported configuration") # ignore these test cases + if metadata_init.nb_args == 2: + assert metadata_init.a_type(("Foo"), raw_value).libzim_value == clean_value + elif metadata_init.nb_args == 1: + assert metadata_init.a_type(raw_value).libzim_value == clean_value + + +# @pytest.mark.parametrize( +# "metadata, expected_value", +# [ +# (metadata.Metadata("Foo", b"a value"), b"a value"), +# (metadata.Metadata("Foo", io.BytesIO(b"a value")), b"a value"), +# (metadata.DescriptionMetadata(io.BytesIO(b"a value")), b"a value"), +# (metadata.DescriptionMetadata(b"a value"), b"a value"), +# (metadata.DescriptionMetadata("a value"), b"a value"), +# (metadata.IllustrationMetadata(png), b"a value"), +# ], +# ) +def test_libzim_bytes_value(metadata_init: MetadataInitConfig, png_image: pathlib.Path): + if metadata_init.nb_args == 2: + if metadata_init.a_type == metadata.IllustrationMetadata: + with open(png_image, "rb") as fh: + png_data = fh.read() + assert ( + metadata.IllustrationMetadata( + "Illustration_48x48@1", png_data + ).libzim_value + == png_data + ) + else: + assert metadata_init.a_type("Foo", b"a value").libzim_value == b"a value" + elif metadata_init.nb_args == 1: + if metadata_init.a_type == metadata.DateMetadata: + assert metadata_init.a_type(b"2023-12-11").libzim_value == b"2023-12-11" + elif metadata_init.a_type == metadata.LanguageMetadata: + assert metadata_init.a_type(b"fra,eng").libzim_value == b"fra,eng" + else: + assert metadata_init.a_type(b"a value").libzim_value == b"a value" + + +def test_libzim_io_bytesio_value( + metadata_init: MetadataInitConfig, png_image: pathlib.Path +): + if metadata_init.nb_args == 2: + if metadata_init.a_type == metadata.IllustrationMetadata: + with open(png_image, "rb") as fh: + png_data = fh.read() + assert ( + metadata.IllustrationMetadata( + "Illustration_48x48@1", io.BytesIO(png_data) + ).libzim_value + == png_data + ) + else: + assert ( + metadata_init.a_type("Foo", io.BytesIO(b"a value")).libzim_value + == b"a value" + ) + elif metadata_init.nb_args == 1: + if metadata_init.a_type == metadata.DateMetadata: + pass # Not supported + elif metadata_init.a_type == metadata.LanguageMetadata: + assert ( + metadata_init.a_type(io.BytesIO(b"fra,eng")).libzim_value == b"fra,eng" + ) + else: + assert ( + metadata_init.a_type(io.BytesIO(b"a value")).libzim_value == b"a value" + ) + + +def test_std_metadata_values(): + test_value = dataclasses.replace(metadata.DEFAULT_DEV_ZIM_METADATA) + test_value.License = metadata.LicenseMetadata("Creative Commons CC0") + values = test_value.values() + assert len(values) == 9 + + expected_values: list[metadata.Metadata] = [ + metadata.LicenseMetadata("Creative Commons CC0"), + metadata.NameMetadata("Test Name"), + metadata.TitleMetadata("Test Title"), + metadata.CreatorMetadata("Test Creator"), + metadata.PublisherMetadata("Test Publisher"), + metadata.DateMetadata("2023-01-01"), + metadata.DescriptionMetadata("Test Description"), + metadata.LanguageMetadata("fra"), + # blank 48x48 transparent PNG + metadata.IllustrationMetadata( + "Illustration_48x48@1", + base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAADAAAAAwAQMAAABtzGvEAAAAGXRFWHRTb2Z0d2FyZQBB" + "ZG9iZSBJbWFnZVJlYWR5ccllPAAAAANQTFRFR3BMgvrS0gAAAAF0Uk5TAEDm2GYAAAAN" + "SURBVBjTY2AYBdQEAAFQAAGn4toWAAAAAElFTkSuQmCC" + ), + ), + ] + for value in values: + found = False + for expected_value in expected_values: + if value.__class__ == expected_value.__class__: + assert value.value == expected_value.value + found = True + break + if not found: + pytest.fail( + f"Did not found matching expected values for {value.name} metadata" + ) diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 17425f02..2fd7f3f2 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -3,8 +3,6 @@ from __future__ import annotations -import base64 -import datetime import io import logging import pathlib @@ -14,19 +12,37 @@ import sys import tempfile import time +from typing import NamedTuple from unittest.mock import call, patch import pytest from libzim.writer import Compression # pyright: ignore -from zimscraperlib.constants import ( - DEFAULT_DEV_ZIM_METADATA, - MANDATORY_ZIM_METADATA_KEYS, - UTF8, -) +from zimscraperlib.constants import UTF8 from zimscraperlib.download import save_large_file, stream_file from zimscraperlib.filesystem import delete_callback from zimscraperlib.zim import Archive, Creator, StaticItem, URLItem +from zimscraperlib.zim.metadata import ( + DEFAULT_DEV_ZIM_METADATA, + CreatorMetadata, + CustomMetadata, + CustomTextMetadata, + DateMetadata, + DescriptionMetadata, + FlavourMetadata, + IllustrationMetadata, + LanguageMetadata, + LicenseMetadata, + LongDescriptionMetadata, + Metadata, + NameMetadata, + PublisherMetadata, + ScraperMetadata, + SourceMetadata, + StandardMetadataList, + TagsMetadata, + TitleMetadata, +) from zimscraperlib.zim.providers import FileLikeProvider, URLProvider @@ -60,7 +76,7 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str: str, html_str_cn: with open(png_image, "rb") as fh: png_data = fh.read() with Creator(fpath, main_path).config_dev_metadata( - Tags=tags, Illustration_48x48_at_1=png_data + [TagsMetadata(tags), IllustrationMetadata("Illustration_48x48@1", png_data)] ) as creator: # verbatim HTML from string creator.add_item_for("welcome", "wel", content=html_str, is_front=True) @@ -92,8 +108,10 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str: str, html_str_cn: assert fpath.exists() reader = Archive(fpath) - assert reader.get_text_metadata("Title") == DEFAULT_DEV_ZIM_METADATA["Title"] - assert reader.get_text_metadata("Language") == DEFAULT_DEV_ZIM_METADATA["Language"] + assert reader.get_text_metadata("Title") == DEFAULT_DEV_ZIM_METADATA.Title.value + assert ( + reader.get_text_metadata("Language") == DEFAULT_DEV_ZIM_METADATA.Language.value + ) assert reader.get_text_metadata("Tags") == tags assert reader.main_entry.get_item().path == f"{main_path}" # make sure we have our image @@ -134,7 +152,7 @@ def test_create_without_workaround(tmp_path): def test_noindexlanguage(tmp_path): fpath = tmp_path / "test.zim" - creator = Creator(fpath, "welcome").config_dev_metadata(Language="bam") + creator = Creator(fpath, "welcome").config_dev_metadata(LanguageMetadata("bam")) creator.config_indexing(False) with creator as creator: creator.add_item(StaticItem(path="welcome", content="hello")) @@ -536,20 +554,6 @@ def test_without_metadata(tmp_path): Creator(tmp_path, "").start() -def test_check_metadata(tmp_path): - with pytest.raises(ValueError, match="Counter cannot be set"): - Creator(tmp_path, "").config_dev_metadata(Counter=1).start() - - with pytest.raises(ValueError, match="Invalid type for Foo"): - Creator(tmp_path, "").config_dev_metadata(Foo=1).start() - - with pytest.raises(ValueError, match="Description is too long."): - Creator(tmp_path, "").config_dev_metadata(Description="T" * 90).start() - - with pytest.raises(ValueError, match="LongDescription is too long."): - Creator(tmp_path, "").config_dev_metadata(LongDescription="T" * 5000).start() - - @pytest.mark.parametrize( "tags", [ @@ -575,42 +579,62 @@ def test_start_logs_metadata_log_contents(mocked_logger, png_image, tags, tmp_pa fpath = tmp_path / "test_config.zim" with open(png_image, "rb") as fh: png_data = fh.read() - creator = Creator(fpath, "", disable_metadata_checks=True).config_metadata( - Name="wikipedia_fr_football", - Title="English Wikipedia", - Creator="English speaking Wikipedia contributors", - Publisher="Wikipedia user Foobar", - Date="2009-11-21", - Description="All articles (without images) from the english Wikipedia", - LongDescription="This ZIM file contains all articles (without images)" - " from the english Wikipedia by 2009-11-10. The topics are...", - Language="eng", - License="CC-BY", - Tags=tags, - Flavour="nopic", - Source="https://en.wikipedia.org/", - Scraper="mwoffliner 1.2.3", - Illustration_48x48_at_1=png_data, - TestMetadata="Test Metadata", + creator = Creator(fpath, "", check_metadata_conventions=False).config_metadata( + StandardMetadataList( + Name=NameMetadata("wikipedia_fr_football"), + Title=TitleMetadata("English Wikipedia"), + Creator=CreatorMetadata("English speaking Wikipedia contributors"), + Publisher=PublisherMetadata("Wikipedia user Foobar"), + Date=DateMetadata("2009-11-21"), + Description=DescriptionMetadata( + "All articles (without images) from the english Wikipedia" + ), + LongDescription=LongDescriptionMetadata( + "This ZIM file contains all articles (without images)" + " from the english Wikipedia by 2009-11-10. The topics are..." + ), + Language=LanguageMetadata("eng"), + License=LicenseMetadata("CC-BY"), + Tags=TagsMetadata(tags), + Flavour=FlavourMetadata("nopic"), + Source=SourceMetadata("https://en.wikipedia.org/"), + Scraper=ScraperMetadata("mwoffliner 1.2.3"), + Illustration_48x48_at_1=IllustrationMetadata( + "Illustration_48x48@1", png_data + ), + ), + [CustomTextMetadata("TestMetadata", "Test Metadata")], + fail_on_missing_prefix_in_extras=False, ) - class NotPrintable: + class NotPrintable(str): def __str__(self): raise ValueError("Not printable I said") creator._metadata.update( { - "Illustration_96x96@1": b"%PDF-1.5\n%\xe2\xe3\xcf\xd3", - "Chars": b"\xc5\xa1\xc9\x94\xc9\x9b", - "Chars-32": b"\xff\xfe\x00\x00a\x01\x00\x00T\x02\x00\x00[\x02\x00\x00", - "Video": b"\x00\x00\x00 ftypisom\x00\x00\x02\x00isomiso2avc1mp41\x00", - "Toupie": NotPrintable(), - } + "Illustration_96x96@1": Metadata( + "Illustration_96x96@1", b"%PDF-1.5\n%\xe2\xe3\xcf\xd3" + ), + "Chars": Metadata("Chars", b"\xc5\xa1\xc9\x94\xc9\x9b"), + "Chars-32": Metadata( + "Chars-32", b"\xff\xfe\x00\x00a\x01\x00\x00T\x02\x00\x00[\x02\x00\x00" + ), + "Video": Metadata( + "Video", b"\x00\x00\x00 ftypisom\x00\x00\x02\x00isomiso2avc1mp41\x00" + ), + "Toupie": CustomTextMetadata("Toupie", NotPrintable("value")), + } # intentionaly bad, to handle case where user does bad things ) + # intentionaly bad, to handle case where user does bad things + creator._metadata["Relation"] = None # pyright: ignore[reportArgumentType] + creator._metadata["BadRawValue"] = "Value" # pyright: ignore[reportArgumentType] + creator._log_metadata() # /!\ this must be alpha sorted mocked_logger.debug.assert_has_calls( [ + call("Metadata: BadRawValue is improper metadata type: str: Value"), call("Metadata: Chars = šɔɛ"), call( "Metadata: Chars-32 is a 16 bytes text/plain blob " @@ -624,10 +648,7 @@ def __str__(self): ), call("Metadata: Flavour = nopic"), call("Metadata: Illustration_48x48@1 is a 3274 bytes 48x48px PNG Image"), - call( - "Metadata: Illustration_96x96@1 is a 14 bytes " - "application/pdf blob not recognized as an Image" - ), + call("Metadata: Illustration_96x96@1 is a 14 bytes application/pdf blob"), call("Metadata: Language = eng"), call("Metadata: License = CC-BY"), call( @@ -637,21 +658,27 @@ def __str__(self): ), call("Metadata: Name = wikipedia_fr_football"), call("Metadata: Publisher = Wikipedia user Foobar"), - call("Metadata: Relation = None"), + call("Metadata: Relation is None"), call("Metadata: Scraper = mwoffliner 1.2.3"), call("Metadata: Source = https://en.wikipedia.org/"), call(f"Metadata: Tags = {tags}"), call("Metadata: TestMetadata = Test Metadata"), call("Metadata: Title = English Wikipedia"), - call("Metadata: Toupie is unexpected data type: NotPrintable"), + call( + "Metadata: Toupie is unexpected data type: " + "test_start_logs_metadata_log_contents..NotPrintable" + ), call("Metadata: Video is a 33 bytes video/mp4 blob"), ] ) -def test_relax_metadata(tmp_path): - Creator(tmp_path, "", disable_metadata_checks=True).config_dev_metadata( - Description="T" * 90 +def test_relax_metadata( + tmp_path, + ignore_metadata_conventions, # noqa: ARG001 +): + Creator(tmp_path, "", check_metadata_conventions=False).config_dev_metadata( + DescriptionMetadata("T" * 90) ).start() @@ -679,22 +706,30 @@ def test_config_metadata(tmp_path, png_image, tags): with open(png_image, "rb") as fh: png_data = fh.read() creator = Creator(fpath, "").config_metadata( - Name="wikipedia_fr_football", - Title="English Wikipedia", - Creator="English speaking Wikipedia contributors", - Publisher="Wikipedia user Foobar", - Date="2009-11-21", - Description="All articles (without images) from the english Wikipedia", - LongDescription="This ZIM file contains all articles (without images)" - " from the english Wikipedia by 2009-11-10. The topics are...", - Language="eng", - License="CC-BY", - Tags=tags, - Flavour="nopic", - Source="https://en.wikipedia.org/", - Scraper="mwoffliner 1.2.3", - Illustration_48x48_at_1=png_data, - TestMetadata="Test Metadata", + StandardMetadataList( + Name=NameMetadata("wikipedia_fr_football"), + Title=TitleMetadata("English Wikipedia"), + Creator=CreatorMetadata("English speaking Wikipedia contributors"), + Publisher=PublisherMetadata("Wikipedia user Foobar"), + Date=DateMetadata("2009-11-21"), + Description=DescriptionMetadata( + "All articles (without images) from the english Wikipedia" + ), + LongDescription=LongDescriptionMetadata( + "This ZIM file contains all articles (without images)" + " from the english Wikipedia by 2009-11-10. The topics are..." + ), + Language=LanguageMetadata("eng"), + License=LicenseMetadata("CC-BY"), + Tags=TagsMetadata(tags), + Flavour=FlavourMetadata("nopic"), + Source=SourceMetadata("https://en.wikipedia.org/"), + Scraper=ScraperMetadata("mwoffliner 1.2.3"), + Illustration_48x48_at_1=IllustrationMetadata( + "Illustration_48x48@1", png_data + ), + ), + [CustomTextMetadata("X-TestMetadata", "Test Metadata")], ) with creator: pass @@ -720,48 +755,46 @@ def test_config_metadata(tmp_path, png_image, tags): ) assert reader.get_text_metadata("Language") == "eng" assert reader.get_text_metadata("License") == "CC-BY" - assert ( - reader.get_text_metadata("Tags") - == "wikipedia;_category:wikipedia;_pictures:no;_videos:no;" - "_details:yes;_ftindex:yes" - ) + assert set(reader.get_text_metadata("Tags").split(";")) == set( + "wikipedia;_category:wikipedia;_pictures:no;_videos:no;" + "_details:yes;_ftindex:yes".split(";") + ) # order of tags is not guaranteed and does not matter + assert len(reader.get_text_metadata("Tags").split(";")) == 6 assert reader.get_text_metadata("Flavour") == "nopic" assert reader.get_text_metadata("Source") == "https://en.wikipedia.org/" assert reader.get_text_metadata("Scraper") == "mwoffliner 1.2.3" assert reader.get_metadata("Illustration_48x48@1") == png_data - assert reader.get_text_metadata("TestMetadata") == "Test Metadata" + assert reader.get_text_metadata("X-TestMetadata") == "Test Metadata" def test_config_metadata_control_characters(tmp_path): fpath = tmp_path / "test_config.zim" creator = Creator(fpath, "").config_dev_metadata( - Description="\t\n\r\n \tA description \awith \bcontrol characters\v", - LongDescription="A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t", - Creator=" A creator ", - ) - assert creator._metadata["Description"] == "A description with control characters" - assert ( - creator._metadata["LongDescription"] - == "A description \rwith \ncontrol characters\tsss" + [ + DescriptionMetadata( + "\t\n\r\n \tA description \awith \bcontrol characters\v" + ), + LongDescriptionMetadata( + "A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t" + ), + CreatorMetadata(" A creator "), + ] ) - assert creator._metadata["Creator"] == "A creator" with creator: creator.add_metadata( - "Description_1", - "\t\n\r\n \tA description \awith \bcontrol characters\v", - ) - creator.add_metadata( - "LongDescription_1", - "A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t", - ) - creator.add_metadata( - "Creator_1", - " A creator ", + CustomTextMetadata( + "Description_1", + "\t\n\r\n \tA description \awith \bcontrol characters\v", + ) ) creator.add_metadata( - "Binary1", - bytes.fromhex("01FA"), + CustomTextMetadata( + "LongDescription_1", + "A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t", + ) ) + creator.add_metadata(CustomTextMetadata("Creator_1", " A creator ")) + creator.add_metadata(CustomMetadata("Binary1", bytes.fromhex("01FA"))) pass assert fpath.exists() @@ -788,83 +821,94 @@ def test_config_metadata_control_characters(tmp_path): assert bytes.hex(reader.get_metadata("Binary1")) == "01fa" -@pytest.mark.parametrize( - "name,value,valid", - [ - ("Name", 4, False), - ("Title", 4, False), - ("Creator", 4, False), - ("Publisher", 4, False), - ("Description", 4, False), - ("LongDescription", 4, False), - ("License", 4, False), - ("Relation", 4, False), - ("Relation", 4, False), - ("Flavour", 4, False), - ("Source", 4, False), - ("Scraper", 4, False), - ("Title", "में" * 30, True), - ("Title", "X" * 31, False), - ("Date", 4, False), - ("Date", datetime.datetime.now(), True), # noqa: DTZ005 - ("Date", datetime.datetime(1969, 12, 31, 23, 59), True), # noqa: DTZ001 - ("Date", datetime.date(1969, 12, 31), True), - ("Date", datetime.date.today(), True), # noqa: DTZ011 - ("Date", "1969-12-31", True), - ("Date", "1969-13-31", False), - ("Date", "2023/02/29", False), - ("Date", "2023-55-99", False), - ("Language", "xxx", False), - ("Language", "rmr", False), - ("Language", "eng", True), - ("Language", "fra", True), - ("Language", "bam", True), - ("Language", "fr", False), - ("Language", "en", False), - ("Language", "fra,eng", True), - ("Language", "fra,eng,bam", True), - ("Language", "fra,en,bam", False), - ("Language", "eng,", False), - ("Language", "eng, fra", False), - ("Counter", "1", False), - ("Description", "में" * 80, True), - ("Description", "X" * 81, False), - ("LongDescription", "में" * 4000, True), - ("LongDescription", "X" * 4001, False), - ("Tags", 4, False), - ("Tags", ["wikipedia", 4, "football"], False), - ("Tags", ("wikipedia", "football"), True), - ("Tags", ["wikipedia", "football"], True), - ("Tags", "wikipedia;football", True), - # 1x1 PNG image - ( - "Illustration_48x48@1", - base64.b64decode( - "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAGXRFWHRTb2Z0d2FyZQBB" - "ZG9iZSBJbWFnZVJlYWR5ccllPAAAAA9JREFUeNpi+P//P0CAAQAF/gL+Lc6J7gAAAABJ" - "RU5ErkJggg==" - ), +class ExtraMetadataCase(NamedTuple): + extras: list[Metadata] + fail_on_missing_prefix: bool + id: str + + +def __get_extra_metadata_case_id(case: ExtraMetadataCase) -> str: + return case.id + + +@pytest.fixture( + params=[ + ExtraMetadataCase( + [CustomTextMetadata("X-TestMetadata", "Test Metadata")], + True, + id="good_prefix", + ), + ExtraMetadataCase( + [CustomTextMetadata("TestMetadata", "Test Metadata")], False, + id="bad_prefix", ), - ( - "Illustration_48x48@1", - DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"], + ExtraMetadataCase( + [ + CustomTextMetadata("X-TestMetadata", "Test Metadata"), + CustomTextMetadata("X-TestMetadata2", "Test Metadata"), + ], True, + id="list_of_two_good_prefix", ), - ( - "Illustration_96x96@1", - DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"], + ExtraMetadataCase( + [ + CustomTextMetadata("X-TestMetadata", "Test Metadata"), + CustomTextMetadata("TestMetadata2", "Test Metadata"), + ], False, + id="list_with_one_bad_prefix", ), - ] - + [(name, "", False) for name in MANDATORY_ZIM_METADATA_KEYS], + ], + ids=__get_extra_metadata_case_id, ) -def test_validate_metadata(tmp_path, name, value, valid): - if valid: - Creator(tmp_path / "_.zim", "").validate_metadata(name, value) - else: - with pytest.raises(ValueError): - Creator(tmp_path / "_.zim", "").validate_metadata(name, value) +def metadata_extras(request: pytest.FixtureRequest): + yield request.param + + +def test_metadata_extras(tmp_path, metadata_extras: ExtraMetadataCase): + Creator(tmp_path / "_.zim", "").config_metadata( + DEFAULT_DEV_ZIM_METADATA, + metadata_extras.extras, + fail_on_missing_prefix_in_extras=metadata_extras.fail_on_missing_prefix, + ) + + +def test_metadata_extras_dev(tmp_path, metadata_extras: ExtraMetadataCase): + Creator(tmp_path / "_.zim", "").config_dev_metadata(metadata_extras.extras) + + +def test_metadata_extras_missing_prefix(tmp_path): + with pytest.raises(ValueError, match="does not starts with X- as expected"): + Creator(tmp_path / "_.zim", "").config_metadata( + DEFAULT_DEV_ZIM_METADATA, + [CustomTextMetadata("TestMetadata", "Test Metadata")], + ) + + +@pytest.mark.parametrize( + "name,metadata,expected_value", + [ + pytest.param( + "X-Test", + CustomTextMetadata( + "X-Test", DEFAULT_DEV_ZIM_METADATA.Title.libzim_value.decode() + "Foo" + ), + DEFAULT_DEV_ZIM_METADATA.Title.libzim_value.decode() + "Foo", + id="simple_str", + ), + pytest.param("Tags", TagsMetadata(["tag1", "tag2"]), "tag1;tag2", id="tags"), + ], +) +def test_add_metadata( + tmp_path: pathlib.Path, name: str, metadata: Metadata, expected_value: str +): + fpath = tmp_path / "test_blank.zim" + with Creator(fpath, "").config_dev_metadata() as creator: + creator.add_metadata(metadata) + assert fpath.exists() + reader = Archive(fpath) + assert reader.get_text_metadata(name) == expected_value def test_config_indexing(tmp_path): From 968a62c6d97c623113ac406d511260e581525957 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Tue, 10 Dec 2024 16:34:24 +0000 Subject: [PATCH 02/13] Added beartype to `zim` submodule --- pyproject.toml | 3 +++ src/zimscraperlib/zim/__init__.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 26ace799..a5776507 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "regex>=2020.7.14", "pymupdf>=1.24.0,<2.0", "CairoSVG>=2.2.0,<3.0", + "beartype==0.19.0", # youtube-dl should be updated as frequently as possible "yt-dlp" ] @@ -252,6 +253,8 @@ ban-relative-imports = "all" "tests/**/*" = ["PLR2004", "S101", "TID252"] # _libkiwix mimics libkiwix C++ code, names obey C++ conventions "src/zimscraperlib/zim/_libkiwix.py" = ["N802", "N803", "N806"] +# beartype must be first +"src/zimscraperlib/zim/__init__.py" = ["E402"] [tool.pytest.ini_options] minversion = "7.3" diff --git a/src/zimscraperlib/zim/__init__.py b/src/zimscraperlib/zim/__init__.py index df282f3a..6b0b1714 100644 --- a/src/zimscraperlib/zim/__init__.py +++ b/src/zimscraperlib/zim/__init__.py @@ -9,8 +9,11 @@ zim.items: item to add to creator zim.archive: read ZIM files, accessing or searching its content""" +from beartype.claw import beartype_this_package from libzim.writer import Blob # pyright: ignore +beartype_this_package() + from zimscraperlib.zim.archive import Archive from zimscraperlib.zim.creator import Creator from zimscraperlib.zim.filesystem import make_zim_file From f633e7a4cab6f8dc788b5caea9f12142b281ac01 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Thu, 12 Dec 2024 09:56:08 +0000 Subject: [PATCH 03/13] Fixing CounterMap type --- src/zimscraperlib/zim/_libkiwix.py | 13 ++++++++----- src/zimscraperlib/zim/archive.py | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/zimscraperlib/zim/_libkiwix.py b/src/zimscraperlib/zim/_libkiwix.py index 02cae889..5e24e159 100644 --- a/src/zimscraperlib/zim/_libkiwix.py +++ b/src/zimscraperlib/zim/_libkiwix.py @@ -15,12 +15,15 @@ from __future__ import annotations import io -from collections import namedtuple +from typing import NamedTuple -MimetypeAndCounter = namedtuple("MimetypeAndCounter", ["mimetype", "value"]) -CounterMap = dict[ - type(MimetypeAndCounter.mimetype), type(MimetypeAndCounter.value) # pyright: ignore -] + +class MimetypeAndCounter(NamedTuple): + mimetype: str + value: int + + +type CounterMap = dict[str, int] def getline(src: io.StringIO, delim: str | None = None) -> tuple[bool, str]: diff --git a/src/zimscraperlib/zim/archive.py b/src/zimscraperlib/zim/archive.py index 19314687..891fb976 100644 --- a/src/zimscraperlib/zim/archive.py +++ b/src/zimscraperlib/zim/archive.py @@ -17,7 +17,7 @@ import libzim.search # Query, Searcher # pyright: ignore import libzim.suggestion # SuggestionSearcher # pyright: ignore -from zimscraperlib.zim._libkiwix import convertTags, parseMimetypeCounter +from zimscraperlib.zim._libkiwix import CounterMap, convertTags, parseMimetypeCounter class Archive(libzim.reader.Archive): @@ -101,7 +101,7 @@ def get_search_results_count(self, query: str) -> int: return search.getEstimatedMatches() @property - def counters(self) -> dict[str, int]: + def counters(self) -> CounterMap: try: return parseMimetypeCounter(self.get_text_metadata("Counter")) except RuntimeError: # pragma: no cover (no ZIM avail to test itl) From 1a5db4b83e007d32a68b7cb9325ef28d7381f527 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Thu, 12 Dec 2024 09:56:52 +0000 Subject: [PATCH 04/13] Fixed call to make_zim_file in test (type) --- tests/zim/test_fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zim/test_fs.py b/tests/zim/test_fs.py index 3eec120c..2ba5cfc1 100644 --- a/tests/zim/test_fs.py +++ b/tests/zim/test_fs.py @@ -142,7 +142,7 @@ def test_make_zim_file_no_file_on_error(tmp_path, png_image, build_data): illustration="{png_image.name}", title="Test ZIM", description="A test ZIM", - redirects_file="{build_data["redirects_file"]}") + redirects_file=pathlib.Path("{build_data["redirects_file"]}")) except Exception as exc: print(exc) finally: From a5d221297d0f80b48ab1800417e661ece58f7076 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Thu, 12 Dec 2024 09:58:53 +0000 Subject: [PATCH 05/13] Introducing Callback via new typing module - Explicit callback definition - simplified delete_callback to be a dumb callback (not chaining) --- src/zimscraperlib/filesystem.py | 14 ++---------- src/zimscraperlib/typing.py | 26 ++++++++++++++++++++++ src/zimscraperlib/zim/creator.py | 34 +++++++++++++++++------------ tests/filesystem/test_filesystem.py | 20 +---------------- 4 files changed, 49 insertions(+), 45 deletions(-) create mode 100644 src/zimscraperlib/typing.py diff --git a/src/zimscraperlib/filesystem.py b/src/zimscraperlib/filesystem.py index 65e0a2c7..6ffbe6a1 100644 --- a/src/zimscraperlib/filesystem.py +++ b/src/zimscraperlib/filesystem.py @@ -9,8 +9,6 @@ import os import pathlib -from collections.abc import Callable -from typing import Any import magic @@ -44,15 +42,7 @@ def get_content_mimetype(content: bytes | str) -> str: return MIME_OVERRIDES.get(detected_mime, detected_mime) -def delete_callback( - fpath: str | pathlib.Path, - callback: Callable | None = None, - *callback_args: Any, -): - """helper deleting passed filepath, optionnaly calling an additional callback""" +def delete_callback(fpath: str | pathlib.Path): + """helper deleting passed filepath""" os.unlink(fpath) - - # call the callback if requested - if callback and callable(callback): - callback.__call__(*callback_args) diff --git a/src/zimscraperlib/typing.py b/src/zimscraperlib/typing.py new file mode 100644 index 00000000..237f1ae7 --- /dev/null +++ b/src/zimscraperlib/typing.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from collections.abc import Callable +from typing import Any, NamedTuple + + +class Callback(NamedTuple): + func: Callable + args: tuple[Any, ...] | None = None + kwargs: dict[str, Any] | None = None + + @property + def callable(self) -> bool: + return callable(self.func) + + def get_args(self) -> tuple[Any, ...]: + return self.args or () + + def get_kwargs(self) -> dict[str, Any]: + return self.kwargs or {} + + def call_with(self, *args, **kwargs): + self.func.__call__(*args, **kwargs) + + def call(self): + self.call_with(*self.get_args(), **self.get_kwargs()) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index e0046c87..781259b3 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -40,6 +40,7 @@ ) from zimscraperlib.i18n import is_valid_iso_639_3 from zimscraperlib.types import get_mime_for_name +from zimscraperlib.typing import Callback from zimscraperlib.zim.indexing import IndexData from zimscraperlib.zim.items import StaticItem from zimscraperlib.zim.metadata import ( @@ -341,7 +342,7 @@ def add_item_for( should_compress: bool | None = None, delete_fpath: bool | None = False, duplicate_ok: bool | None = None, - callback: Callable | tuple[Callable, Any] | None = None, + callbacks: list[Callback] | Callback | None = None, index_data: IndexData | None = None, auto_index: bool = True, ): @@ -365,6 +366,11 @@ def add_item_for( if fpath is None and content is None: raise ValueError("One of fpath or content is required") + if isinstance(callbacks, Callback): + callbacks = [callbacks] + elif callbacks is None: + callbacks = [] + mimetype = mimetype_for( path=path, content=content, fpath=fpath, mimetype=mimetype ) @@ -377,12 +383,7 @@ def add_item_for( hints[libzim.writer.Hint.COMPRESS] = should_compress if delete_fpath and fpath: - cb = [delete_callback, fpath] - if callback and callable(callback): - cb.append(callback) - elif callback: - cb += list(callback) - callback = tuple(cb) + callbacks.append(Callback(func=delete_callback, args=(fpath,))) self.add_item( StaticItem( @@ -395,7 +396,7 @@ def add_item_for( index_data=index_data, auto_index=auto_index, ), - callback=callback, + callbacks=callbacks, duplicate_ok=duplicate_ok, ) return path @@ -404,18 +405,23 @@ def add_item( self, item: libzim.writer.Item, duplicate_ok: bool | None = None, - callback: Callable | tuple[Callable, Any] | None = None, + callbacks: list[Callback] | Callback | None = None, ): """Add a libzim.writer.Item callback: either a single callable or a tuple containing the callable as first element then the arguments to pass to the callable. Note: you must __not__ include the item itself in those arguments.""" - if callback: - if callable(callback): - weakref.finalize(item, callback) - else: - weakref.finalize(item, *callback) + if isinstance(callbacks, Callback): + callbacks = [callbacks] + elif callbacks is None: + callbacks = [] + + for callback in callbacks: + if callback.callable: + weakref.finalize( + item, callback.func, *callback.get_args(), **callback.get_kwargs() + ) duplicate_ok = duplicate_ok or self.ignore_duplicates try: diff --git a/tests/filesystem/test_filesystem.py b/tests/filesystem/test_filesystem.py index 08579c16..daf2bec8 100644 --- a/tests/filesystem/test_filesystem.py +++ b/tests/filesystem/test_filesystem.py @@ -43,25 +43,7 @@ def test_mime_overrides(svg_image): assert get_content_mimetype(fh.read(64)) == expected_mime -def test_delete_callback_with_cb(tmp_path): - class Store: - called = 0 - - def cb(*args): # noqa: ARG001 - Store.called += 1 - - fpath = tmp_path.joinpath("my-file") - with open(fpath, "w") as fh: - fh.write("content") - - delete_callback(fpath, cb, fpath.name) - - assert not fpath.exists() - assert Store.called - assert Store.called == 1 - - -def test_delete_callback_without_cb(tmp_path): +def test_delete_callback(tmp_path): fpath = tmp_path.joinpath("my-file") with open(fpath, "w") as fh: fh.write("content") From d44fa1e891bf71c85624e6905303ca62b925fc56 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Thu, 12 Dec 2024 10:00:39 +0000 Subject: [PATCH 06/13] Revamped metadata module around single base Metadata --- src/zimscraperlib/zim/creator.py | 23 +- src/zimscraperlib/zim/filesystem.py | 8 +- src/zimscraperlib/zim/metadata.py | 719 +++++++++++++++------------- tests/zim/conftest.py | 4 +- tests/zim/test_metadata.py | 362 +++++++------- tests/zim/test_zim_creator.py | 106 ++-- 6 files changed, 647 insertions(+), 575 deletions(-) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index 781259b3..318abeb9 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -24,13 +24,10 @@ import pathlib import re import weakref -from collections.abc import Callable -from typing import Any import libzim.writer # pyright: ignore import PIL.Image -import zimscraperlib.zim.metadata from zimscraperlib import logger from zimscraperlib.constants import FRONT_ARTICLE_MIMETYPES from zimscraperlib.filesystem import ( @@ -102,8 +99,8 @@ class Creator(libzim.writer.Creator): Use workaround_nocancel=False to disable the workaround. By default, all metadata are validated for compliance with openZIM specification and - conventions. Set check_metadata_conventions=True to disable this validation (you can - still do checks manually with the validation methods or your own logic). + conventions. Set metdata.APPLY_RECOMMENDATIONS to False to disable this validation + # (you canstill do checks manually with the validation methods or your own logic). """ def __init__( @@ -114,7 +111,6 @@ def __init__( *, workaround_nocancel: bool | None = True, ignore_duplicates: bool | None = False, - check_metadata_conventions: bool = True, ): super().__init__(filename=filename) self._metadata: dict[str, Metadata] = {} @@ -132,9 +128,6 @@ def __init__( self.workaround_nocancel = workaround_nocancel self.ignore_duplicates = ignore_duplicates - zimscraperlib.zim.metadata.check_metadata_conventions = ( - check_metadata_conventions - ) def config_indexing( self, indexing: bool, language: str | None = None # noqa: FBT001 @@ -211,7 +204,7 @@ def _get_first_language_metadata_value(self) -> str | None: """Private methods to get most popular lang code from Language metadata""" for metadata in self._metadata.values(): if isinstance(metadata, LanguageMetadata): - return metadata.libzim_value.decode().split(",")[0] + return metadata.value[0] return None def start(self): @@ -244,18 +237,14 @@ def start(self): for metadata in self._metadata.values(): if isinstance(metadata, IllustrationMetadata): - self.add_illustration(metadata.size, metadata.libzim_value) + self.add_illustration(metadata.illustration_size, metadata.libzim_value) else: self.add_metadata(metadata) self._metadata.clear() return self - def add_metadata( - self, - value: Metadata, - mimetype: str = "text/plain;charset=UTF-8", - ): + def add_metadata(self, value: Metadata): """Really add the metadata to the ZIM, after ZIM creation has started. You would probably prefer to use config_metadata methods to check metadata @@ -263,7 +252,7 @@ def add_metadata( duplicate metadata name. """ - super().add_metadata(value.name, value.libzim_value, mimetype) + super().add_metadata(value.name, value.libzim_value, value.mimetype) def config_metadata( self, diff --git a/src/zimscraperlib/zim/filesystem.py b/src/zimscraperlib/zim/filesystem.py index 6cbc73e3..baa39c90 100644 --- a/src/zimscraperlib/zim/filesystem.py +++ b/src/zimscraperlib/zim/filesystem.py @@ -166,11 +166,13 @@ def make_zim_file( with open(illustration_path, "rb") as fh: illustration_data = fh.read() + # disable recommendations if requested + metadata.APPLY_RECOMMENDATIONS = not disable_metadata_checks + zim_file = Creator( filename=fpath, main_path=main_page, ignore_duplicates=ignore_duplicates, - check_metadata_conventions=disable_metadata_checks, ).config_metadata( metadata.StandardMetadataList( # mandatory @@ -181,8 +183,8 @@ def make_zim_file( Language=metadata.LanguageMetadata(language), Creator=metadata.CreatorMetadata(creator), Publisher=metadata.PublisherMetadata(publisher), - Illustration_48x48_at_1=metadata.IllustrationMetadata( - "Illustration_48x48@1", illustration_data + Illustration_48x48_at_1=metadata.DefaultIllustrationMetadata( + illustration_data ), # optional Tags=metadata.TagsMetadata(list(tags)) if tags else None, diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index 0fe4f508..6fc5c439 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -3,8 +3,10 @@ import base64 import datetime import io -import re -from dataclasses import dataclass +from collections.abc import Iterable +from dataclasses import asdict, dataclass, fields +from itertools import filterfalse +from typing import Any, BinaryIO import regex @@ -21,9 +23,8 @@ # All control characters are disallowed in str metadata except \n, \r and \t UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}") -# flag to enable / disable check conventions (e.g. title shorter than 30 chars, -# description shorter than 80 chars, ...) -check_metadata_conventions: bool = True +# whether to apply openZIM recommendations (see https://wiki.openzim.org/wiki/Metadata) +APPLY_RECOMMENDATIONS: bool = True def clean_str(value: str) -> str: @@ -36,391 +37,419 @@ def nb_grapheme_for(value: str) -> int: return len(regex.findall(r"\X", value)) -class Metadata: - """A very basic metadata, with a name and a bytes or io.BytesIO value +def mandatory(cls): + """Marks a Metadata mandatory: must be set to please Creator and cannot be empty""" + cls.is_required = True + cls.empty_allowed = False + return cls - Compliance with ZIM specification is done at initialisation, value passed at - initialization is kept in memory in `value` attribute, target name is stored in - `name` value and conversion to libzim value is available with `libzim_value` - property. - """ - def __init__(self, name: str, value: bytes | io.BytesIO) -> None: - if not isinstance(value, bytes | io.BytesIO): - raise ValueError( - f"Unexpected type passed to {self.__class__.__qualname__}: " - f"{value.__class__.__qualname__}" - ) - if name == "Counter": - raise ValueError("Counter cannot be set. libzim sets it.") - self.name = name - self.value = value - libzim_value = self.libzim_value # to check for errors - if libzim_value is None or len(libzim_value) == 0: - raise ValueError("Cannot set empty metadata") +def expecting_text(cls): + """Expects a Text (str) input. Will be cleaned-up and UTF-8 encoded""" + cls.expecting = "text" + cls.require_text_cleanup = True + cls.require_utf8_encoding = True - @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - if isinstance(self.value, io.BytesIO): - return self.value.getvalue() - return self.value + # native type is str + def get_cleaned_value(self, value: str) -> str: + if self.require_text_cleanup: + value = clean_str(value) + if not self.empty_allowed and not value.strip(): + raise ValueError("Missing value (empty not allowed)") -class _TextMetadata(Metadata): - """A basic metadata whose value is expected to be some text""" + # max-length is openZIM recommendation + if getattr(self, "oz_max_length", 0) and APPLY_RECOMMENDATIONS: + if nb_grapheme_for(value) > self.oz_max_length: + raise ValueError( + f"{self.name} value is too long ({self.oz_max_length})" + ) + return value - def __init__(self, name: str, value: bytes | io.BytesIO | str) -> None: - if not isinstance(value, bytes | io.BytesIO | str): - raise ValueError( - f"Unexpected type passed to {self.__class__.__qualname__}: " - f"{value.__class__.__qualname__}" - ) - super().__init__( - name=name, - value=( - value.encode() - if isinstance(value, str) - else value.getvalue() if isinstance(value, io.BytesIO) else value - ), - ) - self.value = value - _ = self.libzim_value # to check for errors + def get_libzim_value(self) -> bytes: + return self.get_encoded(self.value) - @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - # decode and reencode byte types to validate it's proper UTF-8 text - if isinstance(self.value, io.BytesIO): - str_value = self.value.getvalue().decode() - elif isinstance(self.value, bytes): - str_value = self.value.decode() - else: - str_value = self.value - return clean_str(str_value).encode() - - -def _check_for_allowed_custom_metadata_name(name: str, class_name: str): - """Check that metadata name is not among the standard ones for which a type exist""" - if name in DEFAULT_DEV_ZIM_METADATA.__dict__.keys(): - # this list contains the 'bad' illustration keys, but we don't care, they should - # still not be used - raise ValueError( - f"It is not allowed to use {class_name} for standard {name} " - f"metadata. Please use {name}Metadata type for proper checks" - ) - if name.startswith("Illustration_"): - raise ValueError( - f"It is not allowed to use {class_name} for standard Illustration" - "metadata. Please use IllustrationMetadata type for proper checks" - ) - - -class CustomTextMetadata(_TextMetadata): - """A text metadata for which we do little checks""" - - def __init__(self, name: str, value: bytes | io.BytesIO | str) -> None: - _check_for_allowed_custom_metadata_name(name, self.__class__.__qualname__) - super().__init__(name=name, value=value) + cls.get_cleaned_value = get_cleaned_value + cls.get_libzim_value = get_libzim_value + return cls -class CustomMetadata(Metadata): - """A bytes metadata for which we do little checks""" +def expecting_textlist(cls): + """Expects a Text List (list[str]) input. Each item will be cleaned-up. - def __init__(self, name: str, value: bytes | io.BytesIO) -> None: - _check_for_allowed_custom_metadata_name(name, self.__class__.__qualname__) - super().__init__(name=name, value=value) + List will be joined (see `join_list_with`) and UTF-8 encoded""" + cls.expecting = "textlist" + cls.require_textlist_cleanup = True + cls.require_utf8_encoding = True + # native type is list[str] + def get_cleaned_value(self, value: Iterable[str] | str) -> list[str]: + if isinstance(value, str): + value = [value] + else: + value = list(value) + if self.require_textlist_cleanup: + value = [clean_str(item) for item in value] + if not self.empty_allowed and (not value or not all(value)): + raise ValueError("Missing value (empty not allowed)") + if not self.duplicates_allowed and len(set(value)) != len(value): + raise ValueError("Duplicate entries not allowed") + elif self.require_deduplication: + value = unique_values(value) + if self.oz_only_iso636_3_allowed and APPLY_RECOMMENDATIONS: + invalid_codes = list(filterfalse(is_valid_iso_639_3, value)) + if invalid_codes: + raise ValueError( + f"Following code(s) are not ISO-639-3: {','.join(invalid_codes)}" + ) + return value -class _MandatoryTextMetadata(_TextMetadata): - """A mandatory (value must be set) text metadata""" + def get_libzim_value(self) -> bytes: + return self.get_encoded(self.join_list_with.join(self.value)) - @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - value = super().libzim_value - if len(value) == 0: - raise ValueError("Missing value for mandatory metadata") - return value + cls.get_cleaned_value = get_cleaned_value + cls.get_libzim_value = get_libzim_value + return cls -class _MandatoryMetadata(Metadata): - """A mandatory (value must be set) bytes metadata""" +def expecting_date(cls): + """Expects a Date (date | datetime) input. Will be UTF-8 encoded as YYYY-MM-DD""" + cls.expecting = "date" + cls.require_utf8_encoding = True - @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - value = super().libzim_value - if len(value) == 0: - raise ValueError("Missing value for mandatory metadata") + # native type is date + def get_cleaned_value( + self, value: datetime.date | datetime.datetime # noqa: ARG001 + ) -> datetime.date: + if isinstance(value, datetime.datetime): + value = value.date() return value + def get_libzim_value(self) -> bytes: + return self.get_encoded(self.value.strftime("%Y-%m-%d")) -class TitleMetadata(_MandatoryTextMetadata): - """The Title metadata""" + cls.get_cleaned_value = get_cleaned_value + cls.get_libzim_value = get_libzim_value + return cls - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__(name="Title", value=value) - @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - value = super().libzim_value - if check_metadata_conventions: - if nb_grapheme_for(value.decode()) > RECOMMENDED_MAX_TITLE_LENGTH: - raise ValueError("Title is too long.") +def expecting_illustration(cls): + """Expects a Square PNG Illustration (bytes-like) input. + + PNG format and squareness will be checked""" + cls.expecting = "illustration" + cls.meta_mimetype = "image/png" + + # native type is PNG image buffer + def get_cleaned_value(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: + if isinstance(value, BinaryIO): + payload = value.read() + value.seek(0) + value = payload + elif isinstance(value, io.BytesIO): + value = value.getvalue() + if not self.empty_allowed and not value: + raise ValueError("Missing value (empty not allowed)") + + if not is_valid_image( + image=value, + imformat="PNG", + size=(self.illustration_size, self.illustration_size), + ): + raise ValueError( + f"{self.name} is not a valid " + f"{self.illustration_size}x{self.illustration_size} PNG Image" + ) return value + def get_libzim_value(self) -> bytes: + return self.value -class DescriptionMetadata(_MandatoryTextMetadata): - """The Description metadata""" + cls.get_cleaned_value = get_cleaned_value + cls.get_libzim_value = get_libzim_value + return cls - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__(name="Description", value=value) - @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - value = super().libzim_value - if check_metadata_conventions: - if nb_grapheme_for(value.decode()) > MAXIMUM_DESCRIPTION_METADATA_LENGTH: - raise ValueError("Description is too long.") - return value +def allow_empty(cls): + """Whether input can be blank""" + cls.empty_allowed = True + return cls -class LongDescriptionMetadata(_MandatoryTextMetadata): - """The LongDescription metadata""" +def allow_duplicates(cls): + """Whether list input can accept duplicate values""" + cls.duplicates_allowed = True + return cls - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__(name="LongDescription", value=value) - @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - value = super().libzim_value - if check_metadata_conventions: - if ( - nb_grapheme_for(value.decode()) - > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH - ): - raise ValueError("LongDescription is too long.") - return value +def deduplicate(cls): + """Whether duplicates in list inputs should be reduced""" + cls.duplicates_allowed = True + cls.require_deduplication = True + return cls -class DateMetadata(_TextMetadata): - """The Date metadata""" +def only_lang_codes(cls): + """Whether list input should be checked to only accept ISO-639-1 codes""" + cls.oz_only_iso636_3_allowed = True + return cls - def __init__(self, value: bytes | str | datetime.date | datetime.datetime) -> None: - if not isinstance(value, bytes | str | datetime.date | datetime.datetime): - raise ValueError( - f"Unexpected type passed to {self.__class__.__qualname__}: " - f"{value.__class__.__qualname__}" - ) - super().__init__( - name="Date", - value=( - value - if isinstance(value, bytes) - else ( - value.encode() - if isinstance(value, str) - else value.strftime("%Y-%m-%d").encode() - ) - ), - ) - self.value = value - _ = self.libzim_value # to check for errors - @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - if isinstance(self.value, datetime.date | datetime.datetime): - return self.value.strftime("%Y-%m-%d").encode() - match = re.match( - r"(?P\d{4})-(?P\d{2})-(?P\d{2})", - self.value.decode() if isinstance(self.value, bytes) else self.value, - ) - if not match: - raise ValueError("Invalid date format, not matching regex yyyy-mm-dd") - try: - datetime.date(**{k: int(v) for k, v in match.groupdict().items()}) - except Exception as exc: - raise ValueError(f"Invalid date format: {exc}") from None - return self.value if isinstance(self.value, bytes) else self.value.encode() - - -class IllustrationMetadata(_MandatoryMetadata): - """Any Illustration_**x**@* metadata""" - - def __init__(self, name: str, value: bytes | io.BytesIO) -> None: - super().__init__(name=name, value=value) - _ = self.libzim_value # to check for errors +def x_protected(cls): + """Whether metadata name should be checked for collision with reserved names - @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - match = ILLUSTRATIONS_METADATA_RE.match(self.name) - if not match: - raise ValueError("Illustration metadata has improper name") - self.size = int(match.groupdict()["height"]) - if int(match.groupdict()["width"]) != self.size: - raise ValueError("Illustration is not squared") - if not is_valid_image( - image=self.value, - imformat="PNG", - size=(self.size, self.size), - ): - raise ValueError( - f"{self.name} is not a valid {self.size}x{self.size} PNG Image" - ) - if isinstance(self.value, io.BytesIO): - return self.value.getvalue() - else: - return self.value + when applying recommendations""" + cls.oz_x_protected = True + return cls -class LanguageMetadata(_MandatoryTextMetadata): - """The Language metadata""" +def x_prefixed(cls): + """Whether metadata names should be automatically X-Prefixed""" + cls.oz_x_protected = False + cls.oz_x_prefixed = True + return cls - def __init__(self, value: bytes | io.BytesIO | str | list[str] | set[str]) -> None: - if not isinstance(value, bytes | io.BytesIO | str | list | set): - raise ValueError( - f"Unexpected type passed to {self.__class__.__qualname__}: " - f"{value.__class__.__qualname__}" - ) - if isinstance(value, list | set) and not all( - isinstance(item, str) for item in value - ): - bad_types = {item.__class__.__qualname__ for item in value} - {"str"} - raise ValueError( - f"Invalid type(s) found in Iterable: {",".join(bad_types)}" - ) - super().__init__( - name="Language", - value=",".join(value) if isinstance(value, list | set) else value, - ) - self.value = value - _ = self.libzim_value # to check for errors + +class Metadata: + # name of the metadata (not its value) + meta_name: str + value: Any + + # MIME type of the value + meta_mimetype: str + + # whether metadata is required or not + is_required: bool = False + + # what kind of input data is expected + expecting: str = "bytes" # text | textlist | date | illustration + + # str: text value must be cleaned-up + require_text_cleanup: bool = False + + # whether an empty value is allowed or not + empty_allowed: bool = False + + # str: whether text value must be under a certain length + oz_max_length: int + + # list[str]: text values in list must be cleaned-up + require_textlist_cleanup: bool = False + + # list[str]: whether duplicate values are allowed in a list of str + duplicates_allowed: bool = False + # list[str]: whether duplicate values are automatically removed + require_deduplication: bool = False + + # list[str]: whether values in list must all be ISO-636-3 codes + oz_only_iso636_3_allowed: bool = False + + join_list_with: str = " " # , | ; + + illustration_size: int + illustration_scale: int = 1 + + # str, list[str]: whether text value must be encoded to UTF-8 + require_utf8_encoding: bool = False + + # wether Name is automatically X-prefixed + oz_x_prefixed: bool = False + # wether Name is prevented to clash with reserved names + oz_x_protected: bool = False + + def __init__( + self, value: Any, name: str | None = None, mimetype: str | None = None + ) -> None: + if name: + self.meta_name = name + if not getattr(self, "meta_name", ""): + raise OSError("Metadata name missing") + if mimetype: + self.meta_mimetype = mimetype + self.value = self.get_cleaned_value(value) + self.validate() + + def get_mimetype(self) -> str: + # explicitly set first + return getattr(self, "meta_mimetype", "text/plain;charset=UTF-8") @property - def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - if isinstance(self.value, bytes | io.BytesIO | str): - codes = [ - clean_str(code) for code in super().libzim_value.decode().split(",") - ] - else: - codes = [clean_str(code) for code in self.value] - codes = [code for code in codes if code] # remove empty values - if len(codes) == 0: - raise ValueError("Missing value for mandatory Language metadata") - if len(set(codes)) != len(codes): - raise ValueError("Duplicate codes not allowed in Language metadata") - for code in codes: - if not is_valid_iso_639_3(code): - raise ValueError(f"{code} is not ISO-639-3.") - return ",".join(codes).encode() - - -class TagsMetadata(_TextMetadata): - """The Tags metadata""" - - def __init__(self, value: bytes | io.BytesIO | str | list[str] | set[str]) -> None: - if not isinstance(value, bytes | io.BytesIO | str | list | set): - raise ValueError( - f"Unexpected type passed to {self.__class__.__qualname__}: " - f"{value.__class__.__qualname__}" + def mimetype(self) -> str: + return self.get_mimetype() + + @staticmethod + def matches_reserved_name(name: str) -> bool: + if name in [field.name for field in fields(StandardMetadataList)]: + return True + # set by libzim + if name == "Counter": + return True + return bool(ILLUSTRATIONS_METADATA_RE.match(name)) + + def get_name(self) -> str: + if self.expecting == "illustration": + return ( + f"Illustration_{self.illustration_size}x{self.illustration_size}" + f"@{self.illustration_scale}" ) - if isinstance(value, list | set) and not all( - isinstance(item, str) for item in value + + # X- prefix is recommendation for custom + if ( + self.oz_x_protected + and APPLY_RECOMMENDATIONS + and not self.meta_name.startswith("X-") + and self.matches_reserved_name(self.meta_name) ): - bad_types = {item.__class__.__qualname__ for item in value} - {"str"} - raise ValueError( - f"Invalid type(s) found in Iterable: {",".join(bad_types)}" - ) - super().__init__( - name="Tags", - value=";".join(value) if isinstance(value, list | set) else value, - ) - self.value = value - _ = self.libzim_value # to check for errors + raise ValueError("Custom metdata name must be X- prefixed") + if ( + self.oz_x_prefixed + and APPLY_RECOMMENDATIONS + and not self.meta_name.startswith("X-") + ): + return f"X-{self.meta_name}" + return self.meta_name + + @property + def name(self) -> str: + return self.get_name() + + @staticmethod + def get_encoded(value: str) -> bytes: + return value.encode() @property def libzim_value(self) -> bytes: - """The value to pass to the libzim for creating the metadata""" - if isinstance(self.value, bytes | io.BytesIO | str): - tags = unique_values( - [clean_str(tag) for tag in super().libzim_value.decode().split(";")] - ) - else: - tags = unique_values([clean_str(tag) for tag in self.value]) - tags = [tag for tag in tags if tag] # remove empty values - return ";".join(tags).encode() + return self.get_libzim_value() + + # native type is bytes + def get_cleaned_value(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: + if isinstance(value, BinaryIO): + payload = value.read() + value.seek(0) + value = payload + elif isinstance(value, io.BytesIO): + value = value.getvalue() + if not self.empty_allowed and not value: + raise ValueError("Missing value (empty not allowed)") + return value + + def get_libzim_value(self) -> bytes: + return self.value + + def validate(self) -> None: + _ = self.name + _ = self.libzim_value + + +@mandatory +@expecting_text +class NameMetadata(Metadata): + meta_name: str = "Name" + +@mandatory +@expecting_textlist +@only_lang_codes +class LanguageMetadata(Metadata): + meta_name: str = "Language" + join_list_with: str = "," -class NameMetadata(_MandatoryTextMetadata): - """The Name metadata""" - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__("Name", value) +@mandatory +@expecting_text +class TitleMetadata(Metadata): + meta_name: str = "Title" + oz_max_length: int = RECOMMENDED_MAX_TITLE_LENGTH -class CreatorMetadata(_MandatoryTextMetadata): - """The Creator metadata""" +@mandatory +@expecting_text +class CreatorMetadata(Metadata): + meta_name: str = "Creator" - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__("Creator", value) +@mandatory +@expecting_text +class PublisherMetadata(Metadata): + meta_name: str = "Publisher" -class PublisherMetadata(_MandatoryTextMetadata): - """The Publisher metadata""" - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__("Publisher", value) +# TODO +@mandatory +@expecting_date +class DateMetadata(Metadata): + meta_name: str = "Date" -class ScraperMetadata(_TextMetadata): - """The Scraper metadata""" +@expecting_illustration +class IllustrationMetadata(Metadata): + meta_name = "Illustration_{size}x{size}@{scale}" + illustration_size: int + illustration_scale: int = 1 - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__("Scraper", value) + def __init__( + self, value: bytes | BinaryIO | io.BytesIO, size: int, scale: int = 1 + ) -> None: + self.illustration_scale = scale + self.illustration_size = size + super().__init__(value=value) -class FlavourMetadata(_TextMetadata): - """The Flavour metadata""" +@mandatory +@expecting_illustration +class DefaultIllustrationMetadata(Metadata): + meta_name = "Illustration_48x48@1" + illustration_size: int = 48 + illustration_scale: int = 1 - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__("Flavour", value) +@mandatory +@expecting_text +class DescriptionMetadata(Metadata): + meta_name: str = "Description" + oz_max_length: int = MAXIMUM_DESCRIPTION_METADATA_LENGTH -class SourceMetadata(_TextMetadata): - """The Source metadata""" - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__("Source", value) +@expecting_text +class LongDescriptionMetadata(Metadata): + meta_name: str = "LongDescription" + oz_max_length: int = MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH -class LicenseMetadata(_TextMetadata): - """The License metadata""" +@deduplicate +@expecting_textlist +class TagsMetadata(Metadata): + meta_name: str = "Tags" + join_list_with: str = ";" - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__("License", value) +@expecting_text +class ScraperMetadata(Metadata): + meta_name: str = "Scraper" -class RelationMetadata(_TextMetadata): - """The Relation metadata""" - def __init__(self, value: bytes | io.BytesIO | str) -> None: - super().__init__("Relation", value) +@expecting_text +class FlavourMetadata(Metadata): + meta_name: str = "Flavour" + + +@expecting_text +class SourceMetadata(Metadata): + meta_name: str = "Source" + + +@expecting_text +class LicenseMetadata(Metadata): + meta_name: str = "License" + + +@expecting_text +class RelationMetadata(Metadata): + meta_name: str = "Relation" @dataclass class StandardMetadataList: - """A class holding all openZIM standard metadata - - Useful to ensure that all mandatory metadata are set, no typo occurs in metadata - name and the specification is respected (forbidden duplicate values, ...). - """ Name: NameMetadata Language: LanguageMetadata @@ -428,7 +457,7 @@ class StandardMetadataList: Creator: CreatorMetadata Publisher: PublisherMetadata Date: DateMetadata - Illustration_48x48_at_1: IllustrationMetadata + Illustration_48x48_at_1: DefaultIllustrationMetadata Description: DescriptionMetadata LongDescription: LongDescriptionMetadata | None = None Tags: TagsMetadata | None = None @@ -439,22 +468,57 @@ class StandardMetadataList: Relation: RelationMetadata | None = None def values(self) -> list[Metadata]: - return [v for v in self.__dict__.values() if v] + return list(filter(bool, asdict(self).values())) + + @classmethod + def get_reserved_names(cls) -> list[str]: + """list of mandatory metadata as per the spec. + + computed from metadata using @mandatory decorator""" + names = [] + for field in fields(cls): + meta_type = globals().get(str(field.type)) + if getattr(meta_type, "is_required", False): + names.append(getattr(meta_type, "meta_name", "")) + return names + + +@x_protected +class CustomMetadata(Metadata): + def __init__(self, name: str, value: bytes | BinaryIO | io.BytesIO) -> None: + self.meta_name = name + super().__init__(value=value) + + +@x_protected +@expecting_text +class CustomTextMetadata(Metadata): + def __init__(self, name: str, value: str) -> None: + self.meta_name = name + super().__init__(name=name, value=value) + + +@x_prefixed +class XCustomMetadata(CustomMetadata): ... + + +@x_prefixed +class XCustomTextMetadata(CustomTextMetadata): ... + + +MANDATORY_ZIM_METADATA_KEYS = StandardMetadataList.get_reserved_names() -# A sample standard metadata list, to be used for dev purposes when one does not care -# at all about metadata values ; this ensures all metadata are compliant with the spec DEFAULT_DEV_ZIM_METADATA = StandardMetadataList( Name=NameMetadata("Test Name"), Title=TitleMetadata("Test Title"), Creator=CreatorMetadata("Test Creator"), Publisher=PublisherMetadata("Test Publisher"), - Date=DateMetadata("2023-01-01"), + Date=DateMetadata(datetime.date(2023, 1, 1)), Description=DescriptionMetadata("Test Description"), Language=LanguageMetadata("fra"), # blank 48x48 transparent PNG - Illustration_48x48_at_1=IllustrationMetadata( - "Illustration_48x48@1", + Illustration_48x48_at_1=DefaultIllustrationMetadata( base64.b64decode( "iVBORw0KGgoAAAANSUhEUgAAADAAAAAwAQMAAABtzGvEAAAAGXRFWHRTb2Z0d2FyZQBB" "ZG9iZSBJbWFnZVJlYWR5ccllPAAAAANQTFRFR3BMgvrS0gAAAAF0Uk5TAEDm2GYAAAAN" @@ -462,10 +526,3 @@ def values(self) -> list[Metadata]: ), ), ) - -# list of mandatory metadata of the zim file, automatically computed -MANDATORY_ZIM_METADATA_KEYS = [ - metadata.name - for metadata in DEFAULT_DEV_ZIM_METADATA.__dict__.values() - if isinstance(metadata, _MandatoryTextMetadata | _MandatoryMetadata) -] diff --git a/tests/zim/conftest.py b/tests/zim/conftest.py index eeaf4544..c2ff66ae 100644 --- a/tests/zim/conftest.py +++ b/tests/zim/conftest.py @@ -107,6 +107,6 @@ def counters(): @pytest.fixture def ignore_metadata_conventions(): - zimscraperlib.zim.metadata.check_metadata_conventions = False + zimscraperlib.zim.metadata.APPLY_RECOMMENDATIONS = False yield - zimscraperlib.zim.metadata.check_metadata_conventions = True + zimscraperlib.zim.metadata.APPLY_RECOMMENDATIONS = True diff --git a/tests/zim/test_metadata.py b/tests/zim/test_metadata.py index 24cdc8e3..d202e303 100644 --- a/tests/zim/test_metadata.py +++ b/tests/zim/test_metadata.py @@ -9,6 +9,7 @@ from typing import NamedTuple import pytest +from beartype.roar import BeartypeCallHintParamViolation as InvalidType from zimscraperlib.zim import metadata @@ -19,13 +20,13 @@ ("fra"), ("eng"), ("bam"), - ("fra,eng"), - ("eng, fra"), - ("fra,eng,bam"), (["fra", "eng"]), - ("fra, eng"), # codes are automatically cleaned-up - ("eng, \r"), # codes are automatically cleaned-up - (["fra", " "]), # codes are automatically cleaned-up + ("eng", "fra"), + ("fra", "eng", "bam"), + (["fra", "eng"]), + ("fra", "eng"), # codes are automatically cleaned-up + ("eng\r"), # codes are automatically cleaned-up + (["fra "]), # codes are automatically cleaned-up ], ) def test_validate_language_valid(value: list[str] | str): @@ -33,27 +34,28 @@ def test_validate_language_valid(value: list[str] | str): @pytest.mark.parametrize( - "value,error", + "value,exception,error", [ - ("", "Missing value for mandatory metadata"), - ("fr", "is not ISO-639-3"), - ("en", "is not ISO-639-3"), - ("xxx", "is not ISO-639-3"), - ("rmr", "is not ISO-639-3"), - ("fra,en,bam", "is not ISO-639-3"), - ("fra;eng", "is not ISO-639-3"), - ("fra,fra", "Duplicate codes not allowed in Language metadata"), - (["fra", "fra"], "Duplicate codes not allowed in Language metadata"), - ([], "Missing value for mandatory metadata"), - ([" ", "\t"], "Missing value for mandatory Language metadata"), - (" , ", "Missing value for mandatory Language metadata"), - (["fra", 1], "Invalid type(s) found in Iterable:"), - (["fra", b"1"], "Invalid type(s) found in Iterable:"), - (["fra", 1, b"1"], "Invalid type(s) found in Iterable:"), + ("", ValueError, "Missing value (empty not allowed)"), + ("fr", ValueError, "are not ISO-639-3"), + ("en", ValueError, "are not ISO-639-3"), + ("xxx", ValueError, "are not ISO-639-3"), + ("rmr", ValueError, "are not ISO-639-3"), + ("fra,en,bam", ValueError, "are not ISO-639-3"), + (("fra", "en", "bam"), ValueError, "are not ISO-639-3"), + ("fra;eng", ValueError, "are not ISO-639-3"), + (("fra", "fra"), ValueError, "Duplicate entries not allowed"), + (["fra", "fra"], ValueError, "Duplicate entries not allowed"), + ([], ValueError, "Missing value (empty not allowed)"), + ([" ", "\t"], ValueError, "Missing value (empty not allowed)"), + ((" ", " "), ValueError, "Missing value (empty not allowed)"), + (["fra", 1], InvalidType, " violates type"), + (["fra", b"1"], InvalidType, " violates type"), + (["fra", 1, b"1"], InvalidType, " violates type"), ], ) -def test_validate_language_invalid(value: list[str] | str, error: str): - with pytest.raises(ValueError, match=re.escape(error)): +def test_validate_language_invalid(value: list[str] | str, exception: type, error: str): + with pytest.raises(exception, match=re.escape(error)): assert metadata.LanguageMetadata(value) @@ -84,26 +86,31 @@ def test_validate_tags_valid(tags, is_valid): @pytest.mark.parametrize( - "value, error", + "value, exception, error", [ - pytest.param("", "Cannot set empty metadata", id="empty_string"), - pytest.param(1, "Unexpected type passed to TagsMetadata: int", id="one_int"), pytest.param( - None, "Unexpected type passed to TagsMetadata: NoneType", id="none_value" + "", ValueError, "Missing value (empty not allowed)", id="empty_string" ), + pytest.param(1, InvalidType, " violates type", id="one_int"), pytest.param( - ["wikipedia", 1], "Invalid type(s) found in Iterable: int", id="int_in_list" + None, + InvalidType, + " violates type", + id="none_value", ), + pytest.param(["wikipedia", 1], InvalidType, " violates type", id="int_in_list"), ], ) -def test_validate_tags_invalid(value: list[str] | str, error: str): - with pytest.raises(ValueError, match=re.escape(error)): +def test_validate_tags_invalid( + value: list[str] | str | int, exception: type, error: str +): + with pytest.raises(exception, match=re.escape(error)): metadata.TagsMetadata(value) def test_validate_dedup_tags(): assert ( - metadata.TagsMetadata(" wikipedia \t ; wikipedia").libzim_value + metadata.TagsMetadata([" wikipedia \t ", " wikipedia"]).libzim_value == b"wikipedia" ) @@ -117,7 +124,7 @@ def test_validate_short_grapheme_title_check_enabled(): def test_validate_too_long_title_check_enabled(): - with pytest.raises(ValueError, match="Title is too long"): + with pytest.raises(ValueError, match="Title value is too long"): assert metadata.TitleMetadata("T" * 31) @@ -136,7 +143,7 @@ def test_validate_short_grapheme_description_check_enabled(): def test_validate_too_long_description_check_enabled(): - with pytest.raises(ValueError, match="Description is too long"): + with pytest.raises(ValueError, match="Description value is too long"): assert metadata.DescriptionMetadata("T" * 81) @@ -155,7 +162,7 @@ def test_validate_short_grapheme_longdescription_check_enabled(): def test_validate_too_long_longdescription_check_enabled(): - with pytest.raises(ValueError, match="Description is too long"): + with pytest.raises(ValueError, match="LongDescription value is too long"): assert metadata.LongDescriptionMetadata("T" * 4001) @@ -175,20 +182,23 @@ def test_validate_date_datetime_datetime(): ) -@pytest.mark.parametrize("value", [("9999-99-99"), ("2023/02/29"), ("1969-13-31")]) -def test_validate_date_invalid_date(value): - with pytest.raises(ValueError, match="Invalid date format"): - metadata.DateMetadata(value) +def test_validate_date_invalid_datee(): + assert metadata.DateMetadata( + datetime.datetime(9, 12, 11, 0, 0, 0, tzinfo=datetime.UTC) + ) -def test_validate_illustration_invalid_name(): - with pytest.raises(ValueError, match="Illustration metadata has improper name"): - metadata.IllustrationMetadata("Illustration_48x48_at_1", b"") +@pytest.mark.parametrize("value", [("9999-99-99"), ("2023/02/29"), ("1969-13-31")]) +def test_validate_date_invalid_datetype(value): + with pytest.raises(InvalidType, match="violates type hint"): + metadata.DateMetadata(value) -def test_validate_illustration_not_squared(): - with pytest.raises(ValueError, match="Illustration is not squared"): - metadata.IllustrationMetadata("Illustration_48x96@1", b"") +def test_validate_illustration_invalid_image(): + with pytest.raises( + ValueError, match="Illustration_48x48@1 is not a valid 48x48 PNG Image" + ): + metadata.IllustrationMetadata(b"PN", size=48) def test_validate_illustration_wrong_sizes(png_image2): @@ -197,12 +207,12 @@ def test_validate_illustration_wrong_sizes(png_image2): with pytest.raises( ValueError, match="Illustration_48x48@1 is not a valid 48x48 PNG Image" ): - metadata.IllustrationMetadata("Illustration_48x48@1", png_data) + metadata.IllustrationMetadata(png_data, size=48) def test_blank_metadata(): - with pytest.raises(ValueError, match="Cannot set empty metadata"): - metadata.Metadata("Blank", b"") + with pytest.raises(ValueError, match=r"Missing value \(empty not allowed\)"): + metadata.Metadata(name="Blank", value=b"") class MetadataInitConfig(NamedTuple): @@ -213,15 +223,15 @@ class MetadataInitConfig(NamedTuple): @pytest.fixture( params=[ MetadataInitConfig(metadata.Metadata, 2), - MetadataInitConfig(metadata._TextMetadata, 2), MetadataInitConfig(metadata.CustomTextMetadata, 2), MetadataInitConfig(metadata.CustomMetadata, 2), - MetadataInitConfig(metadata._MandatoryTextMetadata, 2), - MetadataInitConfig(metadata._MandatoryMetadata, 2), + MetadataInitConfig(metadata.XCustomMetadata, 2), + MetadataInitConfig(metadata.XCustomTextMetadata, 2), MetadataInitConfig(metadata.TitleMetadata, 1), MetadataInitConfig(metadata.DescriptionMetadata, 1), MetadataInitConfig(metadata.LongDescriptionMetadata, 1), MetadataInitConfig(metadata.DateMetadata, 1), + MetadataInitConfig(metadata.DefaultIllustrationMetadata, 1), MetadataInitConfig(metadata.IllustrationMetadata, 2), MetadataInitConfig(metadata.LanguageMetadata, 1), MetadataInitConfig(metadata.TagsMetadata, 1), @@ -239,35 +249,6 @@ def metadata_init(request: pytest.FixtureRequest) -> MetadataInitConfig: return request.param -def test_none_metadata_two_args(metadata_init: MetadataInitConfig): - with pytest.raises( - ValueError, - match=f"Unexpected type passed to {metadata_init.a_type.__qualname__}: " - "NoneType", - ): - if metadata_init.nb_args == 2: - metadata_init.a_type( - "Foo", - None, # pyright: ignore[reportArgumentType] - ) - elif metadata_init.nb_args == 1: - metadata_init.a_type(None) - - -@pytest.mark.parametrize( - "a_type", - [ - (metadata.Metadata), - (metadata.CustomTextMetadata), - (metadata._TextMetadata), - (metadata._MandatoryMetadata), - ], -) -def test_reserved_counter_metadata(a_type: type): - with pytest.raises(ValueError, match="Counter cannot be set. libzim sets it."): - a_type("Counter", b"Foo") - - @pytest.mark.parametrize( "metadata_name", [ @@ -287,6 +268,7 @@ def test_reserved_counter_metadata(a_type: type): ("Source"), ("License"), ("Relation"), + ("Counter"), ], ) @pytest.mark.parametrize( @@ -297,119 +279,168 @@ def test_reserved_counter_metadata(a_type: type): ], ) def test_reserved_names(metadata_name: str, metadata_type: type): - with pytest.raises(ValueError, match="It is not allowed to use"): - metadata_type(metadata_name, b"a value") + with pytest.raises(ValueError, match="Custom metdata name must be X- prefixed"): + value = "x" if metadata_type == metadata.CustomTextMetadata else b"x" + metadata_type(name=metadata_name, value=value) -def test_mandatory_value(metadata_init: MetadataInitConfig): +@pytest.mark.parametrize( + "metadata_name", + [ + ("Nameo"), + ("Languageo"), + ("Titleo"), + ("Creatoro"), + ("Publishero"), + ("Dateo"), + ("Illustration_48x48@1o"), + ("Illustration_96x96@1o"), + ("Descriptiono"), + ("LongDescriptiono"), + ("Tagso"), + ("Scrapero"), + ("Flavouro"), + ("Sourceo"), + ("Licenseo"), + ("Relationo"), + ("Countero"), + ], +) +@pytest.mark.parametrize( + "metadata_type", + [ + (metadata.CustomMetadata), + (metadata.CustomTextMetadata), + ], +) +def test_nonreserved_custom(metadata_name: str, metadata_type: type): + value = "x" if metadata_type == metadata.CustomTextMetadata else b"x" + assert metadata_type(name=metadata_name, value=value) - with pytest.raises( - ValueError, - match=( - "is not a valid 48x48 PNG Image" - if metadata_init.a_type == metadata.IllustrationMetadata - else ( - "Invalid date format, not matching regex" - if metadata_init.a_type == metadata.DateMetadata - else "Cannot set empty metadata|Missing value for mandatory metadata" - ) - ), - ): - if metadata_init.nb_args == 2: - metadata_init.a_type( - ( - "Foo" - if metadata_init.a_type != metadata.IllustrationMetadata - else "Illustration_48x48@1" - ), - b"", - ) - elif metadata_init.nb_args == 1: - metadata_init.a_type(b"") + +def test_mandatory_value(metadata_init: MetadataInitConfig): + if not metadata_init.a_type.is_required: + pytest.skip("Only testing mandatory ones") + with pytest.raises(ValueError, match="Missing"): + if metadata_init.a_type.expecting == "textlist": + metadata_init.a_type([]) + metadata_init.a_type(["", " "]) + elif metadata_init.a_type.expecting == "text": + metadata_init.a_type("") + metadata_init.a_type(" ") + elif metadata_init.a_type.expecting == "date": + pytest.skip("Cannot set an empty Date") + elif metadata_init.a_type.expecting in ("illustration", "bytes"): + if metadata_init.nb_args == 1: + metadata_init.a_type(b"") + metadata_init.a_type(b" ") + else: + metadata_init.a_type(b"", name="Foo") + metadata_init.a_type(b" ", name="Foo") def test_clean_value(metadata_init: MetadataInitConfig): - raw_value = b"\t\n\r\n \tA val \t\awith \bcontrol chars\v\n" + raw_value = "\t\n\r\n \tA val \t\awith \bcontrol chars\v\n" clean_value = b"A val \twith control chars" - if metadata_init.a_type in ( - # some types do not support the raw value - metadata.IllustrationMetadata, - metadata.LanguageMetadata, - metadata.DateMetadata, - # and binary types are not cleaned-up - metadata.Metadata, - metadata.CustomMetadata, - metadata._MandatoryMetadata, - ): - pytest.skip("unsupported configuration") # ignore these test cases - if metadata_init.nb_args == 2: - assert metadata_init.a_type(("Foo"), raw_value).libzim_value == clean_value - elif metadata_init.nb_args == 1: + raw_lang_value = "\t\n\r\n \tfra \t\a\b\v\n" + clean_lang_value = b"fra" + if metadata_init.a_type == metadata.LanguageMetadata: + assert metadata_init.a_type([raw_lang_value]).libzim_value == clean_lang_value + elif metadata_init.a_type.expecting == "textlist": assert metadata_init.a_type(raw_value).libzim_value == clean_value + assert metadata_init.a_type( + value=[raw_value, raw_value] + ).libzim_value == metadata_init.a_type.join_list_with.join( + [clean_value.decode("UTF-8")] + ).encode( + "UTF-8" + ) + elif metadata_init.a_type.expecting == "text": + if metadata_init.nb_args == 1: + assert metadata_init.a_type(value=raw_value).libzim_value == clean_value + else: + assert ( + metadata_init.a_type(value=raw_value, name="Foo").libzim_value + == clean_value + ) -# @pytest.mark.parametrize( -# "metadata, expected_value", -# [ -# (metadata.Metadata("Foo", b"a value"), b"a value"), -# (metadata.Metadata("Foo", io.BytesIO(b"a value")), b"a value"), -# (metadata.DescriptionMetadata(io.BytesIO(b"a value")), b"a value"), -# (metadata.DescriptionMetadata(b"a value"), b"a value"), -# (metadata.DescriptionMetadata("a value"), b"a value"), -# (metadata.IllustrationMetadata(png), b"a value"), -# ], -# ) def test_libzim_bytes_value(metadata_init: MetadataInitConfig, png_image: pathlib.Path): if metadata_init.nb_args == 2: if metadata_init.a_type == metadata.IllustrationMetadata: with open(png_image, "rb") as fh: png_data = fh.read() + assert metadata_init.a_type(png_data, 48).libzim_value == png_data + elif metadata_init.a_type in ( + metadata.CustomTextMetadata, + metadata.XCustomTextMetadata, + ): assert ( - metadata.IllustrationMetadata( - "Illustration_48x48@1", png_data - ).libzim_value - == png_data + metadata_init.a_type(value="a value", name="Foo").libzim_value + == b"a value" + ) + elif metadata_init.a_type in ( + metadata.CustomMetadata, + metadata.XCustomMetadata, + ): + assert ( + metadata_init.a_type(value=b"a value", name="Foo").libzim_value + == b"a value" ) else: - assert metadata_init.a_type("Foo", b"a value").libzim_value == b"a value" + assert ( + metadata_init.a_type(value=b"a value", name="Foo").libzim_value + == b"a value" + ) elif metadata_init.nb_args == 1: - if metadata_init.a_type == metadata.DateMetadata: - assert metadata_init.a_type(b"2023-12-11").libzim_value == b"2023-12-11" + if metadata_init.a_type == metadata.DefaultIllustrationMetadata: + with open(png_image, "rb") as fh: + png_data = fh.read() + assert metadata_init.a_type(png_data).libzim_value == png_data + elif metadata_init.a_type == metadata.DateMetadata: + assert ( + metadata_init.a_type(datetime.date(2023, 12, 11)).libzim_value + == b"2023-12-11" + ) elif metadata_init.a_type == metadata.LanguageMetadata: - assert metadata_init.a_type(b"fra,eng").libzim_value == b"fra,eng" + assert metadata_init.a_type(["fra", "eng"]).libzim_value == b"fra,eng" else: - assert metadata_init.a_type(b"a value").libzim_value == b"a value" + assert metadata_init.a_type("a value").libzim_value == b"a value" def test_libzim_io_bytesio_value( metadata_init: MetadataInitConfig, png_image: pathlib.Path ): - if metadata_init.nb_args == 2: - if metadata_init.a_type == metadata.IllustrationMetadata: - with open(png_image, "rb") as fh: - png_data = fh.read() - assert ( - metadata.IllustrationMetadata( - "Illustration_48x48@1", io.BytesIO(png_data) - ).libzim_value - == png_data - ) - else: + if metadata_init.a_type.expecting == "illustration": + with open(png_image, "rb") as fh: + png_data = fh.read() + if metadata_init.nb_args == 1: + assert ( + metadata_init.a_type(value=io.BytesIO(png_data)).libzim_value + == png_data + ) + else: + assert ( + metadata_init.a_type( + value=io.BytesIO(png_data), size=48 + ).libzim_value + == png_data + ) + if metadata_init.a_type.expecting == "bytes": + if metadata_init.nb_args == 1: assert ( - metadata_init.a_type("Foo", io.BytesIO(b"a value")).libzim_value + metadata_init.a_type(value=io.BytesIO(b"a value")).libzim_value == b"a value" ) - elif metadata_init.nb_args == 1: - if metadata_init.a_type == metadata.DateMetadata: - pass # Not supported - elif metadata_init.a_type == metadata.LanguageMetadata: - assert ( - metadata_init.a_type(io.BytesIO(b"fra,eng")).libzim_value == b"fra,eng" - ) else: assert ( - metadata_init.a_type(io.BytesIO(b"a value")).libzim_value == b"a value" + metadata_init.a_type( + value=io.BytesIO(b"a value"), name="Foo" + ).libzim_value + == b"a value" ) + else: + pytest.skip("Type doesnt support bytes-like input") def test_std_metadata_values(): @@ -424,12 +455,11 @@ def test_std_metadata_values(): metadata.TitleMetadata("Test Title"), metadata.CreatorMetadata("Test Creator"), metadata.PublisherMetadata("Test Publisher"), - metadata.DateMetadata("2023-01-01"), + metadata.DateMetadata(datetime.date(2023, 1, 1)), metadata.DescriptionMetadata("Test Description"), metadata.LanguageMetadata("fra"), # blank 48x48 transparent PNG - metadata.IllustrationMetadata( - "Illustration_48x48@1", + metadata.DefaultIllustrationMetadata( base64.b64decode( "iVBORw0KGgoAAAANSUhEUgAAADAAAAAwAQMAAABtzGvEAAAAGXRFWHRTb2Z0d2FyZQBB" "ZG9iZSBJbWFnZVJlYWR5ccllPAAAAANQTFRFR3BMgvrS0gAAAAF0Uk5TAEDm2GYAAAAN" @@ -448,3 +478,7 @@ def test_std_metadata_values(): pytest.fail( f"Did not found matching expected values for {value.name} metadata" ) + + +def test_raw_metadata(): + assert metadata.Metadata(name="Name", value=b"hello") diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 2fd7f3f2..d3a0a40c 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -3,6 +3,7 @@ from __future__ import annotations +import datetime import io import logging import pathlib @@ -21,6 +22,7 @@ from zimscraperlib.constants import UTF8 from zimscraperlib.download import save_large_file, stream_file from zimscraperlib.filesystem import delete_callback +from zimscraperlib.typing import Callback from zimscraperlib.zim import Archive, Creator, StaticItem, URLItem from zimscraperlib.zim.metadata import ( DEFAULT_DEV_ZIM_METADATA, @@ -28,9 +30,9 @@ CustomMetadata, CustomTextMetadata, DateMetadata, + DefaultIllustrationMetadata, DescriptionMetadata, FlavourMetadata, - IllustrationMetadata, LanguageMetadata, LicenseMetadata, LongDescriptionMetadata, @@ -76,7 +78,7 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str: str, html_str_cn: with open(png_image, "rb") as fh: png_data = fh.read() with Creator(fpath, main_path).config_dev_metadata( - [TagsMetadata(tags), IllustrationMetadata("Illustration_48x48@1", png_data)] + [TagsMetadata(tags), DefaultIllustrationMetadata(png_data)] ) as creator: # verbatim HTML from string creator.add_item_for("welcome", "wel", content=html_str, is_front=True) @@ -108,10 +110,12 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str: str, html_str_cn: assert fpath.exists() reader = Archive(fpath) - assert reader.get_text_metadata("Title") == DEFAULT_DEV_ZIM_METADATA.Title.value - assert ( - reader.get_text_metadata("Language") == DEFAULT_DEV_ZIM_METADATA.Language.value - ) + assert reader.get_text_metadata( + "Title" + ) == DEFAULT_DEV_ZIM_METADATA.Title.libzim_value.decode("UTF-8") + assert reader.get_text_metadata( + "Language" + ) == DEFAULT_DEV_ZIM_METADATA.Language.libzim_value.decode("UTF-8") assert reader.get_text_metadata("Tags") == tags assert reader.main_entry.get_item().path == f"{main_path}" # make sure we have our image @@ -143,11 +147,10 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str: str, html_str_cn: def test_create_without_workaround(tmp_path): fpath = tmp_path / "test.zim" - with Creator( - fpath, "welcome", workaround_nocancel=False - ).config_dev_metadata() as creator: - with pytest.raises(RuntimeError, match="AttributeError"): - creator.add_item("hello") # pyright: ignore [reportArgumentType] + with Creator(fpath, "welcome", workaround_nocancel=False).config_dev_metadata(): + with pytest.raises(RuntimeError): + raise RuntimeError("erroring") + assert fpath.exists() def test_noindexlanguage(tmp_path): @@ -207,7 +210,7 @@ def test_add_item_for_delete_fail(tmp_path, png_image): filepath=local_path, path="index", ), - callback=(delete_callback, local_path), + callbacks=Callback(func=delete_callback, kwargs={"fpath": local_path}), ) assert not local_path.exists() @@ -226,21 +229,6 @@ def test_add_item_empty_content(tmp_path): ) -@pytest.mark.parametrize("auto_index", [False, True]) -def test_add_item_for_unsupported_content_type(auto_index, tmp_path): - fpath = tmp_path / "test.zim" - # test with incorrect content type - with Creator(fpath, "welcome").config_dev_metadata() as creator: - with pytest.raises(RuntimeError): - creator.add_item_for( - path="welcome", - title="hello", - mimetype="text/plain", - content=123, # pyright: ignore[reportArgumentType] - auto_index=auto_index, - ) - - def test_compression(tmp_path): fpath = tmp_path / "test.zim" with Creator( @@ -249,7 +237,7 @@ def test_compression(tmp_path): creator.add_item(StaticItem(path="welcome", content="hello")) with Creator( - fpath, "welcome", compression=Compression.zstd # pyright: ignore + fpath, "welcome", compression=Compression.zstd.name ).config_dev_metadata() as creator: creator.add_item(StaticItem(path="welcome", content="hello")) @@ -297,7 +285,7 @@ def test_sourcefile_removal_std(tmp_path, html_file): path=paths[-1].name, mimetype="text/html", ), - callback=(delete_callback, paths[-1]), + callbacks=Callback(func=delete_callback, kwargs={"fpath": paths[-1]}), ) for path in paths: assert not path.exists() @@ -485,7 +473,8 @@ def cb(): with Creator(fpath, "").config_dev_metadata() as creator: creator.add_item( - StaticItem(path=html_file.name, filepath=html_file), callback=cb + StaticItem(path=html_file.name, filepath=html_file), + callbacks=Callback(func=cb), ) assert Store.called is True @@ -514,13 +503,16 @@ def cb(*args): # noqa: ARG001 with Creator(tmp_path / "test.zim", "").config_dev_metadata() as creator: creator.add_item_for( - path=html_file.name, fpath=html_file, delete_fpath=True, callback=cb + path=html_file.name, + fpath=html_file, + delete_fpath=True, + callbacks=Callback(func=cb), ) creator.add_item_for( path=html_file2.name, fpath=html_file2, delete_fpath=True, - callback=(cb, html_file.name), + callbacks=Callback(func=cb, args=(html_file.name,)), ) assert not html_file.exists() @@ -557,10 +549,6 @@ def test_without_metadata(tmp_path): @pytest.mark.parametrize( "tags", [ - ( - "wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:yes;" - "_ftindex:yes" - ), ( [ "wikipedia", @@ -569,23 +557,31 @@ def test_without_metadata(tmp_path): "_videos:no", "_details:yes", "_ftindex:yes", + "wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:yes;" + "_ftindex:yes", ] ), ], ) @patch("zimscraperlib.zim.creator.logger", autospec=True) -def test_start_logs_metadata_log_contents(mocked_logger, png_image, tags, tmp_path): +def test_start_logs_metadata_log_contents( + mocked_logger, + png_image, + tags, + tmp_path, + ignore_metadata_conventions, # noqa: ARG001 +): mocked_logger.isEnabledFor.side_effect = lambda level: level == logging.DEBUG fpath = tmp_path / "test_config.zim" with open(png_image, "rb") as fh: png_data = fh.read() - creator = Creator(fpath, "", check_metadata_conventions=False).config_metadata( + creator = Creator(fpath, "").config_metadata( StandardMetadataList( Name=NameMetadata("wikipedia_fr_football"), Title=TitleMetadata("English Wikipedia"), Creator=CreatorMetadata("English speaking Wikipedia contributors"), Publisher=PublisherMetadata("Wikipedia user Foobar"), - Date=DateMetadata("2009-11-21"), + Date=DateMetadata(datetime.date(2009, 11, 21)), Description=DescriptionMetadata( "All articles (without images) from the english Wikipedia" ), @@ -599,9 +595,7 @@ def test_start_logs_metadata_log_contents(mocked_logger, png_image, tags, tmp_pa Flavour=FlavourMetadata("nopic"), Source=SourceMetadata("https://en.wikipedia.org/"), Scraper=ScraperMetadata("mwoffliner 1.2.3"), - Illustration_48x48_at_1=IllustrationMetadata( - "Illustration_48x48@1", png_data - ), + Illustration_48x48_at_1=DefaultIllustrationMetadata(png_data), ), [CustomTextMetadata("TestMetadata", "Test Metadata")], fail_on_missing_prefix_in_extras=False, @@ -614,14 +608,16 @@ def __str__(self): creator._metadata.update( { "Illustration_96x96@1": Metadata( - "Illustration_96x96@1", b"%PDF-1.5\n%\xe2\xe3\xcf\xd3" + value=b"%PDF-1.5\n%\xe2\xe3\xcf\xd3", name="Illustration_96x96@1" ), - "Chars": Metadata("Chars", b"\xc5\xa1\xc9\x94\xc9\x9b"), + "Chars": Metadata(name="Chars", value=b"\xc5\xa1\xc9\x94\xc9\x9b"), "Chars-32": Metadata( - "Chars-32", b"\xff\xfe\x00\x00a\x01\x00\x00T\x02\x00\x00[\x02\x00\x00" + name="Chars-32", + value=b"\xff\xfe\x00\x00a\x01\x00\x00T\x02\x00\x00[\x02\x00\x00", ), "Video": Metadata( - "Video", b"\x00\x00\x00 ftypisom\x00\x00\x02\x00isomiso2avc1mp41\x00" + name="Video", + value=b"\x00\x00\x00 ftypisom\x00\x00\x02\x00isomiso2avc1mp41\x00", ), "Toupie": CustomTextMetadata("Toupie", NotPrintable("value")), } # intentionaly bad, to handle case where user does bad things @@ -649,7 +645,7 @@ def __str__(self): call("Metadata: Flavour = nopic"), call("Metadata: Illustration_48x48@1 is a 3274 bytes 48x48px PNG Image"), call("Metadata: Illustration_96x96@1 is a 14 bytes application/pdf blob"), - call("Metadata: Language = eng"), + call("Metadata: Language = ['eng']"), call("Metadata: License = CC-BY"), call( "Metadata: LongDescription = This ZIM file contains all articles " @@ -664,10 +660,8 @@ def __str__(self): call(f"Metadata: Tags = {tags}"), call("Metadata: TestMetadata = Test Metadata"), call("Metadata: Title = English Wikipedia"), - call( - "Metadata: Toupie is unexpected data type: " - "test_start_logs_metadata_log_contents..NotPrintable" - ), + # cleaned-up anyway + call("Metadata: Toupie = value"), call("Metadata: Video is a 33 bytes video/mp4 blob"), ] ) @@ -677,9 +671,7 @@ def test_relax_metadata( tmp_path, ignore_metadata_conventions, # noqa: ARG001 ): - Creator(tmp_path, "", check_metadata_conventions=False).config_dev_metadata( - DescriptionMetadata("T" * 90) - ).start() + Creator(tmp_path, "").config_dev_metadata(DescriptionMetadata("T" * 90)).start() @pytest.mark.parametrize( @@ -711,7 +703,7 @@ def test_config_metadata(tmp_path, png_image, tags): Title=TitleMetadata("English Wikipedia"), Creator=CreatorMetadata("English speaking Wikipedia contributors"), Publisher=PublisherMetadata("Wikipedia user Foobar"), - Date=DateMetadata("2009-11-21"), + Date=DateMetadata(datetime.date(2009, 11, 21)), Description=DescriptionMetadata( "All articles (without images) from the english Wikipedia" ), @@ -725,9 +717,7 @@ def test_config_metadata(tmp_path, png_image, tags): Flavour=FlavourMetadata("nopic"), Source=SourceMetadata("https://en.wikipedia.org/"), Scraper=ScraperMetadata("mwoffliner 1.2.3"), - Illustration_48x48_at_1=IllustrationMetadata( - "Illustration_48x48@1", png_data - ), + Illustration_48x48_at_1=DefaultIllustrationMetadata(png_data), ), [CustomTextMetadata("X-TestMetadata", "Test Metadata")], ) From c95a58f7375095e3019435c08adcc8574fe379bf Mon Sep 17 00:00:00 2001 From: rgaudin Date: Thu, 12 Dec 2024 10:07:21 +0000 Subject: [PATCH 07/13] Upgrading dev dependencies Reasoning: coverage reported a lot of missing lines on zim/metadata.py with previous version Also includes auto linting where new ruff complained --- pyproject.toml | 12 ++++++------ src/zimscraperlib/image/conversion.py | 2 +- src/zimscraperlib/zim/__init__.py | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a5776507..13e05bca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,19 +53,19 @@ scripts = [ ] lint = [ "black==24.10.0", - "ruff==0.7.0", + "ruff==0.8.2", ] check = [ - "pyright==1.1.385", - "pytest==8.3.3", + "pyright==1.1.390", + "pytest==8.3.4", ] test = [ - "pytest==8.3.3", + "pytest==8.3.4", "pytest-mock==3.14.0", - "coverage==7.5.3", + "coverage==7.6.9", ] dev = [ - "ipython==8.25.0", + "ipython==8.30.0", "pre-commit==4.0.1", "zimscraperlib[scripts]", "zimscraperlib[lint]", diff --git a/src/zimscraperlib/image/conversion.py b/src/zimscraperlib/image/conversion.py index ed4522d0..de53bea9 100644 --- a/src/zimscraperlib/image/conversion.py +++ b/src/zimscraperlib/image/conversion.py @@ -37,7 +37,7 @@ def convert_image( if not fmt: raise ValueError("Impossible to guess destination image format") with pilopen(src) as image: - if image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED or colorspace: + if (image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED) or colorspace: image = image.convert(colorspace or "RGB") # noqa: PLW2901 save_image(image, dst, fmt, **params) diff --git a/src/zimscraperlib/zim/__init__.py b/src/zimscraperlib/zim/__init__.py index 6b0b1714..da74ab69 100644 --- a/src/zimscraperlib/zim/__init__.py +++ b/src/zimscraperlib/zim/__init__.py @@ -27,14 +27,14 @@ __all__ = [ "Archive", + "Blob", "Creator", - "make_zim_file", + "FileLikeProvider", + "FileProvider", "Item", "StaticItem", - "URLItem", - "FileProvider", "StringProvider", - "FileLikeProvider", + "URLItem", "URLProvider", - "Blob", + "make_zim_file", ] From fca42e432ef84e1e0190b01ba4273b06a1465b38 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Thu, 12 Dec 2024 15:37:39 +0000 Subject: [PATCH 08/13] Using per-type base classes for Metadata In order to properly expose input type in __init__ (for pyright and user assit), use one base class (subclassing Metadata) per input type. Cant get rid of the `Any` on `Metadata` init (otherwise would me re-implement the init everywhere). Used the opportunity to remove the `expecting` classvar and modified tests accordingly - Also fixed a minor issue in bytes reading by seeking back to previous position and not zero. - Also shared binary reading logic inside main base class (was already there) so it can be reused in illustration - Now explicitly says the type of stored data (can be different to inputs in somewhat flexible ones) --- src/zimscraperlib/zim/metadata.py | 317 ++++++++++++++---------------- tests/zim/test_metadata.py | 54 +++-- 2 files changed, 188 insertions(+), 183 deletions(-) diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index 6fc5c439..63f40bd6 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -44,132 +44,6 @@ def mandatory(cls): return cls -def expecting_text(cls): - """Expects a Text (str) input. Will be cleaned-up and UTF-8 encoded""" - cls.expecting = "text" - cls.require_text_cleanup = True - cls.require_utf8_encoding = True - - # native type is str - def get_cleaned_value(self, value: str) -> str: - if self.require_text_cleanup: - value = clean_str(value) - - if not self.empty_allowed and not value.strip(): - raise ValueError("Missing value (empty not allowed)") - - # max-length is openZIM recommendation - if getattr(self, "oz_max_length", 0) and APPLY_RECOMMENDATIONS: - if nb_grapheme_for(value) > self.oz_max_length: - raise ValueError( - f"{self.name} value is too long ({self.oz_max_length})" - ) - return value - - def get_libzim_value(self) -> bytes: - return self.get_encoded(self.value) - - cls.get_cleaned_value = get_cleaned_value - cls.get_libzim_value = get_libzim_value - return cls - - -def expecting_textlist(cls): - """Expects a Text List (list[str]) input. Each item will be cleaned-up. - - List will be joined (see `join_list_with`) and UTF-8 encoded""" - cls.expecting = "textlist" - cls.require_textlist_cleanup = True - cls.require_utf8_encoding = True - - # native type is list[str] - def get_cleaned_value(self, value: Iterable[str] | str) -> list[str]: - if isinstance(value, str): - value = [value] - else: - value = list(value) - if self.require_textlist_cleanup: - value = [clean_str(item) for item in value] - if not self.empty_allowed and (not value or not all(value)): - raise ValueError("Missing value (empty not allowed)") - if not self.duplicates_allowed and len(set(value)) != len(value): - raise ValueError("Duplicate entries not allowed") - elif self.require_deduplication: - value = unique_values(value) - if self.oz_only_iso636_3_allowed and APPLY_RECOMMENDATIONS: - invalid_codes = list(filterfalse(is_valid_iso_639_3, value)) - if invalid_codes: - raise ValueError( - f"Following code(s) are not ISO-639-3: {','.join(invalid_codes)}" - ) - return value - - def get_libzim_value(self) -> bytes: - return self.get_encoded(self.join_list_with.join(self.value)) - - cls.get_cleaned_value = get_cleaned_value - cls.get_libzim_value = get_libzim_value - return cls - - -def expecting_date(cls): - """Expects a Date (date | datetime) input. Will be UTF-8 encoded as YYYY-MM-DD""" - cls.expecting = "date" - cls.require_utf8_encoding = True - - # native type is date - def get_cleaned_value( - self, value: datetime.date | datetime.datetime # noqa: ARG001 - ) -> datetime.date: - if isinstance(value, datetime.datetime): - value = value.date() - return value - - def get_libzim_value(self) -> bytes: - return self.get_encoded(self.value.strftime("%Y-%m-%d")) - - cls.get_cleaned_value = get_cleaned_value - cls.get_libzim_value = get_libzim_value - return cls - - -def expecting_illustration(cls): - """Expects a Square PNG Illustration (bytes-like) input. - - PNG format and squareness will be checked""" - cls.expecting = "illustration" - cls.meta_mimetype = "image/png" - - # native type is PNG image buffer - def get_cleaned_value(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: - if isinstance(value, BinaryIO): - payload = value.read() - value.seek(0) - value = payload - elif isinstance(value, io.BytesIO): - value = value.getvalue() - if not self.empty_allowed and not value: - raise ValueError("Missing value (empty not allowed)") - - if not is_valid_image( - image=value, - imformat="PNG", - size=(self.illustration_size, self.illustration_size), - ): - raise ValueError( - f"{self.name} is not a valid " - f"{self.illustration_size}x{self.illustration_size} PNG Image" - ) - return value - - def get_libzim_value(self) -> bytes: - return self.value - - cls.get_cleaned_value = get_cleaned_value - cls.get_libzim_value = get_libzim_value - return cls - - def allow_empty(cls): """Whether input can be blank""" cls.empty_allowed = True @@ -213,7 +87,7 @@ def x_prefixed(cls): class Metadata: # name of the metadata (not its value) meta_name: str - value: Any + value: bytes # MIME type of the value meta_mimetype: str @@ -221,9 +95,6 @@ class Metadata: # whether metadata is required or not is_required: bool = False - # what kind of input data is expected - expecting: str = "bytes" # text | textlist | date | illustration - # str: text value must be cleaned-up require_text_cleanup: bool = False @@ -287,7 +158,9 @@ def matches_reserved_name(name: str) -> bool: return bool(ILLUSTRATIONS_METADATA_RE.match(name)) def get_name(self) -> str: - if self.expecting == "illustration": + if getattr(self, "illustration_size", 0) and getattr( + self, "illustration_scale", 0 + ): return ( f"Illustration_{self.illustration_size}x{self.illustration_size}" f"@{self.illustration_scale}" @@ -321,11 +194,11 @@ def get_encoded(value: str) -> bytes: def libzim_value(self) -> bytes: return self.get_libzim_value() - # native type is bytes - def get_cleaned_value(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: + def get_binary_from(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: if isinstance(value, BinaryIO): + last_pos = value.tell() payload = value.read() - value.seek(0) + value.seek(last_pos) value = payload elif isinstance(value, io.BytesIO): value = value.getvalue() @@ -333,6 +206,10 @@ def get_cleaned_value(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: raise ValueError("Missing value (empty not allowed)") return value + # native type is bytes + def get_cleaned_value(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: + return self.get_binary_from(value) + def get_libzim_value(self) -> bytes: return self.value @@ -341,48 +218,160 @@ def validate(self) -> None: _ = self.libzim_value +class TextBasedMetadata(Metadata): + """Expects a Text (str) input. Will be cleaned-up and UTF-8 encoded""" + + value: str + require_text_cleanup = True + require_utf8_encoding = True + + def __init__(self, value: str, name: str | None = None) -> None: + super().__init__(value=value, name=name) + + # native type is str + def get_cleaned_value(self, value: str) -> str: + if self.require_text_cleanup: + value = clean_str(value) + + if not self.empty_allowed and not value.strip(): + raise ValueError("Missing value (empty not allowed)") + + # max-length is openZIM recommendation + if getattr(self, "oz_max_length", 0) and APPLY_RECOMMENDATIONS: + if nb_grapheme_for(value) > self.oz_max_length: + raise ValueError( + f"{self.name} value is too long ({self.oz_max_length})" + ) + return value + + def get_libzim_value(self) -> bytes: + return self.get_encoded(self.value) + + +class TextListBasedMetadata(Metadata): + """Expects a Text List (list[str]) input. Each item will be cleaned-up. + + List will be joined (see `join_list_with`) and UTF-8 encoded""" + + value: list[str] # we accept single str input but store as list[str] + require_textlist_cleanup = True + require_utf8_encoding = True + + def __init__(self, value: Iterable[str] | str, name: str | None = None) -> None: + super().__init__(value=value, name=name) + + # native type is list[str] + def get_cleaned_value(self, value: Iterable[str] | str) -> list[str]: + if isinstance(value, str): + value = [value] + else: + value = list(value) + if self.require_textlist_cleanup: + value = [clean_str(item) for item in value] + if not self.empty_allowed and (not value or not all(value)): + raise ValueError("Missing value (empty not allowed)") + if not self.duplicates_allowed and len(set(value)) != len(value): + raise ValueError("Duplicate entries not allowed") + elif self.require_deduplication: + value = unique_values(value) + if self.oz_only_iso636_3_allowed and APPLY_RECOMMENDATIONS: + invalid_codes = list(filterfalse(is_valid_iso_639_3, value)) + if invalid_codes: + raise ValueError( + f"Following code(s) are not ISO-639-3: {','.join(invalid_codes)}" + ) + return value + + def get_libzim_value(self) -> bytes: + return self.get_encoded(self.join_list_with.join(self.value)) + + +class DateBasedMetadata(Metadata): + """Expects a Date (date | datetime) input. Will be UTF-8 encoded as YYYY-MM-DD""" + + value: datetime.date + require_utf8_encoding = True + + def __init__( + self, value: datetime.date | datetime.datetime, name: str | None = None + ) -> None: + super().__init__(value=value, name=name) + + # native type is date + def get_cleaned_value(self, value: datetime.date) -> datetime.date: + if isinstance(value, datetime.datetime): + value = value.date() + return value + + def get_libzim_value(self) -> bytes: + return self.get_encoded(self.value.strftime("%Y-%m-%d")) + + +class IllustrationBasedMetadata(Metadata): + """Expects a Square PNG Illustration (bytes-like) input. + + PNG format and squareness will be checked""" + + value: bytes + meta_mimetype = "image/png" + + def __init__( + self, value: bytes | BinaryIO | io.BytesIO, name: str | None = None + ) -> None: + super().__init__(value=value, name=name) + + # native type is PNG image buffer + def get_cleaned_value(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: + value = self.get_binary_from(value) + if not is_valid_image( + image=value, + imformat="PNG", + size=(self.illustration_size, self.illustration_size), + ): + raise ValueError( + f"{self.name} is not a valid " + f"{self.illustration_size}x{self.illustration_size} PNG Image" + ) + return value + + def get_libzim_value(self) -> bytes: + return self.value + + @mandatory -@expecting_text -class NameMetadata(Metadata): +class NameMetadata(TextBasedMetadata): meta_name: str = "Name" @mandatory -@expecting_textlist @only_lang_codes -class LanguageMetadata(Metadata): +class LanguageMetadata(TextListBasedMetadata): meta_name: str = "Language" join_list_with: str = "," @mandatory -@expecting_text -class TitleMetadata(Metadata): +class TitleMetadata(TextBasedMetadata): meta_name: str = "Title" oz_max_length: int = RECOMMENDED_MAX_TITLE_LENGTH @mandatory -@expecting_text -class CreatorMetadata(Metadata): +class CreatorMetadata(TextBasedMetadata): meta_name: str = "Creator" @mandatory -@expecting_text -class PublisherMetadata(Metadata): +class PublisherMetadata(TextBasedMetadata): meta_name: str = "Publisher" -# TODO @mandatory -@expecting_date -class DateMetadata(Metadata): +class DateMetadata(DateBasedMetadata): meta_name: str = "Date" -@expecting_illustration -class IllustrationMetadata(Metadata): +class IllustrationMetadata(IllustrationBasedMetadata): meta_name = "Illustration_{size}x{size}@{scale}" illustration_size: int illustration_scale: int = 1 @@ -396,55 +385,46 @@ def __init__( @mandatory -@expecting_illustration -class DefaultIllustrationMetadata(Metadata): +class DefaultIllustrationMetadata(IllustrationBasedMetadata): meta_name = "Illustration_48x48@1" illustration_size: int = 48 illustration_scale: int = 1 @mandatory -@expecting_text -class DescriptionMetadata(Metadata): +class DescriptionMetadata(TextBasedMetadata): meta_name: str = "Description" oz_max_length: int = MAXIMUM_DESCRIPTION_METADATA_LENGTH -@expecting_text -class LongDescriptionMetadata(Metadata): +class LongDescriptionMetadata(TextBasedMetadata): meta_name: str = "LongDescription" oz_max_length: int = MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH @deduplicate -@expecting_textlist -class TagsMetadata(Metadata): +class TagsMetadata(TextListBasedMetadata): meta_name: str = "Tags" join_list_with: str = ";" -@expecting_text -class ScraperMetadata(Metadata): +class ScraperMetadata(TextBasedMetadata): meta_name: str = "Scraper" -@expecting_text -class FlavourMetadata(Metadata): +class FlavourMetadata(TextBasedMetadata): meta_name: str = "Flavour" -@expecting_text -class SourceMetadata(Metadata): +class SourceMetadata(TextBasedMetadata): meta_name: str = "Source" -@expecting_text -class LicenseMetadata(Metadata): +class LicenseMetadata(TextBasedMetadata): meta_name: str = "License" -@expecting_text -class RelationMetadata(Metadata): +class RelationMetadata(TextBasedMetadata): meta_name: str = "Relation" @@ -491,8 +471,7 @@ def __init__(self, name: str, value: bytes | BinaryIO | io.BytesIO) -> None: @x_protected -@expecting_text -class CustomTextMetadata(Metadata): +class CustomTextMetadata(TextBasedMetadata): def __init__(self, name: str, value: str) -> None: self.meta_name = name super().__init__(name=name, value=value) diff --git a/tests/zim/test_metadata.py b/tests/zim/test_metadata.py index d202e303..8bd3560f 100644 --- a/tests/zim/test_metadata.py +++ b/tests/zim/test_metadata.py @@ -6,7 +6,8 @@ import io import pathlib import re -from typing import NamedTuple +from types import NoneType +from typing import BinaryIO, NamedTuple import pytest from beartype.roar import BeartypeCallHintParamViolation as InvalidType @@ -14,6 +15,18 @@ from zimscraperlib.zim import metadata +def get_classvar_value_type(cls: type) -> type: + """expected type of value classvar for a Metadata type + + Found in class annotations of type or any of its parent class""" + target = cls + while target: + if value_type := getattr(target, "__annotations__", {}).get("value", False): + return value_type + target = getattr(target, "__base__", None) + return NoneType + + @pytest.mark.parametrize( "value", [ @@ -105,7 +118,7 @@ def test_validate_tags_invalid( value: list[str] | str | int, exception: type, error: str ): with pytest.raises(exception, match=re.escape(error)): - metadata.TagsMetadata(value) + metadata.TagsMetadata(value) # pyright: ignore [reportArgumentType] def test_validate_dedup_tags(): @@ -322,21 +335,30 @@ def test_mandatory_value(metadata_init: MetadataInitConfig): if not metadata_init.a_type.is_required: pytest.skip("Only testing mandatory ones") with pytest.raises(ValueError, match="Missing"): - if metadata_init.a_type.expecting == "textlist": + if issubclass(metadata_init.a_type, metadata.TextListBasedMetadata): metadata_init.a_type([]) metadata_init.a_type(["", " "]) - elif metadata_init.a_type.expecting == "text": + elif issubclass(metadata_init.a_type, metadata.TextBasedMetadata): metadata_init.a_type("") metadata_init.a_type(" ") - elif metadata_init.a_type.expecting == "date": + elif issubclass(metadata_init.a_type, metadata.DateBasedMetadata): pytest.skip("Cannot set an empty Date") - elif metadata_init.a_type.expecting in ("illustration", "bytes"): + elif issubclass(metadata_init.a_type, metadata.DefaultIllustrationMetadata): + metadata_init.a_type(b"") + metadata_init.a_type(b" ") + elif get_classvar_value_type(metadata_init.a_type) is bytes: if metadata_init.nb_args == 1: - metadata_init.a_type(b"") - metadata_init.a_type(b" ") + metadata_init.a_type(b"") # pyright: ignore[reportCallIssue] + metadata_init.a_type(b" ") # pyright: ignore[reportCallIssue] else: - metadata_init.a_type(b"", name="Foo") - metadata_init.a_type(b" ", name="Foo") + metadata_init.a_type( + b"", name="Foo" # pyright: ignore[reportCallIssue] + ) + metadata_init.a_type( + b" ", name="Foo" # pyright: ignore[reportCallIssue] + ) + else: + raise OSError("WTF") def test_clean_value(metadata_init: MetadataInitConfig): @@ -346,7 +368,7 @@ def test_clean_value(metadata_init: MetadataInitConfig): clean_lang_value = b"fra" if metadata_init.a_type == metadata.LanguageMetadata: assert metadata_init.a_type([raw_lang_value]).libzim_value == clean_lang_value - elif metadata_init.a_type.expecting == "textlist": + elif isinstance(metadata_init.a_type, metadata.TextListBasedMetadata): assert metadata_init.a_type(raw_value).libzim_value == clean_value assert metadata_init.a_type( value=[raw_value, raw_value] @@ -355,7 +377,7 @@ def test_clean_value(metadata_init: MetadataInitConfig): ).encode( "UTF-8" ) - elif metadata_init.a_type.expecting == "text": + elif isinstance(metadata_init.a_type, metadata.TextBasedMetadata): if metadata_init.nb_args == 1: assert metadata_init.a_type(value=raw_value).libzim_value == clean_value else: @@ -411,7 +433,7 @@ def test_libzim_bytes_value(metadata_init: MetadataInitConfig, png_image: pathli def test_libzim_io_bytesio_value( metadata_init: MetadataInitConfig, png_image: pathlib.Path ): - if metadata_init.a_type.expecting == "illustration": + if isinstance(metadata_init.a_type, metadata.IllustrationBasedMetadata): with open(png_image, "rb") as fh: png_data = fh.read() if metadata_init.nb_args == 1: @@ -426,7 +448,11 @@ def test_libzim_io_bytesio_value( ).libzim_value == png_data ) - if metadata_init.a_type.expecting == "bytes": + if get_classvar_value_type(metadata_init.a_type) in ( + bytes, + BinaryIO, + io.BytesIO, + ): if metadata_init.nb_args == 1: assert ( metadata_init.a_type(value=io.BytesIO(b"a value")).libzim_value From 68844ab3467dfeb5dc3ff7a1087842bc3e44c343 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Fri, 13 Dec 2024 12:09:07 +0000 Subject: [PATCH 09/13] Removed unreachable code in metadata log --- src/zimscraperlib/zim/creator.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index 318abeb9..af81f298 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -191,14 +191,7 @@ def _log_metadata(self): ) continue - # rest is either printable or unexpected - try: - logger.debug(f"Metadata: {name} = {metadata.value!s}") - except Exception: - logger.debug( - f"Metadata: {name} is unexpected data type: " - f"{type(metadata.value).__qualname__}" - ) + logger.debug(f"Metadata: {name} = {metadata.value!s}") def _get_first_language_metadata_value(self) -> str | None: """Private methods to get most popular lang code from Language metadata""" From 6a2ea9ba655588a57122b5712e3f1236a77a1c37 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Fri, 13 Dec 2024 12:19:17 +0000 Subject: [PATCH 10/13] Use illustration API for all illustration based metadata --- src/zimscraperlib/zim/creator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index af81f298..e18e9e85 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -43,7 +43,7 @@ from zimscraperlib.zim.metadata import ( DEFAULT_DEV_ZIM_METADATA, MANDATORY_ZIM_METADATA_KEYS, - IllustrationMetadata, + IllustrationBasedMetadata, LanguageMetadata, Metadata, StandardMetadataList, @@ -229,7 +229,7 @@ def start(self): super().__enter__() for metadata in self._metadata.values(): - if isinstance(metadata, IllustrationMetadata): + if isinstance(metadata, IllustrationBasedMetadata): self.add_illustration(metadata.illustration_size, metadata.libzim_value) else: self.add_metadata(metadata) From 48ba93b4a1da52feb25b30ea1b8a62ea1fb98818 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Fri, 13 Dec 2024 12:20:11 +0000 Subject: [PATCH 11/13] Fixed IO/bytes based input --- src/zimscraperlib/zim/metadata.py | 35 ++++++++++++++++++------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index 63f40bd6..b9a8b69d 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -6,7 +6,7 @@ from collections.abc import Iterable from dataclasses import asdict, dataclass, fields from itertools import filterfalse -from typing import Any, BinaryIO +from typing import Any import regex @@ -194,20 +194,25 @@ def get_encoded(value: str) -> bytes: def libzim_value(self) -> bytes: return self.get_libzim_value() - def get_binary_from(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: - if isinstance(value, BinaryIO): - last_pos = value.tell() - payload = value.read() - value.seek(last_pos) - value = payload - elif isinstance(value, io.BytesIO): - value = value.getvalue() + def get_binary_from(self, value: bytes | io.IOBase | io.BytesIO) -> bytes: + bvalue: bytes = b"" + if isinstance(value, io.BytesIO): + bvalue = value.getvalue() + elif isinstance(value, bytes): + bvalue = value + else: + last_pos: int + if value.seekable(): + last_pos = value.tell() + bvalue = value.read() + if value.seekable(): + value.seek(last_pos) if not self.empty_allowed and not value: raise ValueError("Missing value (empty not allowed)") - return value + return bvalue # native type is bytes - def get_cleaned_value(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: + def get_cleaned_value(self, value: bytes | io.IOBase | io.BytesIO) -> bytes: return self.get_binary_from(value) def get_libzim_value(self) -> bytes: @@ -316,12 +321,12 @@ class IllustrationBasedMetadata(Metadata): meta_mimetype = "image/png" def __init__( - self, value: bytes | BinaryIO | io.BytesIO, name: str | None = None + self, value: bytes | io.IOBase | io.BytesIO, name: str | None = None ) -> None: super().__init__(value=value, name=name) # native type is PNG image buffer - def get_cleaned_value(self, value: bytes | BinaryIO | io.BytesIO) -> bytes: + def get_cleaned_value(self, value: bytes | io.IOBase | io.BytesIO) -> bytes: value = self.get_binary_from(value) if not is_valid_image( image=value, @@ -377,7 +382,7 @@ class IllustrationMetadata(IllustrationBasedMetadata): illustration_scale: int = 1 def __init__( - self, value: bytes | BinaryIO | io.BytesIO, size: int, scale: int = 1 + self, value: bytes | io.IOBase | io.BytesIO, size: int, scale: int = 1 ) -> None: self.illustration_scale = scale self.illustration_size = size @@ -465,7 +470,7 @@ def get_reserved_names(cls) -> list[str]: @x_protected class CustomMetadata(Metadata): - def __init__(self, name: str, value: bytes | BinaryIO | io.BytesIO) -> None: + def __init__(self, name: str, value: bytes | io.IOBase | io.BytesIO) -> None: self.meta_name = name super().__init__(value=value) From e669d24252c3a433838e566b4a54a149e2dc4fc6 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Fri, 13 Dec 2024 12:22:19 +0000 Subject: [PATCH 12/13] Added useless init for CustomMetadata to please coverage --- src/zimscraperlib/zim/metadata.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index b9a8b69d..b7e8b012 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -483,14 +483,20 @@ def __init__(self, name: str, value: str) -> None: @x_prefixed -class XCustomMetadata(CustomMetadata): ... +class XCustomMetadata(CustomMetadata): + # reimpl just to please coverage + def __init__(self, name: str, value: bytes | io.IOBase | io.BytesIO) -> None: + super().__init__(name=name, value=value) @x_prefixed -class XCustomTextMetadata(CustomTextMetadata): ... +class XCustomTextMetadata(CustomTextMetadata): + # reimpl just to please coverage + def __init__(self, name: str, value: str) -> None: + super().__init__(name=name, value=value) -MANDATORY_ZIM_METADATA_KEYS = StandardMetadataList.get_reserved_names() +MANDATORY_ZIM_METADATA_KEYS: list[str] = StandardMetadataList.get_reserved_names() DEFAULT_DEV_ZIM_METADATA = StandardMetadataList( From 7e2efa19141d8062a780fffe8db81abf774533e8 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Fri, 13 Dec 2024 12:23:33 +0000 Subject: [PATCH 13/13] updated tests for metadata/illus/typing updates --- tests/zim/test_metadata.py | 99 +++++++++++++++++++++++++++++++++++ tests/zim/test_typing.py | 40 ++++++++++++++ tests/zim/test_zim_creator.py | 53 +++++++++++++++++++ 3 files changed, 192 insertions(+) create mode 100644 tests/zim/test_typing.py diff --git a/tests/zim/test_metadata.py b/tests/zim/test_metadata.py index 8bd3560f..434d5b11 100644 --- a/tests/zim/test_metadata.py +++ b/tests/zim/test_metadata.py @@ -508,3 +508,102 @@ def test_std_metadata_values(): def test_raw_metadata(): assert metadata.Metadata(name="Name", value=b"hello") + + +def test_decorators(): + @metadata.allow_empty + class MyMetadata(metadata.XCustomTextMetadata): ... + + assert MyMetadata("Test", "value").libzim_value == b"value" + + @metadata.allow_duplicates + class MyMetadata2(metadata.TextListBasedMetadata): + join_list_with = "|" + + assert ( + MyMetadata2(name="Test", value=["value", "hello", "value"]).libzim_value + == b"value|hello|value" + ) + + raw_value = "\t\n\r\n \tA val \t\awith \bcontrol chars\v\n" + + class MyMetadata3(metadata.TextBasedMetadata): + require_text_cleanup = False + + assert MyMetadata3(name="Test", value=raw_value).libzim_value == raw_value.encode( + "UTF-8" + ) + + @metadata.allow_duplicates + class MyMetadata4(metadata.TextListBasedMetadata): + require_textlist_cleanup = False + join_list_with = "|" + + assert MyMetadata4( + name="Test", value=(raw_value, raw_value) + ).libzim_value == "|".join([raw_value, raw_value]).encode("UTF-8") + + +def test_custom_ones(): + textval = "value" + value = textval.encode("UTF-8") + assert metadata.CustomMetadata("Test", value).libzim_value == value + assert metadata.XCustomMetadata("Name", value).libzim_value == value + assert metadata.XCustomMetadata("Name", value).name == "X-Name" + assert metadata.CustomTextMetadata("Test", textval).libzim_value == value + assert metadata.XCustomTextMetadata("Name", textval).libzim_value == value + assert metadata.XCustomTextMetadata("Name", textval).name == "X-Name" + with pytest.raises(ValueError, match="must be X- prefixed"): + metadata.CustomMetadata("Name", value) + with pytest.raises(ValueError, match="must be X- prefixed"): + metadata.CustomTextMetadata("Name", textval) + + +def test_mandatory_zim_metadata_keys(): + # as per the spec on 2024-12-13 + assert len(metadata.MANDATORY_ZIM_METADATA_KEYS) >= 8 + assert "Illustration_48x48@1" in metadata.MANDATORY_ZIM_METADATA_KEYS + + +def test_default_dev_zim_metadata(): + assert isinstance(metadata.DEFAULT_DEV_ZIM_METADATA, metadata.StandardMetadataList) + # as per the spec on 2024-12-13 + assert len(metadata.DEFAULT_DEV_ZIM_METADATA.values()) == 8 + + +def test_get_binary_from(png_image): + with open(png_image, "rb") as fh: + png_data = fh.read() + # bytes input + assert metadata.Metadata(value=png_data, name="Test").libzim_value == png_data + # io.BytesIO input + assert ( + metadata.Metadata(value=io.BytesIO(png_data), name="Test").libzim_value + == png_data + ) + # BaseIO input + with open(png_image, "rb") as fh: + assert metadata.Metadata(value=fh, name="Test").libzim_value == png_data + + # unseekbale BaseIO + def notseekable(): + return False + + with open(png_image, "rb") as fh: + fh.seekable = notseekable + assert metadata.Metadata(value=fh, name="Test").libzim_value == png_data + + +def test_ensure_missingname_raises(): + with pytest.raises(OSError, match="name missing"): + metadata.Metadata(b"yello") + + +def test_mimetype_usage(): + mimetype = "video/webm" + assert metadata.Metadata(b"hello", "Test", mimetype=mimetype).mimetype == mimetype + assert metadata.Metadata(b"hello", "Test").mimetype == "text/plain;charset=UTF-8" + assert ( + metadata.DEFAULT_DEV_ZIM_METADATA.Illustration_48x48_at_1.mimetype + == "image/png" + ) diff --git a/tests/zim/test_typing.py b/tests/zim/test_typing.py new file mode 100644 index 00000000..a262d8f2 --- /dev/null +++ b/tests/zim/test_typing.py @@ -0,0 +1,40 @@ +from zimscraperlib.typing import Callback + + +class GlobalCounter: + count: int = 0 + + +def update_counter() -> int: + GlobalCounter.count += 1 + return GlobalCounter.count + + +def update_counter_args(bump: int) -> int: + GlobalCounter.count += bump + return GlobalCounter.count + + +def update_counter_kwargs(*, bump: int) -> int: + GlobalCounter.count += bump + return GlobalCounter.count + + +def update_counter_args_and_kwargs(bump: int, *, factor: int) -> int: + GlobalCounter.count += bump * factor + return GlobalCounter.count + + +def test_callback_init(): + assert Callback(update_counter) + assert Callback(update_counter).callable + GlobalCounter.count = 0 + assert GlobalCounter.count == 0 + Callback(update_counter_args, args=(1,)).call() + assert GlobalCounter.count == 1 + Callback(update_counter_args, kwargs={"bump": 1}).call() + assert GlobalCounter.count == 2 + Callback(update_counter_kwargs, kwargs={"bump": 1}).call() + assert GlobalCounter.count == 3 + Callback(update_counter_args_and_kwargs, args=(1,), kwargs={"factor": 2}).call() + assert GlobalCounter.count == 5 diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index d3a0a40c..78c324ea 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -181,6 +181,14 @@ def test_add_item_for(tmp_path): creator.add_item_for(path="welcome", title="hello") +def test_additem_bad_content(tmp_path): + with Creator(tmp_path / "test.zim", "welcome").config_dev_metadata() as creator: + with pytest.raises(RuntimeError, match="Unexpected type for content"): + si = StaticItem(path="welcome", content="hello") + si.content = 1 # pyright: ignore[reportAttributeAccessIssue] + creator.add_item(si) + + def test_add_item_for_delete(tmp_path, html_file): fpath = tmp_path / "test.zim" local_path = pathlib.Path(tmp_path / "somefile.html") @@ -480,6 +488,51 @@ def cb(): assert Store.called is True +def test_item_callbacks(tmp_path): + fpath = tmp_path / "test.zim" + + class Store: + called = 0 + + def cb(): + Store.called += 1 + + with Creator(fpath, "").config_dev_metadata() as creator: + creator.add_item_for( + path="hello", + content="hello", + callbacks=[Callback(func=cb), Callback(func=cb)], + ) + + assert Store.called == 2 + + class UnCallableCallback(Callback): + @property + def callable(self) -> bool: + return False + + with Creator(fpath, "").config_dev_metadata() as creator: + # +2 + creator.add_item( + StaticItem(path="hello", content="hello"), + callbacks=[Callback(func=cb), Callback(func=cb)], + ) + # + 0 + creator.add_item(StaticItem(path="hello2", content="hello")) + # +1 + creator.add_item( + StaticItem(path="hello3", content="hello"), + callbacks=Callback(func=cb), + ) + # + 0 + creator.add_item( + StaticItem(path="hello4", content="hello"), + callbacks=UnCallableCallback(func=cb), + ) + + assert Store.called == 5 + + def test_compess_hints(tmp_path, html_file): with Creator(tmp_path / "test.zim", "").config_dev_metadata() as creator: creator.add_item_for(