Skip to content

Issue 155: log metadata prior to verification (PR #160 fixes) #172

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- `zim.creator.Creator._log_metadata()` to log (DEBUG) all metadata set on `_metadata` (prior to start())

### Changed

- Migrate the **VideoWebmLow** and **VideoWebmHigh** presets to VP9 for smaller file size #79
- New preset versions are v3 and v2 respectively
- Simplify type annotations by replacing Union and Optional with pipe character ("|") for improved readability and clarity
- Calling `Creator._log_metadata()` on `Creator.start()` if running in DEBUG

### Fixed
- Add back the `--runinstalled` flag for test execution to allow smooth testing on other build chains (#139)
Expand Down
53 changes: 53 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,18 @@
from __future__ import annotations

import datetime
import io
import logging
import pathlib
import re
import weakref
from collections.abc import Callable, Iterable
from typing import Any

import libzim.writer # pyright: ignore
import PIL.Image

from zimscraperlib import logger
from zimscraperlib.constants import (
DEFAULT_DEV_ZIM_METADATA,
FRONT_ARTICLE_MIMETYPES,
Expand Down Expand Up @@ -146,7 +150,56 @@ def config_indexing(
self.__indexing_configured = True
return self

def _log_metadata(self):
"""Log (DEBUG) all metadata set on (_metadata ~ config_metadata())

Does not log metadata set post-start (via add_metadata())"""
for name, value in sorted(self._metadata.items()):
# illustration mandates an Image
if re.match(r"^Illustration_(\d+)x(\d+)@(\d+)$", name):
try:
with PIL.Image.open(io.BytesIO(value)) as img:
logger.debug(
f"Metadata: {name} is a {len(value)} bytes "
f"{img.size[0]}x{img.size[1]}px {img.format} Image"
)
except Exception:
logger.debug(
f"Metadata: {name} is a {len(value)} bytes "
f"{get_content_mimetype(value[:64])} blob "
"not recognized as an Image"
)
continue

# bytes are either encoded string or arbitrary data
if isinstance(value, bytes):
mimetype = get_content_mimetype(value[:64])
if not mimetype.startswith("text/"):
logger.debug(
f"Metadata: {name} is a {len(value)} bytes {mimetype} blob"
)
continue
try:
logger.debug(f"Metadata: {name} = {value.decode('UTF-8')}")
except Exception:
logger.debug(
f"Metadata: {name} is a {len(value)} bytes {mimetype} blob "
"not decodable as an UTF-8 string"
)
continue

# rest is either printable or unexpected
try:
logger.debug(f"Metadata: {name} = {value!s}")
except Exception:
logger.debug(
f"Metadata: {name} is unexpected data type: {type(value).__name__}"
)

def start(self):
if logger.isEnabledFor(logging.DEBUG): # pragma: no cover
self._log_metadata()

if not all(self._metadata.get(key) for key in MANDATORY_ZIM_METADATA_KEYS):
raise ValueError("Mandatory metadata are not all set.")

Expand Down
101 changes: 101 additions & 0 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import base64
import datetime
import io
import logging
import pathlib
import random
import shutil
import subprocess
import sys
import tempfile
import time
from unittest.mock import call, patch

import pytest
from libzim.writer import Compression # pyright: ignore
Expand Down Expand Up @@ -540,6 +542,105 @@ def test_check_metadata(tmp_path):
Creator(tmp_path, "").config_dev_metadata(LongDescription="T" * 5000).start()


@pytest.mark.parametrize(
"tags",
[
(
"wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:yes;"
"_ftindex:yes"
),
(
[
"wikipedia",
"_category:wikipedia",
"_pictures:no",
"_videos:no",
"_details:yes",
"_ftindex:yes",
]
),
],
)
@patch("zimscraperlib.zim.creator.logger", autospec=True)
def test_start_logs_metadata_log_contents(mocked_logger, png_image, tags, tmp_path):
mocked_logger.isEnabledFor.side_effect = lambda level: level == logging.DEBUG
fpath = tmp_path / "test_config.zim"
with open(png_image, "rb") as fh:
png_data = fh.read()
creator = Creator(fpath, "", disable_metadata_checks=True).config_metadata(
Name="wikipedia_fr_football",
Title="English Wikipedia",
Creator="English speaking Wikipedia contributors",
Publisher="Wikipedia user Foobar",
Date="2009-11-21",
Description="All articles (without images) from the english Wikipedia",
LongDescription="This ZIM file contains all articles (without images)"
" from the english Wikipedia by 2009-11-10. The topics are...",
Language="eng",
License="CC-BY",
Tags=tags,
Flavour="nopic",
Source="https://en.wikipedia.org/",
Scraper="mwoffliner 1.2.3",
Illustration_48x48_at_1=png_data,
TestMetadata="Test Metadata",
)

class NotPrintable:
def __str__(self):
raise ValueError("Not printable I said")

creator._metadata.update(
{
"Illustration_96x96@1": b"%PDF-1.5\n%\xe2\xe3\xcf\xd3",
"Chars": b"\xc5\xa1\xc9\x94\xc9\x9b",
"Chars-32": b"\xff\xfe\x00\x00a\x01\x00\x00T\x02\x00\x00[\x02\x00\x00",
"Video": b"\x00\x00\x00 ftypisom\x00\x00\x02\x00isomiso2avc1mp41\x00",
"Toupie": NotPrintable(),
}
)
creator._log_metadata()
# /!\ this must be alpha sorted
mocked_logger.debug.assert_has_calls(
[
call("Metadata: Chars = šɔɛ"),
call(
"Metadata: Chars-32 is a 16 bytes text/plain blob "
"not decodable as an UTF-8 string"
),
call("Metadata: Creator = English speaking Wikipedia contributors"),
call("Metadata: Date = 2009-11-21"),
call(
"Metadata: Description = All articles (without images) from the "
"english Wikipedia"
),
call("Metadata: Flavour = nopic"),
call("Metadata: Illustration_48x48@1 is a 3274 bytes 48x48px PNG Image"),
call(
"Metadata: Illustration_96x96@1 is a 14 bytes "
"application/pdf blob not recognized as an Image"
),
call("Metadata: Language = eng"),
call("Metadata: License = CC-BY"),
call(
"Metadata: LongDescription = This ZIM file contains all articles "
"(without images) from the english Wikipedia by 2009-11-10. "
"The topics are..."
),
call("Metadata: Name = wikipedia_fr_football"),
call("Metadata: Publisher = Wikipedia user Foobar"),
call("Metadata: Relation = None"),
call("Metadata: Scraper = mwoffliner 1.2.3"),
call("Metadata: Source = https://en.wikipedia.org/"),
call(f"Metadata: Tags = {tags}"),
call("Metadata: TestMetadata = Test Metadata"),
call("Metadata: Title = English Wikipedia"),
call("Metadata: Toupie is unexpected data type: NotPrintable"),
call("Metadata: Video is a 33 bytes video/mp4 blob"),
]
)


def test_relax_metadata(tmp_path):
Creator(tmp_path, "", disable_metadata_checks=True).config_dev_metadata(
Description="T" * 90
Expand Down
Loading