Skip to content

Commit def563a

Browse files
richterdavidbenoit74
authored andcommitted
Automated logging of creator metadata
1 parent ada7f73 commit def563a

File tree

3 files changed

+159
-0
lines changed

3 files changed

+159
-0
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- `zim.creator.Creator._log_metadata()` to log (DEBUG) all metadata set on `_metadata` (prior to start())
13+
1014
### Changed
1115

1216
- Migrate the **VideoWebmLow** and **VideoWebmHigh** presets to VP9 for smaller file size #79
1317
- New preset versions are v3 and v2 respectively
1418
- Simplify type annotations by replacing Union and Optional with pipe character ("|") for improved readability and clarity
19+
- Calling `Creator._log_metadata()` on `Creator.start()` if running in DEBUG
1520

1621
### Fixed
1722
- Add back the `--runinstalled` flag for test execution to allow smooth testing on other build chains (#139)

src/zimscraperlib/zim/creator.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,18 @@
2020
from __future__ import annotations
2121

2222
import datetime
23+
import io
24+
import logging
2325
import pathlib
2426
import re
2527
import weakref
2628
from collections.abc import Callable, Iterable
2729
from typing import Any
2830

2931
import libzim.writer # pyright: ignore
32+
import PIL.Image
3033

34+
from zimscraperlib import logger
3135
from zimscraperlib.constants import (
3236
DEFAULT_DEV_ZIM_METADATA,
3337
FRONT_ARTICLE_MIMETYPES,
@@ -146,7 +150,56 @@ def config_indexing(
146150
self.__indexing_configured = True
147151
return self
148152

153+
def _log_metadata(self):
154+
"""Log (DEBUG) all metadata set on (_metadata ~ config_metadata())
155+
156+
Does not log metadata set post-start (via add_metadata())"""
157+
for name, value in sorted(self._metadata.items()):
158+
# illustration mandates an Image
159+
if re.match(r"^Illustration_(\d+)x(\d+)@(\d+)$", name):
160+
try:
161+
with PIL.Image.open(io.BytesIO(value)) as img:
162+
logger.debug(
163+
f"Metadata: {name} is a {len(value)} bytes "
164+
f"{img.size[0]}x{img.size[1]}px {img.format} Image"
165+
)
166+
except Exception:
167+
logger.debug(
168+
f"Metadata: {name} is a {len(value)} bytes "
169+
f"{get_content_mimetype(value[:64])} blob "
170+
"not recognized as an Image"
171+
)
172+
continue
173+
174+
# bytes are either encoded string or arbitrary data
175+
if isinstance(value, bytes):
176+
mimetype = get_content_mimetype(value[:64])
177+
if not mimetype.startswith("text/"):
178+
logger.debug(
179+
f"Metadata: {name} is a {len(value)} bytes {mimetype} blob"
180+
)
181+
continue
182+
try:
183+
logger.debug(f"Metadata: {name} = {value.decode('UTF-8')}")
184+
except Exception:
185+
logger.debug(
186+
f"Metadata: {name} is a {len(value)} bytes {mimetype} blob "
187+
"not decodable as an UTF-8 string"
188+
)
189+
continue
190+
191+
# rest is either printable or unexpected
192+
try:
193+
logger.debug(f"Metadata: {name} = {value!s}")
194+
except Exception:
195+
logger.debug(
196+
f"Metadata: {name} is unexpected data type: {type(value).__name__}"
197+
)
198+
149199
def start(self):
200+
if logger.isEnabledFor(logging.DEBUG): # pragma: no cover
201+
self._log_metadata()
202+
150203
if not all(self._metadata.get(key) for key in MANDATORY_ZIM_METADATA_KEYS):
151204
raise ValueError("Mandatory metadata are not all set.")
152205

tests/zim/test_zim_creator.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44
import base64
55
import datetime
66
import io
7+
import logging
78
import pathlib
89
import random
910
import shutil
1011
import subprocess
1112
import sys
1213
import tempfile
1314
import time
15+
from unittest.mock import call, patch
1416

1517
import pytest
1618
from libzim.writer import Compression # pyright: ignore
@@ -540,6 +542,105 @@ def test_check_metadata(tmp_path):
540542
Creator(tmp_path, "").config_dev_metadata(LongDescription="T" * 5000).start()
541543

542544

545+
@pytest.mark.parametrize(
546+
"tags",
547+
[
548+
(
549+
"wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:yes;"
550+
"_ftindex:yes"
551+
),
552+
(
553+
[
554+
"wikipedia",
555+
"_category:wikipedia",
556+
"_pictures:no",
557+
"_videos:no",
558+
"_details:yes",
559+
"_ftindex:yes",
560+
]
561+
),
562+
],
563+
)
564+
@patch("zimscraperlib.zim.creator.logger", autospec=True)
565+
def test_start_logs_metadata_log_contents(mocked_logger, png_image, tags, tmp_path):
566+
mocked_logger.isEnabledFor.side_effect = lambda level: level == logging.DEBUG
567+
fpath = tmp_path / "test_config.zim"
568+
with open(png_image, "rb") as fh:
569+
png_data = fh.read()
570+
creator = Creator(fpath, "", disable_metadata_checks=True).config_metadata(
571+
Name="wikipedia_fr_football",
572+
Title="English Wikipedia",
573+
Creator="English speaking Wikipedia contributors",
574+
Publisher="Wikipedia user Foobar",
575+
Date="2009-11-21",
576+
Description="All articles (without images) from the english Wikipedia",
577+
LongDescription="This ZIM file contains all articles (without images)"
578+
" from the english Wikipedia by 2009-11-10. The topics are...",
579+
Language="eng",
580+
License="CC-BY",
581+
Tags=tags,
582+
Flavour="nopic",
583+
Source="https://en.wikipedia.org/",
584+
Scraper="mwoffliner 1.2.3",
585+
Illustration_48x48_at_1=png_data,
586+
TestMetadata="Test Metadata",
587+
)
588+
589+
class NotPrintable:
590+
def __str__(self):
591+
raise ValueError("Not printable I said")
592+
593+
creator._metadata.update(
594+
{
595+
"Illustration_96x96@1": b"%PDF-1.5\n%\xe2\xe3\xcf\xd3",
596+
"Chars": b"\xc5\xa1\xc9\x94\xc9\x9b",
597+
"Chars-32": b"\xff\xfe\x00\x00a\x01\x00\x00T\x02\x00\x00[\x02\x00\x00",
598+
"Video": b"\x00\x00\x00 ftypisom\x00\x00\x02\x00isomiso2avc1mp41\x00",
599+
"Toupie": NotPrintable(),
600+
}
601+
)
602+
creator._log_metadata()
603+
# /!\ this must be alpha sorted
604+
mocked_logger.debug.assert_has_calls(
605+
[
606+
call("Metadata: Chars = šɔɛ"),
607+
call(
608+
"Metadata: Chars-32 is a 16 bytes text/plain blob "
609+
"not decodable as an UTF-8 string"
610+
),
611+
call("Metadata: Creator = English speaking Wikipedia contributors"),
612+
call("Metadata: Date = 2009-11-21"),
613+
call(
614+
"Metadata: Description = All articles (without images) from the "
615+
"english Wikipedia"
616+
),
617+
call("Metadata: Flavour = nopic"),
618+
call("Metadata: Illustration_48x48@1 is a 3274 bytes 48x48px PNG Image"),
619+
call(
620+
"Metadata: Illustration_96x96@1 is a 14 bytes "
621+
"application/pdf blob not recognized as an Image"
622+
),
623+
call("Metadata: Language = eng"),
624+
call("Metadata: License = CC-BY"),
625+
call(
626+
"Metadata: LongDescription = This ZIM file contains all articles "
627+
"(without images) from the english Wikipedia by 2009-11-10. "
628+
"The topics are..."
629+
),
630+
call("Metadata: Name = wikipedia_fr_football"),
631+
call("Metadata: Publisher = Wikipedia user Foobar"),
632+
call("Metadata: Relation = None"),
633+
call("Metadata: Scraper = mwoffliner 1.2.3"),
634+
call("Metadata: Source = https://en.wikipedia.org/"),
635+
call(f"Metadata: Tags = {tags}"),
636+
call("Metadata: TestMetadata = Test Metadata"),
637+
call("Metadata: Title = English Wikipedia"),
638+
call("Metadata: Toupie is unexpected data type: NotPrintable"),
639+
call("Metadata: Video is a 33 bytes video/mp4 blob"),
640+
]
641+
)
642+
643+
543644
def test_relax_metadata(tmp_path):
544645
Creator(tmp_path, "", disable_metadata_checks=True).config_dev_metadata(
545646
Description="T" * 90

0 commit comments

Comments
 (0)