Skip to content

Commit 3d749ab

Browse files
committed
Simplified _log_metatada logic
- Single method (called on start but can be called manually) - Image format and dimension logged for illustration only - all printable meta logged - fallback for other bytes and unexpected types - Tests for all use cases - Changed test to only test log_format directly
1 parent cdb71aa commit 3d749ab

File tree

3 files changed

+74
-28
lines changed

3 files changed

+74
-28
lines changed

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- `zim.creator.Creator._log_metadata()` to log (DEBUG) all metadata set on `_metadata` (prior to start())
13+
1014
### Changed
1115

1216
- Migrate the **VideoWebmLow** and **VideoWebmHigh** presets to VP9 for smaller file size #79
1317
- New preset versions are v3 and v2 respectively
1418
- Simplify type annotations by replacing Union and Optional with pipe character ("|") for improved readability and clarity
15-
- Add debug logging of metadata fields
19+
- Calling `Creator._log_metadata()` on `Creator.start()` if running in DEBUG
1620

1721
### Fixed
1822
- Add back the `--runinstalled` flag for test execution to allow smooth testing on other build chains (#139)

src/zimscraperlib/zim/creator.py

Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -150,33 +150,53 @@ def config_indexing(
150150
self.__indexing_configured = True
151151
return self
152152

153-
def _is_illustration_metadata_name(self, name: str) -> bool:
154-
"""Return True if name is a valid illustration metadata name"""
155-
return name.startswith("Illustration_")
156-
157-
def _get_illustration_metadata_details(
158-
self,
159-
value: bytes,
160-
) -> str | None:
161-
"""Return image format for debug logging of illustration metadata"""
162-
try:
163-
with PIL.Image.open(io.BytesIO(value)) as img:
164-
if img is not None:
165-
return f"{img.format} {img.size[0]}x{img.size[1]}"
166-
except PIL.UnidentifiedImageError as e:
167-
return f"Image format issue: {e}"
168-
return f"Unknown image format, {len(value)} bytes"
169-
170153
def _log_metadata(self):
154+
"""Log (DEBUG) all metadata set on (_metadata ~ config_metadata())
155+
156+
Does not log metadata set post-start (via add_metadata())"""
171157
for name, value in sorted(self._metadata.items()):
172-
if self._is_illustration_metadata_name(name) and isinstance(value, bytes):
173-
illus_md = self._get_illustration_metadata_details(value)
174-
else:
175-
illus_md = None
176-
logger.debug(f"Metadata: {name} = {(illus_md if illus_md else value)}")
158+
# illustration mandates an Image
159+
if re.match(r"^Illustration_(\d+)x(\d+)@(\d+)$", name):
160+
try:
161+
with PIL.Image.open(io.BytesIO(value)) as img:
162+
logger.debug(
163+
f"Metadata: {name} is a {len(value)} bytes "
164+
f"{img.size[0]}x{img.size[1]}px {img.format} Image"
165+
)
166+
except Exception:
167+
logger.debug(
168+
f"Metadata: {name} is a {len(value)} bytes "
169+
f"{get_content_mimetype(value[:64])} blob "
170+
"not recognized as an Image"
171+
)
172+
continue
173+
174+
# bytes are either encoded string or arbitrary data
175+
if isinstance(value, bytes):
176+
mimetype = get_content_mimetype(value[:64])
177+
if not mimetype.startswith("text/"):
178+
logger.debug(
179+
f"Metadata: {name} is a {len(value)} bytes {mimetype} blob"
180+
)
181+
continue
182+
try:
183+
logger.debug(f"Metadata: {name} = {value.decode('UTF-8')}")
184+
except Exception:
185+
logger.debug(
186+
f"Metadata: {name} is a {len(value)} bytes {mimetype} blob"
187+
)
188+
continue
189+
190+
# rest is either printable or unexpected
191+
try:
192+
logger.debug(f"Metadata: {name} = {value!s}")
193+
except Exception:
194+
logger.debug(
195+
f"Metadata: {name} is unexpected data type: {type(value).__name__}"
196+
)
177197

178198
def start(self):
179-
if logger.isEnabledFor(logging.DEBUG):
199+
if logger.isEnabledFor(logging.DEBUG): # pragma: no cover
180200
self._log_metadata()
181201

182202
if not all(self._metadata.get(key) for key in MANDATORY_ZIM_METADATA_KEYS):

tests/zim/test_zim_creator.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ def test_start_logs_metadata_log_contents(mocked_logger, png_image, tags, tmp_pa
567567
fpath = tmp_path / "test_config.zim"
568568
with open(png_image, "rb") as fh:
569569
png_data = fh.read()
570-
creator = Creator(fpath, "").config_metadata(
570+
creator = Creator(fpath, "", disable_metadata_checks=True).config_metadata(
571571
Name="wikipedia_fr_football",
572572
Title="English Wikipedia",
573573
Creator="English speaking Wikipedia contributors",
@@ -585,18 +585,38 @@ def test_start_logs_metadata_log_contents(mocked_logger, png_image, tags, tmp_pa
585585
Illustration_48x48_at_1=png_data,
586586
TestMetadata="Test Metadata",
587587
)
588-
creator.start()
589-
creator.finish()
588+
589+
class NotPrintable:
590+
def __str__(self):
591+
raise ValueError("Not printable I said")
592+
593+
creator._metadata.update(
594+
{
595+
"Illustration_96x96@1": b"%PDF-1.5\n%\xe2\xe3\xcf\xd3",
596+
"Chars": b"\xc5\xa1\xc9\x94\xc9\x9b",
597+
"Chars-32": b"\xff\xfe\x00\x00a\x01\x00\x00T\x02\x00\x00[\x02\x00\x00",
598+
"Video": b"\x00\x00\x00 ftypisom\x00\x00\x02\x00isomiso2avc1mp41\x00",
599+
"Toupie": NotPrintable(),
600+
}
601+
)
602+
creator._log_metadata()
603+
# /!\ this must be alpha sorted
590604
mocked_logger.debug.assert_has_calls(
591605
[
606+
call("Metadata: Chars = šɔɛ"),
607+
call("Metadata: Chars-32 is a 16 bytes text/plain blob"),
592608
call("Metadata: Creator = English speaking Wikipedia contributors"),
593609
call("Metadata: Date = 2009-11-21"),
594610
call(
595611
"Metadata: Description = All articles (without images) from the "
596612
"english Wikipedia"
597613
),
598614
call("Metadata: Flavour = nopic"),
599-
call("Metadata: Illustration_48x48@1 = PNG 48x48"),
615+
call("Metadata: Illustration_48x48@1 is a 3274 bytes 48x48px PNG Image"),
616+
call(
617+
"Metadata: Illustration_96x96@1 is a 14 bytes "
618+
"application/pdf blob not recognized as an Image"
619+
),
600620
call("Metadata: Language = eng"),
601621
call("Metadata: License = CC-BY"),
602622
call(
@@ -612,6 +632,8 @@ def test_start_logs_metadata_log_contents(mocked_logger, png_image, tags, tmp_pa
612632
call(f"Metadata: Tags = {tags}"),
613633
call("Metadata: TestMetadata = Test Metadata"),
614634
call("Metadata: Title = English Wikipedia"),
635+
call("Metadata: Toupie is unexpected data type: NotPrintable"),
636+
call("Metadata: Video is a 33 bytes video/mp4 blob"),
615637
]
616638
)
617639

0 commit comments

Comments
 (0)