From dbf7718aee8d98ac59577ff54e5b6ebf13eefaba Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 29 Jul 2024 14:00:18 +0000 Subject: [PATCH] fixup! Drop disallowed control characters and strip blank characters --- CHANGELOG.md | 1 + src/zimscraperlib/zim/creator.py | 22 +++++++++------------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e770241d..182f53c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add utility function to compute ZIM Tags #164, including deduplication #156 +- Metadata does not automatically drops control characters #159 ### Fixed diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index 97478a88..9198980a 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -67,7 +67,7 @@ ) # All control characters are disallowed in str metadata except \n, \r and \t -CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}") +UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}") def mimetype_for( @@ -256,10 +256,9 @@ def add_metadata( ): # drop control characters before passing them to libzim if isinstance(content, str): - if CONTROL_CHARACTERS_REGEX.search(content): - content = CONTROL_CHARACTERS_REGEX.sub("", content).strip(" \r\n\t") - else: - content = content.strip(" \r\n\t") + content = UNWANTED_CONTROL_CHARACTERS_REGEX.sub("", content).strip( + " \r\n\t" + ) if not self.disable_metadata_checks: self.validate_metadata(name, content) if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)): @@ -314,16 +313,13 @@ def config_metadata( } ) self._metadata.update(extras) - for k, v in self._metadata.items(): + for metadata_key, metadata_value in self._metadata.items(): # drop control characters so that proper value is stored in memory and # logged in DEBUG mode ; also strip blank characters - if isinstance(v, str): - if CONTROL_CHARACTERS_REGEX.search(v): - self._metadata[k] = CONTROL_CHARACTERS_REGEX.sub("", v).strip( - " \r\n\t" - ) - else: - self._metadata[k] = v.strip(" \r\n\t") + if isinstance(metadata_value, str): + self._metadata[metadata_key] = UNWANTED_CONTROL_CHARACTERS_REGEX.sub( + "", metadata_value + ).strip(" \r\n\t") return self def config_dev_metadata(self, **extras: str):