Skip to content

Commit

Permalink
fixup! Drop disallowed control characters and strip blank characters
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Jul 29, 2024
1 parent 6ffd72c commit dbf7718
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Add utility function to compute ZIM Tags #164, including deduplication #156
- Metadata does not automatically drops control characters #159

### Fixed

Expand Down
22 changes: 9 additions & 13 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
)

# All control characters are disallowed in str metadata except \n, \r and \t
CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}")
UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}")


def mimetype_for(
Expand Down Expand Up @@ -256,10 +256,9 @@ def add_metadata(
):
# drop control characters before passing them to libzim
if isinstance(content, str):
if CONTROL_CHARACTERS_REGEX.search(content):
content = CONTROL_CHARACTERS_REGEX.sub("", content).strip(" \r\n\t")
else:
content = content.strip(" \r\n\t")
content = UNWANTED_CONTROL_CHARACTERS_REGEX.sub("", content).strip(
" \r\n\t"
)
if not self.disable_metadata_checks:
self.validate_metadata(name, content)
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
Expand Down Expand Up @@ -314,16 +313,13 @@ def config_metadata(
}
)
self._metadata.update(extras)
for k, v in self._metadata.items():
for metadata_key, metadata_value in self._metadata.items():
# drop control characters so that proper value is stored in memory and
# logged in DEBUG mode ; also strip blank characters
if isinstance(v, str):
if CONTROL_CHARACTERS_REGEX.search(v):
self._metadata[k] = CONTROL_CHARACTERS_REGEX.sub("", v).strip(
" \r\n\t"
)
else:
self._metadata[k] = v.strip(" \r\n\t")
if isinstance(metadata_value, str):
self._metadata[metadata_key] = UNWANTED_CONTROL_CHARACTERS_REGEX.sub(
"", metadata_value
).strip(" \r\n\t")
return self

def config_dev_metadata(self, **extras: str):
Expand Down

0 comments on commit dbf7718

Please sign in to comment.