Skip to content

Commit

Permalink
Drop disallowed control characters and strip blank characters
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Jul 10, 2024
1 parent 1eddabc commit 6ffd72c
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 0 deletions.
20 changes: 20 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import libzim.writer # pyright: ignore
import PIL.Image
import regex

from zimscraperlib import logger
from zimscraperlib.constants import (
Expand Down Expand Up @@ -65,6 +66,9 @@
re.MULTILINE | re.DOTALL,
)

# All control characters are disallowed in str metadata except \n, \r and \t
CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}")


def mimetype_for(
path: str,
Expand Down Expand Up @@ -250,6 +254,12 @@ def add_metadata(
content: str | bytes | datetime.date | datetime.datetime | Iterable[str],
mimetype: str = "text/plain;charset=UTF-8",
):
# drop control characters before passing them to libzim
if isinstance(content, str):
if CONTROL_CHARACTERS_REGEX.search(content):
content = CONTROL_CHARACTERS_REGEX.sub("", content).strip(" \r\n\t")
else:
content = content.strip(" \r\n\t")
if not self.disable_metadata_checks:
self.validate_metadata(name, content)
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
Expand Down Expand Up @@ -304,6 +314,16 @@ def config_metadata(
}
)
self._metadata.update(extras)
for k, v in self._metadata.items():
# drop control characters so that proper value is stored in memory and
# logged in DEBUG mode ; also strip blank characters
if isinstance(v, str):
if CONTROL_CHARACTERS_REGEX.search(v):
self._metadata[k] = CONTROL_CHARACTERS_REGEX.sub("", v).strip(
" \r\n\t"
)
else:
self._metadata[k] = v.strip(" \r\n\t")
return self

def config_dev_metadata(self, **extras: str):
Expand Down
51 changes: 51 additions & 0 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,57 @@ def test_config_metadata(tmp_path, png_image, tags):
assert reader.get_text_metadata("TestMetadata") == "Test Metadata"


def test_config_metadata_control_characters(tmp_path):
fpath = tmp_path / "test_config.zim"
creator = Creator(fpath, "").config_dev_metadata(
Description="\t\n\r\n \tA description \awith \bcontrol characters\v",
LongDescription="A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t",
Creator=" A creator ",
)
assert creator._metadata["Description"] == "A description with control characters"
assert (
creator._metadata["LongDescription"]
== "A description \rwith \ncontrol characters\tsss"
)
assert creator._metadata["Creator"] == "A creator"
with creator:
creator.add_metadata(
"Description_1",
"\t\n\r\n \tA description \awith \bcontrol characters\v",
)
creator.add_metadata(
"LongDescription_1",
"A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t",
)
creator.add_metadata(
"Creator_1",
" A creator ",
)
pass

assert fpath.exists()

reader = Archive(fpath)
assert (
reader.get_text_metadata("Description")
== "A description with control characters"
)
assert (
reader.get_text_metadata("LongDescription")
== "A description \rwith \ncontrol characters\tsss"
)
assert reader.get_text_metadata("Creator") == "A creator"
assert (
reader.get_text_metadata("Description_1")
== "A description with control characters"
)
assert (
reader.get_text_metadata("LongDescription_1")
== "A description \rwith \ncontrol characters\tsss"
)
assert reader.get_text_metadata("Creator_1") == "A creator"


@pytest.mark.parametrize(
"name,value,valid",
[
Expand Down

0 comments on commit 6ffd72c

Please sign in to comment.