Skip to content

Commit

Permalink
BUG: Lookup index in _xobj_to_image can be ByteStringObject (#1366)
Browse files Browse the repository at this point in the history
DEV: Adjusted File classes __str__ and __repr__ to easy debugging
  • Loading branch information
MartinThoma authored Sep 25, 2022
1 parent 26bdc6b commit eca1a84
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 1 deletion.
17 changes: 17 additions & 0 deletions PyPDF2/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,24 @@ def rename_kwargs( # type: ignore
)


def _human_readable_bytes(bytes: int) -> str:
if bytes < 10**3:
return f"{bytes} Byte"
elif bytes < 10**6:
return f"{bytes / 10**3:.1f} kB"
elif bytes < 10**9:
return f"{bytes / 10**6:.1f} MB"
else:
return f"{bytes / 10**9:.1f} GB"


@dataclass
class File:
name: str
data: bytes

def __str__(self) -> str:
return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))})"

def __repr__(self) -> str:
return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))}, hash: {hash(self.data)})"
7 changes: 6 additions & 1 deletion PyPDF2/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,12 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:

img = Image.frombytes(mode, size, data)
if color_space == "/Indexed":
img.putpalette(lookup.get_data())
from .generic import ByteStringObject

if isinstance(lookup, ByteStringObject):
img.putpalette(lookup)
else:
img.putpalette(lookup.get_data())
img = img.convert("RGB")
if G.S_MASK in x_object_obj: # add alpha channel
alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].get_data())
Expand Down
23 changes: 23 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import PyPDF2._utils
from PyPDF2 import PdfReader
from PyPDF2._utils import (
File,
_get_max_pdf_version_header,
_human_readable_bytes,
deprecate_bookmark,
mark_location,
matrix_multiply,
Expand Down Expand Up @@ -256,3 +258,24 @@ def test_escapedcode_followed_by_int():
reader = PdfReader(io.BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


@pytest.mark.parametrize(
("input_int", "expected_output"),
[
(123, "123 Byte"),
(1234, "1.2 kB"),
(123_456, "123.5 kB"),
(1_234_567, "1.2 MB"),
(1_234_567_890, "1.2 GB"),
(1_234_567_890_000, "1234.6 GB"),
],
)
def test_human_readable_bytes(input_int, expected_output):
assert _human_readable_bytes(input_int) == expected_output


def test_file():
f = File(name="image.png", data=b"")
assert str(f) == "File(name=image.png, data: 0 Byte)"
assert repr(f) == "File(name=image.png, data: 0 Byte, hash: 0)"
1 change: 1 addition & 0 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,7 @@ def test_merge_output(caplog):
"https://corpora.tika.apache.org/base/docs/govdocs1/969/969502.pdf",
"tika-969502.pdf",
),
("https://arxiv.org/pdf/2201.00214.pdf", "arxiv-2201.00214.pdf"),
],
)
def test_image_extraction(url, name):
Expand Down

0 comments on commit eca1a84

Please sign in to comment.