Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB: Cope with some issues in pillow #2595

Merged
merged 8 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions pypdf/_xobj_image_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,7 @@ def _get_imagemode(
color_components = cast(int, icc_profile["/N"])
color_space = icc_profile.get("/Alternate", "")
elif color_space[0] == "/Indexed":
color_space = color_space[1]
if isinstance(color_space, IndirectObject):
color_space = color_space.get_object()
color_space = color_space[1].get_object()
mode2, invert_color = _get_imagemode(
color_space, color_components, prev_mode, depth + 1
)
Expand Down Expand Up @@ -291,13 +289,15 @@ def _handle_jpx(
if img1.mode == "RGBA" and mode == "RGB":
mode = "RGBA"
# we need to convert to the good mode
try:
if img1.mode != mode:
img = Image.frombytes(mode, img1.size, img1.tobytes())
else:
img = img1
except OSError:
if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets
# L,P are indexed modes which should not be changed.
img = img1
elif {img1.mode, mode} == {"RGBA", "CMYK"}:
# RGBA / CMYK are 4bytes encoding where
# the encoding should be corrected
img = Image.frombytes(mode, img1.size, img1.tobytes())
else: # pragma: no cover
img = img1.convert(mode)
# for CMYK conversion :
# https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
# not implemented for the moment as I need to get properly the ICC
Expand Down
9 changes: 6 additions & 3 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,10 +894,13 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
img_byte_arr = BytesIO()
try:
img.save(img_byte_arr, format=image_format)
except OSError: # pragma: no cover
# odd error
except OSError: # pragma: no cover # covered with pillow 10.3
# in case of we convert to RGBA and then to PNG
img1 = img.convert("RGBA")
image_format = "PNG"
extension = ".png"
img_byte_arr = BytesIO()
img.save(img_byte_arr, format=image_format)
img1.save(img_byte_arr, format=image_format)
data = img_byte_arr.getvalue()

try: # temporary try/except until other fixes of images
Expand Down
63 changes: 63 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from io import BytesIO
from pathlib import Path
from typing import Union
from zipfile import ZipFile

import pytest
from PIL import Image, ImageChops, ImageDraw
Expand Down Expand Up @@ -283,3 +284,65 @@ def test_data_with_lf():
name = "iss2343b0.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[8].images[9].image, img) == 1.0


@pytest.mark.enable_socket()
def test_oserror():
"""Cf #2265"""
url = "https://github.com/py-pdf/pypdf/files/13127130/Binance.discovery.responses.2.gov.uscourts.dcd.256060.140.1.pdf"
name = "iss2265.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader.pages[2].images[1]
# Due to errors in translation in pillow we may not get
# the correct image. Therefore we cannot use `image_similarity`.


@pytest.mark.parametrize(
("pdf", "pdf_name", "images", "images_name", "filtr"),
[
(
"https://github.com/py-pdf/pypdf/files/13127197/FTX.Claim.SC30.01072023101624File595287144.pdf",
"iss2266a.pdf",
"https://github.com/py-pdf/pypdf/files/14967061/iss2266a_images.zip",
"iss2266a_images.zip",
((0, 0), (1, 0), (4, 0), (9, 0)), # random pick-up to speed up test
),
(
"https://github.com/py-pdf/pypdf/files/13127242/FTX.Claim.Skybridge.Capital.30062023113350File971325116.pdf",
"iss2266b.pdf",
"https://github.com/py-pdf/pypdf/files/14967099/iss2266b_images.zip",
"iss2266b_images.zip",
((0, 0), (1, 0), (4, 0), (9, 0)), # random pick-up to speed up test
),
],
)
@pytest.mark.enable_socket()
def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr):
"""
Code to create zipfile:
import pypdf;zipfile

with pypdf.PdfReader("____inputfile___") as r:
with zipfile.ZipFile("__outputzip___","w") as z:
for p in r.pages:
for ii,i in enumerate(p.images):
print(i.name)
b=BytesIO()
i.image.save(b,"JPEG")
z.writestr(f"image_{p.page_number}_{ii}_{i.name}",b.getbuffer())
"""
url = pdf
name = pdf_name
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
url = images
name = images_name
print(pdf_name, images_name) # noqa: T201
with ZipFile(BytesIO(get_data_from_url(url, name=name)), "r") as zf:
for fn in zf.namelist():
sp = fn.split("_")
p, i = int(sp[1]), int(sp[2])
if filtr is not None and (p, i) not in filtr:
continue
print(fn) # noqa: T201
img = Image.open(BytesIO(zf.read(fn)))
assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99
Loading