From d1676565207af3afa3cae51ad96a48f2b463bcb3 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 10 Apr 2024 23:14:57 +0200 Subject: [PATCH 1/7] ROB: Cope with some issues in pillow closes #2265 --- pypdf/_xobj_image_helpers.py | 15 ++++++++++----- pypdf/filters.py | 9 ++++++--- tests/test_images.py | 11 +++++++++++ 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 89341a460..697825b44 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -73,9 +73,7 @@ def _get_imagemode( color_components = cast(int, icc_profile["/N"]) color_space = icc_profile.get("/Alternate", "") elif color_space[0] == "/Indexed": - color_space = color_space[1] - if isinstance(color_space, IndirectObject): - color_space = color_space.get_object() + color_space = color_space[1].get_object() mode2, invert_color = _get_imagemode( color_space, color_components, prev_mode, depth + 1 ) @@ -292,10 +290,17 @@ def _handle_jpx( mode = "RGBA" # we need to convert to the good mode try: - if img1.mode != mode: + if (img1.mode == mode) or (img1.mode in ("L", "P") and mode in ("L", "P")): + img = img1 + elif ( + img1.mode == "RGBA" + and mode == "CMYK" + or img1.mode == "CMYK" + and mode == "RGBA" + ): img = Image.frombytes(mode, img1.size, img1.tobytes()) else: - img = img1 + img = img1.convert(mode) except OSError: img = Image.frombytes(mode, img1.size, img1.tobytes()) # for CMYK conversion : diff --git a/pypdf/filters.py b/pypdf/filters.py index 9e2158b21..b2991a4b4 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -894,10 +894,13 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, img_byte_arr = BytesIO() try: img.save(img_byte_arr, format=image_format) - except OSError: # pragma: no cover - # odd error + except OSError: + # in case of we convert to RGBA and then to PNG + img1 = img.convert("RGBA") + image_format = "PNG" + extension = ".png" img_byte_arr = BytesIO() - img.save(img_byte_arr, format=image_format) + img1.save(img_byte_arr, format=image_format) data = img_byte_arr.getvalue() try: # temporary try/except until other fixes of images diff --git a/tests/test_images.py b/tests/test_images.py index 7a690f7d7..036c6c4a6 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -283,3 +283,14 @@ def test_data_with_lf(): name = "iss2343b0.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[8].images[9].image, img) == 1.0 + + +@pytest.mark.enable_socket() +def test_oserror(): + """Cf #2265""" + url = "https://github.com/py-pdf/pypdf/files/13127130/Binance.discovery.responses.2.gov.uscourts.dcd.256060.140.1.pdf" + name = "iss2265.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[2].images[1] + # due to errors in translation in pillow we may not be have to get + # the correct image therefore we cannot use image_similarity From 58c30ddaf5c144823573ab63715bda2b8707e320 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 11 Apr 2024 21:03:25 +0200 Subject: [PATCH 2/7] Update tests/test_images.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- tests/test_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_images.py b/tests/test_images.py index 036c6c4a6..2752ab18a 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -293,4 +293,4 @@ def test_oserror(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[2].images[1] # due to errors in translation in pillow we may not be have to get - # the correct image therefore we cannot use image_similarity + # the correct image. Therefore we cannot use `image_similarity`. From 290b97f6db426cfed7b6dc3355f53a3b1fd43b14 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 11 Apr 2024 21:03:34 +0200 Subject: [PATCH 3/7] Update tests/test_images.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- tests/test_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_images.py b/tests/test_images.py index 2752ab18a..f15909ae2 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -292,5 +292,5 @@ def test_oserror(): name = "iss2265.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[2].images[1] - # due to errors in translation in pillow we may not be have to get + # Due to errors in translation in pillow we may not get # the correct image. Therefore we cannot use `image_similarity`. From 530a166bb7b96ce436c85ec08e1e22d5a33edeef Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 13 Apr 2024 10:55:21 +0200 Subject: [PATCH 4/7] refactored --- pypdf/_xobj_image_helpers.py | 21 ++++++++------------- pypdf/filters.py | 2 +- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 697825b44..baef9a078 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -289,20 +289,15 @@ def _handle_jpx( if img1.mode == "RGBA" and mode == "RGB": mode = "RGBA" # we need to convert to the good mode - try: - if (img1.mode == mode) or (img1.mode in ("L", "P") and mode in ("L", "P")): - img = img1 - elif ( - img1.mode == "RGBA" - and mode == "CMYK" - or img1.mode == "CMYK" - and mode == "RGBA" - ): - img = Image.frombytes(mode, img1.size, img1.tobytes()) - else: - img = img1.convert(mode) - except OSError: + if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unorder) sets + # L,P are indexed mode, where there should not be changed + img = img1 + elif {img1.mode, mode} == {"RGBA", "CMYK"}: + # RGBA / CMYK are 4bytes encoding where + # the encoding should be corrected img = Image.frombytes(mode, img1.size, img1.tobytes()) + else: + img = img1.convert(mode) # for CMYK conversion : # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop # not implemented for the moment as I need to get properly the ICC diff --git a/pypdf/filters.py b/pypdf/filters.py index b2991a4b4..e49a23af2 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -894,7 +894,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, img_byte_arr = BytesIO() try: img.save(img_byte_arr, format=image_format) - except OSError: + except OSError: # pragma: no cover # coverred with pillow version(10.3) # in case of we convert to RGBA and then to PNG img1 = img.convert("RGBA") image_format = "PNG" From 4c131f479aaac70b623cb8c51c624721230ef0b9 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Sat, 13 Apr 2024 11:00:04 +0200 Subject: [PATCH 5/7] improve wording --- pypdf/_xobj_image_helpers.py | 4 ++-- pypdf/filters.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index baef9a078..8a23678d6 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -289,8 +289,8 @@ def _handle_jpx( if img1.mode == "RGBA" and mode == "RGB": mode = "RGBA" # we need to convert to the good mode - if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unorder) sets - # L,P are indexed mode, where there should not be changed + if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets + # L,P are indexed modes which should not be changed. img = img1 elif {img1.mode, mode} == {"RGBA", "CMYK"}: # RGBA / CMYK are 4bytes encoding where diff --git a/pypdf/filters.py b/pypdf/filters.py index e49a23af2..d62cf7842 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -894,7 +894,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, img_byte_arr = BytesIO() try: img.save(img_byte_arr, format=image_format) - except OSError: # pragma: no cover # coverred with pillow version(10.3) + except OSError: # pragma: no cover # covered with pillow 10.3 # in case of we convert to RGBA and then to PNG img1 = img.convert("RGBA") image_format = "PNG" From 41d18b9472e7969ecbfbc66feda7cc9a46e6b042 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 13 Apr 2024 13:10:12 +0200 Subject: [PATCH 6/7] add test for #2266 to cover #2266 --- tests/test_images.py | 52 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/test_images.py b/tests/test_images.py index f15909ae2..ad694d669 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -8,6 +8,7 @@ from io import BytesIO from pathlib import Path from typing import Union +from zipfile import ZipFile import pytest from PIL import Image, ImageChops, ImageDraw @@ -294,3 +295,54 @@ def test_oserror(): reader.pages[2].images[1] # Due to errors in translation in pillow we may not get # the correct image. Therefore we cannot use `image_similarity`. + + +@pytest.mark.parametrize( + ("pdf", "pdf_name", "images", "images_name", "filtr"), + [ + ( + "https://github.com/py-pdf/pypdf/files/13127197/FTX.Claim.SC30.01072023101624File595287144.pdf", + "iss2266a.pdf", + "https://github.com/py-pdf/pypdf/files/14967061/iss2266a_images.zip", + "iss2266a_images.zip", + ((0, 0), (1, 0), (4, 0), (9, 0)), # random pick-up to speed up test + ), + ( + "https://github.com/py-pdf/pypdf/files/13127242/FTX.Claim.Skybridge.Capital.30062023113350File971325116.pdf", + "iss2266b.pdf", + "https://github.com/py-pdf/pypdf/files/14967099/iss2266b_images.zip", + "iss2266b_images.zip", + ((0, 0), (1, 0), (4, 0), (9, 0)), # random pick-up to speed up test + ), + ], +) +@pytest.mark.enable_socket() +def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr): + """ + Code to create zipfile: + import pypdf;zipfile + + with pypdf.PdfReader("____inputfile___") as r: + with zipfile.ZipFile("__outputzip___","w") as z: + for p in r.pages: + for ii,i in enumerate(p.images): + print(i.name) + b=BytesIO() + i.image.save(b,"JPEG") + z.writestr(f"image_{p.page_number}_{ii}_{i.name}",b.getbuffer()) + """ + url = pdf + name = pdf_name + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = images + name = images_name + print(pdf_name, images_name) # noqa: T201 + with ZipFile(BytesIO(get_data_from_url(url, name=name)), "r") as zf: + for fn in zf.namelist(): + sp = fn.split("_") + p, i = int(sp[1]), int(sp[2]) + if filtr is not None and (p, i) not in filtr: + continue + print(fn) # noqa: T201 + img = Image.open(BytesIO(zf.read(fn))) + assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99 From b04e886c177d23b442af307ee835bc53b8bc927b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 13 Apr 2024 13:13:16 +0200 Subject: [PATCH 7/7] coverage (no cover) --- pypdf/_xobj_image_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 8a23678d6..cc0123ff2 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -296,7 +296,7 @@ def _handle_jpx( # RGBA / CMYK are 4bytes encoding where # the encoding should be corrected img = Image.frombytes(mode, img1.size, img1.tobytes()) - else: + else: # pragma: no cover img = img1.convert(mode) # for CMYK conversion : # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop