From d1676565207af3afa3cae51ad96a48f2b463bcb3 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Wed, 10 Apr 2024 23:14:57 +0200
Subject: [PATCH 1/7] ROB: Cope with some issues in pillow

closes #2265
---
 pypdf/_xobj_image_helpers.py | 15 ++++++++++-----
 pypdf/filters.py             |  9 ++++++---
 tests/test_images.py         | 11 +++++++++++
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
index 89341a460..697825b44 100644
--- a/pypdf/_xobj_image_helpers.py
+++ b/pypdf/_xobj_image_helpers.py
@@ -73,9 +73,7 @@ def _get_imagemode(
         color_components = cast(int, icc_profile["/N"])
         color_space = icc_profile.get("/Alternate", "")
     elif color_space[0] == "/Indexed":
-        color_space = color_space[1]
-        if isinstance(color_space, IndirectObject):
-            color_space = color_space.get_object()
+        color_space = color_space[1].get_object()
         mode2, invert_color = _get_imagemode(
             color_space, color_components, prev_mode, depth + 1
         )
@@ -292,10 +290,17 @@ def _handle_jpx(
         mode = "RGBA"
     # we need to convert to the good mode
     try:
-        if img1.mode != mode:
+        if (img1.mode == mode) or (img1.mode in ("L", "P") and mode in ("L", "P")):
+            img = img1
+        elif (
+            img1.mode == "RGBA"
+            and mode == "CMYK"
+            or img1.mode == "CMYK"
+            and mode == "RGBA"
+        ):
             img = Image.frombytes(mode, img1.size, img1.tobytes())
         else:
-            img = img1
+            img = img1.convert(mode)
     except OSError:
         img = Image.frombytes(mode, img1.size, img1.tobytes())
     # for CMYK conversion :
diff --git a/pypdf/filters.py b/pypdf/filters.py
index 9e2158b21..b2991a4b4 100644
--- a/pypdf/filters.py
+++ b/pypdf/filters.py
@@ -894,10 +894,13 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
     img_byte_arr = BytesIO()
     try:
         img.save(img_byte_arr, format=image_format)
-    except OSError:  # pragma: no cover
-        # odd error
+    except OSError:
+        # in case of we convert to RGBA and then to PNG
+        img1 = img.convert("RGBA")
+        image_format = "PNG"
+        extension = ".png"
         img_byte_arr = BytesIO()
-        img.save(img_byte_arr, format=image_format)
+        img1.save(img_byte_arr, format=image_format)
     data = img_byte_arr.getvalue()
 
     try:  # temporary try/except until other fixes of images
diff --git a/tests/test_images.py b/tests/test_images.py
index 7a690f7d7..036c6c4a6 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -283,3 +283,14 @@ def test_data_with_lf():
     name = "iss2343b0.png"
     img = Image.open(BytesIO(get_data_from_url(url, name=name)))
     assert image_similarity(reader.pages[8].images[9].image, img) == 1.0
+
+
+@pytest.mark.enable_socket()
+def test_oserror():
+    """Cf #2265"""
+    url = "https://github.com/py-pdf/pypdf/files/13127130/Binance.discovery.responses.2.gov.uscourts.dcd.256060.140.1.pdf"
+    name = "iss2265.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    reader.pages[2].images[1]
+    # due to errors in translation in pillow we may not be have to get
+    # the correct image therefore we cannot use image_similarity

From 58c30ddaf5c144823573ab63715bda2b8707e320 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 11 Apr 2024 21:03:25 +0200
Subject: [PATCH 2/7] Update tests/test_images.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 tests/test_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_images.py b/tests/test_images.py
index 036c6c4a6..2752ab18a 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -293,4 +293,4 @@ def test_oserror():
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     reader.pages[2].images[1]
     # due to errors in translation in pillow we may not be have to get
-    # the correct image therefore we cannot use image_similarity
+    # the correct image. Therefore we cannot use `image_similarity`.

From 290b97f6db426cfed7b6dc3355f53a3b1fd43b14 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 11 Apr 2024 21:03:34 +0200
Subject: [PATCH 3/7] Update tests/test_images.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 tests/test_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_images.py b/tests/test_images.py
index 2752ab18a..f15909ae2 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -292,5 +292,5 @@ def test_oserror():
     name = "iss2265.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     reader.pages[2].images[1]
-    # due to errors in translation in pillow we may not be have to get
+    # Due to errors in translation in pillow we may not get
     # the correct image. Therefore we cannot use `image_similarity`.

From 530a166bb7b96ce436c85ec08e1e22d5a33edeef Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sat, 13 Apr 2024 10:55:21 +0200
Subject: [PATCH 4/7] refactored

---
 pypdf/_xobj_image_helpers.py | 21 ++++++++-------------
 pypdf/filters.py             |  2 +-
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
index 697825b44..baef9a078 100644
--- a/pypdf/_xobj_image_helpers.py
+++ b/pypdf/_xobj_image_helpers.py
@@ -289,20 +289,15 @@ def _handle_jpx(
     if img1.mode == "RGBA" and mode == "RGB":
         mode = "RGBA"
     # we need to convert to the good mode
-    try:
-        if (img1.mode == mode) or (img1.mode in ("L", "P") and mode in ("L", "P")):
-            img = img1
-        elif (
-            img1.mode == "RGBA"
-            and mode == "CMYK"
-            or img1.mode == "CMYK"
-            and mode == "RGBA"
-        ):
-            img = Image.frombytes(mode, img1.size, img1.tobytes())
-        else:
-            img = img1.convert(mode)
-    except OSError:
+    if img1.mode == mode or {img1.mode, mode} == {"L", "P"}:  # compare (unorder) sets
+        # L,P are indexed mode, where there should not be changed
+        img = img1
+    elif {img1.mode, mode} == {"RGBA", "CMYK"}:
+        # RGBA / CMYK are 4bytes encoding where
+        # the encoding should be corrected
         img = Image.frombytes(mode, img1.size, img1.tobytes())
+    else:
+        img = img1.convert(mode)
     # for CMYK conversion :
     # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
     # not implemented for the moment as I need to get properly the ICC
diff --git a/pypdf/filters.py b/pypdf/filters.py
index b2991a4b4..e49a23af2 100644
--- a/pypdf/filters.py
+++ b/pypdf/filters.py
@@ -894,7 +894,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
     img_byte_arr = BytesIO()
     try:
         img.save(img_byte_arr, format=image_format)
-    except OSError:
+    except OSError:  # pragma: no cover  # coverred with pillow version(10.3)
         # in case of we convert to RGBA and then to PNG
         img1 = img.convert("RGBA")
         image_format = "PNG"

From 4c131f479aaac70b623cb8c51c624721230ef0b9 Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Sat, 13 Apr 2024 11:00:04 +0200
Subject: [PATCH 5/7] improve wording

---
 pypdf/_xobj_image_helpers.py | 4 ++--
 pypdf/filters.py             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
index baef9a078..8a23678d6 100644
--- a/pypdf/_xobj_image_helpers.py
+++ b/pypdf/_xobj_image_helpers.py
@@ -289,8 +289,8 @@ def _handle_jpx(
     if img1.mode == "RGBA" and mode == "RGB":
         mode = "RGBA"
     # we need to convert to the good mode
-    if img1.mode == mode or {img1.mode, mode} == {"L", "P"}:  # compare (unorder) sets
-        # L,P are indexed mode, where there should not be changed
+    if img1.mode == mode or {img1.mode, mode} == {"L", "P"}:  # compare (unordered) sets
+        # L,P are indexed modes which should not be changed.
         img = img1
     elif {img1.mode, mode} == {"RGBA", "CMYK"}:
         # RGBA / CMYK are 4bytes encoding where
diff --git a/pypdf/filters.py b/pypdf/filters.py
index e49a23af2..d62cf7842 100644
--- a/pypdf/filters.py
+++ b/pypdf/filters.py
@@ -894,7 +894,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
     img_byte_arr = BytesIO()
     try:
         img.save(img_byte_arr, format=image_format)
-    except OSError:  # pragma: no cover  # coverred with pillow version(10.3)
+    except OSError:  # pragma: no cover  # covered with pillow 10.3
         # in case of we convert to RGBA and then to PNG
         img1 = img.convert("RGBA")
         image_format = "PNG"

From 41d18b9472e7969ecbfbc66feda7cc9a46e6b042 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sat, 13 Apr 2024 13:10:12 +0200
Subject: [PATCH 6/7] add test for #2266

to cover #2266
---
 tests/test_images.py | 52 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/tests/test_images.py b/tests/test_images.py
index f15909ae2..ad694d669 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -8,6 +8,7 @@
 from io import BytesIO
 from pathlib import Path
 from typing import Union
+from zipfile import ZipFile
 
 import pytest
 from PIL import Image, ImageChops, ImageDraw
@@ -294,3 +295,54 @@ def test_oserror():
     reader.pages[2].images[1]
     # Due to errors in translation in pillow we may not get
     # the correct image. Therefore we cannot use `image_similarity`.
+
+
+@pytest.mark.parametrize(
+    ("pdf", "pdf_name", "images", "images_name", "filtr"),
+    [
+        (
+            "https://github.com/py-pdf/pypdf/files/13127197/FTX.Claim.SC30.01072023101624File595287144.pdf",
+            "iss2266a.pdf",
+            "https://github.com/py-pdf/pypdf/files/14967061/iss2266a_images.zip",
+            "iss2266a_images.zip",
+            ((0, 0), (1, 0), (4, 0), (9, 0)),  # random pick-up to speed up test
+        ),
+        (
+            "https://github.com/py-pdf/pypdf/files/13127242/FTX.Claim.Skybridge.Capital.30062023113350File971325116.pdf",
+            "iss2266b.pdf",
+            "https://github.com/py-pdf/pypdf/files/14967099/iss2266b_images.zip",
+            "iss2266b_images.zip",
+            ((0, 0), (1, 0), (4, 0), (9, 0)),  # random pick-up to speed up test
+        ),
+    ],
+)
+@pytest.mark.enable_socket()
+def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr):
+    """
+    Code to create zipfile:
+    import pypdf;zipfile
+
+    with pypdf.PdfReader("____inputfile___") as r:
+     with zipfile.ZipFile("__outputzip___","w") as z:
+      for p in r.pages:
+       for ii,i in enumerate(p.images):
+        print(i.name)
+        b=BytesIO()
+        i.image.save(b,"JPEG")
+        z.writestr(f"image_{p.page_number}_{ii}_{i.name}",b.getbuffer())
+    """
+    url = pdf
+    name = pdf_name
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    url = images
+    name = images_name
+    print(pdf_name, images_name)  # noqa: T201
+    with ZipFile(BytesIO(get_data_from_url(url, name=name)), "r") as zf:
+        for fn in zf.namelist():
+            sp = fn.split("_")
+            p, i = int(sp[1]), int(sp[2])
+            if filtr is not None and (p, i) not in filtr:
+                continue
+            print(fn)  # noqa: T201
+            img = Image.open(BytesIO(zf.read(fn)))
+            assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99

From b04e886c177d23b442af307ee835bc53b8bc927b Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sat, 13 Apr 2024 13:13:16 +0200
Subject: [PATCH 7/7] coverage (no cover)

---
 pypdf/_xobj_image_helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
index 8a23678d6..cc0123ff2 100644
--- a/pypdf/_xobj_image_helpers.py
+++ b/pypdf/_xobj_image_helpers.py
@@ -296,7 +296,7 @@ def _handle_jpx(
         # RGBA / CMYK are 4bytes encoding where
         # the encoding should be corrected
         img = Image.frombytes(mode, img1.size, img1.tobytes())
-    else:
+    else:  # pragma: no cover
         img = img1.convert(mode)
     # for CMYK conversion :
     # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop