From 6dad92a647c0465e8f1b8fbe4701e70d700479aa Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Sun, 10 Dec 2023 12:20:30 +0100
Subject: [PATCH] ROB: Relax flate decoding for too many lookup values (#2331)

When handling flate objects with a lookup table and the image mode `1`, we would previously raise a generic `AssertionError` if the number of lookup values did not match.

Cases where too many values are specified are now considered a warning only.
Additionally, this PR adds a more meaningful error message.
---
 pypdf/_xobj_image_helpers.py | 10 ++++++++--
 tests/test_filters.py        |  9 +++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
index 515c01ebe..a390357dd 100644
--- a/pypdf/_xobj_image_helpers.py
+++ b/pypdf/_xobj_image_helpers.py
@@ -4,7 +4,7 @@
 from io import BytesIO
 from typing import Any, List, Tuple, Union, cast
 
-from ._utils import logger_warning
+from ._utils import WHITESPACES, logger_warning
 from .constants import ColorSpaces
 from .errors import PdfReadError
 from .generic import (
@@ -195,7 +195,13 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
         else:
             if img.mode == "1":
                 # Two values ("high" and "low").
-                assert len(lookup) == 2 * nb, len(lookup)
+                expected_count = 2 * nb
+                if len(lookup) != expected_count:
+                    if len(lookup) < expected_count:
+                        raise PdfReadError(f"Not enough lookup values: Expected {expected_count}, got {len(lookup)}.")
+                    lookup = lookup[:expected_count]
+                    if not all(_value in WHITESPACES for _value in lookup[expected_count:]):
+                        raise PdfReadError(f"Too many lookup values: Expected {expected_count}, got {len(lookup)}.")
                 colors_arr = [lookup[:nb], lookup[nb:]]
                 arr = b"".join(
                     [
diff --git a/tests/test_filters.py b/tests/test_filters.py
index e38280244..873e7a957 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -589,3 +589,12 @@ def test_flate_decode_with_image_mode_1():
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     for image in reader.pages[7].images:
         _ = image
+
+
+@pytest.mark.enable_socket()
+def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup():
+    """From #2331"""
+    url = "https://github.com/py-pdf/pypdf/files/13611048/out1.pdf"
+    name = "issue2331.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    reader.pages[0].images[0]