From 6dad92a647c0465e8f1b8fbe4701e70d700479aa Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Sun, 10 Dec 2023 12:20:30 +0100 Subject: [PATCH] ROB: Relax flate decoding for too many lookup values (#2331) When handling flate objects with a lookup table and the image mode `1`, we would previously raise a generic `AssertionError` if the number of lookup values did not match. Cases where too many values are specified are now considered a warning only. Additionally, this PR adds a more meaningful error message. --- pypdf/_xobj_image_helpers.py | 10 ++++++++-- tests/test_filters.py | 9 +++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 515c01ebe..a390357dd 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -4,7 +4,7 @@ from io import BytesIO from typing import Any, List, Tuple, Union, cast -from ._utils import logger_warning +from ._utils import WHITESPACES, logger_warning from .constants import ColorSpaces from .errors import PdfReadError from .generic import ( @@ -195,7 +195,13 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: else: if img.mode == "1": # Two values ("high" and "low"). - assert len(lookup) == 2 * nb, len(lookup) + expected_count = 2 * nb + if len(lookup) != expected_count: + if len(lookup) < expected_count: + raise PdfReadError(f"Not enough lookup values: Expected {expected_count}, got {len(lookup)}.") + lookup = lookup[:expected_count] + if not all(_value in WHITESPACES for _value in lookup[expected_count:]): + raise PdfReadError(f"Too many lookup values: Expected {expected_count}, got {len(lookup)}.") colors_arr = [lookup[:nb], lookup[nb:]] arr = b"".join( [ diff --git a/tests/test_filters.py b/tests/test_filters.py index e38280244..873e7a957 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -589,3 +589,12 @@ def test_flate_decode_with_image_mode_1(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for image in reader.pages[7].images: _ = image + + +@pytest.mark.enable_socket() +def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup(): + """From #2331""" + url = "https://github.com/py-pdf/pypdf/files/13611048/out1.pdf" + name = "issue2331.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].images[0]