diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 81791c4c6..255707d7e 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -148,18 +148,26 @@ def extract_inline_DCT(stream: StreamType) -> bytes: Extract DCT (JPEG) stream from inline image. The stream will be moved onto the EI. """ + def read(length: int) -> bytes: + # If 0 bytes are returned, and *size* was not 0, this indicates end of file. + # If the object is in non-blocking mode and no bytes are available, `None` is returned. + _result = stream.read(length) + if _result is None or len(_result) != length: + raise PdfReadError("Unexpected end of stream") + return _result + data_out: bytes = b"" # Read Blocks of data (ID/Size/data) up to ID=FF/D9 # https://www.digicamsoft.com/itu/itu-t81-36.html - notfirst = False + not_first = False while True: - c = stream.read(1) - if notfirst or (c == b"\xff"): + c = read(1) + if not_first or (c == b"\xff"): data_out += c if c != b"\xff": continue - notfirst = True - c = stream.read(1) + not_first = True + c = read(1) data_out += c if c == b"\xff": stream.seek(-1, 1) # pragma: no cover @@ -172,10 +180,10 @@ def extract_inline_DCT(stream: StreamType) -> bytes: b"\xda\xdb\xdc\xdd\xde\xdf" b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" ): - c = stream.read(2) + c = read(2) data_out += c sz = c[0] * 256 + c[1] - data_out += stream.read(sz - 2) + data_out += read(sz - 2) ei_tok = read_non_whitespace(stream) ei_tok += stream.read(2) diff --git a/tests/generic/test_image_inline.py b/tests/generic/test_image_inline.py index e2439baee..c1f5b7dbd 100644 --- a/tests/generic/test_image_inline.py +++ b/tests/generic/test_image_inline.py @@ -1,7 +1,12 @@ """Test the pypdf.generic._image_inline module.""" from io import BytesIO +import pytest + +from pypdf import PdfReader +from pypdf.errors import PdfReadError from pypdf.generic._image_inline import is_followed_by_binary_data +from tests import get_data_from_url def test_is_followed_by_binary_data(): @@ -59,3 +64,14 @@ def test_is_followed_by_binary_data(): stream = BytesIO(b"1234.56 42 13 37 10 20 c\n") assert not is_followed_by_binary_data(stream) + + +@pytest.mark.enable_socket +def test_extract_inline_dct__early_end_of_file(): + url = "https://github.com/user-attachments/files/23056988/inline_dct__early_eof.pdf" + name = "inline_dct__early_eof.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[0] + + with pytest.raises(expected_exception=PdfReadError, match=r"^Unexpected end of stream$"): + page.images[0].image.load()