From 89debe60cf9ed24ba1f52a9e34464cc2a7fcf3d9 Mon Sep 17 00:00:00 2001 From: stefan6419846 <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 28 Sep 2023 08:46:46 +0200 Subject: [PATCH] BUG: Avoid isolating the graphics state multiple times (fixes #2219) --- pypdf/generic/_data_structures.py | 21 +++++++++++++ tests/test_page.py | 51 +++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 88a17d85a..ed1d06336 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1023,6 +1023,8 @@ def __init__( super().set_data(b_(stream_data)) self.forced_encoding = forced_encoding + self._has_isolated_graphics_state: Optional[bool] = None + def clone( self, pdf_dest: Any, @@ -1229,12 +1231,31 @@ def operations(self, operations: List[Tuple[Any, Any]]) -> None: self._operations = operations self._data = b"" + @property + def has_isolated_graphics_state(self) -> bool: + if self._has_isolated_graphics_state is None: + if self._operations: + self._has_isolated_graphics_state = self._operations[0] == "q" and self._operations[-1] == "Q" + elif self._data: + # Check for the character with the linebreak as inserted by `isolate_graphics_state`. + self._has_isolated_graphics_state = self._data[:2] == b"q\n" and self._data[-2:] == b"Q\n" + else: + # Empty stream. + self._has_isolated_graphics_state = True + + return self._has_isolated_graphics_state + def isolate_graphics_state(self) -> None: + if self.has_isolated_graphics_state: + # No need to isolate again. + return + if self._operations: self._operations.insert(0, ([], "q")) self._operations.append(([], "Q")) elif self._data: self._data = b"q\n" + b_(self._data) + b"Q\n" + self._has_isolated_graphics_state = True # This overrides the parent method: def write_to_stream( diff --git a/tests/test_page.py b/tests/test_page.py index 1d6c49443..530821e37 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1288,3 +1288,54 @@ def test_get_contents_from_nullobject(): p = writer.add_blank_page(100, 100) p[NameObject("/Contents")] = writer._add_object(NullObject()) p.get_contents() + + +@pytest.mark.enable_socket() +def test_has_isolated_graphics_state(): + # Real example. + url = "https://github.com/py-pdf/pypdf/files/12428859/out1.pdf" + name = "isolate-graphics-state.pdf" + page = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] + content_stream = page.get_contents() + assert content_stream is not None + + assert content_stream.has_isolated_graphics_state is False + content_stream.isolate_graphics_state() + assert content_stream.has_isolated_graphics_state is True + + # Empty stream handling. + content_stream = ContentStream(stream=None, pdf="dummy.pdf") + assert content_stream.has_isolated_graphics_state is True + + # Handling of string-based checks. + content_stream = ContentStream(stream=None, pdf="dummy.pdf") + content_stream.set_data(b"q\n 841.680 0 0 595.200 0.000 0.000 cm\n/Im0 Do\nQ\n\n \n") + assert content_stream.has_isolated_graphics_state is False + + content_stream = ContentStream(stream=None, pdf="dummy.pdf") + content_stream.set_data(b"q\n 841.680 0 0 595.200 0.000 0.000 cm\n/Im0 Do\nQ\n") + assert content_stream.has_isolated_graphics_state is True + + # Dummy example to test caching. + content_stream = ContentStream(stream=None, pdf="dummy.pdf") + assert content_stream._has_isolated_graphics_state is None + content_stream._has_isolated_graphics_state = True + assert content_stream.has_isolated_graphics_state is True + content_stream._has_isolated_graphics_state = False + assert content_stream.has_isolated_graphics_state is False + + +@pytest.mark.enable_socket() +def test_isolate_graphics_state(): + url = "https://github.com/py-pdf/pypdf/files/12428859/out1.pdf" + name = "isolate-graphics-state.pdf" + page = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] + content_stream = page.get_contents() + assert content_stream is not None + + # This page is not considered isolated at the beginning due to the final characters. + assert content_stream._data == b"q\n 841.680 0 0 595.200 0.000 0.000 cm\n/Im0 Do\nQ\n\n \n" + content_stream.isolate_graphics_state() + assert content_stream._data == b"q\nq\n 841.680 0 0 595.200 0.000 0.000 cm\n/Im0 Do\nQ\n\n \nQ\n" + content_stream.isolate_graphics_state() + assert content_stream._data == b"q\nq\n 841.680 0 0 595.200 0.000 0.000 cm\n/Im0 Do\nQ\n\n \nQ\n"