From 6f05a46e53538c8f0966d32d270e789be052c5d8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 5 Apr 2023 23:00:26 +0200 Subject: [PATCH 1/3] ROB : Prevent loop in Cloning fixes #1767 the issue is due to object 589/0 : this object corresponds to 2 object the file trailer but also an XObject corresponding to the filled text ("test") because of that the object tries to be duplicated inducing the loop. This loop is not normal and I've fixed as a robustness improvement. If you run your code the text "test" will be missing as it is hidden by the trailer object. --- pypdf/generic/_base.py | 6 +++--- pypdf/generic/_data_structures.py | 4 +++- tests/test_writer.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index f75e66dd6..cc0354add 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -277,9 +277,9 @@ def clone( obj = NullObject() assert isinstance(self, (IndirectObject,)) obj.indirect_reference = self - dup = obj.clone(pdf_dest, force_duplicate, ignore_fields) - assert dup is not None - assert dup.indirect_reference is not None + dup = pdf_dest._add_object( + obj.clone(pdf_dest, force_duplicate, ignore_fields) + ) return dup.indirect_reference @property diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 1fd196027..8d37970ad 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -750,7 +750,9 @@ def _clone( if decoded_self is None: self.decoded_self = None else: - self.decoded_self = decoded_self.clone(pdf_dest, True, ignore_fields) # type: ignore[assignment] + self.decoded_self = decoded_self.clone( + pdf_dest, force_duplicate, ignore_fields + ) # type: ignore[assignment] except Exception: pass super()._clone(src, pdf_dest, force_duplicate, ignore_fields) diff --git a/tests/test_writer.py b/tests/test_writer.py index 10943c509..5066eecb6 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1292,3 +1292,14 @@ def test_iss1723(): in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) out_pdf = PdfWriter() out_pdf.append(in_pdf, (3, 5)) + + +@pytest.mark.enable_socket() +def test_iss1767(): + # test with a pdf which is buggy because the object 389,0 exists 3 times: + # twice to define catalog and one as an XObject inducing a loop when + # cloning + url = "https://github.com/py-pdf/pypdf/files/11138472/test.pdf" + name = "iss1723.pdf" + in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + PdfWriter(clone_from=in_pdf) From 7d862ae23a00de415c181101f36612ade5c58613 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 6 Apr 2023 19:04:46 +0200 Subject: [PATCH 2/3] fix mypy --- pypdf/_protocols.py | 3 +++ pypdf/generic/_base.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index 85e9e0a56..ba6cd8a3c 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -73,6 +73,9 @@ def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]: def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: ... + def _add_object(self, obj: Any) -> Any: + ... + @property def pages(self) -> List[Any]: ... diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index cc0354add..be3d71c45 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -280,6 +280,9 @@ def clone( dup = pdf_dest._add_object( obj.clone(pdf_dest, force_duplicate, ignore_fields) ) + # asserts added to prevent errors in mypy + assert dup is not None + assert dup.indirect_reference is not None return dup.indirect_reference @property From 52df465bda9f630e1eb19982ad7fbee66d0e2fab Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 6 Apr 2023 19:15:58 +0200 Subject: [PATCH 3/3] mypy 2 --- pypdf/generic/_data_structures.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 8d37970ad..b8aaf12d4 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -750,9 +750,10 @@ def _clone( if decoded_self is None: self.decoded_self = None else: - self.decoded_self = decoded_self.clone( - pdf_dest, force_duplicate, ignore_fields - ) # type: ignore[assignment] + self.decoded_self = cast( + "DecodedStreamObject", + decoded_self.clone(pdf_dest, force_duplicate, ignore_fields), + ) except Exception: pass super()._clone(src, pdf_dest, force_duplicate, ignore_fields)