From d0b8f685aaabe724a8d0d6c4710b696d0c55c549 Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Tue, 28 Oct 2025 10:34:21 -0400 Subject: [PATCH 01/11] tests to reproduce failure --- tests/example_files.yaml | 2 ++ tests/test_merger.py | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/tests/example_files.yaml b/tests/example_files.yaml index be02965d7..873729f1e 100644 --- a/tests/example_files.yaml +++ b/tests/example_files.yaml @@ -120,3 +120,5 @@ url: https://github.com/user-attachments/files/21578875/layout-parser-paper-with-empty-pages.pdf - local_filename: issue-3429.pdf url: https://github.com/user-attachments/files/21711469/bomb.pdf +- local_filename: issue-3508.pdf + url: https://webasto-russia.ru/repair-manual-thermo-230-300-350-2012-en.pdf diff --git a/tests/test_merger.py b/tests/test_merger.py index f1eb2e78e..5a813f5ea 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -398,6 +398,15 @@ def test_articles_with_writer(caplog): assert len(r.threads) == 4 assert r.threads[0].get_object()["/F"]["/P"] == r.pages[0] +@pytest.mark.enable_socket +def test_null_articles_with_writer(): + data = get_data_from_url(name="issue-3508.pdf") + file = BytesIO(data) + merger = PdfWriter() + merger.append(file) + temp = BytesIO() + merger.write(temp) + # if this completes without error, the test was successful def test_get_reference(): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") From ff7e4cce33ad9715f6c464230e754167a506f2e3 Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Tue, 28 Oct 2025 10:42:58 -0400 Subject: [PATCH 02/11] fix --- CONTRIBUTORS.md | 1 + pypdf/_writer.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index cc2849766..87fe2261e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -39,6 +39,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr * [Mérino, Antoine](https://github.com/Merinorus) * [Murphy, Kevin](https://github.com/kmurphy4) * [nalin-udhaar](https://github.com/nalin-udhaar) +* [Noah-Houghton](https://github.com/Noah-Houghton) | [LinkedIn](https://www.linkedin.com/in/noah-h-554992a0/) * [Paramonov, Alexey](https://github.com/alexey-v-paramonov) * [Paternault, Louis](https://framagit.org/spalax) * [Perrensen, Olsen](https://github.com/olsonperrensen) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index c4ebea929..b322f34ea 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -3003,7 +3003,10 @@ def add_filtered_articles( for p in pages.values(): pp = p.original_page for a in pp.get("/B", ()): - thr = a.get_object().get("/T") + a_obj = a.get_object() + if isinstance(a_obj, NullObject): + continue + thr = a_obj.get("/T") if thr is None: continue thr = thr.get_object() From bf9bc85df84804e3749c90bd52a45f70e69fbeb9 Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Tue, 28 Oct 2025 11:01:47 -0400 Subject: [PATCH 03/11] remove unnecessary write call --- tests/test_merger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_merger.py b/tests/test_merger.py index 5a813f5ea..dd862cfbf 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -398,16 +398,16 @@ def test_articles_with_writer(caplog): assert len(r.threads) == 4 assert r.threads[0].get_object()["/F"]["/P"] == r.pages[0] + @pytest.mark.enable_socket def test_null_articles_with_writer(): data = get_data_from_url(name="issue-3508.pdf") file = BytesIO(data) merger = PdfWriter() merger.append(file) - temp = BytesIO() - merger.write(temp) # if this completes without error, the test was successful + def test_get_reference(): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") assert writer.get_reference(writer.pages[0]) == writer.pages[0].indirect_reference From 745229a6c8cc0274b9535a8daf50af0273c0a99f Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Tue, 28 Oct 2025 14:42:03 -0400 Subject: [PATCH 04/11] see if wrapping in pdfreader fixes encryption error --- tests/test_merger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_merger.py b/tests/test_merger.py index dd862cfbf..047a542fb 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -402,7 +402,7 @@ def test_articles_with_writer(caplog): @pytest.mark.enable_socket def test_null_articles_with_writer(): data = get_data_from_url(name="issue-3508.pdf") - file = BytesIO(data) + file = PdfReader(BytesIO(data)) merger = PdfWriter() merger.append(file) # if this completes without error, the test was successful From ef9ab229c3827a839941bc648bb20363364a2d85 Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Tue, 28 Oct 2025 14:55:35 -0400 Subject: [PATCH 05/11] see if starting with a source file will address encryption error --- tests/test_merger.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_merger.py b/tests/test_merger.py index 047a542fb..75a4f2e78 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -402,10 +402,9 @@ def test_articles_with_writer(caplog): @pytest.mark.enable_socket def test_null_articles_with_writer(): data = get_data_from_url(name="issue-3508.pdf") - file = PdfReader(BytesIO(data)) - merger = PdfWriter() - merger.append(file) - # if this completes without error, the test was successful + file = BytesIO(data) + merger = PdfWriter(file) + merger.append(file) # if this doesn't crash, the test has been successful def test_get_reference(): From 3998cb462099912137a6906e61fa3490a465121a Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Tue, 28 Oct 2025 16:29:10 -0400 Subject: [PATCH 06/11] minimal code --- tests/test_merger.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_merger.py b/tests/test_merger.py index 75a4f2e78..41d0424e2 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -402,9 +402,8 @@ def test_articles_with_writer(caplog): @pytest.mark.enable_socket def test_null_articles_with_writer(): data = get_data_from_url(name="issue-3508.pdf") - file = BytesIO(data) - merger = PdfWriter(file) - merger.append(file) # if this doesn't crash, the test has been successful + m = PdfWriter() + m.append(PdfReader(BytesIO(data))) def test_get_reference(): From a3f65eab24ba9db475bbc8f5fa1a51a53a6645f9 Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Tue, 28 Oct 2025 16:46:10 -0400 Subject: [PATCH 07/11] remove unneeded pdfreader --- tests/test_merger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_merger.py b/tests/test_merger.py index 41d0424e2..b5b729e5c 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -403,7 +403,7 @@ def test_articles_with_writer(caplog): def test_null_articles_with_writer(): data = get_data_from_url(name="issue-3508.pdf") m = PdfWriter() - m.append(PdfReader(BytesIO(data))) + m.append(BytesIO(data)) def test_get_reference(): From 42ded99d3a1a7c041e0e98dd4bd66620451ab39f Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Wed, 29 Oct 2025 09:41:07 -0400 Subject: [PATCH 08/11] skip test if there is no aes --- tests/test_merger.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_merger.py b/tests/test_merger.py index b5b729e5c..6e8913794 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -15,6 +15,11 @@ PROJECT_ROOT = TESTS_ROOT.parent RESOURCE_ROOT = PROJECT_ROOT / "resources" +from pypdf._crypt_providers import crypt_provider +USE_CRYPTOGRAPHY = crypt_provider[0] == "cryptography" +USE_PYCRYPTODOME = crypt_provider[0] == "pycryptodome" +HAS_AES = USE_CRYPTOGRAPHY or USE_PYCRYPTODOME + sys.path.append(str(PROJECT_ROOT)) @@ -399,6 +404,7 @@ def test_articles_with_writer(caplog): assert r.threads[0].get_object()["/F"]["/P"] == r.pages[0] +@pytest.mark.skipif(not HAS_AES, reason="No AES implementation") @pytest.mark.enable_socket def test_null_articles_with_writer(): data = get_data_from_url(name="issue-3508.pdf") From b9ee46dc1fa867261964cd6ac9ff0eed541e637f Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Wed, 29 Oct 2025 10:08:33 -0400 Subject: [PATCH 09/11] move file to github for permanence; organize imports --- tests/example_files.yaml | 2 +- tests/test_merger.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/example_files.yaml b/tests/example_files.yaml index 873729f1e..726af6356 100644 --- a/tests/example_files.yaml +++ b/tests/example_files.yaml @@ -121,4 +121,4 @@ - local_filename: issue-3429.pdf url: https://github.com/user-attachments/files/21711469/bomb.pdf - local_filename: issue-3508.pdf - url: https://webasto-russia.ru/repair-manual-thermo-230-300-350-2012-en.pdf + url: https://github.com/user-attachments/files/23211824/repair-manual-thermo-230-300-350-2012-en.pdf diff --git a/tests/test_merger.py b/tests/test_merger.py index 6e8913794..3964c45e5 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -7,15 +7,14 @@ import pypdf from pypdf import PdfReader, PdfWriter +from pypdf._crypt_providers import crypt_provider from pypdf.generic import Destination, Fit - from . import get_data_from_url TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent RESOURCE_ROOT = PROJECT_ROOT / "resources" -from pypdf._crypt_providers import crypt_provider USE_CRYPTOGRAPHY = crypt_provider[0] == "cryptography" USE_PYCRYPTODOME = crypt_provider[0] == "pycryptodome" HAS_AES = USE_CRYPTOGRAPHY or USE_PYCRYPTODOME From 6a3a732cf10c727a93fefc63d2b77d0f95d46b7e Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Wed, 29 Oct 2025 10:18:35 -0400 Subject: [PATCH 10/11] fix style --- tests/test_merger.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_merger.py b/tests/test_merger.py index 3964c45e5..d4d7cb409 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -9,6 +9,7 @@ from pypdf import PdfReader, PdfWriter from pypdf._crypt_providers import crypt_provider from pypdf.generic import Destination, Fit + from . import get_data_from_url TESTS_ROOT = Path(__file__).parent.resolve() From c78ca123fd1c4e0e4958a529d5ce5db2eee9c965 Mon Sep 17 00:00:00 2001 From: Noah Houghton Date: Wed, 29 Oct 2025 14:52:31 -0400 Subject: [PATCH 11/11] code review --- pypdf/_writer.py | 2 +- tests/test_merger.py | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index b322f34ea..f4e199986 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -3004,7 +3004,7 @@ def add_filtered_articles( pp = p.original_page for a in pp.get("/B", ()): a_obj = a.get_object() - if isinstance(a_obj, NullObject): + if is_null_or_none(a_obj): continue thr = a_obj.get("/T") if thr is None: diff --git a/tests/test_merger.py b/tests/test_merger.py index d4d7cb409..183766df9 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -7,19 +7,15 @@ import pypdf from pypdf import PdfReader, PdfWriter -from pypdf._crypt_providers import crypt_provider from pypdf.generic import Destination, Fit from . import get_data_from_url +from .test_encryption import HAS_AES TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent RESOURCE_ROOT = PROJECT_ROOT / "resources" -USE_CRYPTOGRAPHY = crypt_provider[0] == "cryptography" -USE_PYCRYPTODOME = crypt_provider[0] == "pycryptodome" -HAS_AES = USE_CRYPTOGRAPHY or USE_PYCRYPTODOME - sys.path.append(str(PROJECT_ROOT)) @@ -408,8 +404,9 @@ def test_articles_with_writer(caplog): @pytest.mark.enable_socket def test_null_articles_with_writer(): data = get_data_from_url(name="issue-3508.pdf") - m = PdfWriter() - m.append(BytesIO(data)) + merger = PdfWriter() + merger.append(BytesIO(data)) + assert len(merger.pages) == 98 def test_get_reference():