From f3d20fcf35f142f361bd2fe7b1e4d6ce05ecd9f3 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 5 Feb 2024 08:52:51 -0500 Subject: [PATCH 1/2] fix: prevent pypdfium2 file leak (fixes: #1089) --- pdfplumber/display.py | 9 +++++---- tests/test_issues.py | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/pdfplumber/display.py b/pdfplumber/display.py index 44723d70..7ee772f9 100644 --- a/pdfplumber/display.py +++ b/pdfplumber/display.py @@ -53,10 +53,8 @@ def get_page_image( stream.seek(0) src = stream - pdfium_page = pypdfium2.PdfDocument( - src, - password=password, - ).get_page(page_ix) + pdfium_doc = pypdfium2.PdfDocument(src, password=password) + pdfium_page = pdfium_doc.get_page(page_ix) img: PIL.Image.Image = pdfium_page.render( # Modifiable arguments @@ -67,6 +65,9 @@ def get_page_image( # Non-modifiable arguments prefer_bgrx=True, ).to_pil() + # In theory `autoclose` when creating it should make it close... + # automatically. In practice this does not seem to be the case. + pdfium_doc.close() return img.convert("RGB") diff --git a/tests/test_issues.py b/tests/test_issues.py index 614b4c22..89eadb50 100644 --- a/tests/test_issues.py +++ b/tests/test_issues.py @@ -2,6 +2,11 @@ import logging import os import re + +try: + import resource +except ModuleNotFoundError: + resource = None import unittest import pdfplumber @@ -275,3 +280,22 @@ def test_issue_982(self): text = re.sub(r"\s+", " ", page.extract_text(use_text_flow=True)) words = " ".join(w["text"] for w in page.extract_words(use_text_flow=True)) assert text[0:100] == words[0:100] + + def test_issue_1089(self): + """ + Page.to_image() leaks file descriptors + + This is because PyPdfium2 leaks file descriptors. Explicitly + close the `PdfDocument` to prevent this. + """ + # Skip test on platforms without getrlimit + if resource is None: + return + # Any PDF will do + path = os.path.join(HERE, "pdfs/test-punkt.pdf") + soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) + with pdfplumber.open(path) as pdf: + for idx in range(soft): + _ = pdf.pages[0].to_image() + # We're still alive + assert True From 7322077e0f745837598952b76f0c0d91c252e709 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 5 Feb 2024 08:53:03 -0500 Subject: [PATCH 2/2] fix: placate flake8 --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 83433b7b..d1a21adf 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -55,7 +55,7 @@ def test_resolve(self): def test_resolve_all(self): info = self.pdf.doc.xrefs[0].trailer["Info"] - assert type(info) == PDFObjRef + assert type(info) is PDFObjRef a = [{"info": info}] a_res = utils.resolve_all(a) assert a_res[0]["info"]["Producer"] == self.pdf.doc.info[0]["Producer"]