Merge pull request #1090 from dhdaines/issue-1089

Explicitly close `pypdfium2.PdfDocument` in `get_page_image`
jsvine · Feb 10, 2024 · 7b9101a · 7b9101a
2 parents 07d9997 + 7322077
commit 7b9101a
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 5 deletions.
diff --git a/pdfplumber/display.py b/pdfplumber/display.py
@@ -53,10 +53,8 @@ def get_page_image(
         stream.seek(0)
         src = stream
 
-    pdfium_page = pypdfium2.PdfDocument(
-        src,
-        password=password,
-    ).get_page(page_ix)
+    pdfium_doc = pypdfium2.PdfDocument(src, password=password)
+    pdfium_page = pdfium_doc.get_page(page_ix)
 
     img: PIL.Image.Image = pdfium_page.render(
         # Modifiable arguments
@@ -67,6 +65,9 @@ def get_page_image(
         # Non-modifiable arguments
         prefer_bgrx=True,
     ).to_pil()
+    # In theory `autoclose` when creating it should make it close...
+    # automatically.  In practice this does not seem to be the case.
+    pdfium_doc.close()
 
     return img.convert("RGB")
 

diff --git a/tests/test_issues.py b/tests/test_issues.py
@@ -2,6 +2,11 @@
 import logging
 import os
 import re
+
+try:
+    import resource
+except ModuleNotFoundError:
+    resource = None
 import unittest
 
 import pdfplumber
@@ -275,3 +280,22 @@ def test_issue_982(self):
             text = re.sub(r"\s+", " ", page.extract_text(use_text_flow=True))
             words = " ".join(w["text"] for w in page.extract_words(use_text_flow=True))
             assert text[0:100] == words[0:100]
+
+    def test_issue_1089(self):
+        """
+        Page.to_image() leaks file descriptors
+
+        This is because PyPdfium2 leaks file descriptors.  Explicitly
+        close the `PdfDocument` to prevent this.
+        """
+        # Skip test on platforms without getrlimit
+        if resource is None:
+            return
+        # Any PDF will do
+        path = os.path.join(HERE, "pdfs/test-punkt.pdf")
+        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+        with pdfplumber.open(path) as pdf:
+            for idx in range(soft):
+                _ = pdf.pages[0].to_image()
+        # We're still alive
+        assert True
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -55,7 +55,7 @@ def test_resolve(self):
 
     def test_resolve_all(self):
         info = self.pdf.doc.xrefs[0].trailer["Info"]
-        assert type(info) == PDFObjRef
+        assert type(info) is PDFObjRef
         a = [{"info": info}]
         a_res = utils.resolve_all(a)
         assert a_res[0]["info"]["Producer"] == self.pdf.doc.info[0]["Producer"]