Skip to content

Commit

Permalink
Merge pull request #1090 from dhdaines/issue-1089
Browse files Browse the repository at this point in the history
Explicitly close `pypdfium2.PdfDocument` in `get_page_image`
  • Loading branch information
jsvine authored Feb 10, 2024
2 parents 07d9997 + 7322077 commit 7b9101a
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 5 deletions.
9 changes: 5 additions & 4 deletions pdfplumber/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,8 @@ def get_page_image(
stream.seek(0)
src = stream

pdfium_page = pypdfium2.PdfDocument(
src,
password=password,
).get_page(page_ix)
pdfium_doc = pypdfium2.PdfDocument(src, password=password)
pdfium_page = pdfium_doc.get_page(page_ix)

img: PIL.Image.Image = pdfium_page.render(
# Modifiable arguments
Expand All @@ -67,6 +65,9 @@ def get_page_image(
# Non-modifiable arguments
prefer_bgrx=True,
).to_pil()
# In theory `autoclose` when creating it should make it close...
# automatically. In practice this does not seem to be the case.
pdfium_doc.close()

return img.convert("RGB")

Expand Down
24 changes: 24 additions & 0 deletions tests/test_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
import logging
import os
import re

try:
import resource
except ModuleNotFoundError:
resource = None
import unittest

import pdfplumber
Expand Down Expand Up @@ -275,3 +280,22 @@ def test_issue_982(self):
text = re.sub(r"\s+", " ", page.extract_text(use_text_flow=True))
words = " ".join(w["text"] for w in page.extract_words(use_text_flow=True))
assert text[0:100] == words[0:100]

def test_issue_1089(self):
"""
Page.to_image() leaks file descriptors
This is because PyPdfium2 leaks file descriptors. Explicitly
close the `PdfDocument` to prevent this.
"""
# Skip test on platforms without getrlimit
if resource is None:
return
# Any PDF will do
path = os.path.join(HERE, "pdfs/test-punkt.pdf")
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
with pdfplumber.open(path) as pdf:
for idx in range(soft):
_ = pdf.pages[0].to_image()
# We're still alive
assert True
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_resolve(self):

def test_resolve_all(self):
info = self.pdf.doc.xrefs[0].trailer["Info"]
assert type(info) == PDFObjRef
assert type(info) is PDFObjRef
a = [{"info": info}]
a_res = utils.resolve_all(a)
assert a_res[0]["info"]["Producer"] == self.pdf.doc.info[0]["Producer"]
Expand Down

0 comments on commit 7b9101a

Please sign in to comment.