Replace Wand with pypdfium2 for page.to_image(...)

This commit swaps out Wand (and its non-Python dependencies ImageMagick and Ghostscript) for pypdfium2 for PageImage rendering. This has some advantages: - Less finicky: Wand often caused users problems, due to "MagickWand shared library not found" and "PolicyError: not authorized `PDF'" issues. By contrast, pypdfium2 seems (at least at first) to more self-contained and not require any system-tweaking. - Faster: pypdfium2 appears to render images more quickly than Wand (see @cmdlineuser's tests in #899) - More flexible: pypdfium2 appears to generate images with greater color depth; by default, pdfplumber quantizes those images so that they save/display compactly (in fact, with smaller file sizes than the previous code), this commit also adds parameters to retain all/more of the original, more detailed colors. Thanks to @cmdlineuser in #899 for the suggestion.
jsvine · Jul 16, 2023 · b049373 · mara004 · Jul 23, 2023 · b049373
1 parent ea5e275
commit b049373
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -250,17 +250,12 @@ If you pass the `pdfminer.six`-handling `laparams` parameter to `pdfplumber.open
 
 `pdfplumber`'s visual debugging tools can be helpful in understanding the structure of a PDF and the objects that have been extracted from it.
 
-__Note:__ To use this feature, you'll also need to have two additional pieces of software installed on your computer:
-
-- [`ImageMagick`](https://www.imagemagick.org/). [Installation instructions here](http://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-debian).
-- [`ghostscript`](https://www.ghostscript.com). [Installation instructions here](https://ghostscript.readthedocs.io/en/latest/Install.html), or simply `apt install ghostscript` (Ubuntu) / `brew install ghostscript` (Mac).
-
 
 ### Creating a `PageImage` with `.to_image()`
 
 To turn any page (including cropped pages) into an `PageImage` object, call `my_page.to_image()`. You can optionally pass *one* of the  following keyword arguments:
 
-- `resolution`: The desired number pixels per inch. Defaults to 72. See note below.
+- `resolution`: The desired number pixels per inch. Defaults to 72.
 - `width`: The desired image width in pixels.
 - `height`: The desired image width in pixels.
 
@@ -270,12 +265,10 @@ For instance:
 im = my_pdf.pages[0].to_image(resolution=150)
 ```
 
-From a script or REPL, `im.show()` will open the image in your local image viewer. But `PageImage` objects also play nicely with IPython/Jupyter notebooks; they automatically render as cell outputs. For example:
+From a script or REPL, `im.show()` will open the image in your local image viewer. But `PageImage` objects also play nicely with Jupyter notebooks; they automatically render as cell outputs. For example:
 
 ![Visual debugging in Jupyter](examples/screenshots/visual-debugging-in-jupyter.png "Visual debugging in Jupyter")
 
-*Note*: `pdfplumber` passes the `resolution` parameter to [Wand](https://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image), the Python library we use for image conversion. Wand will create the image with the desired number of total pixels of height/width, but does not fully respect the `resolution` in the strict sense of that word: Although PNGs are capable of storing an image's resolution density as metadata, Wand's PNGs do not.
-
 *Note*: `.to_image(...)` works as expected with `Page.crop(...)`/`CroppedPage` instances, but is unable to incorporate changes made via `Page.filter(...)`/`FilteredPage` instances.
 
 
@@ -286,7 +279,7 @@ From a script or REPL, `im.show()` will open the image in your local image viewe
 |`im.reset()`| Clears anything you've drawn so far.|
 |`im.copy()`| Copies the image to a new `PageImage` object.|
 |`im.show()`| Opens the image in your local image viewer.|
-|`im.save(path_or_fileobject, format="PNG")`| Saves the annotated image.|
+|`im.save(path_or_fileobject, format="PNG", quantize=True, colors=256, bits=8)`| Saves the annotated image as a PNG file. The default arguments quantize the image to a palette of 256 colors, saving the PNG with 8-bit color depth. You can disable quantization by passing `quantize=False` or adjust the size of the color palette by passing `colors=N`.|
 
 ### Drawing methods
 

diff --git a/pdfplumber/display.py b/pdfplumber/display.py
@@ -1,10 +1,10 @@
 from io import BufferedReader, BytesIO
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
 
 import PIL.Image
 import PIL.ImageDraw
-from wand.image import Color as WandColor  # type: ignore
-from wand.image import Image as WandImage
+import pypdfium2  # type: ignore
 
 from . import utils
 from ._typing import T_bbox, T_num, T_obj, T_obj_list, T_point, T_seq
@@ -34,66 +34,53 @@ class COLORS:
 
 
 def get_page_image(
-    stream: Union[BufferedReader, BytesIO], page_no: int, resolution: Union[int, float]
-) -> WandImage:
+    stream: Union[BufferedReader, BytesIO],
+    page_ix: int,
+    resolution: Union[int, float],
+    password: Optional[str],
+) -> PIL.Image.Image:
     # If we are working with a file object saved to disk
     if hasattr(stream, "name"):
-        filename = f"{stream.name}[{page_no}]"
-        file = None
-
-        def postprocess(img: WandImage) -> WandImage:
-            return img
+        src = stream.name
 
     # If we instead are working with a BytesIO stream
     else:
         stream.seek(0)
-        filename = None
-        file = stream
-
-        def postprocess(img: WandImage) -> WandImage:
-            return WandImage(image=img.sequence[page_no])
-
-    with WandImage(
-        resolution=resolution,
-        filename=filename,
-        file=file,
-        colorspace="rgb",
-        format="pdf",
-    ) as img_init:
-        img = postprocess(img_init)
-        with WandImage(
-            width=img.width,
-            height=img.height,
-            background=WandColor("white"),
-            colorspace="rgb",
-        ) as bg:
-            bg.composite(img, 0, 0)
-            try:
-                im = PIL.Image.open(BytesIO(bg.make_blob("png")))
-            except PIL.Image.DecompressionBombError:
-                raise PIL.Image.DecompressionBombError(
-                    "Image conversion raised a DecompressionBombError. "
-                    "PIL.Image.MAX_IMAGE_PIXELS is currently set to "
-                    f"{PIL.Image.MAX_IMAGE_PIXELS}. "
-                    "If you trust this PDF, you can try setting "
-                    "PIL.Image.MAX_IMAGE_PIXELS to a higher value. "
-                    "See https://github.com/jsvine/pdfplumber/issues/413"
-                    "#issuecomment-1190650404 for more information."
-                )
-        return im.convert("RGB")
+        src = stream
+
+    img: PIL.Image.Image = pypdfium2.PdfDocument._process_page(
+        # Modifiable arguments
+        page_ix,
+        input_data=src,
+        password=password,
+        scale=resolution / 72,
+        no_smoothtext=True,
+        # Non-modifiable arguments
+        renderer=pypdfium2._helpers.page.PdfPage.render,
+        converter=pypdfium2.PdfBitmap.to_pil,
+        prefer_bgrx=True,
+        pass_info=False,
+        need_formenv=False,
+        mk_formconfig=None,
+    )
+
+    return img.convert("RGB")
 
 
 class PageImage:
     def __init__(
         self,
         page: "Page",
-        original: Optional[WandImage] = None,
+        original: Optional[PIL.Image.Image] = None,
         resolution: Union[int, float] = DEFAULT_RESOLUTION,
     ):
         self.page = page
         if original is None:
             self.original = get_page_image(
-                page.pdf.stream, page.page_number - 1, resolution
+                stream=page.pdf.stream,
+                page_ix=page.page_number - 1,
+                resolution=resolution,
+                password=page.pdf.password,
             )
         else:
             self.original = original
@@ -104,15 +91,18 @@ def __init__(
         else:
             self.root = page.root_page
             cropped = page.root_page.bbox != page.bbox
+
+        self.resolution = resolution
         self.scale = self.original.size[0] / self.root.width
+
         if cropped:
             cropbox = (
-                (page.bbox[0] - page.root_page.bbox[0]) * self.scale,
-                (page.bbox[1] - page.root_page.bbox[1]) * self.scale,
-                (page.bbox[2] - page.root_page.bbox[0]) * self.scale,
-                (page.bbox[3] - page.root_page.bbox[1]) * self.scale,
+                int((page.bbox[0] - page.root_page.bbox[0]) * self.scale),
+                int((page.bbox[1] - page.root_page.bbox[1]) * self.scale),
+                int((page.bbox[2] - page.root_page.bbox[0]) * self.scale),
+                int((page.bbox[3] - page.root_page.bbox[1]) * self.scale),
             )
-            self.original = self.original.crop(tuple(map(int, cropbox)))
+            self.original = self.original.crop(cropbox)
         self.reset()
 
     def _reproject_bbox(self, bbox: T_bbox) -> T_bbox:
@@ -134,12 +124,35 @@ def _reproject(self, coord: T_point) -> T_point:
         return (_x0, _top)
 
     def reset(self) -> "PageImage":
-        self.annotated = PIL.Image.new(self.original.mode, self.original.size)
+        self.annotated = PIL.Image.new("RGB", self.original.size)
         self.annotated.paste(self.original)
         self.draw = PIL.ImageDraw.Draw(self.annotated, "RGBA")
-        self.save = self.annotated.save
         return self
 
+    def save(
+        self,
+        dest: Union[str, Path, BytesIO],
+        format: str = "PNG",
+        quantize: bool = True,
+        colors: int = 256,
+        bits: int = 8,
+        **kwargs: Any,
+    ) -> None:
+        if quantize:
+            out = self.annotated.quantize(colors, method=PIL.Image.FASTOCTREE).convert(
+                "P"
+            )
+        else:
+            out = self.annotated
+
+        out.save(
+            dest,
+            format=format,
+            bits=bits,
+            dpi=(self.resolution, self.resolution),
+            **kwargs,
+        )
+
     def copy(self) -> "PageImage":
         return self.__class__(self.page, self.original)
 
@@ -358,7 +371,7 @@ def outline_chars(
 
     def _repr_png_(self) -> bytes:
         b = BytesIO()
-        self.annotated.save(b, "PNG")
+        self.save(b, "PNG")
         return b.getvalue()
 
     def show(self) -> None:  # pragma: no cover

diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py
@@ -29,15 +29,16 @@ def __init__(
         stream_is_external: bool = False,
         pages: Optional[Union[List[int], Tuple[int]]] = None,
         laparams: Optional[Dict[str, Any]] = None,
-        password: str = "",
+        password: Optional[str] = None,
         strict_metadata: bool = False,
     ):
         self.stream = stream
         self.stream_is_external = stream_is_external
         self.pages_to_parse = pages
         self.laparams = None if laparams is None else LAParams(**laparams)
+        self.password = password
 
-        self.doc = PDFDocument(PDFParser(stream), password=password)
+        self.doc = PDFDocument(PDFParser(stream), password=password or "")
         self.rsrcmgr = PDFResourceManager()
         self.metadata = {}
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 pdfminer.six==20221105
 Pillow>=9.1
-Wand>=0.6.10
+pypdfium2>=4.18.0
diff --git a/tests/test_display.py b/tests/test_display.py
@@ -90,16 +90,21 @@ def test_outline_chars(self):
     def test__repr_png_(self):
         png = self.im._repr_png_()
         assert isinstance(png, bytes)
-        assert len(png) in (
-            61247,
-            71939,
-            71983,
-            72168,
-        )  # PNG encoder seems to work differently on different setups
+        assert 40000 < len(png) < 80000
+
+    def test_no_quantize(self):
+        b = io.BytesIO()
+        self.im.save(b, "PNG", quantize=False)
+        assert len(b.getvalue()) > 100000
 
     def test_decompression_bomb(self):
         original_max = PIL.Image.MAX_IMAGE_PIXELS
         PIL.Image.MAX_IMAGE_PIXELS = 10
-        with pytest.raises(PIL.Image.DecompressionBombError):
-            self.pdf.pages[0].to_image()
+        # Previously, this raised PIL.Image.DecompressionBombError
+        self.pdf.pages[0].to_image()
         PIL.Image.MAX_IMAGE_PIXELS = original_max
+
+    def test_password(self):
+        path = os.path.join(HERE, "pdfs/password-example.pdf")
+        with pdfplumber.open(path, password="test") as pdf:
+            pdf.pages[0].to_image()