Unstructured-IO · Coniferish · Oct 15, 2024 · Oct 24, 2024 · Dec 9, 2024 · Jan 18, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ### Enhancements
 
 ### Features
+- **Use password** to load PDF with all modes
 
 ### Fixes
 - **Fix an issue with multiple values for `infer_table_structure`** when paritioning email with image attachements the kwarg calls into `partition` to partition the image already contains `infer_table_structure`. Now `partition` function checks if the `kwarg` has `infer_table_structure` already

diff --git a/example-docs/pdf/password.pdf b/example-docs/pdf/password.pdf
diff --git a/test_unstructured/chunking/test_basic.py b/test_unstructured/chunking/test_basic.py
@@ -164,5 +164,5 @@ def it_supports_the_include_orig_elements_option(
     # -- fixtures --------------------------------------------------------------------------------
 
     @pytest.fixture()
-    def _chunk_elements_(self, request: FixtureRequest):
+    def _chunk_elements_(self, request: FixtureRequest):  # noqa: PT005
         return function_mock(request, "unstructured.chunking.basic._chunk_elements")
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
@@ -468,7 +468,7 @@ def it_supports_the_include_orig_elements_option(
     # -- fixtures --------------------------------------------------------------------------------
 
     @pytest.fixture()
-    def _chunk_by_title_(self, request: FixtureRequest):
+    def _chunk_by_title_(self, request: FixtureRequest):  # noqa: PT005
         return function_mock(request, "unstructured.chunking.title._chunk_by_title")
 
 

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1517,3 +1517,43 @@ def test_document_to_element_list_sets_category_depth_titles():
     assert elements[1].metadata.category_depth == 2
     assert elements[2].metadata.category_depth is None
     assert elements[3].metadata.category_depth == 0
+
+
+@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
+@pytest.mark.parametrize(
+    "strategy",
+    # fast: can't capture the "intentionally left blank page" page
+    # others: will ignore the actual blank page
+    [
+        PartitionStrategy.FAST,
+        PartitionStrategy.HI_RES,
+        PartitionStrategy.OCR_ONLY,
+    ],
+)
+def test_partition_pdf_with_password(
+    file_mode: str,
+    strategy: str,
+    filename: str = example_doc_path("pdf/password.pdf"),
+):
+    # Test that the partition_pdf function can handle filename
+    def _test(result: list[Element]):
+        # validate that the result is a non-empty list of dicts
+        assert len(result) == 1
+        assert result[0].text == "File with password"
+
+    if file_mode == "filename":
+        result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password")
+        _test(result)
+    elif file_mode == "rb":
+        with open(filename, "rb") as f:
+            result = pdf.partition_pdf(file=f, strategy=strategy, password="password")
+            _test(result)
+    else:
+        with open(filename, "rb") as test_file:
+            with SpooledTemporaryFile() as spooled_temp_file:
+                spooled_temp_file.write(test_file.read())
+                spooled_temp_file.seek(0)
+                result = pdf.partition_pdf(
+                    file=spooled_temp_file, strategy=strategy, password="password"
+                )
+                _test(result)
diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
@@ -443,7 +443,7 @@ def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_funct
     # -- fixtures --------------------------------------------------------------------------------
 
     @pytest.fixture
-    def _last_modified_prop_(self, request: FixtureRequest):
+    def _last_modified_prop_(self, request: FixtureRequest):  # noqa: PT005
         return property_mock(request, MsgPartitionerOptions, "_last_modified")
 
     @pytest.fixture

diff --git a/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py b/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py
@@ -99,7 +99,7 @@ def get_instance_(self, request: FixtureRequest):
         return method_mock(request, OCRAgent, "get_instance")
 
     @pytest.fixture()
-    def _get_ocr_agent_cls_qname_(self, request: FixtureRequest):
+    def _get_ocr_agent_cls_qname_(self, request: FixtureRequest):  # noqa: PT005
         return method_mock(request, OCRAgent, "_get_ocr_agent_cls_qname")
 
     @pytest.fixture()

diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py
@@ -32,6 +32,7 @@ def partition_image(
     starting_page_number: int = 1,
     extract_forms: bool = False,
     form_extraction_skip_tables: bool = True,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Parses an image into a list of interpreted elements.
@@ -91,6 +92,8 @@ def partition_image(
         (results in adding FormKeysValues elements to output).
     form_extraction_skip_tables
         Whether the form extraction logic should ignore regions designated as Tables.
+    password
+        The password to decrypt the PDF file.
     """
     exactly_one(filename=filename, file=file)
 
@@ -113,5 +116,6 @@ def partition_image(
         starting_page_number=starting_page_number,
         extract_forms=extract_forms,
         form_extraction_skip_tables=form_extraction_skip_tables,
+        password=password,
         **kwargs,
     )
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -142,6 +142,7 @@ def partition_pdf(
     starting_page_number: int = 1,
     extract_forms: bool = False,
     form_extraction_skip_tables: bool = True,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Parses a pdf document into a list of interpreted elements.
@@ -222,6 +223,7 @@ def partition_pdf(
         starting_page_number=starting_page_number,
         extract_forms=extract_forms,
         form_extraction_skip_tables=form_extraction_skip_tables,
+        password=password,
         **kwargs,
     )
 
@@ -243,6 +245,7 @@ def partition_pdf_or_image(
     starting_page_number: int = 1,
     extract_forms: bool = False,
     form_extraction_skip_tables: bool = True,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Parses a pdf or image document into a list of interpreted elements."""
@@ -270,6 +273,7 @@ def partition_pdf_or_image(
                 file=spooled_to_bytes_io_if_needed(file),
                 languages=languages,
                 metadata_last_modified=metadata_last_modified or last_modified,
+                password=password,
                 starting_page_number=starting_page_number,
                 **kwargs,
             )
@@ -320,6 +324,7 @@ def partition_pdf_or_image(
                 starting_page_number=starting_page_number,
                 extract_forms=extract_forms,
                 form_extraction_skip_tables=form_extraction_skip_tables,
+                password=password,
                 **kwargs,
             )
             out_elements = _process_uncategorized_text_elements(elements)
@@ -344,6 +349,7 @@ def partition_pdf_or_image(
                 ocr_languages=ocr_languages,
                 is_image=is_image,
                 metadata_last_modified=metadata_last_modified or last_modified,
+                password=password,
                 starting_page_number=starting_page_number,
                 **kwargs,
             )
@@ -358,6 +364,7 @@ def extractable_elements(
     languages: Optional[list[str]] = None,
     metadata_last_modified: Optional[str] = None,
     starting_page_number: int = 1,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> list[list[Element]]:
     if isinstance(file, bytes):
@@ -367,6 +374,7 @@ def extractable_elements(
         file=file,
         languages=languages,
         metadata_last_modified=metadata_last_modified,
+        password=password,
         starting_page_number=starting_page_number,
         **kwargs,
     )
@@ -378,6 +386,7 @@ def _partition_pdf_with_pdfminer(
     languages: list[str],
     metadata_last_modified: Optional[str],
     starting_page_number: int = 1,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> list[list[Element]]:
     """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@@ -401,6 +410,7 @@ def _partition_pdf_with_pdfminer(
                 languages=languages,
                 metadata_last_modified=metadata_last_modified,
                 starting_page_number=starting_page_number,
+                password=password,
                 **kwargs,
             )
 
@@ -410,6 +420,7 @@ def _partition_pdf_with_pdfminer(
             filename=filename,
             languages=languages,
             metadata_last_modified=metadata_last_modified,
+            password=password,
             starting_page_number=starting_page_number,
             **kwargs,
         )
@@ -425,14 +436,16 @@ def _process_pdfminer_pages(
     metadata_last_modified: Optional[str],
     annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
     starting_page_number: int = 1,
+    password: Optional[str] = None,
     **kwargs,
 ) -> list[list[Element]]:
     """Uses PDFMiner to split a document into pages and process them."""
 
     elements = []
 
     for page_number, (page, page_layout) in enumerate(
-        open_pdfminer_pages_generator(fp), start=starting_page_number
+        open_pdfminer_pages_generator(fp, password=password),
+        start=starting_page_number,
     ):
         width, height = page_layout.width, page_layout.height
 
@@ -554,6 +567,7 @@ def _partition_pdf_or_image_local(
     extract_forms: bool = False,
     form_extraction_skip_tables: bool = True,
     pdf_hi_res_max_pages: Optional[int] = None,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Partition using package installed locally"""
@@ -589,11 +603,12 @@ def _partition_pdf_or_image_local(
             filename,
             is_image=is_image,
             model_name=hi_res_model_name,
+            password=password,
             pdf_image_dpi=pdf_image_dpi,
         )
 
         extracted_layout, layouts_links = (
-            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password)
             if pdf_text_extractable
             else ([], [])
         )
@@ -631,6 +646,7 @@ def _partition_pdf_or_image_local(
             infer_table_structure=infer_table_structure,
             ocr_languages=ocr_languages,
             ocr_mode=ocr_mode,
+            password=password,
             pdf_image_dpi=pdf_image_dpi,
             ocr_layout_dumper=ocr_layout_dumper,
         )
@@ -639,14 +655,15 @@ def _partition_pdf_or_image_local(
             file,
             is_image=is_image,
             model_name=hi_res_model_name,
+            password=password,
             pdf_image_dpi=pdf_image_dpi,
         )
 
         if hasattr(file, "seek"):
             file.seek(0)
 
         extracted_layout, layouts_links = (
-            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
+            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
             if pdf_text_extractable
             else ([], [])
         )
@@ -686,6 +703,7 @@ def _partition_pdf_or_image_local(
             infer_table_structure=infer_table_structure,
             ocr_languages=ocr_languages,
             ocr_mode=ocr_mode,
+            password=password,
             pdf_image_dpi=pdf_image_dpi,
             ocr_layout_dumper=ocr_layout_dumper,
         )
@@ -725,6 +743,7 @@ def _partition_pdf_or_image_local(
             pdf_image_dpi=pdf_image_dpi,
             extract_image_block_to_payload=extract_image_block_to_payload,
             output_dir_path=extract_image_block_output_dir,
+            password=password,
         )
 
     for el_type in extract_image_block_types:
@@ -796,6 +815,7 @@ def _partition_pdf_or_image_local(
             draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
             resize=env_config.ANALYSIS_BBOX_RESIZE,
             format=env_config.ANALYSIS_BBOX_FORMAT,
+            password=password,
         )
 
     return out_elements
@@ -834,6 +854,7 @@ def _partition_pdf_or_image_with_ocr(
     is_image: bool = False,
     metadata_last_modified: Optional[str] = None,
     starting_page_number: int = 1,
+    password: Optional[str] = None,
     **kwargs: Any,
 ):
     """Partitions an image or PDF using OCR. For PDFs, each page is converted
@@ -858,7 +879,7 @@ def _partition_pdf_or_image_with_ocr(
             elements.extend(page_elements)
     else:
         for page_number, image in enumerate(
-            convert_pdf_to_images(filename, file), start=starting_page_number
+            convert_pdf_to_images(filename, file, password=password), start=starting_page_number
         ):
             page_elements = _partition_pdf_or_image_with_ocr_from_image(
                 image=image,

diff --git a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py
@@ -546,6 +546,7 @@ def __init__(
         draw_grid: bool = False,
         resize: Optional[float] = None,
         format: str = "png",
+        password: Optional[str] = None,
     ):
         self.draw_caption = draw_caption
         self.draw_grid = draw_grid
@@ -554,6 +555,7 @@ def __init__(
         self.format = format
         self.drawers = []
         self.file = file
+        self.password = password
 
         super().__init__(filename, save_dir)
 
@@ -678,6 +680,7 @@ def load_source_image(self) -> Generator[Image.Image, None, None]:
                         file=self.file,
                         output_folder=temp_dir,
                         path_only=True,
+                        password=self.password,
                     )
                 except Exception as ex:  # noqa: E722
                     print(

diff --git a/unstructured/partition/pdf_image/analysis/tools.py b/unstructured/partition/pdf_image/analysis/tools.py
@@ -66,6 +66,7 @@ def save_analysis_artifiacts(
     draw_caption: bool = True,
     resize: Optional[float] = None,
     format: str = "png",
+    password: Optional[str] = None,
 ):
     """Save the analysis artifacts for a given file. Loads some settings from
     the environment configuration.
@@ -82,6 +83,7 @@ def save_analysis_artifiacts(
         draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
         resize: Output image resize value. If not provided, the image will not be resized.
         format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
+        password (optional): The password to decrypt the PDF file.
     """
     if not filename:
         filename = _generate_filename(is_image)
@@ -109,6 +111,7 @@ def save_analysis_artifiacts(
             draw_caption=draw_caption,
             resize=resize,
             format=format,
+            password=password,
         )
 
         for layout_dumper in layout_dumpers:
@@ -125,6 +128,7 @@ def render_bboxes_for_file(
     draw_caption: bool = True,
     resize: Optional[float] = None,
     format: str = "png",
+    password: Optional[str] = None,
 ):
     """Render the bounding boxes for a given layout dimp file.
     To be used for analysis after the partition is performed for
@@ -144,6 +148,7 @@ def render_bboxes_for_file(
         draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
         resize: Output image resize value. If not provided, the image will not be resized.
         format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
+        password (optional): The password to decrypt the PDF file.
     """
     filename_stem = Path(filename).stem
     is_image = not Path(filename).suffix.endswith("pdf")
@@ -183,6 +188,7 @@ def render_bboxes_for_file(
             draw_caption=draw_caption,
             resize=resize,
             format=format,
+            password=password,
         )
 
         for drawer in layout_drawers:

diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -37,6 +37,7 @@ def process_data_with_ocr(
     ocr_mode: str = OCRMode.FULL_PAGE.value,
     pdf_image_dpi: int = 200,
     ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
+    password: Optional[str] = None,
 ) -> "DocumentLayout":
     """
     Process OCR data from a given data and supplement the output DocumentLayout
@@ -64,6 +65,8 @@ def process_data_with_ocr(
 
     - ocr_layout_dumper (OCRLayoutDumper, optional): The OCR layout dumper to save the OCR layout.
 
+    - password (optional): The password to decrypt the PDF file.
+
     Returns:
         DocumentLayout: The merged layout information obtained after OCR processing.
     """
@@ -84,6 +87,7 @@ def process_data_with_ocr(
             ocr_mode=ocr_mode,
             pdf_image_dpi=pdf_image_dpi,
             ocr_layout_dumper=ocr_layout_dumper,
+            password=password,
         )
 
     return merged_layouts
@@ -100,6 +104,7 @@ def process_file_with_ocr(
     ocr_mode: str = OCRMode.FULL_PAGE.value,
     pdf_image_dpi: int = 200,
     ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
+    password: Optional[str] = None,
 ) -> "DocumentLayout":
     """
     Process OCR data from a given file and supplement the output DocumentLayout
@@ -124,6 +129,8 @@ def process_file_with_ocr(
 
     - pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200.
 
+    - password (optional): The password to decrypt the PDF file.
+
     Returns:
         DocumentLayout: The merged layout information obtained after OCR processing.
     """
@@ -157,6 +164,7 @@ def process_file_with_ocr(
                     dpi=pdf_image_dpi,
                     output_folder=temp_dir,
                     paths_only=True,
+                    userpw=password,
                 )
                 image_paths = cast(List[str], _image_paths)
                 for i, image_path in enumerate(image_paths):

diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -58,6 +58,7 @@ def convert_pdf_to_image(
     dpi: int = 200,
     output_folder: Optional[Union[str, PurePath]] = None,
     path_only: bool = False,
+    password: Optional[str] = None,
 ) -> Union[List[Image.Image], List[str]]:
     """Get the image renderings of the pdf pages using pdf2image"""
 
@@ -71,6 +72,7 @@ def convert_pdf_to_image(
             dpi=dpi,
             output_folder=output_folder,
             paths_only=path_only,
+            userpw=password,
         )
     else:
         images = pdf2image.convert_from_path(
@@ -125,6 +127,7 @@ def save_elements(
     is_image: bool = False,
     extract_image_block_to_payload: bool = False,
     output_dir_path: str | None = None,
+    password: Optional[str] = None,
 ):
     """
     Saves specific elements from a PDF as images either to a directory or embeds them in the
@@ -167,6 +170,7 @@ def save_elements(
                 pdf_image_dpi,
                 output_folder=temp_dir,
                 path_only=True,
+                password=password,
             )
             image_paths = cast(List[str], _image_paths)
 
@@ -389,15 +393,16 @@ def convert_pdf_to_images(
     filename: str = "",
     file: Optional[bytes | IO[bytes]] = None,
     chunk_size: int = 10,
+    password: Optional[str] = None,
 ) -> Iterator[Image.Image]:
     # Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on)
     exactly_one(filename=filename, file=file)
     if file is not None:
         f_bytes = convert_to_bytes(file)
-        info = pdf2image.pdfinfo_from_bytes(f_bytes)
+        info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password)
     else:
         f_bytes = None
-        info = pdf2image.pdfinfo_from_path(filename)
+        info = pdf2image.pdfinfo_from_path(filename, userpw=password)
 
     total_pages = info["Pages"]
     for start_page in range(1, total_pages + 1, chunk_size):
@@ -407,12 +412,14 @@ def convert_pdf_to_images(
                 f_bytes,
                 first_page=start_page,
                 last_page=end_page,
+                userpw=password,
             )
         else:
             chunk_images = pdf2image.convert_from_path(
                 filename,
                 first_page=start_page,
                 last_page=end_page,
+                userpw=password,
             )
 
         for image in chunk_images:

diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -35,12 +35,14 @@
 def process_file_with_pdfminer(
     filename: str = "",
     dpi: int = 200,
+    password: Optional[str] = None,
 ) -> tuple[List[List["TextRegion"]], List[List]]:
     with open_filename(filename, "rb") as fp:
         fp = cast(BinaryIO, fp)
         extracted_layout, layouts_links = process_data_with_pdfminer(
             file=fp,
             dpi=dpi,
+            password=password,
         )
         return extracted_layout, layouts_links
 
@@ -49,6 +51,7 @@ def process_file_with_pdfminer(
 def process_data_with_pdfminer(
     file: Optional[Union[bytes, BinaryIO]] = None,
     dpi: int = 200,
+    password: Optional[str] = None,
 ) -> tuple[List[List["TextRegion"]], List[List]]:
     """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
     pdf pages using pdf2image"""
@@ -62,7 +65,9 @@ def process_data_with_pdfminer(
     layouts_links = []
     # Coefficient to rescale bounding box to be compatible with images
     coef = dpi / 72
-    for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)):
+    for page_number, (page, page_layout) in enumerate(
+        open_pdfminer_pages_generator(file, password=password)
+    ):
         width, height = page_layout.width, page_layout.height
 
         text_layout = []

diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -73,6 +73,7 @@ def rect_to_bbox(
 @requires_dependencies(["pikepdf", "pypdf"])
 def open_pdfminer_pages_generator(
     fp: BinaryIO,
+    password: str = "",
 ):
     """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
 
@@ -84,7 +85,7 @@ def open_pdfminer_pages_generator(
     with tempfile.TemporaryDirectory() as tmp_dir_path:
         tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
         try:
-            pages = PDFPage.get_pages(fp)
+            pages = PDFPage.get_pages(fp, password=password)
             # Detect invalid dictionary construct for entire PDF
             for i, page in enumerate(pages):
                 try: