diff --git a/sample-docs/password.pdf b/sample-docs/password.pdf new file mode 100644 index 00000000..21bd55d5 Binary files /dev/null and b/sample-docs/password.pdf differ diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index d7fc278c..6f844433 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -302,6 +302,25 @@ def mock_get_elements(self, *args, **kwargs): assert page.image is None +@pytest.mark.slow() +def test_from_file_with_password(monkeypatch, mock_final_layout): + + doc = layout.DocumentLayout.from_file( + "sample-docs/password.pdf", + password="password") + assert doc + + monkeypatch.setattr(layout, "get_model", + lambda x: MockLayoutModel(mock_final_layout)) + with patch( + "unstructured_inference.inference.layout.UnstructuredObjectDetectionModel", + MockLayoutModel, + ), open("sample-docs/password.pdf",mode="rb") as fp: + doc = layout.process_data_with_model(fp, model_name="fake", password="password") + assert doc + + + def test_from_image_file_raises_with_empty_fn(): with pytest.raises(FileNotFoundError): layout.DocumentLayout.from_image_file("") diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 57fc742a..473dda07 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -51,6 +51,7 @@ def from_file( filename: str, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, pdf_image_dpi: int = 200, + password:Optional[str] = None, **kwargs, ) -> DocumentLayout: """Creates a DocumentLayout from a pdf file.""" @@ -62,6 +63,7 @@ def from_file( pdf_image_dpi, output_folder=temp_dir, path_only=True, + password=password, ) image_paths = cast(List[str], _image_paths) number_of_pages = len(image_paths) @@ -133,6 +135,7 @@ def __init__( document_filename: Optional[Union[str, PurePath]] = None, detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, + password:Optional[str] = None, ): if detection_model is not None and element_extraction_model is not None: raise ValueError("Only one of detection_model and extraction_model should be passed.") @@ -148,6 +151,7 @@ def __init__( self.element_extraction_model = element_extraction_model self.elements: Collection[LayoutElement] = [] self.elements_array: LayoutElements | None = None + self.password = password # NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has # locations now and if we need to support LayoutElements without bounding boxes we can make # the bbox property optional @@ -325,6 +329,7 @@ def from_image( def process_data_with_model( data: BinaryIO, model_name: Optional[str], + password: Optional[str] = None, **kwargs: Any, ) -> DocumentLayout: """Process PDF as file-like object `data` into a `DocumentLayout`. @@ -339,6 +344,7 @@ def process_data_with_model( layout = process_file_with_model( file_path, model_name, + password=password, **kwargs, ) @@ -351,6 +357,7 @@ def process_file_with_model( is_image: bool = False, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, pdf_image_dpi: int = 200, + password: Optional[str] = None, **kwargs: Any, ) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by @@ -370,6 +377,7 @@ def process_file_with_model( filename, detection_model=detection_model, element_extraction_model=element_extraction_model, + password=password, **kwargs, ) if is_image @@ -379,6 +387,7 @@ def process_file_with_model( element_extraction_model=element_extraction_model, fixed_layouts=fixed_layouts, pdf_image_dpi=pdf_image_dpi, + password=password, **kwargs, ) ) @@ -390,6 +399,7 @@ def convert_pdf_to_image( dpi: int = 200, output_folder: Optional[Union[str, PurePath]] = None, path_only: bool = False, + password: Optional[str] = None, ) -> Union[List[Image.Image], List[str]]: """Get the image renderings of the pdf pages using pdf2image""" @@ -402,12 +412,14 @@ def convert_pdf_to_image( dpi=dpi, output_folder=output_folder, paths_only=path_only, + userpw=password, ) else: images = pdf2image.convert_from_path( filename, dpi=dpi, paths_only=path_only, + userpw=password, ) return images