diff --git a/CHANGELOG.md b/CHANGELOG.md index 90c8b0f4..2f2ef842 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.8.7 + +* fix: add `password` for PDF + ## 0.8.6 * feat: add back `source` to `TextRegions` and `LayoutElements` for backward compatibility diff --git a/sample-docs/password.pdf b/sample-docs/password.pdf new file mode 100644 index 00000000..21bd55d5 Binary files /dev/null and b/sample-docs/password.pdf differ diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index d7fc278c..be0e8769 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -302,6 +302,21 @@ def mock_get_elements(self, *args, **kwargs): assert page.image is None +@pytest.mark.slow() +def test_from_file_with_password(monkeypatch, mock_final_layout): + + doc = layout.DocumentLayout.from_file("sample-docs/password.pdf", password="password") + assert doc + + monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_final_layout)) + with patch( + "unstructured_inference.inference.layout.UnstructuredObjectDetectionModel", + MockLayoutModel, + ), open("sample-docs/password.pdf", mode="rb") as fp: + doc = layout.process_data_with_model(fp, model_name="fake", password="password") + assert doc + + def test_from_image_file_raises_with_empty_fn(): with pytest.raises(FileNotFoundError): layout.DocumentLayout.from_image_file("") @@ -544,6 +559,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m detection_model=detection_model, element_extraction_model=element_extraction_model, fixed_layouts=None, + password=None, pdf_image_dpi=200, ) diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py index 15c467cd..3feb9ed5 100644 --- a/test_unstructured_inference/models/test_tables.py +++ b/test_unstructured_inference/models/test_tables.py @@ -11,7 +11,10 @@ import unstructured_inference.models.table_postprocess as postprocess from unstructured_inference.models import tables -from unstructured_inference.models.tables import apply_thresholds_on_objects, structure_to_cells +from unstructured_inference.models.tables import ( + apply_thresholds_on_objects, + structure_to_cells, +) skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"} diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 3dd2389c..ba167b50 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.8.6" # pragma: no cover +__version__ = "0.8.7" # pragma: no cover diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 57fc742a..9b6897d3 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -51,6 +51,7 @@ def from_file( filename: str, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, pdf_image_dpi: int = 200, + password: Optional[str] = None, **kwargs, ) -> DocumentLayout: """Creates a DocumentLayout from a pdf file.""" @@ -62,6 +63,7 @@ def from_file( pdf_image_dpi, output_folder=temp_dir, path_only=True, + password=password, ) image_paths = cast(List[str], _image_paths) number_of_pages = len(image_paths) @@ -133,6 +135,7 @@ def __init__( document_filename: Optional[Union[str, PurePath]] = None, detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, + password: Optional[str] = None, ): if detection_model is not None and element_extraction_model is not None: raise ValueError("Only one of detection_model and extraction_model should be passed.") @@ -148,6 +151,7 @@ def __init__( self.element_extraction_model = element_extraction_model self.elements: Collection[LayoutElement] = [] self.elements_array: LayoutElements | None = None + self.password = password # NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has # locations now and if we need to support LayoutElements without bounding boxes we can make # the bbox property optional @@ -325,6 +329,7 @@ def from_image( def process_data_with_model( data: BinaryIO, model_name: Optional[str], + password: Optional[str] = None, **kwargs: Any, ) -> DocumentLayout: """Process PDF as file-like object `data` into a `DocumentLayout`. @@ -339,6 +344,7 @@ def process_data_with_model( layout = process_file_with_model( file_path, model_name, + password=password, **kwargs, ) @@ -351,6 +357,7 @@ def process_file_with_model( is_image: bool = False, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, pdf_image_dpi: int = 200, + password: Optional[str] = None, **kwargs: Any, ) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by @@ -379,6 +386,7 @@ def process_file_with_model( element_extraction_model=element_extraction_model, fixed_layouts=fixed_layouts, pdf_image_dpi=pdf_image_dpi, + password=password, **kwargs, ) ) @@ -390,6 +398,7 @@ def convert_pdf_to_image( dpi: int = 200, output_folder: Optional[Union[str, PurePath]] = None, path_only: bool = False, + password: Optional[str] = None, ) -> Union[List[Image.Image], List[str]]: """Get the image renderings of the pdf pages using pdf2image""" @@ -402,12 +411,14 @@ def convert_pdf_to_image( dpi=dpi, output_folder=output_folder, paths_only=path_only, + userpw=password or "", ) else: images = pdf2image.convert_from_path( filename, dpi=dpi, paths_only=path_only, + userpw=password or "", ) return images