Skip to content

Add password #3876

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
### Enhancements

### Features
- **Use password** to load PDF with all modes

### Fixes
- **Fix an issue with multiple values for `infer_table_structure`** when paritioning email with image attachements the kwarg calls into `partition` to partition the image already contains `infer_table_structure`. Now `partition` function checks if the `kwarg` has `infer_table_structure` already
Binary file added example-docs/pdf/password.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion test_unstructured/chunking/test_basic.py
Original file line number Diff line number Diff line change
@@ -164,5 +164,5 @@ def it_supports_the_include_orig_elements_option(
# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture()
def _chunk_elements_(self, request: FixtureRequest):
def _chunk_elements_(self, request: FixtureRequest): # noqa: PT005
return function_mock(request, "unstructured.chunking.basic._chunk_elements")
2 changes: 1 addition & 1 deletion test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
@@ -468,7 +468,7 @@ def it_supports_the_include_orig_elements_option(
# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture()
def _chunk_by_title_(self, request: FixtureRequest):
def _chunk_by_title_(self, request: FixtureRequest): # noqa: PT005
return function_mock(request, "unstructured.chunking.title._chunk_by_title")


40 changes: 40 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
@@ -1517,3 +1517,43 @@ def test_document_to_element_list_sets_category_depth_titles():
assert elements[1].metadata.category_depth == 2
assert elements[2].metadata.category_depth is None
assert elements[3].metadata.category_depth == 0


@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
"strategy",
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_pdf_with_password(
file_mode: str,
strategy: str,
filename: str = example_doc_path("pdf/password.pdf"),
):
# Test that the partition_pdf function can handle filename
def _test(result: list[Element]):
# validate that the result is a non-empty list of dicts
assert len(result) == 1
assert result[0].text == "File with password"

if file_mode == "filename":
result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password")
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy, password="password")
_test(result)
else:
with open(filename, "rb") as test_file:
with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, password="password"
)
_test(result)
2 changes: 1 addition & 1 deletion test_unstructured/partition/test_msg.py
Original file line number Diff line number Diff line change
@@ -443,7 +443,7 @@ def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_funct
# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture
def _last_modified_prop_(self, request: FixtureRequest):
def _last_modified_prop_(self, request: FixtureRequest): # noqa: PT005
return property_mock(request, MsgPartitionerOptions, "_last_modified")

@pytest.fixture
Original file line number Diff line number Diff line change
@@ -99,7 +99,7 @@ def get_instance_(self, request: FixtureRequest):
return method_mock(request, OCRAgent, "get_instance")

@pytest.fixture()
def _get_ocr_agent_cls_qname_(self, request: FixtureRequest):
def _get_ocr_agent_cls_qname_(self, request: FixtureRequest): # noqa: PT005
return method_mock(request, OCRAgent, "_get_ocr_agent_cls_qname")

@pytest.fixture()
4 changes: 4 additions & 0 deletions unstructured/partition/image.py
Original file line number Diff line number Diff line change
@@ -32,6 +32,7 @@ def partition_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses an image into a list of interpreted elements.
@@ -91,6 +92,8 @@ def partition_image(
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
password
The password to decrypt the PDF file.
"""
exactly_one(filename=filename, file=file)

@@ -113,5 +116,6 @@ def partition_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
29 changes: 25 additions & 4 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
@@ -142,6 +142,7 @@ def partition_pdf(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf document into a list of interpreted elements.
@@ -222,6 +223,7 @@ def partition_pdf(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)

@@ -243,6 +245,7 @@ def partition_pdf_or_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
@@ -270,6 +273,7 @@ def partition_pdf_or_image(
file=spooled_to_bytes_io_if_needed(file),
languages=languages,
metadata_last_modified=metadata_last_modified or last_modified,
password=password,
starting_page_number=starting_page_number,
**kwargs,
)
@@ -320,6 +324,7 @@ def partition_pdf_or_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
@@ -344,6 +349,7 @@ def partition_pdf_or_image(
ocr_languages=ocr_languages,
is_image=is_image,
metadata_last_modified=metadata_last_modified or last_modified,
password=password,
starting_page_number=starting_page_number,
**kwargs,
)
@@ -358,6 +364,7 @@ def extractable_elements(
languages: Optional[list[str]] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
if isinstance(file, bytes):
@@ -367,6 +374,7 @@ def extractable_elements(
file=file,
languages=languages,
metadata_last_modified=metadata_last_modified,
password=password,
starting_page_number=starting_page_number,
**kwargs,
)
@@ -378,6 +386,7 @@ def _partition_pdf_with_pdfminer(
languages: list[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@@ -401,6 +410,7 @@ def _partition_pdf_with_pdfminer(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

@@ -410,6 +420,7 @@ def _partition_pdf_with_pdfminer(
filename=filename,
languages=languages,
metadata_last_modified=metadata_last_modified,
password=password,
starting_page_number=starting_page_number,
**kwargs,
)
@@ -425,14 +436,16 @@ def _process_pdfminer_pages(
metadata_last_modified: Optional[str],
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs,
) -> list[list[Element]]:
"""Uses PDFMiner to split a document into pages and process them."""

elements = []

for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp), start=starting_page_number
open_pdfminer_pages_generator(fp, password=password),
start=starting_page_number,
):
width, height = page_layout.width, page_layout.height

@@ -554,6 +567,7 @@ def _partition_pdf_or_image_local(
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
pdf_hi_res_max_pages: Optional[int] = None,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partition using package installed locally"""
@@ -589,11 +603,12 @@ def _partition_pdf_or_image_local(
filename,
is_image=is_image,
model_name=hi_res_model_name,
password=password,
pdf_image_dpi=pdf_image_dpi,
)

extracted_layout, layouts_links = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable
else ([], [])
)
@@ -631,6 +646,7 @@ def _partition_pdf_or_image_local(
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
password=password,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
@@ -639,14 +655,15 @@ def _partition_pdf_or_image_local(
file,
is_image=is_image,
model_name=hi_res_model_name,
password=password,
pdf_image_dpi=pdf_image_dpi,
)

if hasattr(file, "seek"):
file.seek(0)

extracted_layout, layouts_links = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable
else ([], [])
)
@@ -686,6 +703,7 @@ def _partition_pdf_or_image_local(
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
password=password,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
@@ -725,6 +743,7 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
extract_image_block_to_payload=extract_image_block_to_payload,
output_dir_path=extract_image_block_output_dir,
password=password,
)

for el_type in extract_image_block_types:
@@ -796,6 +815,7 @@ def _partition_pdf_or_image_local(
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
resize=env_config.ANALYSIS_BBOX_RESIZE,
format=env_config.ANALYSIS_BBOX_FORMAT,
password=password,
)

return out_elements
@@ -834,6 +854,7 @@ def _partition_pdf_or_image_with_ocr(
is_image: bool = False,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
):
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
@@ -858,7 +879,7 @@ def _partition_pdf_or_image_with_ocr(
elements.extend(page_elements)
else:
for page_number, image in enumerate(
convert_pdf_to_images(filename, file), start=starting_page_number
convert_pdf_to_images(filename, file, password=password), start=starting_page_number
):
page_elements = _partition_pdf_or_image_with_ocr_from_image(
image=image,
Original file line number Diff line number Diff line change
@@ -546,6 +546,7 @@ def __init__(
draw_grid: bool = False,
resize: Optional[float] = None,
format: str = "png",
password: Optional[str] = None,
):
self.draw_caption = draw_caption
self.draw_grid = draw_grid
@@ -554,6 +555,7 @@ def __init__(
self.format = format
self.drawers = []
self.file = file
self.password = password

super().__init__(filename, save_dir)

@@ -678,6 +680,7 @@ def load_source_image(self) -> Generator[Image.Image, None, None]:
file=self.file,
output_folder=temp_dir,
path_only=True,
password=self.password,
)
except Exception as ex: # noqa: E722
print(
6 changes: 6 additions & 0 deletions unstructured/partition/pdf_image/analysis/tools.py
Original file line number Diff line number Diff line change
@@ -66,6 +66,7 @@ def save_analysis_artifiacts(
draw_caption: bool = True,
resize: Optional[float] = None,
format: str = "png",
password: Optional[str] = None,
):
"""Save the analysis artifacts for a given file. Loads some settings from
the environment configuration.
@@ -82,6 +83,7 @@ def save_analysis_artifiacts(
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
resize: Output image resize value. If not provided, the image will not be resized.
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
password (optional): The password to decrypt the PDF file.
"""
if not filename:
filename = _generate_filename(is_image)
@@ -109,6 +111,7 @@ def save_analysis_artifiacts(
draw_caption=draw_caption,
resize=resize,
format=format,
password=password,
)

for layout_dumper in layout_dumpers:
@@ -125,6 +128,7 @@ def render_bboxes_for_file(
draw_caption: bool = True,
resize: Optional[float] = None,
format: str = "png",
password: Optional[str] = None,
):
"""Render the bounding boxes for a given layout dimp file.
To be used for analysis after the partition is performed for
@@ -144,6 +148,7 @@ def render_bboxes_for_file(
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
resize: Output image resize value. If not provided, the image will not be resized.
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
password (optional): The password to decrypt the PDF file.
"""
filename_stem = Path(filename).stem
is_image = not Path(filename).suffix.endswith("pdf")
@@ -183,6 +188,7 @@ def render_bboxes_for_file(
draw_caption=draw_caption,
resize=resize,
format=format,
password=password,
)

for drawer in layout_drawers:
8 changes: 8 additions & 0 deletions unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
@@ -37,6 +37,7 @@ def process_data_with_ocr(
ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
password: Optional[str] = None,
) -> "DocumentLayout":
"""
Process OCR data from a given data and supplement the output DocumentLayout
@@ -64,6 +65,8 @@ def process_data_with_ocr(
- ocr_layout_dumper (OCRLayoutDumper, optional): The OCR layout dumper to save the OCR layout.
- password (optional): The password to decrypt the PDF file.
Returns:
DocumentLayout: The merged layout information obtained after OCR processing.
"""
@@ -84,6 +87,7 @@ def process_data_with_ocr(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
password=password,
)

return merged_layouts
@@ -100,6 +104,7 @@ def process_file_with_ocr(
ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
password: Optional[str] = None,
) -> "DocumentLayout":
"""
Process OCR data from a given file and supplement the output DocumentLayout
@@ -124,6 +129,8 @@ def process_file_with_ocr(
- pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200.
- password (optional): The password to decrypt the PDF file.
Returns:
DocumentLayout: The merged layout information obtained after OCR processing.
"""
@@ -157,6 +164,7 @@ def process_file_with_ocr(
dpi=pdf_image_dpi,
output_folder=temp_dir,
paths_only=True,
userpw=password,
)
image_paths = cast(List[str], _image_paths)
for i, image_path in enumerate(image_paths):
11 changes: 9 additions & 2 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
@@ -58,6 +58,7 @@ def convert_pdf_to_image(
dpi: int = 200,
output_folder: Optional[Union[str, PurePath]] = None,
path_only: bool = False,
password: Optional[str] = None,
) -> Union[List[Image.Image], List[str]]:
"""Get the image renderings of the pdf pages using pdf2image"""

@@ -71,6 +72,7 @@ def convert_pdf_to_image(
dpi=dpi,
output_folder=output_folder,
paths_only=path_only,
userpw=password,
)
else:
images = pdf2image.convert_from_path(
@@ -125,6 +127,7 @@ def save_elements(
is_image: bool = False,
extract_image_block_to_payload: bool = False,
output_dir_path: str | None = None,
password: Optional[str] = None,
):
"""
Saves specific elements from a PDF as images either to a directory or embeds them in the
@@ -167,6 +170,7 @@ def save_elements(
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
password=password,
)
image_paths = cast(List[str], _image_paths)

@@ -389,15 +393,16 @@ def convert_pdf_to_images(
filename: str = "",
file: Optional[bytes | IO[bytes]] = None,
chunk_size: int = 10,
password: Optional[str] = None,
) -> Iterator[Image.Image]:
# Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on)
exactly_one(filename=filename, file=file)
if file is not None:
f_bytes = convert_to_bytes(file)
info = pdf2image.pdfinfo_from_bytes(f_bytes)
info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password)
else:
f_bytes = None
info = pdf2image.pdfinfo_from_path(filename)
info = pdf2image.pdfinfo_from_path(filename, userpw=password)

total_pages = info["Pages"]
for start_page in range(1, total_pages + 1, chunk_size):
@@ -407,12 +412,14 @@ def convert_pdf_to_images(
f_bytes,
first_page=start_page,
last_page=end_page,
userpw=password,
)
else:
chunk_images = pdf2image.convert_from_path(
filename,
first_page=start_page,
last_page=end_page,
userpw=password,
)

for image in chunk_images:
7 changes: 6 additions & 1 deletion unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
@@ -35,12 +35,14 @@
def process_file_with_pdfminer(
filename: str = "",
dpi: int = 200,
password: Optional[str] = None,
) -> tuple[List[List["TextRegion"]], List[List]]:
with open_filename(filename, "rb") as fp:
fp = cast(BinaryIO, fp)
extracted_layout, layouts_links = process_data_with_pdfminer(
file=fp,
dpi=dpi,
password=password,
)
return extracted_layout, layouts_links

@@ -49,6 +51,7 @@ def process_file_with_pdfminer(
def process_data_with_pdfminer(
file: Optional[Union[bytes, BinaryIO]] = None,
dpi: int = 200,
password: Optional[str] = None,
) -> tuple[List[List["TextRegion"]], List[List]]:
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
pdf pages using pdf2image"""
@@ -62,7 +65,9 @@ def process_data_with_pdfminer(
layouts_links = []
# Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72
for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)):
for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(file, password=password)
):
width, height = page_layout.width, page_layout.height

text_layout = []
3 changes: 2 additions & 1 deletion unstructured/partition/pdf_image/pdfminer_utils.py
Original file line number Diff line number Diff line change
@@ -73,6 +73,7 @@ def rect_to_bbox(
@requires_dependencies(["pikepdf", "pypdf"])
def open_pdfminer_pages_generator(
fp: BinaryIO,
password: str = "",
):
"""Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""

@@ -84,7 +85,7 @@ def open_pdfminer_pages_generator(
with tempfile.TemporaryDirectory() as tmp_dir_path:
tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
try:
pages = PDFPage.get_pages(fp)
pages = PDFPage.get_pages(fp, password=password)
# Detect invalid dictionary construct for entire PDF
for i, page in enumerate(pages):
try: