Skip to content

Commit

Permalink
refactor: Updates to Document AI Python Samples (#323)
Browse files Browse the repository at this point in the history
* Updated OCR Quickstart Sample

Added Types to Request Creation
Added ClientOptions object for type safety
Simplified output code to print full text instead of paragraphs
Updated Link to Document Object v1 specification
Added mime_type as variable

* Updates to process_document_sample

- Same Updates as Quickstart Sample
- Moved Imports to top of quickstart file

* Updated Batch Process Example

    - Added typing
    - Use BatchProcessMetadata instead of Operation ID to get output files from GCS
    - Added MimeType specification
    - Added Alternatives for Directory Processing & Callbacks
    - Minor Changes to process_document/quickstart for unified style with batch

* Updates to OCR Response Handling Sample

- Separated Online Processing Request into function
- Added explicit typing for documentai objects
- Converted `.format()` to f-string
- Simplified `layout_to_text()`

* Updated Form Processing Sample

    - Updated to `v1` API
    - Separated processing request into function
    - Added explicit typing for Document AI Types
    - Separated `print_table_rows()` into function for modularity
    - Fixed Spelling error "Collumns"

* Updated Specialized Processor Sample

    - Added Extraction of Properties (Nested Entities) and Normalized Values

* Updates to Splitter/Classifier Sample

- Updated to `v1` API
- Changed Page Numeber Printout
  - (Splitter Classifiers now output all page numbers within a subdocument, instead of just the first and last)

* Updated Test for process_document_sample
 - Added mime_type

* Updated Document Quality Processor Sample
- Updated to `v1` API
- Moved API Call to separate function
- Updated `.format()` to f-strings
- Added Handling for Multiple Page Numbers per entity
- Reused `page_refs_to_string()` from splitter/classifier example
- Added `mime_type` as parameter

* Updated Batch Processing Directory sample variable from CR comments

* Added Sample Input PDF Files & Output JSON Files

* Fixed Spelling Error in Invoice Parser Output filenames

* Addressed Code Review Comments

- Changed Copyright Year back to 2020
- Changed "property" variable to "prop" to avoid naming conflicts

* Updated Client Library Requirements versions

* Addressed Unit Test Failures

* Re-added google-api-core to requirements.txt

* Update samples/snippets/process_document_form_sample.py

Co-authored-by: Anthonios Partheniou <partheniou@google.com>

* Update samples/snippets/requirements.txt

Co-authored-by: Anthonios Partheniou <partheniou@google.com>

* Fixed "entirity" spelling error

Co-authored-by: Gal Zahavi <38544478+galz10@users.noreply.github.com>
Co-authored-by: Anthonios Partheniou <partheniou@google.com>
  • Loading branch information
3 people authored Jul 28, 2022
1 parent 35b59e6 commit bfe4ffc
Show file tree
Hide file tree
Showing 58 changed files with 462,989 additions and 392 deletions.
168 changes: 93 additions & 75 deletions document_ai/snippets/batch_process_documents_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,118 +16,136 @@
# [START documentai_batch_process_document]
import re

from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud import storage

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID'
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
# project_id = 'YOUR_PROJECT_ID'
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
# gcs_input_uri = "YOUR_INPUT_URI"
# gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI"
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
# input_mime_type = "application/pdf"
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/


def batch_process_documents(
project_id,
location,
processor_id,
gcs_input_uri,
gcs_output_uri,
gcs_output_uri_prefix,
project_id: str,
location: str,
processor_id: str,
gcs_input_uri: str,
input_mime_type: str,
gcs_output_bucket: str,
gcs_output_uri_prefix: str,
timeout: int = 300,
):

# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = {}
if location == "eu":
opts = {"api_endpoint": "eu-documentai.googleapis.com"}
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

client = documentai.DocumentProcessorServiceClient(client_options=opts)

destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"

gcs_documents = documentai.GcsDocuments(
documents=[{"gcs_uri": gcs_input_uri, "mime_type": "application/pdf"}]
gcs_document = documentai.GcsDocument(
gcs_uri=gcs_input_uri, mime_type=input_mime_type
)

# 'mime_type' can be 'application/pdf', 'image/tiff',
# and 'image/gif', or 'application/json'
# Load GCS Input URI into a List of document files
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)

# Where to write results
output_config = documentai.DocumentOutputConfig(
gcs_output_config={"gcs_uri": destination_uri}
# NOTE: Alternatively, specify a GCS URI Prefix to process an entire directory
#
# gcs_input_uri = "gs://bucket/directory/"
# gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
# input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
#

# Cloud Storage URI for the Output Directory
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"

gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=destination_uri
)

# Location can be 'us' or 'eu'
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
request = documentai.types.document_processor_service.BatchProcessRequest(
# Where to write results
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

# The full resource name of the processor, e.g.:
# projects/project_id/locations/location/processor/processor_id
# You must create new processors in the Cloud Console first
name = client.processor_path(project_id, location, processor_id)

request = documentai.BatchProcessRequest(
name=name,
input_documents=input_config,
document_output_config=output_config,
)

# BatchProcess returns a Long Running Operation (LRO)
operation = client.batch_process_documents(request)

# Wait for the operation to finish
# Continually polls the operation until it is complete.
# This could take some time for larger files
# Format: projects/PROJECT_NUMBER/locations/LOCATION/operations/OPERATION_ID
print(f"Waiting for operation {operation.operation.name} to complete...")
operation.result(timeout=timeout)

# Results are written to GCS. Use a regex to find
# output files
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)
# NOTE: Can also use callbacks for asynchronous processing
#
# def my_callback(future):
# result = future.result()
#
# operation.add_done_callback(my_callback)

storage_client = storage.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print("Output files:")
# Once the operation is complete,
# get output document information from operation metadata
metadata = documentai.BatchProcessMetadata(operation.metadata)

for i, blob in enumerate(blob_list):
# If JSON file, download the contents of this blob as a bytes object.
if ".json" in blob.name:
blob_as_bytes = blob.download_as_bytes()
if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
raise ValueError(f"Batch Process Failed: {metadata.state_message}")

document = documentai.types.Document.from_json(blob_as_bytes)
print(f"Fetched file {i + 1}")
storage_client = storage.Client()

print("Output files:")
# One process per Input Document
for process in metadata.individual_process_statuses:
# output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
# The Cloud Storage API requires the bucket name and URI prefix separately
matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
if not matches:
print(
"Could not parse output GCS destination:",
process.output_gcs_destination,
)
continue

output_bucket, output_prefix = matches.groups()

# Get List of Document Objects from the Output Bucket
output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)

# Document AI may output multiple JSON files per source file
for blob in output_blobs:
# Document AI should only output JSON files to GCS
if ".json" not in blob.name:
print(
f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
)
continue

# Download JSON File as bytes object and convert to Document Object
print(f"Fetching {blob.name}")
document = documentai.Document.from_json(
blob.download_as_bytes(), ignore_unknown_fields=True
)

# For a full list of Document object attributes, please reference this page:
# https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document
# https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document

# Read the text recognition output from the processor
for page in document.pages:
for form_field in page.form_fields:
field_name = get_text(form_field.field_name, document)
field_value = get_text(form_field.field_value, document)
print("Extracted key value pair:")
print(f"\t{field_name}, {field_value}")
for paragraph in page.paragraphs:
paragraph_text = get_text(paragraph.layout, document)
print(f"Paragraph text:\n{paragraph_text}")
else:
print(f"Skipping non-supported file type {blob.name}")


# Extract shards from the text field
def get_text(doc_element: dict, document: dict):
"""
Document AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in doc_element.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment in doc_element.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response
print("The document contains the following text:")
print(document.text)


# [END documentai_batch_process_document]
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
processor_id = "90484cfdedb024f6"
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
input_mime_type = "application/pdf"
# following bucket contains .csv file which will cause the sample to fail.
gcs_output_full_uri_with_wrong_type = "gs://documentai-beta-samples"
gcs_output_uri_prefix = "test"
BUCKET_NAME = f"document-ai-python-{uuid4()}"


Expand All @@ -34,8 +36,9 @@ def test_batch_process_documents_with_bad_input(capsys):
location=location,
processor_id=processor_id,
gcs_input_uri=gcs_input_uri,
gcs_output_uri=gcs_output_full_uri_with_wrong_type,
gcs_output_uri_prefix="test",
input_mime_type=input_mime_type,
gcs_output_bucket=gcs_output_full_uri_with_wrong_type,
gcs_output_uri_prefix=gcs_output_uri_prefix,
timeout=450,
)
out, _ = capsys.readouterr()
Expand Down
10 changes: 6 additions & 4 deletions document_ai/snippets/batch_process_documents_sample_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "90484cfdedb024f6"
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
input_mime_type = "application/pdf"
gcs_output_uri_prefix = uuid4()
BUCKET_NAME = f"document-ai-python-{uuid4()}"

Expand All @@ -50,11 +51,12 @@ def test_batch_process_documents(capsys, test_bucket):
location=location,
processor_id=processor_id,
gcs_input_uri=gcs_input_uri,
gcs_output_uri=f"gs://{test_bucket}",
input_mime_type=input_mime_type,
gcs_output_bucket=f"gs://{test_bucket}",
gcs_output_uri_prefix=gcs_output_uri_prefix,
)
out, _ = capsys.readouterr()

assert "Extracted" in out
assert "Paragraph" in out
assert "Invoice" in out
assert "operation" in out
assert "Fetching" in out
assert "text:" in out
Loading

0 comments on commit bfe4ffc

Please sign in to comment.