docs(samples): new Doc AI samples for v1beta3 (#44)

* batch_process_sample. changing from async to synchronous * add quick start and process_document samples and tests * add test and sample for batch_process * add test and sample for batch_process * resolve formatting * use os.environ * remove os.path.join * move tests * descriptive variable * specific Exception, formatting * parse all pages in process_document * add more helpful comments * remove unused imports * better exception handling * rename test files * ran linter, removed nested function in batch predict * refactor tests * format imports * format imports * format imports * serialize as Document object * extract get_text helper function * fix file path * delete test bucket * Update samples/snippets/batch_process_documents_sample_v1beta3_test.py Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> * Update samples/snippets/batch_process_documents_sample_v1beta3_test.py Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> * add more specific assertion in batch_process * add more specific assertion in process_document and quickstart * fix output_uri name * Apply suggestions from code review to resolve exception Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> * resolve exception * lint Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
GoogleCloudPlatform · Oct 21, 2020 · d8068bc · d8068bc
1 parent 5d1ef63
commit d8068bc
Show file tree

Hide file tree

Showing 9 changed files with 439 additions and 15 deletions.
diff --git a/documentai/snippets/__init__.py b/documentai/snippets/__init__.py
diff --git a/documentai/snippets/batch_process_documents_sample_v1beta3.py b/documentai/snippets/batch_process_documents_sample_v1beta3.py
@@ -0,0 +1,121 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# [START documentai_batch_process_document]
+import re
+
+from google.cloud import documentai_v1beta3 as documentai
+from google.cloud import storage
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id= 'YOUR_PROJECT_ID'
+# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
+# input_uri = "YOUR_INPUT_URI"
+# gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI"
+# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"
+
+
+def batch_process_documents(
+    project_id,
+    location,
+    processor_id,
+    gcs_input_uri,
+    gcs_output_uri,
+    gcs_output_uri_prefix,
+):
+
+    client = documentai.DocumentProcessorServiceClient()
+
+    destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"
+
+    # 'mime_type' can be 'application/pdf', 'image/tiff',
+    # and 'image/gif', or 'application/json'
+    input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
+        gcs_source=gcs_input_uri, mime_type="application/pdf"
+    )
+
+    # Where to write results
+    output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
+        gcs_destination=destination_uri
+    )
+
+    # Location can be 'us' or 'eu'
+    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
+    request = documentai.types.document_processor_service.BatchProcessRequest(
+        name=name,
+        input_configs=[input_config],
+        output_config=output_config,
+    )
+
+    operation = client.batch_process_documents(request)
+
+    # Wait for the operation to finish
+    operation.result()
+
+    # Results are written to GCS. Use a regex to find
+    # output files
+    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
+    output_bucket = match.group(1)
+    prefix = match.group(2)
+
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket(output_bucket)
+    blob_list = list(bucket.list_blobs(prefix=prefix))
+    print("Output files:")
+
+    for i, blob in enumerate(blob_list):
+        # Download the contents of this blob as a bytes object.
+        blob_as_bytes = blob.download_as_bytes()
+        document = documentai.types.Document.from_json(blob_as_bytes)
+
+        print(f"Fetched file {i + 1}")
+
+        # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
+
+        # Read the text recognition output from the processor
+        for page in document.pages:
+            for form_field in page.form_fields:
+                field_name = get_text(form_field.field_name, document)
+                field_value = get_text(form_field.field_value, document)
+                print("Extracted key value pair:")
+                print(f"\t{field_name}, {field_value}")
+            for paragraph in document.pages:
+                paragraph_text = get_text(paragraph.layout, document)
+                print(f"Paragraph text:\n{paragraph_text}")
+
+
+# Extract shards from the text field
+def get_text(doc_element: dict, document: dict):
+    """
+    Document AI identifies form fields by their offsets
+    in document text. This function converts offsets
+    to text snippets.
+    """
+    response = ""
+    # If a text segment spans several lines, it will
+    # be stored in different text segments.
+    for segment in doc_element.text_anchor.text_segments:
+        start_index = (
+            int(segment.start_index)
+            if "start_index" in doc_element.text_anchor.__dict__
+            else 0
+        )
+        end_index = int(segment.end_index)
+        response += document.text[start_index:end_index]
+    return response
+
+
+# [END documentai_batch_process_document]
diff --git a/documentai/snippets/batch_process_documents_sample_v1beta3_test.py b/documentai/snippets/batch_process_documents_sample_v1beta3_test.py
@@ -0,0 +1,62 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from uuid import uuid4
+
+from google.cloud import storage
+from google.cloud.exceptions import NotFound
+
+import pytest
+
+from samples.snippets import batch_process_documents_sample_v1beta3
+
+location = "us"
+project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
+processor_id = "90484cfdedb024f6"
+gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
+gcs_output_uri_prefix = uuid4()
+BUCKET_NAME = f"document-ai-python-{uuid4()}"
+
+
+@pytest.fixture(scope="module")
+def test_bucket():
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(BUCKET_NAME)
+    yield bucket.name
+
+    try:
+        blobs = list(bucket.list_blobs())
+        for blob in blobs:
+            blob.delete()
+        bucket.delete()
+    except NotFound:
+        print("Bucket already deleted.")
+
+
+def test_batch_process_documents(capsys, test_bucket):
+    batch_process_documents_sample_v1beta3.batch_process_documents(
+        project_id=project_id,
+        location=location,
+        processor_id=processor_id,
+        gcs_input_uri=gcs_input_uri,
+        gcs_output_uri=f"gs://{test_bucket}",
+        gcs_output_uri_prefix=gcs_output_uri_prefix,
+    )
+    out, _ = capsys.readouterr()
+
+    assert "Extracted" in out
+    assert "Paragraph" in out
+    assert "Invoice" in out
diff --git a/documentai/snippets/noxfile.py b/documentai/snippets/noxfile.py
@@ -37,24 +37,22 @@
 
 TEST_CONFIG = {
     # You can opt out from the test for specific Python versions.
-    'ignored_versions': ["2.7"],
-
+    "ignored_versions": ["2.7"],
     # An envvar key for determining the project id to use. Change it
     # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
     # build specific Cloud project. You can also use your own string
     # to use your own Cloud project.
-    'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT',
+    "gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
     # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
-
     # A dictionary you want to inject into your test. Don't put any
     # secrets here. These values will override predefined values.
-    'envs': {},
+    "envs": {},
 }
 
 
 try:
     # Ensure we can import noxfile_config in the project's directory.
-    sys.path.append('.')
+    sys.path.append(".")
     from noxfile_config import TEST_CONFIG_OVERRIDE
 except ImportError as e:
     print("No user noxfile_config found: detail: {}".format(e))
@@ -69,13 +67,13 @@ def get_pytest_env_vars():
     ret = {}
 
     # Override the GCLOUD_PROJECT and the alias.
-    env_key = TEST_CONFIG['gcloud_project_env']
+    env_key = TEST_CONFIG["gcloud_project_env"]
     # This should error out if not set.
-    ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
-    ret['GCLOUD_PROJECT'] = os.environ[env_key]  # deprecated
+    ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key]
+    ret["GCLOUD_PROJECT"] = os.environ[env_key]  # deprecated
 
     # Apply user supplied envs.
-    ret.update(TEST_CONFIG['envs'])
+    ret.update(TEST_CONFIG["envs"])
     return ret
 
 
@@ -84,7 +82,7 @@ def get_pytest_env_vars():
 ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]
 
 # Any default versions that should be ignored.
-IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
+IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"]
 
 TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
 
@@ -138,7 +136,7 @@ def lint(session):
     args = FLAKE8_COMMON_ARGS + [
         "--application-import-names",
         ",".join(local_names),
-        "."
+        ".",
     ]
     session.run("flake8", *args)
 
@@ -147,6 +145,7 @@ def lint(session):
 # Black
 #
 
+
 @nox.session
 def blacken(session):
     session.install("black")
@@ -194,9 +193,9 @@ def py(session):
     if session.python in TESTED_VERSIONS:
         _session_tests(session)
     else:
-        session.skip("SKIPPED: {} tests are disabled for this sample.".format(
-            session.python
-        ))
+        session.skip(
+            "SKIPPED: {} tests are disabled for this sample.".format(session.python)
+        )
 
 
 #

diff --git a/documentai/snippets/process_document_sample_v1beta3.py b/documentai/snippets/process_document_sample_v1beta3.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from google.cloud import documentai_v1beta3 as documentai
+
+# [START documentai_process_document]
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id= 'YOUR_PROJECT_ID';
+# location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
+# file_path = '/path/to/local/pdf';
+
+
+def process_document_sample(
+    project_id: str, location: str, processor_id: str, file_path: str
+):
+    # Instantiates a client
+    client = documentai.DocumentProcessorServiceClient()
+
+    # The full resource name of the processor, e.g.:
+    # projects/project-id/locations/location/processor/processor-id
+    # You must create new processors in the Cloud Console first
+    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
+
+    with open(file_path, "rb") as image:
+        image_content = image.read()
+
+    # Read the file into memory
+    document = {"content": image_content, "mime_type": "application/pdf"}
+
+    # Configure the process request
+    request = {"name": name, "document": document}
+
+    # Recognizes text entities in the PDF document
+    result = client.process_document(request=request)
+
+    document = result.document
+
+    print("Document processing complete.")
+
+    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
+
+    document_pages = document.pages
+
+    # Read the text recognition output from the processor
+    print("The document contains the following paragraphs:")
+    for page in document_pages:
+        paragraphs = page.paragraphs
+        for paragraph in paragraphs:
+            paragraph_text = get_text(paragraph.layout, document)
+            print(f"Paragraph text: {paragraph_text}")
+
+
+# Extract shards from the text field
+def get_text(doc_element: dict, document: dict):
+    """
+    Document AI identifies form fields by their offsets
+    in document text. This function converts offsets
+    to text snippets.
+    """
+    response = ""
+    # If a text segment spans several lines, it will
+    # be stored in different text segments.
+    for segment in doc_element.text_anchor.text_segments:
+        start_index = (
+            int(segment.start_index)
+            if segment.start_index in doc_element.text_anchor.text_segments
+            else 0
+        )
+        end_index = int(segment.end_index)
+        response += document.text[start_index:end_index]
+    return response
+
+
+# [END documentai_process_document]
diff --git a/documentai/snippets/process_document_sample_v1beta3_test.py b/documentai/snippets/process_document_sample_v1beta3_test.py
@@ -0,0 +1,37 @@
+# # Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+
+from samples.snippets import process_document_sample_v1beta3
+
+
+location = "us"
+project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
+processor_id = "90484cfdedb024f6"
+file_path = "resources/invoice.pdf"
+
+
+def test_process_documents(capsys):
+    process_document_sample_v1beta3.process_document_sample(
+        project_id=project_id,
+        location=location,
+        processor_id=processor_id,
+        file_path=file_path,
+    )
+    out, _ = capsys.readouterr()
+
+    assert "Paragraph" in out
+    assert "Invoice" in out