diff --git a/vision/cloud-client/detect/detect_pdf.py b/vision/cloud-client/detect/detect_pdf.py new file mode 100644 index 000000000000..a728d0522a58 --- /dev/null +++ b/vision/cloud-client/detect/detect_pdf.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python + +# Copyright 2018 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""OCR with PDF/TIFF as source files on GCS + +Example: + python detect_pdf.py \ + --gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \ + --gcs-destination-uri gs://BUCKET_NAME/PREFIX/ +""" + +import argparse +import re + +from google.cloud import storage +from google.cloud import vision_v1p2beta1 as vision +from google.protobuf import json_format + + +# [START vision_async_detect_document_ocr] +def async_detect_document(gcs_source_uri, gcs_destination_uri): + # Supported mime_types are: 'application/pdf' and 'image/tiff' + mime_type = 'application/pdf' + + # How many pages should be grouped into each json output file. + # With a file of 5 pages + batch_size = 2 + + client = vision.ImageAnnotatorClient() + + feature = vision.types.Feature( + type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION) + + gcs_source = vision.types.GcsSource(uri=gcs_source_uri) + input_config = vision.types.InputConfig( + gcs_source=gcs_source, mime_type=mime_type) + + gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri) + output_config = vision.types.OutputConfig( + gcs_destination=gcs_destination, batch_size=batch_size) + + async_request = vision.types.AsyncAnnotateFileRequest( + features=[feature], input_config=input_config, + output_config=output_config) + + operation = client.async_batch_annotate_files( + requests=[async_request]) + + print('Waiting for the operation to finish.') + operation.result(timeout=90) + + # Once the request has completed and the output has been + # written to GCS, we can list all the output files. + storage_client = storage.Client() + + match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) + bucket_name = match.group(1) + prefix = match.group(2) + + bucket = storage_client.get_bucket(bucket_name=bucket_name) + + # List objects with the given prefix. + blob_list = list(bucket.list_blobs(prefix=prefix)) + print('Output files:') + for blob in blob_list: + print(blob.name) + + # Process the first output file from GCS. + # Since we specified batch_size=2, the first response contains + # the first two pages of the input file. + output = blob_list[0] + + json_string = output.download_as_string() + response = json_format.Parse( + json_string, vision.types.AnnotateFileResponse()) + + # The actual response for the first page of the input file. + first_page_response = response.responses[0] + annotation = first_page_response.full_text_annotation + + # Here we print the full text from the first page. + # The response contains more information: + # annotation/pages/blocks/paragraphs/words/symbols + # including confidence scores and bounding boxes + print(u'Full text:\n{}'.format( + annotation.text)) +# [END vision_async_detect_document_ocr] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--gcs-source-uri', required=True) + parser.add_argument('--gcs-destination-uri', required=True) + + args = parser.parse_args() + async_detect_document(args.gcs_source_uri, args.gcs_destination_uri) diff --git a/vision/cloud-client/detect/detect_pdf_test.py b/vision/cloud-client/detect/detect_pdf_test.py new file mode 100644 index 000000000000..f0f0b5f7126d --- /dev/null +++ b/vision/cloud-client/detect/detect_pdf_test.py @@ -0,0 +1,38 @@ +# Copyright 2018 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from google.cloud import storage + +from detect_pdf import async_detect_document + +BUCKET = os.environ['CLOUD_STORAGE_BUCKET'] +OUTPUT_PREFIX = 'OCR_PDF_TEST_OUTPUT' +GCS_SOURCE_URI = 'gs://{}/HodgeConj.pdf'.format(BUCKET) +GCS_DESTINATION_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX) + + +def test_async_detect_document(capsys): + async_detect_document( + gcs_source_uri=GCS_SOURCE_URI, + gcs_destination_uri=GCS_DESTINATION_URI) + out, _ = capsys.readouterr() + + assert 'Hodge conjecture' in out + + storage_client = storage.Client() + bucket = storage_client.get_bucket(BUCKET) + for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX): + blob.delete() diff --git a/vision/cloud-client/detect/requirements.txt b/vision/cloud-client/detect/requirements.txt index 80c8a11ca3c2..ca9d0e400b84 100644 --- a/vision/cloud-client/detect/requirements.txt +++ b/vision/cloud-client/detect/requirements.txt @@ -1 +1,2 @@ google-cloud-vision==0.30.1 +google-cloud-storage==1.6.0