Skip to content

Commit

Permalink
[DO NOT MERGE] Vision API OCR PDF/TIFF sample (#1420)
Browse files Browse the repository at this point in the history
* add docpdf sample

* import order

* list blobs

* filename change

* add the renamed files

* parse json string to AnnotateFileResponse message

* show more of the response

* simplify response processing to better focus on how to make the request

* fix typo

* linter

* linter

* linter
  • Loading branch information
dizcology authored and chenyumic committed Apr 4, 2018
1 parent f427368 commit 7405c00
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 0 deletions.
110 changes: 110 additions & 0 deletions vision/cloud-client/detect/detect_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python

# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""OCR with PDF/TIFF as source files on GCS
Example:
python detect_pdf.py \
--gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \
--gcs-destination-uri gs://BUCKET_NAME/PREFIX/
"""

import argparse
import re

from google.cloud import storage
from google.cloud import vision_v1p2beta1 as vision
from google.protobuf import json_format


# [START vision_async_detect_document_ocr]
def async_detect_document(gcs_source_uri, gcs_destination_uri):
# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = 'application/pdf'

# How many pages should be grouped into each json output file.
# With a file of 5 pages
batch_size = 2

client = vision.ImageAnnotatorClient()

feature = vision.types.Feature(
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)

gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
input_config = vision.types.InputConfig(
gcs_source=gcs_source, mime_type=mime_type)

gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
output_config = vision.types.OutputConfig(
gcs_destination=gcs_destination, batch_size=batch_size)

async_request = vision.types.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config,
output_config=output_config)

operation = client.async_batch_annotate_files(
requests=[async_request])

print('Waiting for the operation to finish.')
operation.result(timeout=90)

# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
storage_client = storage.Client()

match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)

bucket = storage_client.get_bucket(bucket_name=bucket_name)

# List objects with the given prefix.
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files:')
for blob in blob_list:
print(blob.name)

# Process the first output file from GCS.
# Since we specified batch_size=2, the first response contains
# the first two pages of the input file.
output = blob_list[0]

json_string = output.download_as_string()
response = json_format.Parse(
json_string, vision.types.AnnotateFileResponse())

# The actual response for the first page of the input file.
first_page_response = response.responses[0]
annotation = first_page_response.full_text_annotation

# Here we print the full text from the first page.
# The response contains more information:
# annotation/pages/blocks/paragraphs/words/symbols
# including confidence scores and bounding boxes
print(u'Full text:\n{}'.format(
annotation.text))
# [END vision_async_detect_document_ocr]


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--gcs-source-uri', required=True)
parser.add_argument('--gcs-destination-uri', required=True)

args = parser.parse_args()
async_detect_document(args.gcs_source_uri, args.gcs_destination_uri)
38 changes: 38 additions & 0 deletions vision/cloud-client/detect/detect_pdf_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from google.cloud import storage

from detect_pdf import async_detect_document

BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
OUTPUT_PREFIX = 'OCR_PDF_TEST_OUTPUT'
GCS_SOURCE_URI = 'gs://{}/HodgeConj.pdf'.format(BUCKET)
GCS_DESTINATION_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)


def test_async_detect_document(capsys):
async_detect_document(
gcs_source_uri=GCS_SOURCE_URI,
gcs_destination_uri=GCS_DESTINATION_URI)
out, _ = capsys.readouterr()

assert 'Hodge conjecture' in out

storage_client = storage.Client()
bucket = storage_client.get_bucket(BUCKET)
for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX):
blob.delete()
1 change: 1 addition & 0 deletions vision/cloud-client/detect/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
google-cloud-vision==0.30.1
google-cloud-storage==1.6.0

0 comments on commit 7405c00

Please sign in to comment.