Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vision API OCR PDF/TIFF sample #1420

Merged
merged 12 commits into from
Apr 4, 2018
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions vision/cloud-client/detect/detect_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python

# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""OCR with PDF/TIFF as source files on GCS

Example:
python detect_pdf.py \
--gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \
--gcs-destination-uri gs://BUCKET_NAME/PREFIX/
"""

import argparse
import re

from google.cloud import storage
from google.cloud import vision_v1p2beta1 as vision
from google.protobuf import json_format


# [START vision_async_detect_document_ocr]
def async_detect_document(gcs_source_uri, gcs_destination_uri):
# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = 'application/pdf'

# How many pages should be grouped into each json output file.
# With a file of 5 pages
batch_size = 2

client = vision.ImageAnnotatorClient()

feature = vision.types.Feature(
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)

gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
input_config = vision.types.InputConfig(
gcs_source=gcs_source, mime_type=mime_type)

gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
output_config = vision.types.OutputConfig(
gcs_destination=gcs_destination, batch_size=batch_size)

async_request = vision.types.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config,
output_config=output_config)

operation = client.async_batch_annotate_files(
requests=[async_request])

print('Waiting for the operation to finish.')
operation.result(timeout=90)

# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
storage_client = storage.Client()

match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)

bucket = storage_client.get_bucket(bucket_name=bucket_name)

# List objects with the given prefix.
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files:')
for blob in blob_list:
print(blob.name)

# Process the first output file from GCS.
# Since we specified batch_size=2, the first response contains
# the first two pages of the input file.
output = blob_list[0]

json_string = output.download_as_string()
response = json_format.Parse(
json_string, vision.types.AnnotateFileResponse())

# The actual response for the first page of the input file.
first_page_response = response.responses[0]
annotation = first_page_response.full_text_annotation

# Here we print the full text from the first page.
# The response contains more information:
# annotation/pages/blocks/paragraphs/words/symbols
# including conficence score and bounding boxes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

confidence

print(u'Full text:\n{}'.format(
annotation.text))
# [END vision_async_detect_document_ocr]


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--gcs-source-uri', required=True)
parser.add_argument('--gcs-destination-uri', required=True)

args = parser.parse_args()
async_detect_document(args.gcs_source_uri, args.gcs_destination_uri)
36 changes: 36 additions & 0 deletions vision/cloud-client/detect/detect_pdf_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from google.cloud import storage

from detect_pdf import async_detect_document

BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
OUTPUT_PREFIX = 'OCR_PDF_TEST_OUTPUT'
GCS_SOURCE_URI = 'gs://{}/HodgeConj.pdf'.format(BUCKET)
GCS_DESTINATION_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)


def test_async_detect_document(capsys):
async_detect_document(gcs_source_uri=GCS_SOURCE_URI, gcs_destination_uri=GCS_DESTINATION_URI)
out, _ = capsys.readouterr()

assert 'Hodge conjecture' in out

storage_client = storage.Client()
bucket = storage_client.get_bucket(BUCKET)
for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX):
blob.delete()
1 change: 1 addition & 0 deletions vision/cloud-client/detect/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
google-cloud-vision==0.30.1
google-cloud-storage==1.6.0