From 81249dba88f95219e47eb463ab59a3448b498a6a Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Wed, 21 Mar 2018 13:48:32 -0700 Subject: [PATCH 01/12] add docpdf sample --- vision/cloud-client/detect/docpdf.py | 89 +++++++++++++++++++++ vision/cloud-client/detect/docpdf_test.py | 36 +++++++++ vision/cloud-client/detect/requirements.txt | 1 + 3 files changed, 126 insertions(+) create mode 100644 vision/cloud-client/detect/docpdf.py create mode 100644 vision/cloud-client/detect/docpdf_test.py diff --git a/vision/cloud-client/detect/docpdf.py b/vision/cloud-client/detect/docpdf.py new file mode 100644 index 000000000000..69ec7e158e89 --- /dev/null +++ b/vision/cloud-client/detect/docpdf.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +# Copyright 2018 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""OCR with PDF/TIFF as source files on GCS + +Example: + python docpdf.py --gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \ + --gcs-destination-uri gs://BUCKET_NAME/OCR/ +""" + +import argparse +import json +import re + +from google.cloud import vision_v1p2beta1 as vision +from google.cloud import storage +from google.protobuf import json_format + + +def async_detect_document(gcs_source_uri, gcs_destination_uri): + # Supported mime_types are: 'application/pdf' and 'image/tiff' + mime_type = 'application/pdf' + + # How many pages should be grouped into each json output file. + batch_size = 2 + + client = vision.ImageAnnotatorClient() + + feature = vision.types.Feature( + type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION) + + gcs_source = vision.types.GcsSource(uri=gcs_source_uri) + input_config = vision.types.InputConfig( + gcs_source=gcs_source, mime_type=mime_type) + + gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri) + output_config = vision.types.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) + + async_request = vision.types.AsyncAnnotateFileRequest( + features=[feature], input_config=input_config, output_config=output_config) + + operation = client.async_batch_annotate_files( + requests=[async_request]) + + print('Waiting for the operation to finish.') + result = operation.result(90) + + # Retrieve the first output file from GCS + storage_client = storage.Client() + + match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) + bucket_name = match.group(1) + object_name = match.group(2) + 'output-1-to-2.json' + + bucket = storage_client.get_bucket(bucket_name=bucket_name) + blob = bucket.blob(blob_name=object_name) + + # Print the full text from the first page. + # The response additionally includes individual detected symbol's + # confidence and bounding box. + json_string = blob.download_as_string() + response = json.loads(json_string) + + first_page = response['responses'][0] + print(first_page['fullTextAnnotation']['text']) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--gcs-source-uri', required=True) + parser.add_argument('--gcs-destination-uri', required=True) + + args = parser.parse_args() + async_detect_document(args.gcs_source_uri, args.gcs_destination_uri) + diff --git a/vision/cloud-client/detect/docpdf_test.py b/vision/cloud-client/detect/docpdf_test.py new file mode 100644 index 000000000000..82d976054189 --- /dev/null +++ b/vision/cloud-client/detect/docpdf_test.py @@ -0,0 +1,36 @@ +# Copyright 2018 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from google.cloud import storage + +from docpdf import async_detect_document + +BUCKET = os.environ['CLOUD_STORAGE_BUCKET'] +OUTPUT_PREFIX = 'OCR_PDF_TEST_OUTPUT' +GCS_SOURCE_URI = 'gs://{}/HodgeConj.pdf'.format(BUCKET) +GCS_DESTINATION_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX) + + +def test_async_detect_document(capsys): + async_detect_document(gcs_source_uri=GCS_SOURCE_URI, gcs_destination_uri=GCS_DESTINATION_URI) + out, _ = capsys.readouterr() + + assert 'Hodge conjecture' in out + + storage_client = storage.Client() + bucket = storage_client.get_bucket(BUCKET) + for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX): + blob.delete() diff --git a/vision/cloud-client/detect/requirements.txt b/vision/cloud-client/detect/requirements.txt index 80c8a11ca3c2..ca9d0e400b84 100644 --- a/vision/cloud-client/detect/requirements.txt +++ b/vision/cloud-client/detect/requirements.txt @@ -1 +1,2 @@ google-cloud-vision==0.30.1 +google-cloud-storage==1.6.0 From c375496350dd5a2c3298edc5fac134523bdcfceb Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Mon, 26 Mar 2018 16:48:55 -0700 Subject: [PATCH 02/12] import order --- vision/cloud-client/detect/docpdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision/cloud-client/detect/docpdf.py b/vision/cloud-client/detect/docpdf.py index 69ec7e158e89..8dedf46c8f27 100644 --- a/vision/cloud-client/detect/docpdf.py +++ b/vision/cloud-client/detect/docpdf.py @@ -26,8 +26,8 @@ import json import re -from google.cloud import vision_v1p2beta1 as vision from google.cloud import storage +from google.cloud import vision_v1p2beta1 as vision from google.protobuf import json_format From 1e3c9411c11973648f97347e8fc5ff71caac3a01 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Tue, 27 Mar 2018 11:02:08 -0700 Subject: [PATCH 03/12] list blobs --- vision/cloud-client/detect/docpdf.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/vision/cloud-client/detect/docpdf.py b/vision/cloud-client/detect/docpdf.py index 8dedf46c8f27..e03005856a4e 100644 --- a/vision/cloud-client/detect/docpdf.py +++ b/vision/cloud-client/detect/docpdf.py @@ -36,6 +36,7 @@ def async_detect_document(gcs_source_uri, gcs_destination_uri): mime_type = 'application/pdf' # How many pages should be grouped into each json output file. + # With a file of 5 pages batch_size = 2 client = vision.ImageAnnotatorClient() @@ -57,22 +58,29 @@ def async_detect_document(gcs_source_uri, gcs_destination_uri): requests=[async_request]) print('Waiting for the operation to finish.') - result = operation.result(90) + result = operation.result(timeout=90) - # Retrieve the first output file from GCS + # Once the request has completed and the output has been + # written to GCS, we can list all the output files. storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) - object_name = match.group(2) + 'output-1-to-2.json' + prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name=bucket_name) - blob = bucket.blob(blob_name=object_name) + + # List objects with the given prefix. + blob_list = list(bucket.list_blobs(prefix=prefix)) + print(blob_list) + + #Retrieve the first output file from GCS. + first_output = blob_list[0] # Print the full text from the first page. # The response additionally includes individual detected symbol's # confidence and bounding box. - json_string = blob.download_as_string() + json_string = first_output.download_as_string() response = json.loads(json_string) first_page = response['responses'][0] From a2cd808eb9cc544d0e1e9d834bb259348d2870fc Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Tue, 27 Mar 2018 11:05:35 -0700 Subject: [PATCH 04/12] filename change --- vision/cloud-client/detect/docpdf.py | 97 ----------------------- vision/cloud-client/detect/docpdf_test.py | 36 --------- 2 files changed, 133 deletions(-) delete mode 100644 vision/cloud-client/detect/docpdf.py delete mode 100644 vision/cloud-client/detect/docpdf_test.py diff --git a/vision/cloud-client/detect/docpdf.py b/vision/cloud-client/detect/docpdf.py deleted file mode 100644 index e03005856a4e..000000000000 --- a/vision/cloud-client/detect/docpdf.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2018 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -"""OCR with PDF/TIFF as source files on GCS - -Example: - python docpdf.py --gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \ - --gcs-destination-uri gs://BUCKET_NAME/OCR/ -""" - -import argparse -import json -import re - -from google.cloud import storage -from google.cloud import vision_v1p2beta1 as vision -from google.protobuf import json_format - - -def async_detect_document(gcs_source_uri, gcs_destination_uri): - # Supported mime_types are: 'application/pdf' and 'image/tiff' - mime_type = 'application/pdf' - - # How many pages should be grouped into each json output file. - # With a file of 5 pages - batch_size = 2 - - client = vision.ImageAnnotatorClient() - - feature = vision.types.Feature( - type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION) - - gcs_source = vision.types.GcsSource(uri=gcs_source_uri) - input_config = vision.types.InputConfig( - gcs_source=gcs_source, mime_type=mime_type) - - gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri) - output_config = vision.types.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) - - async_request = vision.types.AsyncAnnotateFileRequest( - features=[feature], input_config=input_config, output_config=output_config) - - operation = client.async_batch_annotate_files( - requests=[async_request]) - - print('Waiting for the operation to finish.') - result = operation.result(timeout=90) - - # Once the request has completed and the output has been - # written to GCS, we can list all the output files. - storage_client = storage.Client() - - match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) - bucket_name = match.group(1) - prefix = match.group(2) - - bucket = storage_client.get_bucket(bucket_name=bucket_name) - - # List objects with the given prefix. - blob_list = list(bucket.list_blobs(prefix=prefix)) - print(blob_list) - - #Retrieve the first output file from GCS. - first_output = blob_list[0] - - # Print the full text from the first page. - # The response additionally includes individual detected symbol's - # confidence and bounding box. - json_string = first_output.download_as_string() - response = json.loads(json_string) - - first_page = response['responses'][0] - print(first_page['fullTextAnnotation']['text']) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--gcs-source-uri', required=True) - parser.add_argument('--gcs-destination-uri', required=True) - - args = parser.parse_args() - async_detect_document(args.gcs_source_uri, args.gcs_destination_uri) - diff --git a/vision/cloud-client/detect/docpdf_test.py b/vision/cloud-client/detect/docpdf_test.py deleted file mode 100644 index 82d976054189..000000000000 --- a/vision/cloud-client/detect/docpdf_test.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2018 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from google.cloud import storage - -from docpdf import async_detect_document - -BUCKET = os.environ['CLOUD_STORAGE_BUCKET'] -OUTPUT_PREFIX = 'OCR_PDF_TEST_OUTPUT' -GCS_SOURCE_URI = 'gs://{}/HodgeConj.pdf'.format(BUCKET) -GCS_DESTINATION_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX) - - -def test_async_detect_document(capsys): - async_detect_document(gcs_source_uri=GCS_SOURCE_URI, gcs_destination_uri=GCS_DESTINATION_URI) - out, _ = capsys.readouterr() - - assert 'Hodge conjecture' in out - - storage_client = storage.Client() - bucket = storage_client.get_bucket(BUCKET) - for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX): - blob.delete() From c3fa66448e0f26865e5ffd07077397124874cb8a Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Tue, 27 Mar 2018 11:22:07 -0700 Subject: [PATCH 05/12] add the renamed files --- vision/cloud-client/detect/detect_pdf.py | 97 +++++++++++++++++++ vision/cloud-client/detect/detect_pdf_test.py | 36 +++++++ 2 files changed, 133 insertions(+) create mode 100644 vision/cloud-client/detect/detect_pdf.py create mode 100644 vision/cloud-client/detect/detect_pdf_test.py diff --git a/vision/cloud-client/detect/detect_pdf.py b/vision/cloud-client/detect/detect_pdf.py new file mode 100644 index 000000000000..e03005856a4e --- /dev/null +++ b/vision/cloud-client/detect/detect_pdf.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +# Copyright 2018 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""OCR with PDF/TIFF as source files on GCS + +Example: + python docpdf.py --gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \ + --gcs-destination-uri gs://BUCKET_NAME/OCR/ +""" + +import argparse +import json +import re + +from google.cloud import storage +from google.cloud import vision_v1p2beta1 as vision +from google.protobuf import json_format + + +def async_detect_document(gcs_source_uri, gcs_destination_uri): + # Supported mime_types are: 'application/pdf' and 'image/tiff' + mime_type = 'application/pdf' + + # How many pages should be grouped into each json output file. + # With a file of 5 pages + batch_size = 2 + + client = vision.ImageAnnotatorClient() + + feature = vision.types.Feature( + type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION) + + gcs_source = vision.types.GcsSource(uri=gcs_source_uri) + input_config = vision.types.InputConfig( + gcs_source=gcs_source, mime_type=mime_type) + + gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri) + output_config = vision.types.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) + + async_request = vision.types.AsyncAnnotateFileRequest( + features=[feature], input_config=input_config, output_config=output_config) + + operation = client.async_batch_annotate_files( + requests=[async_request]) + + print('Waiting for the operation to finish.') + result = operation.result(timeout=90) + + # Once the request has completed and the output has been + # written to GCS, we can list all the output files. + storage_client = storage.Client() + + match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) + bucket_name = match.group(1) + prefix = match.group(2) + + bucket = storage_client.get_bucket(bucket_name=bucket_name) + + # List objects with the given prefix. + blob_list = list(bucket.list_blobs(prefix=prefix)) + print(blob_list) + + #Retrieve the first output file from GCS. + first_output = blob_list[0] + + # Print the full text from the first page. + # The response additionally includes individual detected symbol's + # confidence and bounding box. + json_string = first_output.download_as_string() + response = json.loads(json_string) + + first_page = response['responses'][0] + print(first_page['fullTextAnnotation']['text']) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--gcs-source-uri', required=True) + parser.add_argument('--gcs-destination-uri', required=True) + + args = parser.parse_args() + async_detect_document(args.gcs_source_uri, args.gcs_destination_uri) + diff --git a/vision/cloud-client/detect/detect_pdf_test.py b/vision/cloud-client/detect/detect_pdf_test.py new file mode 100644 index 000000000000..1ba079283b6b --- /dev/null +++ b/vision/cloud-client/detect/detect_pdf_test.py @@ -0,0 +1,36 @@ +# Copyright 2018 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from google.cloud import storage + +from detect_pdf import async_detect_document + +BUCKET = os.environ['CLOUD_STORAGE_BUCKET'] +OUTPUT_PREFIX = 'OCR_PDF_TEST_OUTPUT' +GCS_SOURCE_URI = 'gs://{}/HodgeConj.pdf'.format(BUCKET) +GCS_DESTINATION_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX) + + +def test_async_detect_document(capsys): + async_detect_document(gcs_source_uri=GCS_SOURCE_URI, gcs_destination_uri=GCS_DESTINATION_URI) + out, _ = capsys.readouterr() + + assert 'Hodge conjecture' in out + + storage_client = storage.Client() + bucket = storage_client.get_bucket(BUCKET) + for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX): + blob.delete() From 8d4274cf2d66dff182659ed2471f470223ba3160 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Thu, 29 Mar 2018 10:38:30 -0700 Subject: [PATCH 06/12] parse json string to AnnotateFileResponse message --- vision/cloud-client/detect/detect_pdf.py | 38 ++++++++++++++++-------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/vision/cloud-client/detect/detect_pdf.py b/vision/cloud-client/detect/detect_pdf.py index e03005856a4e..fa0bc793d7b2 100644 --- a/vision/cloud-client/detect/detect_pdf.py +++ b/vision/cloud-client/detect/detect_pdf.py @@ -18,12 +18,12 @@ """OCR with PDF/TIFF as source files on GCS Example: - python docpdf.py --gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \ + python detect_pdf.py \ + --gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \ --gcs-destination-uri gs://BUCKET_NAME/OCR/ """ import argparse -import json import re from google.cloud import storage @@ -31,6 +31,7 @@ from google.protobuf import json_format +# [START vision_async_detect_document_ocr] def async_detect_document(gcs_source_uri, gcs_destination_uri): # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' @@ -49,16 +50,18 @@ def async_detect_document(gcs_source_uri, gcs_destination_uri): gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri) - output_config = vision.types.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) + output_config = vision.types.OutputConfig( + gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.types.AsyncAnnotateFileRequest( - features=[feature], input_config=input_config, output_config=output_config) + features=[feature], input_config=input_config, + output_config=output_config) operation = client.async_batch_annotate_files( requests=[async_request]) print('Waiting for the operation to finish.') - result = operation.result(timeout=90) + operation.result(timeout=90) # Once the request has completed and the output has been # written to GCS, we can list all the output files. @@ -72,19 +75,29 @@ def async_detect_document(gcs_source_uri, gcs_destination_uri): # List objects with the given prefix. blob_list = list(bucket.list_blobs(prefix=prefix)) - print(blob_list) + print('Output files:') + for blob in blob_list: + print(blob.name) - #Retrieve the first output file from GCS. - first_output = blob_list[0] + # Process the first output file from GCS. + # Since we specified batch_size=2, the first response contains + # the first two pages of the input file. + output = blob_list[0] + + json_string = output.download_as_string() + response = json_format.Parse( + json_string, vision.types.AnnotateFileResponse()) + + # The actual response for the first page of the input file. + first_page_response = response.responses[0] # Print the full text from the first page. # The response additionally includes individual detected symbol's # confidence and bounding box. - json_string = first_output.download_as_string() - response = json.loads(json_string) + print(u'Full text:\n{}'.format( + first_page_response.full_text_annotation.text)) - first_page = response['responses'][0] - print(first_page['fullTextAnnotation']['text']) +# [END vision_async_detect_document_ocr] if __name__ == '__main__': @@ -94,4 +107,3 @@ def async_detect_document(gcs_source_uri, gcs_destination_uri): args = parser.parse_args() async_detect_document(args.gcs_source_uri, args.gcs_destination_uri) - From e6803e21bae170b372451c8815be6133a2fc8edd Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Thu, 29 Mar 2018 11:26:28 -0700 Subject: [PATCH 07/12] show more of the response --- vision/cloud-client/detect/detect_pdf.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/vision/cloud-client/detect/detect_pdf.py b/vision/cloud-client/detect/detect_pdf.py index fa0bc793d7b2..344194dacc33 100644 --- a/vision/cloud-client/detect/detect_pdf.py +++ b/vision/cloud-client/detect/detect_pdf.py @@ -92,11 +92,29 @@ def async_detect_document(gcs_source_uri, gcs_destination_uri): first_page_response = response.responses[0] # Print the full text from the first page. - # The response additionally includes individual detected symbol's - # confidence and bounding box. print(u'Full text:\n{}'.format( first_page_response.full_text_annotation.text)) + # The response additionally includes individual detected symbol's + # confidence and bounding box. + for page in first_page_response.full_text_annotation.pages: + for block in page.blocks: + print('\nBlock confidence: {}\n'.format(block.confidence)) + + for paragraph in block.paragraphs: + print('Paragraph confidence: {}'.format( + paragraph.confidence)) + + for word in paragraph.words: + word_text = ''.join([ + symbol.text for symbol in word.symbols + ]) + print(u'Word text: {} (confidence: {})'.format( + word_text, word.confidence)) + + for symbol in word.symbols: + print(u'\tSymbol: {} (confidence: {})'.format( + symbol.text, symbol.confidence)) # [END vision_async_detect_document_ocr] From da70df0d4c8f32a2c24e0359afa09c0423ec3005 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Fri, 30 Mar 2018 23:25:15 -0700 Subject: [PATCH 08/12] simplify response processing to better focus on how to make the request --- vision/cloud-client/detect/detect_pdf.py | 31 ++++++------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/vision/cloud-client/detect/detect_pdf.py b/vision/cloud-client/detect/detect_pdf.py index 344194dacc33..e48c15cdd64b 100644 --- a/vision/cloud-client/detect/detect_pdf.py +++ b/vision/cloud-client/detect/detect_pdf.py @@ -20,7 +20,7 @@ Example: python detect_pdf.py \ --gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \ - --gcs-destination-uri gs://BUCKET_NAME/OCR/ + --gcs-destination-uri gs://BUCKET_NAME/PREFIX/ """ import argparse @@ -90,31 +90,14 @@ def async_detect_document(gcs_source_uri, gcs_destination_uri): # The actual response for the first page of the input file. first_page_response = response.responses[0] + annotation = first_page_response.full_text_annotation - # Print the full text from the first page. + # Here we print the full text from the first page. + # The response contains more information: + # annotation/pages/blocks/paragraphs/words/symbols + # including conficence score and bounding boxes print(u'Full text:\n{}'.format( - first_page_response.full_text_annotation.text)) - - # The response additionally includes individual detected symbol's - # confidence and bounding box. - for page in first_page_response.full_text_annotation.pages: - for block in page.blocks: - print('\nBlock confidence: {}\n'.format(block.confidence)) - - for paragraph in block.paragraphs: - print('Paragraph confidence: {}'.format( - paragraph.confidence)) - - for word in paragraph.words: - word_text = ''.join([ - symbol.text for symbol in word.symbols - ]) - print(u'Word text: {} (confidence: {})'.format( - word_text, word.confidence)) - - for symbol in word.symbols: - print(u'\tSymbol: {} (confidence: {})'.format( - symbol.text, symbol.confidence)) + annotation.text)) # [END vision_async_detect_document_ocr] From 18276361dab88ba0174c166c3b2a7ae79ace162e Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Tue, 3 Apr 2018 09:26:02 -0700 Subject: [PATCH 09/12] fix typo --- vision/cloud-client/detect/detect_pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision/cloud-client/detect/detect_pdf.py b/vision/cloud-client/detect/detect_pdf.py index e48c15cdd64b..a728d0522a58 100644 --- a/vision/cloud-client/detect/detect_pdf.py +++ b/vision/cloud-client/detect/detect_pdf.py @@ -95,7 +95,7 @@ def async_detect_document(gcs_source_uri, gcs_destination_uri): # Here we print the full text from the first page. # The response contains more information: # annotation/pages/blocks/paragraphs/words/symbols - # including conficence score and bounding boxes + # including confidence scores and bounding boxes print(u'Full text:\n{}'.format( annotation.text)) # [END vision_async_detect_document_ocr] From fb688ce2cdd9b14e5844ee379a75208e9b10720a Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Tue, 3 Apr 2018 13:09:13 -0700 Subject: [PATCH 10/12] linter --- vision/cloud-client/detect/detect_pdf_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vision/cloud-client/detect/detect_pdf_test.py b/vision/cloud-client/detect/detect_pdf_test.py index 1ba079283b6b..0401f0bd42d1 100644 --- a/vision/cloud-client/detect/detect_pdf_test.py +++ b/vision/cloud-client/detect/detect_pdf_test.py @@ -25,7 +25,8 @@ def test_async_detect_document(capsys): - async_detect_document(gcs_source_uri=GCS_SOURCE_URI, gcs_destination_uri=GCS_DESTINATION_URI) + async_detect_document(gcs_source_uri=GCS_SOURCE_URI, + gcs_destination_uri=GCS_DESTINATION_URI) out, _ = capsys.readouterr() assert 'Hodge conjecture' in out From 2a1c5b3432c1f48e31a98a907a3cb07bef1395ea Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Tue, 3 Apr 2018 14:38:38 -0700 Subject: [PATCH 11/12] linter --- vision/cloud-client/detect/detect_pdf_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vision/cloud-client/detect/detect_pdf_test.py b/vision/cloud-client/detect/detect_pdf_test.py index 0401f0bd42d1..66663ab4da7d 100644 --- a/vision/cloud-client/detect/detect_pdf_test.py +++ b/vision/cloud-client/detect/detect_pdf_test.py @@ -26,7 +26,7 @@ def test_async_detect_document(capsys): async_detect_document(gcs_source_uri=GCS_SOURCE_URI, - gcs_destination_uri=GCS_DESTINATION_URI) + gcs_destination_uri=GCS_DESTINATION_URI) out, _ = capsys.readouterr() assert 'Hodge conjecture' in out @@ -34,4 +34,4 @@ def test_async_detect_document(capsys): storage_client = storage.Client() bucket = storage_client.get_bucket(BUCKET) for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX): - blob.delete() + blob.delete() From 39b19d120e007bdb9b23cfe1393939ae5ce3845f Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Tue, 3 Apr 2018 16:35:03 -0700 Subject: [PATCH 12/12] linter --- vision/cloud-client/detect/detect_pdf_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vision/cloud-client/detect/detect_pdf_test.py b/vision/cloud-client/detect/detect_pdf_test.py index 66663ab4da7d..f0f0b5f7126d 100644 --- a/vision/cloud-client/detect/detect_pdf_test.py +++ b/vision/cloud-client/detect/detect_pdf_test.py @@ -25,7 +25,8 @@ def test_async_detect_document(capsys): - async_detect_document(gcs_source_uri=GCS_SOURCE_URI, + async_detect_document( + gcs_source_uri=GCS_SOURCE_URI, gcs_destination_uri=GCS_DESTINATION_URI) out, _ = capsys.readouterr()