Added two samples for "OCR with PDF/TIFF as source files" [(#2034)](G…

…oogleCloudPlatform/python-docs-samples#2034) * Added two samples for "OCR with PDF/TIFF as source files" * Moved the code to beta_snippets.py * Fixed the sub-parser names. * Shortened the line that was too long. * Added newline at the end of the file * Using the builtin open function instead * Renamed a variable * Fixed the wrong arg parameter * Added extra comment lines * Regenerated README.rst * Added specific strings to be unit-tested
busunkim96 · Mar 12, 2019 · 1bc0992 · 1bc0992
1 parent 8b3ee74
commit 1bc0992
Show file tree

Hide file tree

Showing 4 changed files with 144 additions and 4 deletions.
diff --git a/samples/snippets/detect/README.rst b/samples/snippets/detect/README.rst
@@ -165,7 +165,7 @@ To run this sample:
  $ python beta_snippets.py
 
  usage: beta_snippets.py [-h]
- {object-localization,object-localization-uri,handwritten-ocr,handwritten-ocr-uri}
+ {object-localization,object-localization-uri,handwritten-ocr,handwritten-ocr-uri,doc-features,doc-features-uri}
  ...
 
  Google Cloud Vision API Python Beta Snippets
@@ -176,12 +176,14 @@ To run this sample:
  python beta_snippets.py object-localization-uri gs://...
  python beta_snippets.py handwritten-ocr INPUT_IMAGE
  python beta_snippets.py handwritten-ocr-uri gs://...
+ python beta_snippets.py doc-features INPUT_PDF
+ python beta_snippets.py doc-features_uri gs://...
 
  For more information, the documentation at
  https://cloud.google.com/vision/docs.
 
  positional arguments:
- {object-localization,object-localization-uri,handwritten-ocr,handwritten-ocr-uri}
+ {object-localization,object-localization-uri,handwritten-ocr,handwritten-ocr-uri,doc-features,doc-features-uri}
  object-localization
  Localize objects in the local image. Args: path: The
  path to the local file.
@@ -195,6 +197,14 @@ To run this sample:
  Detects handwritten characters in the file located in
  Google Cloud Storage. Args: uri: The path to the file
  in Google Cloud Storage (gs://...)
+ doc-features Detects document features in a PDF/TIFF/GIF file.
+ While your PDF file may have several pages, this API
+ can process up to 5 pages only. Args: path: The path
+ to the local file.
+ doc-features-uri Detects document features in a PDF/TIFF/GIF file.
+ While your PDF file may have several pages, this API
+ can process up to 5 pages only. Args: uri: The path to
+ the file in Google Cloud Storage (gs://...)
 
  optional arguments:
  -h, --help show this help message and exit

diff --git a/samples/snippets/detect/beta_snippets.py b/samples/snippets/detect/beta_snippets.py
@@ -23,6 +23,8 @@
 python beta_snippets.py object-localization-uri gs://...
 python beta_snippets.py handwritten-ocr INPUT_IMAGE
 python beta_snippets.py handwritten-ocr-uri gs://...
+python beta_snippets.py doc-features INPUT_PDF
+python beta_snippets.py doc-features_uri gs://...
 
 
 For more information, the documentation at
@@ -174,6 +176,105 @@ def detect_handwritten_ocr_uri(uri):
 # [END vision_handwritten_ocr_gcs_beta]
 
 
+# [START vision_fulltext_detection_pdf_beta]
+def detect_document_features(path):
+ """Detects document features in a PDF/TIFF/GIF file.
+
+ While your PDF file may have several pages,
+ this API can process up to 5 pages only.
+
+ Args:
+ path: The path to the local file.
+ """
+ from google.cloud import vision_v1p4beta1 as vision
+ client = vision.ImageAnnotatorClient()
+
+ with open(path, 'rb') as pdf_file:
+ content = pdf_file.read()
+
+ # Other supported mime_types: image/tiff' or 'image/gif'
+ mime_type = 'application/pdf'
+ input_config = vision.types.InputConfig(
+ content=content, mime_type=mime_type)
+
+ feature = vision.types.Feature(
+ type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
+ # Annotate the first two pages and the last one (max 5 pages)
+ # First page starts at 1, and not 0. Last page is -1.
+ pages = [1, 2, -1]
+
+ request = vision.types.AnnotateFileRequest(
+ input_config=input_config,
+ features=[feature],
+ pages=pages)
+
+ response = client.batch_annotate_files(requests=[request])
+
+ for image_response in response.responses[0].responses:
+ for page in image_response.full_text_annotation.pages:
+ for block in page.blocks:
+ print('\nBlock confidence: {}\n'.format(block.confidence))
+ for par in block.paragraphs:
+ print('\tParagraph confidence: {}'.format(par.confidence))
+ for word in par.words:
+ symbol_texts = [symbol.text for symbol in word.symbols]
+ word_text = ''.join(symbol_texts)
+ print('\t\tWord text: {} (confidence: {})'.format(
+ word_text, word.confidence))
+ for symbol in word.symbols:
+ print('\t\t\tSymbol: {} (confidence: {})'.format(
+ symbol.text, symbol.confidence))
+# [END vision_fulltext_detection_pdf_beta]
+
+
+# [START vision_fulltext_detection_pdf_gcs_beta]
+def detect_document_features_uri(gcs_uri):
+ """Detects document features in a PDF/TIFF/GIF file.
+
+ While your PDF file may have several pages,
+ this API can process up to 5 pages only.
+
+ Args:
+ uri: The path to the file in Google Cloud Storage (gs://...)
+ """
+ from google.cloud import vision_v1p4beta1 as vision
+ client = vision.ImageAnnotatorClient()
+
+ # Other supported mime_types: image/tiff' or 'image/gif'
+ mime_type = 'application/pdf'
+ input_config = vision.types.InputConfig(
+ gcs_source=vision.types.GcsSource(uri=gcs_uri), mime_type=mime_type)
+
+ feature = vision.types.Feature(
+ type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
+ # Annotate the first two pages and the last one (max 5 pages)
+ # First page starts at 1, and not 0. Last page is -1.
+ pages = [1, 2, -1]
+
+ request = vision.types.AnnotateFileRequest(
+ input_config=input_config,
+ features=[feature],
+ pages=pages)
+
+ response = client.batch_annotate_files(requests=[request])
+
+ for image_response in response.responses[0].responses:
+ for page in image_response.full_text_annotation.pages:
+ for block in page.blocks:
+ print('\nBlock confidence: {}\n'.format(block.confidence))
+ for par in block.paragraphs:
+ print('\tParagraph confidence: {}'.format(par.confidence))
+ for word in par.words:
+ symbol_texts = [symbol.text for symbol in word.symbols]
+ word_text = ''.join(symbol_texts)
+ print('\t\tWord text: {} (confidence: {})'.format(
+ word_text, word.confidence))
+ for symbol in word.symbols:
+ print('\t\t\tSymbol: {} (confidence: {})'.format(
+ symbol.text, symbol.confidence))
+# [END vision_fulltext_detection_pdf_gcs_beta]
+
+
 if __name__ == '__main__':
  parser = argparse.ArgumentParser(
  description=__doc__,
@@ -196,15 +297,27 @@ def detect_handwritten_ocr_uri(uri):
  'handwritten-ocr-uri', help=detect_handwritten_ocr_uri.__doc__)
  handwritten_uri_parser.add_argument('uri')
 
+ doc_features_parser = subparsers.add_parser(
+ 'doc-features', help=detect_document_features.__doc__)
+ doc_features_parser.add_argument('path')
+
+ doc_features_uri_parser = subparsers.add_parser(
+ 'doc-features-uri', help=detect_document_features_uri.__doc__)
+ doc_features_uri_parser.add_argument('uri')
+
  args = parser.parse_args()
 
  if 'uri' in args.command:
  if 'object-localization-uri' in args.command:
  localize_objects_uri(args.uri)
  elif 'handwritten-ocr-uri' in args.command:
  detect_handwritten_ocr_uri(args.uri)
+ elif 'doc-features' in args.command:
+ detect_handwritten_ocr_uri(args.uri)
  else:
  if 'object-localization' in args.command:
  localize_objects(args.path)
  elif 'handwritten-ocr' in args.command:
  detect_handwritten_ocr(args.path)
+ elif 'doc-features' in args.command:
+ detect_handwritten_ocr(args.path)
diff --git a/samples/snippets/detect/beta_snippets_test.py b/samples/snippets/detect/beta_snippets_test.py
@@ -16,6 +16,7 @@
 import beta_snippets
 
 RESOURCES = os.path.join(os.path.dirname(__file__), 'resources')
+GCS_ROOT = 'gs://cloud-samples-data/vision/'
 
 
 def test_localize_objects(capsys):
@@ -28,7 +29,7 @@ def test_localize_objects(capsys):
 
 
 def test_localize_objects_uri(capsys):
- uri = 'gs://cloud-samples-data/vision/puppies.jpg'
+ uri = GCS_ROOT + 'puppies.jpg'
 
  beta_snippets.localize_objects_uri(uri)
 
@@ -46,9 +47,25 @@ def test_handwritten_ocr(capsys):
 
 
 def test_handwritten_ocr_uri(capsys):
- uri = 'gs://cloud-samples-data/vision/handwritten.jpg'
+ uri = GCS_ROOT + 'handwritten.jpg'
 
  beta_snippets.detect_handwritten_ocr_uri(uri)
 
  out, _ = capsys.readouterr()
  assert 'Cloud Vision API' in out
+
+
+def test_detect_pdf_document(capsys):
+ file_name = os.path.join(RESOURCES, 'kafka.pdf')
+ beta_snippets.detect_document_features(file_name)
+ out, _ = capsys.readouterr()
+ assert 'Symbol: a' in out
+ assert 'Word text: evenings' in out
+
+
+def test_detect_pdf_document_from_gcs(capsys):
+ gcs_uri = GCS_ROOT + 'document_understanding/kafka.pdf'
+ beta_snippets.detect_document_features_uri(gcs_uri)
+ out, _ = capsys.readouterr()
+ assert 'Symbol' in out
+ assert 'Word text' in out
diff --git a/samples/snippets/detect/resources/kafka.pdf b/samples/snippets/detect/resources/kafka.pdf