Skip to content

Commit

Permalink
[DI] Fix layout extracting issues (#35913)
Browse files Browse the repository at this point in the history
YalinLi0312 authored Jun 26, 2024
1 parent 48590d5 commit 15c50b7
Showing 14 changed files with 268 additions and 80 deletions.
51 changes: 37 additions & 14 deletions sdk/documentintelligence/azure-ai-documentintelligence/README.md
Original file line number Diff line number Diff line change
@@ -237,30 +237,47 @@ for page in result.pages:
words = get_words(page, line)
print(
f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
f"within bounding polygon '{line.polygon}'"
f"within bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
if page.words:
for word in page.words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")

if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
)

if result.paragraphs:
print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
# Sort all paragraphs by span's offset to read in the right order.
result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for paragraph in result.paragraphs:
print(
f"Found paragraph with role: '{paragraph.role}' within {format_bounding_region(paragraph.bounding_regions)} bounding region"
)
print(f"...with content: '{paragraph.content}'")
print(f"...with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

if result.tables:
for table_idx, table in enumerate(result.tables):
print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
print(
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
)
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
if cell.bounding_regions:
for region in cell.bounding_regions:
print(f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'")
print(
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'"
)

print("----------------------------------------")
```
@@ -302,11 +319,14 @@ print("----Key-value pairs found in document----")
if result.key_value_pairs:
for kv_pair in result.key_value_pairs:
if kv_pair.key:
print(f"Key '{kv_pair.key.content}' found within " f"'{kv_pair.key.bounding_regions}' bounding regions")
print(
f"Key '{kv_pair.key.content}' found within "
f"'{format_bounding_region(kv_pair.key.bounding_regions)}' bounding regions"
)
if kv_pair.value:
print(
f"Value '{kv_pair.value.content}' found within "
f"'{kv_pair.value.bounding_regions}' bounding regions\n"
f"'{format_bounding_region(kv_pair.value.bounding_regions)}' bounding regions\n"
)

for page in result.pages:
@@ -318,17 +338,18 @@ for page in result.pages:
words = get_words(page.words, line)
print(
f"...Line #{line_idx} has {len(words)} words and text '{line.content}' within "
f"bounding polygon '{line.polygon}'"
f"bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
if page.words:
for word in page.words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")

if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of "
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of "
f"{selection_mark.confidence}"
)

@@ -337,13 +358,15 @@ if result.tables:
print(f"Table # {table_idx} has {table.row_count} rows and {table.column_count} columns")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
print(
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
)
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
if cell.bounding_regions:
for region in cell.bounding_regions:
print(
f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'\n"
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'\n"
)
print("----------------------------------------")
```
Original file line number Diff line number Diff line change
@@ -43,6 +43,12 @@
import os


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


async def analyze_barcodes():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -82,7 +88,7 @@ async def analyze_barcodes():
print(f"- Barcode #{barcode_idx}: {barcode.value}")
print(f" Kind: {barcode.kind}")
print(f" Confidence: {barcode.confidence}")
print(f" Bounding regions: {barcode.polygon}")
print(f" Bounding regions: {format_polygon(barcode.polygon)}")

print("----------------------------------------")
# [END analyze_barcodes]
Original file line number Diff line number Diff line change
@@ -43,6 +43,12 @@
import os


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


async def analyze_formulas():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -85,13 +91,13 @@ async def analyze_formulas():
for formula_idx, formula in enumerate(inline_formulas):
print(f"- Inline #{formula_idx}: {formula.value}")
print(f" Confidence: {formula.confidence}")
print(f" Bounding regions: {formula.polygon}")
print(f" Bounding regions: {format_polygon(formula.polygon)}")

print(f"\nDetected {len(display_formulas)} display formulas.")
for formula_idx, formula in enumerate(display_formulas):
print(f"- Display #{formula_idx}: {formula.value}")
print(f" Confidence: {formula.confidence}")
print(f" Bounding regions: {formula.polygon}")
print(f" Bounding regions: {format_polygon(formula.polygon)}")

print("----------------------------------------")
# [END analyze_formulas]
Original file line number Diff line number Diff line change
@@ -58,6 +58,12 @@ def _in_span(word, spans):
return False


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


async def analyze_with_highres():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -102,30 +108,35 @@ async def analyze_with_highres():
words = get_words(page, line)
print(
f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
f"within bounding polygon '{line.polygon}'"
f"within bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
if page.words:
for word in page.words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")

if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for table_idx, table in enumerate(result.tables):
print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
print(
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
)
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
if cell.bounding_regions:
for region in cell.bounding_regions:
print(f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'")
print(
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'"
)

print("----------------------------------------")
# [END analyze_with_highres]
Original file line number Diff line number Diff line change
@@ -40,6 +40,18 @@ def _in_span(word, spans):
return False


def format_bounding_region(bounding_regions):
if not bounding_regions:
return "N/A"
return ", ".join(f"Page #{region.page_number}: {format_polygon(region.polygon)}" for region in bounding_regions)


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


async def analyze_general_documents():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -79,11 +91,14 @@ async def analyze_general_documents():
if result.key_value_pairs:
for kv_pair in result.key_value_pairs:
if kv_pair.key:
print(f"Key '{kv_pair.key.content}' found within " f"'{kv_pair.key.bounding_regions}' bounding regions")
print(
f"Key '{kv_pair.key.content}' found within "
f"'{format_bounding_region(kv_pair.key.bounding_regions)}' bounding regions"
)
if kv_pair.value:
print(
f"Value '{kv_pair.value.content}' found within "
f"'{kv_pair.value.bounding_regions}' bounding regions\n"
f"'{format_bounding_region(kv_pair.value.bounding_regions)}' bounding regions\n"
)

for page in result.pages:
@@ -95,17 +110,18 @@ async def analyze_general_documents():
words = get_words(page.words, line)
print(
f"...Line #{line_idx} has {len(words)} words and text '{line.content}' within "
f"bounding polygon '{line.polygon}'"
f"bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
if page.words:
for word in page.words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")

if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of "
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of "
f"{selection_mark.confidence}"
)

@@ -114,13 +130,15 @@ async def analyze_general_documents():
print(f"Table # {table_idx} has {table.row_count} rows and {table.column_count} columns")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
print(
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
)
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
if cell.bounding_regions:
for region in cell.bounding_regions:
print(
f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'\n"
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'\n"
)
print("----------------------------------------")
# [END analyze_general_documents]
Original file line number Diff line number Diff line change
@@ -44,13 +44,25 @@ def _in_span(word, spans):
return False


def format_bounding_region(bounding_regions):
if not bounding_regions:
return "N/A"
return ", ".join(f"Page #{region.page_number}: {format_polygon(region.polygon)}" for region in bounding_regions)


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


async def analyze_layout():
path_to_sample_documents = os.path.abspath(
os.path.join(
os.path.abspath(__file__),
"..",
"..",
"./sample_forms/forms/form_selection_mark.png",
"./sample_forms/forms/tabular_and_general_data.docx",
)
)

@@ -84,30 +96,47 @@ async def analyze_layout():
words = get_words(page, line)
print(
f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
f"within bounding polygon '{line.polygon}'"
f"within bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
if page.words:
for word in page.words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")

if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
)

if result.paragraphs:
print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
# Sort all paragraphs by span's offset to read in the right order.
result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for paragraph in result.paragraphs:
print(
f"Found paragraph with role: '{paragraph.role}' within {format_bounding_region(paragraph.bounding_regions)} bounding region"
)
print(f"...with content: '{paragraph.content}'")
print(f"...with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

if result.tables:
for table_idx, table in enumerate(result.tables):
print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
print(
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
)
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
if cell.bounding_regions:
for region in cell.bounding_regions:
print(f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'")
print(
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'"
)

print("----------------------------------------")
# [END extract_layout]
Original file line number Diff line number Diff line change
@@ -40,6 +40,18 @@ def _in_span(word, spans):
return False


def format_bounding_region(bounding_regions):
if not bounding_regions:
return "N/A"
return ", ".join(f"Page #{region.page_number}: {format_polygon(region.polygon)}" for region in bounding_regions)


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


async def analyze_read():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -91,7 +103,7 @@ async def analyze_read():
for line_idx, line in enumerate(page.lines):
words = get_words(page, line)
print(
f"...Line # {line_idx} has {len(words)} words and text '{line.content}' within bounding polygon '{line.polygon}'"
f"...Line # {line_idx} has {len(words)} words and text '{line.content}' within bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
@@ -101,21 +113,20 @@ async def analyze_read():
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
)

if result.paragraphs:
print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
for paragraph in result.paragraphs:
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
print(f"...with content: '{paragraph.content}'")

# Sort all paragraphs by span's offset to read in the right order.
result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for idx, paragraph in enumerate(result.paragraphs):
for paragraph in result.paragraphs:
print(
f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}"
f"Found paragraph with role: '{paragraph.role}' within {format_bounding_region(paragraph.bounding_regions)} bounding region"
)
print(f"...with content: '{paragraph.content}'")
print(f"...with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

print("----------------------------------------")

Original file line number Diff line number Diff line change
@@ -42,6 +42,12 @@
import os


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


def analyze_barcodes():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -79,7 +85,7 @@ def analyze_barcodes():
print(f"- Barcode #{barcode_idx}: {barcode.value}")
print(f" Kind: {barcode.kind}")
print(f" Confidence: {barcode.confidence}")
print(f" Bounding regions: {barcode.polygon}")
print(f" Bounding regions: {format_polygon(barcode.polygon)}")

print("----------------------------------------")
# [END analyze_barcodes]
Original file line number Diff line number Diff line change
@@ -42,6 +42,12 @@
import os


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


def analyze_formulas():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -82,13 +88,13 @@ def analyze_formulas():
for formula_idx, formula in enumerate(inline_formulas):
print(f"- Inline #{formula_idx}: {formula.value}")
print(f" Confidence: {formula.confidence}")
print(f" Bounding regions: {formula.polygon}")
print(f" Bounding regions: {format_polygon(formula.polygon)}")

print(f"\nDetected {len(display_formulas)} display formulas.")
for formula_idx, formula in enumerate(display_formulas):
print(f"- Display #{formula_idx}: {formula.value}")
print(f" Confidence: {formula.confidence}")
print(f" Bounding regions: {formula.polygon}")
print(f" Bounding regions: {format_polygon(formula.polygon)}")

print("----------------------------------------")
# [END analyze_formulas]
Original file line number Diff line number Diff line change
@@ -57,6 +57,12 @@ def _in_span(word, spans):
return False


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


def analyze_with_highres():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -99,30 +105,35 @@ def analyze_with_highres():
words = get_words(page, line)
print(
f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
f"within bounding polygon '{line.polygon}'"
f"within bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
if page.words:
for word in page.words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")

if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for table_idx, table in enumerate(result.tables):
print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
print(
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
)
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
if cell.bounding_regions:
for region in cell.bounding_regions:
print(f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'")
print(
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'"
)

print("----------------------------------------")
# [END analyze_with_highres]
Original file line number Diff line number Diff line change
@@ -39,6 +39,18 @@ def _in_span(word, spans):
return False


def format_bounding_region(bounding_regions):
if not bounding_regions:
return "N/A"
return ", ".join(f"Page #{region.page_number}: {format_polygon(region.polygon)}" for region in bounding_regions)


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


def analyze_general_documents():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -76,11 +88,14 @@ def analyze_general_documents():
if result.key_value_pairs:
for kv_pair in result.key_value_pairs:
if kv_pair.key:
print(f"Key '{kv_pair.key.content}' found within " f"'{kv_pair.key.bounding_regions}' bounding regions")
print(
f"Key '{kv_pair.key.content}' found within "
f"'{format_bounding_region(kv_pair.key.bounding_regions)}' bounding regions"
)
if kv_pair.value:
print(
f"Value '{kv_pair.value.content}' found within "
f"'{kv_pair.value.bounding_regions}' bounding regions\n"
f"'{format_bounding_region(kv_pair.value.bounding_regions)}' bounding regions\n"
)

for page in result.pages:
@@ -92,17 +107,18 @@ def analyze_general_documents():
words = get_words(page.words, line)
print(
f"...Line #{line_idx} has {len(words)} words and text '{line.content}' within "
f"bounding polygon '{line.polygon}'"
f"bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
if page.words:
for word in page.words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")

if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of "
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of "
f"{selection_mark.confidence}"
)

@@ -111,13 +127,15 @@ def analyze_general_documents():
print(f"Table # {table_idx} has {table.row_count} rows and {table.column_count} columns")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
print(
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
)
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
if cell.bounding_regions:
for region in cell.bounding_regions:
print(
f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'\n"
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'\n"
)
print("----------------------------------------")
# [END analyze_general_documents]
Original file line number Diff line number Diff line change
@@ -43,12 +43,24 @@ def _in_span(word, spans):
return False


def format_bounding_region(bounding_regions):
if not bounding_regions:
return "N/A"
return ", ".join(f"Page #{region.page_number}: {format_polygon(region.polygon)}" for region in bounding_regions)


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


def analyze_layout():
path_to_sample_documents = os.path.abspath(
os.path.join(
os.path.abspath(__file__),
"..",
"./sample_forms/forms/form_selection_mark.png",
"./sample_forms/forms/tabular_and_general_data.docx",
)
)

@@ -81,30 +93,47 @@ def analyze_layout():
words = get_words(page, line)
print(
f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
f"within bounding polygon '{line.polygon}'"
f"within bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
if page.words:
for word in page.words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")

if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
)

if result.paragraphs:
print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
# Sort all paragraphs by span's offset to read in the right order.
result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for paragraph in result.paragraphs:
print(
f"Found paragraph with role: '{paragraph.role}' within {format_bounding_region(paragraph.bounding_regions)} bounding region"
)
print(f"...with content: '{paragraph.content}'")
print(f"...with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

if result.tables:
for table_idx, table in enumerate(result.tables):
print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
print(
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
)
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
if cell.bounding_regions:
for region in cell.bounding_regions:
print(f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'")
print(
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'"
)

print("----------------------------------------")
# [END extract_layout]
Original file line number Diff line number Diff line change
@@ -39,6 +39,18 @@ def _in_span(word, spans):
return False


def format_bounding_region(bounding_regions):
if not bounding_regions:
return "N/A"
return ", ".join(f"Page #{region.page_number}: {format_polygon(region.polygon)}" for region in bounding_regions)


def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


def analyze_read():
path_to_sample_documents = os.path.abspath(
os.path.join(
@@ -88,31 +100,33 @@ def analyze_read():
for line_idx, line in enumerate(page.lines):
words = get_words(page, line)
print(
f"...Line # {line_idx} has {len(words)} words and text '{line.content}' within bounding polygon '{line.polygon}'"
f"...Line # {line_idx} has {len(words)} words and text '{line.content}' within "
f"bounding polygon '{format_polygon(line.polygon)}'"
)

for word in words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
if page.words:
for word in page.words:
print(f"......Word '{word.content}' has a confidence of {word.confidence}")

if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
)

if result.paragraphs:
print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
for paragraph in result.paragraphs:
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
print(f"...with content: '{paragraph.content}'")

# Sort all paragraphs by span's offset to read in the right order.
result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for idx, paragraph in enumerate(result.paragraphs):
for paragraph in result.paragraphs:
print(
f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}"
f"Found paragraph with role: '{paragraph.role}' within "
f"{format_bounding_region(paragraph.bounding_regions)} bounding regions"
)
print(f"...with content: '{paragraph.content}'")
print(f"...with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

print("----------------------------------------")

Binary file not shown.

0 comments on commit 15c50b7

Please sign in to comment.