Skip to content

Commit

Permalink
[ai-form-recognizer] Lazy iterator for words of a line (#18444)
Browse files Browse the repository at this point in the history
* [ai-form-recognizer] Lazy iterator for words of a line

* Use method instead of property

* Regenerate API

* Polished, wrote changelog, added some more tests, samples

* Updated API MD

* Improved docs

* Apply changes from review
  • Loading branch information
witemple-msft authored Nov 5, 2021
1 parent c28fe80 commit 9fa6c41
Show file tree
Hide file tree
Showing 10 changed files with 396 additions and 13 deletions.
6 changes: 2 additions & 4 deletions sdk/formrecognizer/ai-form-recognizer/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
# Release History

## 4.0.0-beta.2 (Unreleased)
## 4.0.0-beta.2 (2021-11-09)

### Features Added

### Breaking Changes
- Added a `words` method to `DocumentLine`. This method produces an `IterableIterator` that will yield all of the `DocumentWord`s that are contained by the line's `spans`. This allows accessing the words that are related to the line from the line itself.

### Bugs Fixed

### Other Changes

## 4.0.0-beta.1 (2021-10-07)

This new major version beta introduces a full redesign of the Azure Form Recognizer client library. To leverage features of the newest Form Recognizer service API (version "2021-09-30-preview" and newer), the new SDK is required, and application code must be changed to use the new clients. Please see the [Migration Guide](https://github.com/azure/azure-sdk-for-js/blob/main/sdk/formrecognizer/ai-form-recognizer/MIGRATION-v3_v4.md) for detailed instructions on how to update application code from version 3.x of the Form Recognizer SDK to the new version (4.x). The following sections contain an outline of the changes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ export interface DocumentLine {
boundingBox?: number[];
content: string;
spans: DocumentSpan[];
words: () => IterableIterator<DocumentWord>;
}

// @public
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@

import { AzureKeyCredential, DocumentModelAdministrationClient } from "@azure/ai-form-recognizer";

import * as dotenv from "dotenv";
dotenv.config();

async function main() {
const endpoint = process.env.FORM_RECOGNIZER_ENDPOINT ?? "<endpoint>";
const credential = new AzureKeyCredential(process.env.FORM_RECOGNIZER_API_KEY ?? "<api key>");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import { DocumentModelAdministrationClient, AzureKeyCredential } from "@azure/ai-form-recognizer";

// Load the .env file if it exists
import * as dotenv from "dotenv";
dotenv.config();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@

import { AzureKeyCredential, DocumentModelAdministrationClient } from "@azure/ai-form-recognizer";

import * as dotenv from "dotenv";
dotenv.config();

async function main() {
const endpoint = process.env.FORM_RECOGNIZER_ENDPOINT ?? "<endpoint>";
const credential = new AzureKeyCredential(process.env.FORM_RECOGNIZER_API_KEY ?? "<api key>");
Expand Down
14 changes: 14 additions & 0 deletions sdk/formrecognizer/ai-form-recognizer/samples-dev/extractLayout.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,20 @@ async function main() {
console.log("- Page", page.pageNumber, `(unit: ${page.unit})`);
console.log(` ${page.width}x${page.height}, angle: ${page.angle}`);
console.log(` ${page.lines.length} lines, ${page.words.length} words`);

if (page.lines.length > 0) {
console.log(" Lines:");

for (const line of page.lines) {
console.log(` - "${line.content}"`);

// The words of the line can also be iterated independently. The words are computed based on their
// corresponding spans.
for (const word of line.words()) {
console.log(` - "${word.content}"`);
}
}
}
}
}

Expand Down
4 changes: 2 additions & 2 deletions sdk/formrecognizer/ai-form-recognizer/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ export {
DocumentFieldType,
DocumentKeyValueElement,
DocumentKeyValuePair,
DocumentLine,
DocumentPage,
DocumentSelectionMark,
DocumentSignatureType,
DocumentSpan,
Expand All @@ -49,6 +47,8 @@ export {
export {
AnalysisPoller,
AnalyzeResult,
DocumentPage,
DocumentLine,
DocumentAnalysisPollOperationState,
AnalyzedDocument,
FormRecognizerRequestBody,
Expand Down
239 changes: 236 additions & 3 deletions sdk/formrecognizer/ai-form-recognizer/src/lro/analyze.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,14 @@ import {
Document as GeneratedDocument,
DocumentEntity,
DocumentKeyValuePair,
DocumentPage,
DocumentPage as GeneratedDocumentPage,
DocumentLine as GeneratedDocumentLine,
DocumentSelectionMark,
DocumentSpan,
DocumentStyle,
DocumentTable,
DocumentWord,
LengthUnit,
} from "../generated";
import { DocumentField, toAnalyzedDocumentFieldsFromGenerated } from "../models/fields";
import { FormRecognizerApiVersion, PollerOptions } from "../options";
Expand Down Expand Up @@ -67,7 +71,6 @@ export interface AnalyzedDocument {
* Transform a REST-level Document response object into the more strongly-typed AnalyzedDocument.
*
* @internal
*
* @param document - a REST-level document response object
* @returns an AnalyzedDocument (which has had its fields mapped to stronger DocumentField types)
*/
Expand Down Expand Up @@ -132,6 +135,236 @@ export interface AnalyzeResult<Document = AnalyzedDocument> {
documents: Document[];
}

/**
* A page within an analysis result.
*/
export interface DocumentPage {
/**
* 1-based page number in the input document.
*/
pageNumber: number;

/**
* The general orientation of the content in clockwise direction, measured in degrees between (-180, 180].
*/
angle: number;

/**
* The width of the image/PDF in pixels/inches, respectively.
*/
width: number;

/**
* The height of the image/PDF in pixels/inches, respectively.
*/
height: number;

/**
* The unit used by the width, height, and boundingBox properties. For images, the unit is "pixel". For PDF, the unit is "inch".
*/
unit: LengthUnit;

/**
* Location of the page in the reading order concatenated content.
*/
spans: DocumentSpan[];

/**
* Extracted words from the page.
*/
words: DocumentWord[];

/**
* Extracted selection marks from the page.
*/
selectionMarks?: DocumentSelectionMark[];

/**
* Extracted lines from the page, potentially containing both textual and visual elements.
*/
lines: DocumentLine[];
}

/**
* Convert a REST-level DocumentPage into a convenience layer version.
*
* @internal
* @param generated - a REST-level DocumentPage.
* @returns
*/
export function toDocumentPageFromGenerated(generated: GeneratedDocumentPage): DocumentPage {
// We will just overwrite the `lines` property with the transformed one rather than create a new object.
generated.lines = generated.lines.map((line) => toDocumentLineFromGenerated(line, generated));

return generated as DocumentPage;
}

/**
* A line of adjacent content elements on a page.
*/
export interface DocumentLine {
/**
* Concatenated content of the contained elements in reading order.
*/
content: string;

/**
* Bounding box of the line.
*/
boundingBox?: number[];

/**
* Location of the line in the reading order concatenated content.
*/
spans: DocumentSpan[];

/**
* Compute the `DocumentWord`s that are related to this line.
*
* This function produces a lazy iterator that will yield one word before computing the next.
*/
words: () => IterableIterator<DocumentWord>;
}

/**
* Tests if one span contains another, by testing that the outer span starts before or at the same character as the
* inner span, and that the end position of the outer span is greater than or equal to the end position of the inner
* span.
*
* @internal
* @param outer - the outer (potentially containing) span
* @param inner - the span to test if `outer` contains
* @returns true if `inner` is contained inside of `outer`.
*/
export function contains(outer: DocumentSpan, inner: DocumentSpan): boolean {
return outer.offset <= inner.offset && outer.offset + outer.length >= inner.offset + inner.length;
}

/**
* Make an empty generator. This might seem silly, but it's useful for satisfying invariants.
*/
function* empty(): Generator<never> {
/* intentionally empty */
}

/**
* Produces an iterator of the given items starting from the given index.
*
* @param items - the items to iterate over
* @param idx - the index of the first item to begin iterating from
*/
function* iterFrom<T>(items: T[], idx: number): Generator<T> {
let i = idx;

while (i < items.length) {
yield items[i++];
}
}

/**
* Binary search through an array of items to find the first item that could possibly be contained by the given span,
* then return an iterator beginning from that item.
*
* This allows a program to quickly find the first relevant item in the array for consideration when testing for span
* inclusion.
*
* @internal
* @param span - the span to use when testing each individual item
* @param items - an array of items to binary search through
* @returns an iterator beginning from the item identified by the search
*/
export function iteratorFromFirstMatchBinarySearch<Spanned extends { span: DocumentSpan }>(
span: DocumentSpan,
items: Spanned[]
): IterableIterator<Spanned> {
let idx = Math.floor(items.length / 2);
let prevIdx = idx;
let min = 0;
let max = items.length;

const found = (): boolean =>
// The item is found if it starts after the current span and the item before it does not. That means it is the first
// item in the array that could be a child if the spans are sorted.
items[idx].span.offset >= span.offset && (items[idx - 1]?.span?.offset ?? -1) < span.offset;

// Binary search to find the first element that could be a child
do {
if (found()) {
return iterFrom(items, idx);
} else if (span.offset > items[idx].span.offset) {
min = prevIdx = idx;
idx = Math.floor(idx + (max - idx) / 2);
} else {
max = prevIdx = idx;
idx = Math.floor(idx - (idx - min) / 2);
}
} while (idx !== prevIdx);

// This might seem weird, but it's a simple way to make the types a little more elegant.
return empty();
}

/**
* This fast algorithm tests the elements of `childArray` for inclusion in any of the given `spans`, assuming that both
* the spans and child items are sorted.
*
* INVARIANT: the items in both the `spans` iterator and `childrenArray` MUST BE SORTED INCREASING by span _offset_.
*
* @internal
* @param spans - the spans that contain the child elements
* @param childrenArray - an array of child items (items that have spans) to test for inclusion in the spans
* @returns - an IterableIterator of child items that are included in any span in the `spans` iterator
*/
export function* fastGetChildren<Spanned extends { span: DocumentSpan }>(
spans: Iterator<DocumentSpan>,
childrenArray: Spanned[]
): Generator<Spanned> {
let curSpan = spans.next();

// Need to exit early if there are no spans.
if (curSpan.done) {
return;
}

const children = iteratorFromFirstMatchBinarySearch(curSpan.value as DocumentSpan, childrenArray);
let curChild = children.next();

while (!(curChild.done || curSpan.done)) {
if (contains(curSpan.value, curChild.value.span)) {
// The span is contained, so yield the current child and advance it.
yield curChild.value;
curChild = children.next();
} else if (curSpan.value.offset + curSpan.value.length < curChild.value.span.offset) {
// The current span ends before the next potential child starts, so advance the span
curSpan = spans.next();
} else {
// The current child was not contained in the current span, so advance to the next child.
curChild = children.next();
}
}
}

/**
* Transforms a REST-level document line into a convenience layer version.
*
* @param generated - a REST-level DocumentLine
* @param page - the page where the DocumentLine appeared
* @returns a convenience layer DocumentLine
*/
function toDocumentLineFromGenerated(
generated: GeneratedDocumentLine,
page: GeneratedDocumentPage
): DocumentLine {
(generated as DocumentLine).words = () =>
fastGetChildren(iterFrom(generated.spans, 0), page.words);

Object.defineProperty(generated, "words", {
enumerable: false,
});

return generated as DocumentLine;
}

/**
* The state of an analysis operation, which will eventually produce the result type that corresponds to the model.
*/
Expand Down Expand Up @@ -192,7 +425,7 @@ export function toAnalyzeResultFromGenerated<
apiVersion: result.apiVersion as FormRecognizerApiVersion,
modelId: result.modelId,
content: result.content,
pages: result.pages,
pages: result.pages.map((page) => toDocumentPageFromGenerated(page)),
tables: result.tables ?? [],
keyValuePairs: result.keyValuePairs ?? [],
entities: result.entities ?? [],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

import { DocumentPage, DocumentStyle, DocumentTable } from "../generated";
import { AnalyzeResult } from "../lro/analyze";
import { DocumentStyle, DocumentTable } from "../generated";
import { AnalyzeResult, DocumentPage, toDocumentPageFromGenerated } from "../lro/analyze";

/**
* Extract from an AnalyzeResult the fields that are produced from layout analysis.
Expand All @@ -12,7 +12,7 @@ export function toLayoutResult(analyzeResult: AnalyzeResult<unknown>): LayoutRes
const { pages, tables, styles } = analyzeResult;

return {
pages,
pages: pages.map(toDocumentPageFromGenerated),
tables,
styles,
};
Expand Down
Loading

0 comments on commit 9fa6c41

Please sign in to comment.