From 9fa6c41b71c58e4918de243151b2173abc8cb305 Mon Sep 17 00:00:00 2001 From: Will Temple Date: Fri, 5 Nov 2021 19:17:54 -0400 Subject: [PATCH] [ai-form-recognizer] Lazy iterator for words of a line (#18444) * [ai-form-recognizer] Lazy iterator for words of a line * Use method instead of property * Regenerate API * Polished, wrote changelog, added some more tests, samples * Updated API MD * Improved docs * Apply changes from review --- .../ai-form-recognizer/CHANGELOG.md | 6 +- .../review/ai-form-recognizer.api.md | 1 + .../samples-dev/buildModel.ts | 3 + .../samples-dev/composeModel.ts | 1 - .../samples-dev/copyModel.ts | 3 + .../samples-dev/extractLayout.ts | 14 + .../ai-form-recognizer/src/index.ts | 4 +- .../ai-form-recognizer/src/lro/analyze.ts | 239 +++++++++++++++++- .../src/models/LayoutResult.ts | 6 +- .../test/private/getChildren.spec.ts | 132 ++++++++++ 10 files changed, 396 insertions(+), 13 deletions(-) create mode 100644 sdk/formrecognizer/ai-form-recognizer/test/private/getChildren.spec.ts diff --git a/sdk/formrecognizer/ai-form-recognizer/CHANGELOG.md b/sdk/formrecognizer/ai-form-recognizer/CHANGELOG.md index 520ad487b026..ef9b82f1bcb4 100644 --- a/sdk/formrecognizer/ai-form-recognizer/CHANGELOG.md +++ b/sdk/formrecognizer/ai-form-recognizer/CHANGELOG.md @@ -1,15 +1,13 @@ # Release History -## 4.0.0-beta.2 (Unreleased) +## 4.0.0-beta.2 (2021-11-09) ### Features Added -### Breaking Changes +- Added a `words` method to `DocumentLine`. This method produces an `IterableIterator` that will yield all of the `DocumentWord`s that are contained by the line's `spans`. This allows accessing the words that are related to the line from the line itself. ### Bugs Fixed -### Other Changes - ## 4.0.0-beta.1 (2021-10-07) This new major version beta introduces a full redesign of the Azure Form Recognizer client library. To leverage features of the newest Form Recognizer service API (version "2021-09-30-preview" and newer), the new SDK is required, and application code must be changed to use the new clients. Please see the [Migration Guide](https://github.com/azure/azure-sdk-for-js/blob/main/sdk/formrecognizer/ai-form-recognizer/MIGRATION-v3_v4.md) for detailed instructions on how to update application code from version 3.x of the Form Recognizer SDK to the new version (4.x). The following sections contain an outline of the changes. diff --git a/sdk/formrecognizer/ai-form-recognizer/review/ai-form-recognizer.api.md b/sdk/formrecognizer/ai-form-recognizer/review/ai-form-recognizer.api.md index abea07dd7b8c..8a88a0499986 100644 --- a/sdk/formrecognizer/ai-form-recognizer/review/ai-form-recognizer.api.md +++ b/sdk/formrecognizer/ai-form-recognizer/review/ai-form-recognizer.api.md @@ -337,6 +337,7 @@ export interface DocumentLine { boundingBox?: number[]; content: string; spans: DocumentSpan[]; + words: () => IterableIterator; } // @public diff --git a/sdk/formrecognizer/ai-form-recognizer/samples-dev/buildModel.ts b/sdk/formrecognizer/ai-form-recognizer/samples-dev/buildModel.ts index c216cd1b8953..563de8d6236c 100644 --- a/sdk/formrecognizer/ai-form-recognizer/samples-dev/buildModel.ts +++ b/sdk/formrecognizer/ai-form-recognizer/samples-dev/buildModel.ts @@ -15,6 +15,9 @@ import { AzureKeyCredential, DocumentModelAdministrationClient } from "@azure/ai-form-recognizer"; +import * as dotenv from "dotenv"; +dotenv.config(); + async function main() { const endpoint = process.env.FORM_RECOGNIZER_ENDPOINT ?? ""; const credential = new AzureKeyCredential(process.env.FORM_RECOGNIZER_API_KEY ?? ""); diff --git a/sdk/formrecognizer/ai-form-recognizer/samples-dev/composeModel.ts b/sdk/formrecognizer/ai-form-recognizer/samples-dev/composeModel.ts index cb6b7170822e..9a265b7a00d8 100644 --- a/sdk/formrecognizer/ai-form-recognizer/samples-dev/composeModel.ts +++ b/sdk/formrecognizer/ai-form-recognizer/samples-dev/composeModel.ts @@ -14,7 +14,6 @@ import { DocumentModelAdministrationClient, AzureKeyCredential } from "@azure/ai-form-recognizer"; -// Load the .env file if it exists import * as dotenv from "dotenv"; dotenv.config(); diff --git a/sdk/formrecognizer/ai-form-recognizer/samples-dev/copyModel.ts b/sdk/formrecognizer/ai-form-recognizer/samples-dev/copyModel.ts index 3cfb0a947a9d..f68ff0e04eeb 100644 --- a/sdk/formrecognizer/ai-form-recognizer/samples-dev/copyModel.ts +++ b/sdk/formrecognizer/ai-form-recognizer/samples-dev/copyModel.ts @@ -10,6 +10,9 @@ import { AzureKeyCredential, DocumentModelAdministrationClient } from "@azure/ai-form-recognizer"; +import * as dotenv from "dotenv"; +dotenv.config(); + async function main() { const endpoint = process.env.FORM_RECOGNIZER_ENDPOINT ?? ""; const credential = new AzureKeyCredential(process.env.FORM_RECOGNIZER_API_KEY ?? ""); diff --git a/sdk/formrecognizer/ai-form-recognizer/samples-dev/extractLayout.ts b/sdk/formrecognizer/ai-form-recognizer/samples-dev/extractLayout.ts index 41eb92383b1c..565547d9fb26 100644 --- a/sdk/formrecognizer/ai-form-recognizer/samples-dev/extractLayout.ts +++ b/sdk/formrecognizer/ai-form-recognizer/samples-dev/extractLayout.ts @@ -37,6 +37,20 @@ async function main() { console.log("- Page", page.pageNumber, `(unit: ${page.unit})`); console.log(` ${page.width}x${page.height}, angle: ${page.angle}`); console.log(` ${page.lines.length} lines, ${page.words.length} words`); + + if (page.lines.length > 0) { + console.log(" Lines:"); + + for (const line of page.lines) { + console.log(` - "${line.content}"`); + + // The words of the line can also be iterated independently. The words are computed based on their + // corresponding spans. + for (const word of line.words()) { + console.log(` - "${word.content}"`); + } + } + } } } diff --git a/sdk/formrecognizer/ai-form-recognizer/src/index.ts b/sdk/formrecognizer/ai-form-recognizer/src/index.ts index 71d2af0db2e3..b2ec865d8cea 100644 --- a/sdk/formrecognizer/ai-form-recognizer/src/index.ts +++ b/sdk/formrecognizer/ai-form-recognizer/src/index.ts @@ -22,8 +22,6 @@ export { DocumentFieldType, DocumentKeyValueElement, DocumentKeyValuePair, - DocumentLine, - DocumentPage, DocumentSelectionMark, DocumentSignatureType, DocumentSpan, @@ -49,6 +47,8 @@ export { export { AnalysisPoller, AnalyzeResult, + DocumentPage, + DocumentLine, DocumentAnalysisPollOperationState, AnalyzedDocument, FormRecognizerRequestBody, diff --git a/sdk/formrecognizer/ai-form-recognizer/src/lro/analyze.ts b/sdk/formrecognizer/ai-form-recognizer/src/lro/analyze.ts index 5fac1d5a2f02..8befee9cec8b 100644 --- a/sdk/formrecognizer/ai-form-recognizer/src/lro/analyze.ts +++ b/sdk/formrecognizer/ai-form-recognizer/src/lro/analyze.ts @@ -11,10 +11,14 @@ import { Document as GeneratedDocument, DocumentEntity, DocumentKeyValuePair, - DocumentPage, + DocumentPage as GeneratedDocumentPage, + DocumentLine as GeneratedDocumentLine, + DocumentSelectionMark, DocumentSpan, DocumentStyle, DocumentTable, + DocumentWord, + LengthUnit, } from "../generated"; import { DocumentField, toAnalyzedDocumentFieldsFromGenerated } from "../models/fields"; import { FormRecognizerApiVersion, PollerOptions } from "../options"; @@ -67,7 +71,6 @@ export interface AnalyzedDocument { * Transform a REST-level Document response object into the more strongly-typed AnalyzedDocument. * * @internal - * * @param document - a REST-level document response object * @returns an AnalyzedDocument (which has had its fields mapped to stronger DocumentField types) */ @@ -132,6 +135,236 @@ export interface AnalyzeResult { documents: Document[]; } +/** + * A page within an analysis result. + */ +export interface DocumentPage { + /** + * 1-based page number in the input document. + */ + pageNumber: number; + + /** + * The general orientation of the content in clockwise direction, measured in degrees between (-180, 180]. + */ + angle: number; + + /** + * The width of the image/PDF in pixels/inches, respectively. + */ + width: number; + + /** + * The height of the image/PDF in pixels/inches, respectively. + */ + height: number; + + /** + * The unit used by the width, height, and boundingBox properties. For images, the unit is "pixel". For PDF, the unit is "inch". + */ + unit: LengthUnit; + + /** + * Location of the page in the reading order concatenated content. + */ + spans: DocumentSpan[]; + + /** + * Extracted words from the page. + */ + words: DocumentWord[]; + + /** + * Extracted selection marks from the page. + */ + selectionMarks?: DocumentSelectionMark[]; + + /** + * Extracted lines from the page, potentially containing both textual and visual elements. + */ + lines: DocumentLine[]; +} + +/** + * Convert a REST-level DocumentPage into a convenience layer version. + * + * @internal + * @param generated - a REST-level DocumentPage. + * @returns + */ +export function toDocumentPageFromGenerated(generated: GeneratedDocumentPage): DocumentPage { + // We will just overwrite the `lines` property with the transformed one rather than create a new object. + generated.lines = generated.lines.map((line) => toDocumentLineFromGenerated(line, generated)); + + return generated as DocumentPage; +} + +/** + * A line of adjacent content elements on a page. + */ +export interface DocumentLine { + /** + * Concatenated content of the contained elements in reading order. + */ + content: string; + + /** + * Bounding box of the line. + */ + boundingBox?: number[]; + + /** + * Location of the line in the reading order concatenated content. + */ + spans: DocumentSpan[]; + + /** + * Compute the `DocumentWord`s that are related to this line. + * + * This function produces a lazy iterator that will yield one word before computing the next. + */ + words: () => IterableIterator; +} + +/** + * Tests if one span contains another, by testing that the outer span starts before or at the same character as the + * inner span, and that the end position of the outer span is greater than or equal to the end position of the inner + * span. + * + * @internal + * @param outer - the outer (potentially containing) span + * @param inner - the span to test if `outer` contains + * @returns true if `inner` is contained inside of `outer`. + */ +export function contains(outer: DocumentSpan, inner: DocumentSpan): boolean { + return outer.offset <= inner.offset && outer.offset + outer.length >= inner.offset + inner.length; +} + +/** + * Make an empty generator. This might seem silly, but it's useful for satisfying invariants. + */ +function* empty(): Generator { + /* intentionally empty */ +} + +/** + * Produces an iterator of the given items starting from the given index. + * + * @param items - the items to iterate over + * @param idx - the index of the first item to begin iterating from + */ +function* iterFrom(items: T[], idx: number): Generator { + let i = idx; + + while (i < items.length) { + yield items[i++]; + } +} + +/** + * Binary search through an array of items to find the first item that could possibly be contained by the given span, + * then return an iterator beginning from that item. + * + * This allows a program to quickly find the first relevant item in the array for consideration when testing for span + * inclusion. + * + * @internal + * @param span - the span to use when testing each individual item + * @param items - an array of items to binary search through + * @returns an iterator beginning from the item identified by the search + */ +export function iteratorFromFirstMatchBinarySearch( + span: DocumentSpan, + items: Spanned[] +): IterableIterator { + let idx = Math.floor(items.length / 2); + let prevIdx = idx; + let min = 0; + let max = items.length; + + const found = (): boolean => + // The item is found if it starts after the current span and the item before it does not. That means it is the first + // item in the array that could be a child if the spans are sorted. + items[idx].span.offset >= span.offset && (items[idx - 1]?.span?.offset ?? -1) < span.offset; + + // Binary search to find the first element that could be a child + do { + if (found()) { + return iterFrom(items, idx); + } else if (span.offset > items[idx].span.offset) { + min = prevIdx = idx; + idx = Math.floor(idx + (max - idx) / 2); + } else { + max = prevIdx = idx; + idx = Math.floor(idx - (idx - min) / 2); + } + } while (idx !== prevIdx); + + // This might seem weird, but it's a simple way to make the types a little more elegant. + return empty(); +} + +/** + * This fast algorithm tests the elements of `childArray` for inclusion in any of the given `spans`, assuming that both + * the spans and child items are sorted. + * + * INVARIANT: the items in both the `spans` iterator and `childrenArray` MUST BE SORTED INCREASING by span _offset_. + * + * @internal + * @param spans - the spans that contain the child elements + * @param childrenArray - an array of child items (items that have spans) to test for inclusion in the spans + * @returns - an IterableIterator of child items that are included in any span in the `spans` iterator + */ +export function* fastGetChildren( + spans: Iterator, + childrenArray: Spanned[] +): Generator { + let curSpan = spans.next(); + + // Need to exit early if there are no spans. + if (curSpan.done) { + return; + } + + const children = iteratorFromFirstMatchBinarySearch(curSpan.value as DocumentSpan, childrenArray); + let curChild = children.next(); + + while (!(curChild.done || curSpan.done)) { + if (contains(curSpan.value, curChild.value.span)) { + // The span is contained, so yield the current child and advance it. + yield curChild.value; + curChild = children.next(); + } else if (curSpan.value.offset + curSpan.value.length < curChild.value.span.offset) { + // The current span ends before the next potential child starts, so advance the span + curSpan = spans.next(); + } else { + // The current child was not contained in the current span, so advance to the next child. + curChild = children.next(); + } + } +} + +/** + * Transforms a REST-level document line into a convenience layer version. + * + * @param generated - a REST-level DocumentLine + * @param page - the page where the DocumentLine appeared + * @returns a convenience layer DocumentLine + */ +function toDocumentLineFromGenerated( + generated: GeneratedDocumentLine, + page: GeneratedDocumentPage +): DocumentLine { + (generated as DocumentLine).words = () => + fastGetChildren(iterFrom(generated.spans, 0), page.words); + + Object.defineProperty(generated, "words", { + enumerable: false, + }); + + return generated as DocumentLine; +} + /** * The state of an analysis operation, which will eventually produce the result type that corresponds to the model. */ @@ -192,7 +425,7 @@ export function toAnalyzeResultFromGenerated< apiVersion: result.apiVersion as FormRecognizerApiVersion, modelId: result.modelId, content: result.content, - pages: result.pages, + pages: result.pages.map((page) => toDocumentPageFromGenerated(page)), tables: result.tables ?? [], keyValuePairs: result.keyValuePairs ?? [], entities: result.entities ?? [], diff --git a/sdk/formrecognizer/ai-form-recognizer/src/models/LayoutResult.ts b/sdk/formrecognizer/ai-form-recognizer/src/models/LayoutResult.ts index 421153c0652d..77e8eb8834f7 100644 --- a/sdk/formrecognizer/ai-form-recognizer/src/models/LayoutResult.ts +++ b/sdk/formrecognizer/ai-form-recognizer/src/models/LayoutResult.ts @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -import { DocumentPage, DocumentStyle, DocumentTable } from "../generated"; -import { AnalyzeResult } from "../lro/analyze"; +import { DocumentStyle, DocumentTable } from "../generated"; +import { AnalyzeResult, DocumentPage, toDocumentPageFromGenerated } from "../lro/analyze"; /** * Extract from an AnalyzeResult the fields that are produced from layout analysis. @@ -12,7 +12,7 @@ export function toLayoutResult(analyzeResult: AnalyzeResult): LayoutRes const { pages, tables, styles } = analyzeResult; return { - pages, + pages: pages.map(toDocumentPageFromGenerated), tables, styles, }; diff --git a/sdk/formrecognizer/ai-form-recognizer/test/private/getChildren.spec.ts b/sdk/formrecognizer/ai-form-recognizer/test/private/getChildren.spec.ts new file mode 100644 index 000000000000..cd5a34f214df --- /dev/null +++ b/sdk/formrecognizer/ai-form-recognizer/test/private/getChildren.spec.ts @@ -0,0 +1,132 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +import { DocumentSpan } from "../../src"; +import { + contains, + fastGetChildren, + iteratorFromFirstMatchBinarySearch, +} from "../../src/lro/analyze"; + +import { assert } from "chai"; + +interface TestData { + id: number; + span: DocumentSpan; +} + +let currentId = 0; + +// eslint-disable-next-line @typescript-eslint/no-redeclare +function TestData(offset: number, length: number): TestData { + return { + id: currentId++, + span: { + offset, + length, + }, + }; +} + +const TEST_DATA: TestData[] = [ + TestData(0, 0), + TestData(0, 1), + TestData(2, 1), + TestData(3, 6), + TestData(9, 2), + TestData(9, 2), + TestData(11, 5), +]; + +/** + * A utility function to coerce a value or array of values into an iterator. + * @param values - a value or array of values + * @returns - an Iterator over all of `values` + */ +function intoIter(values: T | T[]): IterableIterator { + if (Array.isArray(values)) { + return values[Symbol.iterator](); + } else { + return [values][Symbol.iterator](); + } +} + +function naiveGetChildren( + spans: DocumentSpan[], + items: T[] +): T[] { + const arr = [] as T[]; + + for (const span of spans) { + for (const item of items) { + if (contains(span, item.span)) { + arr.push(item); + } + } + } + + return arr; +} + +function naiveFindFirst( + span: DocumentSpan, + items: T[] +): T | undefined { + for (const item of items) { + if (item.span.offset >= span.offset) { + return item; + } + } + + return undefined; +} + +describe("get children", function () { + it("simple inclusion", () => { + const testSpan = { offset: 1, length: 3 }; + const result = [...fastGetChildren(intoIter(testSpan), TEST_DATA)].map(({ id }) => id); + + assert.deepStrictEqual(result, [2]); + assert.deepStrictEqual( + result, + naiveGetChildren([testSpan], TEST_DATA).map(({ id }) => id) + ); + }); + + it("all span identities", () => { + for (const { id: dataId, span } of TEST_DATA) { + const result = [...fastGetChildren(intoIter(span), TEST_DATA)].map(({ id }) => id); + + assert.include(result, dataId); + + assert.deepStrictEqual( + result, + naiveGetChildren([span], TEST_DATA).map(({ id }) => id) + ); + } + }); + + it("zero size inclusion", () => { + const testSpan = { offset: 0, length: 1 }; + const result = [...fastGetChildren(intoIter(testSpan), TEST_DATA)].map(({ id }) => id); + + assert.deepStrictEqual(result, [0, 1]); + + assert.deepStrictEqual( + result, + naiveGetChildren([testSpan], TEST_DATA).map(({ id }) => id) + ); + }); + + describe("binary search", function () { + it("search finds correct index", () => { + for (const datum of TEST_DATA) { + const testSpan = { offset: datum.span.offset, length: 1 }; + assert.strictEqual( + (iteratorFromFirstMatchBinarySearch(testSpan, TEST_DATA).next().value as TestData)?.id, + naiveFindFirst(testSpan, TEST_DATA)?.id + ); + } + }); + }); +});