-
Notifications
You must be signed in to change notification settings - Fork 481
/
Copy pathLayout.py
25 lines (22 loc) · 887 Bytes
/
Layout.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from typing import List
from .DocumentLoadingBase import DocumentLoadingBase
from ..helpers.AzureFormRecognizerHelper import AzureFormRecognizerClient
from ..common.SourceDocument import SourceDocument
class LayoutDocumentLoading(DocumentLoadingBase):
def __init__(self) -> None:
super().__init__()
def load(self, document_url: str) -> List[SourceDocument]:
azure_form_recognizer_client = AzureFormRecognizerClient()
pages_content = azure_form_recognizer_client.begin_analyze_document_from_url(
document_url, use_layout=True
)
documents = [
SourceDocument(
content=page["page_text"],
source=document_url,
offset=page["offset"],
page_number=page["page_number"],
)
for page in pages_content
]
return documents