Skip to content

Commit

Permalink
community: bytes as a source to AzureAIDocumentIntelligenceLoader (l…
Browse files Browse the repository at this point in the history
…angchain-ai#26618)

- **Description:** This PR adds functionality to pass in in-memory bytes
as a source to `AzureAIDocumentIntelligenceLoader`.
- **Issue:** I needed the functionality, so I added it.
- **Dependencies:** NA
- **Twitter handle:** @akseljoonas if this is a big enough change :)

---------

Co-authored-by: Aksel Joonas Reedi <aksel@klippa.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
  • Loading branch information
3 people authored and yanomaly committed Nov 8, 2024
1 parent f7c9748 commit 5838d0f
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(
api_key: str,
file_path: Optional[str] = None,
url_path: Optional[str] = None,
bytes_source: Optional[bytes] = None,
api_version: Optional[str] = None,
api_model: str = "prebuilt-layout",
mode: str = "markdown",
Expand All @@ -41,10 +42,13 @@ def __init__(
The API key to use for DocumentIntelligenceClient construction.
file_path : Optional[str]
The path to the file that needs to be loaded.
Either file_path or url_path must be specified.
Either file_path, url_path or bytes_source must be specified.
url_path : Optional[str]
The URL to the file that needs to be loaded.
Either file_path or url_path must be specified.
Either file_path, url_path or bytes_source must be specified.
bytes_source : Optional[bytes]
The bytes array of the file that needs to be loaded.
Either file_path, url_path or bytes_source must be specified.
api_version: Optional[str]
The API version for DocumentIntelligenceClient. Setting None to use
the default value from `azure-ai-documentintelligence` package.
Expand Down Expand Up @@ -73,10 +77,11 @@ def __init__(
"""

assert (
file_path is not None or url_path is not None
), "file_path or url_path must be provided"
file_path is not None or url_path is not None or bytes_source is not None
), "file_path, url_path or bytes_source must be provided"
self.file_path = file_path
self.url_path = url_path
self.bytes_source = bytes_source

self.parser = AzureAIDocumentIntelligenceParser( # type: ignore[misc]
api_endpoint=api_endpoint,
Expand All @@ -90,9 +95,13 @@ def __init__(
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
"""Lazy load the document as pages."""
if self.file_path is not None:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
else:
elif self.url_path is not None:
yield from self.parser.parse_url(self.url_path) # type: ignore[arg-type]
elif self.bytes_source is not None:
yield from self.parser.parse_bytes(self.bytes_source)
else:
raise ValueError("No data source provided.")
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,21 @@ def parse_url(self, url: str) -> Iterator[Document]:
yield from self._generate_docs_page(result)
else:
raise ValueError(f"Invalid mode: {self.mode}")

def parse_bytes(self, bytes_source: bytes) -> Iterator[Document]:
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest

poller = self.client.begin_analyze_document(
self.api_model,
analyze_request=AnalyzeDocumentRequest(bytes_source=bytes_source),
# content_type="application/octet-stream",
output_content_format="markdown" if self.mode == "markdown" else "text",
)
result = poller.result()

if self.mode in ["single", "markdown"]:
yield from self._generate_docs_single(result)
elif self.mode in ["page"]:
yield from self._generate_docs_page(result)
else:
raise ValueError(f"Invalid mode: {self.mode}")

0 comments on commit 5838d0f

Please sign in to comment.