community: bytes as a source to AzureAIDocumentIntelligenceLoader (l…

…angchain-ai#26618) - **Description:** This PR adds functionality to pass in in-memory bytes as a source to `AzureAIDocumentIntelligenceLoader`. - **Issue:** I needed the functionality, so I added it. - **Dependencies:** NA - **Twitter handle:** @akseljoonas if this is a big enough change :) --------- Co-authored-by: Aksel Joonas Reedi <aksel@klippa.com> Co-authored-by: Erick Friis <erick@langchain.dev>
yanomaly · Nov 8, 2024 · 5838d0f · 5838d0f
1 parent f7c9748
commit 5838d0f
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 6 deletions.
diff --git a/libs/community/langchain_community/document_loaders/doc_intelligence.py b/libs/community/langchain_community/document_loaders/doc_intelligence.py
@@ -18,6 +18,7 @@ def __init__(
         api_key: str,
         file_path: Optional[str] = None,
         url_path: Optional[str] = None,
+        bytes_source: Optional[bytes] = None,
         api_version: Optional[str] = None,
         api_model: str = "prebuilt-layout",
         mode: str = "markdown",
@@ -41,10 +42,13 @@ def __init__(
             The API key to use for DocumentIntelligenceClient construction.
         file_path : Optional[str]
             The path to the file that needs to be loaded.
-            Either file_path or url_path must be specified.
+            Either file_path, url_path or bytes_source must be specified.
         url_path : Optional[str]
             The URL to the file that needs to be loaded.
-            Either file_path or url_path must be specified.
+            Either file_path, url_path or bytes_source must be specified.
+        bytes_source : Optional[bytes]
+            The bytes array of the file that needs to be loaded.
+            Either file_path, url_path or bytes_source must be specified.
         api_version: Optional[str]
             The API version for DocumentIntelligenceClient. Setting None to use
             the default value from `azure-ai-documentintelligence` package.
@@ -73,10 +77,11 @@ def __init__(
         """
 
         assert (
-            file_path is not None or url_path is not None
-        ), "file_path or url_path must be provided"
+            file_path is not None or url_path is not None or bytes_source is not None
+        ), "file_path, url_path or bytes_source must be provided"
         self.file_path = file_path
         self.url_path = url_path
+        self.bytes_source = bytes_source
 
         self.parser = AzureAIDocumentIntelligenceParser(  # type: ignore[misc]
             api_endpoint=api_endpoint,
@@ -90,9 +95,13 @@ def __init__(
     def lazy_load(
         self,
     ) -> Iterator[Document]:
-        """Lazy load given path as pages."""
+        """Lazy load the document as pages."""
         if self.file_path is not None:
             blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
             yield from self.parser.parse(blob)
-        else:
+        elif self.url_path is not None:
             yield from self.parser.parse_url(self.url_path)  # type: ignore[arg-type]
+        elif self.bytes_source is not None:
+            yield from self.parser.parse_bytes(self.bytes_source)
+        else:
+            raise ValueError("No data source provided.")
diff --git a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
@@ -109,3 +109,21 @@ def parse_url(self, url: str) -> Iterator[Document]:
             yield from self._generate_docs_page(result)
         else:
             raise ValueError(f"Invalid mode: {self.mode}")
+
+    def parse_bytes(self, bytes_source: bytes) -> Iterator[Document]:
+        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+
+        poller = self.client.begin_analyze_document(
+            self.api_model,
+            analyze_request=AnalyzeDocumentRequest(bytes_source=bytes_source),
+            # content_type="application/octet-stream",
+            output_content_format="markdown" if self.mode == "markdown" else "text",
+        )
+        result = poller.result()
+
+        if self.mode in ["single", "markdown"]:
+            yield from self._generate_docs_single(result)
+        elif self.mode in ["page"]:
+            yield from self._generate_docs_page(result)
+        else:
+            raise ValueError(f"Invalid mode: {self.mode}")