Merge branch '145-bumping-version' of https://github.com/enoch3712/Ex…

…tractThinker into 145-bumping-version
enoch3712 · Dec 23, 2024 · a6dafad · a6dafad
2 parents 89f069f + d2ac713
commit a6dafad
Showing 3 changed files with 137 additions and 1 deletion.
diff --git a/extract_thinker/image_splitter.py b/extract_thinker/image_splitter.py
@@ -193,6 +193,10 @@ def split_eager_doc_group(self, document: List[dict], classifications: List[Clas
                     "url": f"data:image/jpeg;base64,{base64_image}"
                 }
             })
+
+        # # messages to string json to debug
+        # import json
+        # content_json = json.dumps(messages, indent=4)
 
         try:
             response = self.client.chat.completions.create(

diff --git a/tests/test_document_loader_aws_textract.py b/tests/test_document_loader_aws_textract.py
@@ -44,4 +44,4 @@ def test_vision_mode(self, loader, test_file_path):
             assert "content" in page
             if loader.can_handle_vision(test_file_path):
                 assert "image" in page
-                assert isinstance(page["image"], bytes)
+                assert isinstance(page["image"], bytes)
diff --git a/tests/test_google_stack.py b/tests/test_google_stack.py
@@ -0,0 +1,132 @@
+from io import BytesIO
+import os
+from typing import List, Optional
+from dotenv import load_dotenv
+from extract_thinker import (
+    Classification, 
+    Extractor, 
+    ImageSplitter, 
+    Process, 
+    SplittingStrategy,
+    Contract,
+    DocumentLoaderDocumentAI
+)
+from pydantic import BaseModel, field_validator
+
+from extract_thinker.document_loader.document_loader_aws_textract import DocumentLoaderAWSTextract
+from extract_thinker.document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm
+from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf
+from extract_thinker.text_splitter import TextSplitter
+
+# from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf
+
+load_dotenv()
+
+# Define test constants
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+BULK_DOC_PATH = os.path.join(CURRENT_DIR, "files", "bulk.pdf")
+GOOGLE_CREDENTIALS_PATH = os.path.join(CURRENT_DIR, "credentials", "google_credentials.json")
+
+class InvoiceLine(BaseModel):
+    description: str
+    quantity: int
+    unit_price: float
+    amount: float
+
+    @field_validator('quantity', mode='before')
+    def convert_quantity_to_int(cls, v):
+        if isinstance(v, float):
+            return int(v)
+        return v
+
+class VehicleRegistration(Contract):
+    name_primary: str
+    name_last: Optional[str]
+    address: str
+    vehicle_type: str
+    vehicle_color: str
+
+class DriverLicense(Contract):
+    name: str
+    age: int
+    license_number: str
+
+def setup_process_with_document_ai():
+    """Helper function to set up process with Google Document AI"""
+    # Set required environment variables
+    os.environ["VERTEXAI_PROJECT"] = "extractthinker"
+    os.environ["VERTEXAI_LOCATION"] = "us-central1"
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/regina/Downloads/extractthinker-eb7d824a7f67.json"
+
+    # Initialize document loader
+    document_loader = DocumentLoaderDocumentAI(
+        project_id="496372363784",
+        location="eu",
+        processor_id="9203063ba6e697ce",
+        credentials=os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    )
+
+    # document_loader = DocumentLoaderPyPdf()
+
+    # document_loader = DocumentLoaderAzureForm(
+    #     subscription_key=os.getenv("AZURE_SUBSCRIPTION_KEY"),
+    #     endpoint=os.getenv("AZURE_ENDPOINT")
+    # )
+
+    # Convert file to BytesIO before passing to loader
+    with open("tests/files/bulk.pdf", "rb") as file:
+        bytes_content = BytesIO(file.read())
+        content = document_loader.load(bytes_content)
+
+    # Initialize extractor
+    extractor = Extractor()
+    extractor.load_document_loader(document_loader)
+    extractor.load_llm("vertex_ai/gemini-2.0-flash-exp")
+
+    # Create classifications
+    classifications = [
+        Classification(
+            name="Vehicle Registration",
+            description="This is a vehicle registration document",
+            contract=VehicleRegistration,
+            extractor=extractor
+        ),
+        Classification(
+            name="Driver License",
+            description="This is a driver license document",
+            contract=DriverLicense,
+            extractor=extractor
+        )
+    ]
+
+    # Initialize process
+    process = Process()
+    process.load_document_loader(document_loader)
+    process.load_splitter(ImageSplitter("vertex_ai/gemini-2.0-flash-exp"))
+
+    return process, classifications
+
+def test_document_ai_eager_splitting():
+    """Test eager splitting strategy with Document AI loader"""
+    # Arrange
+    process, classifications = setup_process_with_document_ai()
+
+    # Act
+    result = process.load_file(BULK_DOC_PATH)\
+        .split(classifications, strategy=SplittingStrategy.EAGER)\
+        .extract(vision=True)
+
+    # Assert
+    assert result is not None
+    for item in result:
+        assert isinstance(item, (VehicleRegistration, DriverLicense))
+
+    # Verify vehicle registration data
+    assert result[0].name_primary == "Motorist, Michael M"
+
+    # Verify driver license data
+    assert result[1].age == 65
+    assert result[1].license_number.replace(" ", "") in ["0123456789", "123456789"]
+
+if __name__ == "__main__":
+    test_document_ai_eager_splitting()