Merge pull request #30 from enoch3712/29-fix-tests-100-success-rate-a…

…nd-run-on-the-deployment-pipeline 29 fix tests 100 success rate and run on the deployment pipeline
enoch3712 · Sep 25, 2024 · 1718465 · 1718465
2 parents f1a611a + 3e6ee35
commit 1718465
Show file tree

Hide file tree

Showing 12 changed files with 37 additions and 20 deletions.
diff --git a/examples/invoice.pdf b/examples/invoice.pdf
diff --git a/extract_thinker/extractor.py b/extract_thinker/extractor.py
@@ -276,8 +276,8 @@ def _extract(self,
 
         if content is not None:
             if isinstance(content, dict):
-                if content["is_spreadsheet"]:
-                    content = json_to_formatted_string(content["data"])
+                if content.get("is_spreadsheet", False):
+                    content = json_to_formatted_string(content.get("data", {}))
                 content = yaml.dump(content)
             messages.append({"role": "user", "content": "##Content\n\n" + content})
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "extract_thinker"
-version = "0.0.12"
+version = "0.0.13"
 description = "Library to extract data from files and documents agnositicaly using LLMs"
 authors = ["Júlio Almeida <enoch3712@gmail.com>"]
 readme = "README.md"

diff --git a/tests/files/invoice.pdf b/tests/files/invoice.pdf
diff --git a/tests/classify.py → tests/test_classify.py b/tests/classify.py → tests/test_classify.py
@@ -17,8 +17,8 @@
 load_dotenv()
 tesseract_path = os.getenv("TESSERACT_PATH")
 CURRENT_WORKING_DIRECTORY = os.getcwd()
-INVOICE_FILE_PATH = os.path.join(CURRENT_WORKING_DIRECTORY, "test_images", "invoice.png")
-DRIVER_LICENSE_FILE_PATH = os.path.join(CURRENT_WORKING_DIRECTORY, "test_images", "driver_license.png")
+INVOICE_FILE_PATH = os.path.join(CURRENT_WORKING_DIRECTORY, "tests", "test_images", "invoice.png")
+DRIVER_LICENSE_FILE_PATH = os.path.join(CURRENT_WORKING_DIRECTORY, "tests", "test_images", "driver_license.png")
 
 # Common classifications setup
 COMMON_CLASSIFICATIONS = [
@@ -230,5 +230,6 @@ def test_with_tree():
     assert result is not None
     assert result.name == "Invoice"
 
+
 if __name__ == "__main__":
-    test_with_tree()
+    test_classify_feature()
diff --git a/tests/document_loader_aws_textract.py → tests/test_document_loader_aws_textract.py b/tests/document_loader_aws_textract.py → tests/test_document_loader_aws_textract.py
@@ -26,7 +26,4 @@ def test_load_content_from_pdf():
     assert "tables" in result
     assert "forms" in result
     assert "layout" in result
-    assert len(result["pages"]) > 0
-
-if __name__ == "__main__":
-    test_load_content_from_pdf()
+    assert len(result["pages"]) > 0
diff --git a/...ent_loader_azure_document_intelligence.py → ...ent_loader_azure_document_intelligence.py b/...ent_loader_azure_document_intelligence.py → ...ent_loader_azure_document_intelligence.py
@@ -10,7 +10,7 @@
 subscription_key = os.getenv("AZURE_SUBSCRIPTION_KEY")
 endpoint = os.getenv("AZURE_ENDPOINT")
 loader = DocumentLoaderAzureForm(subscription_key, endpoint)
-test_file_path = os.path.join(cwd, "test_images", "invoice.png")
+test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png")
 
 
 def test_load_content_from_file():
@@ -22,4 +22,4 @@ def test_load_content_from_file():
     # Assert
     assert firstPage is not None
     assert firstPage["paragraphs"][0] == "Invoice 0000001"
-    assert len(firstPage["tables"][0]) == 4
+    assert len(firstPage["tables"][0]) == 4
diff --git a/tests/document_loader_google_document_ai.py → ...est_document_loader_google_document_ai.py b/tests/document_loader_google_document_ai.py → ...est_document_loader_google_document_ai.py
diff --git a/tests/document_loader_pypdf.py → tests/test_document_loader_pypdf.py b/tests/document_loader_pypdf.py → tests/test_document_loader_pypdf.py
@@ -7,7 +7,7 @@
 
 # Arrange
 loader = DocumentLoaderPyPdf()
-test_file_path = os.path.join(cwd, "files", "CV_Candidate.pdf")
+test_file_path = os.path.join(cwd, "tests", "files", "CV_Candidate.pdf")
 
 
 def test_load_content_from_file():

diff --git a/tests/document_loader_tesseract.py → tests/test_document_loader_tesseract.py b/tests/document_loader_tesseract.py → tests/test_document_loader_tesseract.py
@@ -10,7 +10,7 @@
 # Arrange
 tesseract_path = os.getenv("TESSERACT_PATH")
 loader = DocumentLoaderTesseract(tesseract_path)
-test_file_path = os.path.join(cwd, "test_images", "invoice.png")
+test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png")
 
 
 def test_load_content_from_file():

diff --git a/tests/extractor.py → tests/test_extractor.py b/tests/extractor.py → tests/test_extractor.py
@@ -3,24 +3,25 @@
 
 from extract_thinker.extractor import Extractor
 from extract_thinker.document_loader.document_loader_tesseract import DocumentLoaderTesseract
+from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf
 from tests.models.invoice import InvoiceContract
 from extract_thinker.document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm
 
 load_dotenv()
 cwd = os.getcwd()
 
 
-def test_extract_with_tessaract_and_claude():
+def test_extract_with_tessaract_and_gpt4o_mini():
 
     # Arrange
     tesseract_path = os.getenv("TESSERACT_PATH")
-    test_file_path = os.path.join(cwd, "test_images", "invoice.png")
+    test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png")
 
     extractor = Extractor()
     extractor.load_document_loader(
         DocumentLoaderTesseract(tesseract_path)
     )
-    extractor.load_llm("claude-3-haiku-20240307")
+    extractor.load_llm("gpt-4o-mini")
 
     # Act
     result = extractor.extract(test_file_path, InvoiceContract)
@@ -31,16 +32,16 @@ def test_extract_with_tessaract_and_claude():
     assert result.invoice_date == "2014-05-07"
 
 
-def test_extract_with_azure_di_and_claude():
+def test_extract_with_azure_di_and_gpt4o_mini():
     subscription_key = os.getenv("AZURE_SUBSCRIPTION_KEY")
     endpoint = os.getenv("AZURE_ENDPOINT")
-    test_file_path = os.path.join(cwd, "test_images", "invoice.png")
+    test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png")
 
     extractor = Extractor()
     extractor.load_document_loader(
         DocumentLoaderAzureForm(subscription_key, endpoint)
     )
-    extractor.load_llm("claude-3-haiku-20240307")
+    extractor.load_llm("gpt-4o-mini")
     # Act
     result = extractor.extract(test_file_path, InvoiceContract)
 
@@ -50,3 +51,21 @@ def test_extract_with_azure_di_and_claude():
     assert result.lines[0].quantity == 1
     assert result.lines[0].unit_price == 2500
     assert result.lines[0].amount == 2500
+
+def test_extract_with_pypdf_and_gpt4o_mini():
+    test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf")
+
+    extractor = Extractor()
+    extractor.load_document_loader(
+        DocumentLoaderPyPdf()
+    )
+    extractor.load_llm("gpt-4o-mini")
+    # Act
+    result = extractor.extract(test_file_path, InvoiceContract)
+
+    # Assert
+    assert result is not None
+    assert result.lines[0].description == "Consultation services"
+    assert result.lines[0].quantity == 3
+    assert result.lines[0].unit_price == 375
+    assert result.lines[0].amount == 1125
diff --git a/tests/ollama.py → tests/test_ollama.py b/tests/ollama.py → tests/test_ollama.py