Skip to content

Commit

Permalink
Merge pull request #30 from enoch3712/29-fix-tests-100-success-rate-a…
Browse files Browse the repository at this point in the history
…nd-run-on-the-deployment-pipeline

29 fix tests 100 success rate and run on the deployment pipeline
  • Loading branch information
enoch3712 authored Sep 25, 2024
2 parents f1a611a + 3e6ee35 commit 1718465
Show file tree
Hide file tree
Showing 12 changed files with 37 additions and 20 deletions.
Binary file added examples/invoice.pdf
Binary file not shown.
4 changes: 2 additions & 2 deletions extract_thinker/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,8 @@ def _extract(self,

if content is not None:
if isinstance(content, dict):
if content["is_spreadsheet"]:
content = json_to_formatted_string(content["data"])
if content.get("is_spreadsheet", False):
content = json_to_formatted_string(content.get("data", {}))
content = yaml.dump(content)
messages.append({"role": "user", "content": "##Content\n\n" + content})

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "extract_thinker"
version = "0.0.12"
version = "0.0.13"
description = "Library to extract data from files and documents agnositicaly using LLMs"
authors = ["Júlio Almeida <enoch3712@gmail.com>"]
readme = "README.md"
Expand Down
Binary file modified tests/files/invoice.pdf
Binary file not shown.
7 changes: 4 additions & 3 deletions tests/classify.py → tests/test_classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
load_dotenv()
tesseract_path = os.getenv("TESSERACT_PATH")
CURRENT_WORKING_DIRECTORY = os.getcwd()
INVOICE_FILE_PATH = os.path.join(CURRENT_WORKING_DIRECTORY, "test_images", "invoice.png")
DRIVER_LICENSE_FILE_PATH = os.path.join(CURRENT_WORKING_DIRECTORY, "test_images", "driver_license.png")
INVOICE_FILE_PATH = os.path.join(CURRENT_WORKING_DIRECTORY, "tests", "test_images", "invoice.png")
DRIVER_LICENSE_FILE_PATH = os.path.join(CURRENT_WORKING_DIRECTORY, "tests", "test_images", "driver_license.png")

# Common classifications setup
COMMON_CLASSIFICATIONS = [
Expand Down Expand Up @@ -230,5 +230,6 @@ def test_with_tree():
assert result is not None
assert result.name == "Invoice"


if __name__ == "__main__":
test_with_tree()
test_classify_feature()
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,4 @@ def test_load_content_from_pdf():
assert "tables" in result
assert "forms" in result
assert "layout" in result
assert len(result["pages"]) > 0

if __name__ == "__main__":
test_load_content_from_pdf()
assert len(result["pages"]) > 0
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
subscription_key = os.getenv("AZURE_SUBSCRIPTION_KEY")
endpoint = os.getenv("AZURE_ENDPOINT")
loader = DocumentLoaderAzureForm(subscription_key, endpoint)
test_file_path = os.path.join(cwd, "test_images", "invoice.png")
test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png")


def test_load_content_from_file():
Expand All @@ -22,4 +22,4 @@ def test_load_content_from_file():
# Assert
assert firstPage is not None
assert firstPage["paragraphs"][0] == "Invoice 0000001"
assert len(firstPage["tables"][0]) == 4
assert len(firstPage["tables"][0]) == 4
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

# Arrange
loader = DocumentLoaderPyPdf()
test_file_path = os.path.join(cwd, "files", "CV_Candidate.pdf")
test_file_path = os.path.join(cwd, "tests", "files", "CV_Candidate.pdf")


def test_load_content_from_file():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# Arrange
tesseract_path = os.getenv("TESSERACT_PATH")
loader = DocumentLoaderTesseract(tesseract_path)
test_file_path = os.path.join(cwd, "test_images", "invoice.png")
test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png")


def test_load_content_from_file():
Expand Down
31 changes: 25 additions & 6 deletions tests/extractor.py → tests/test_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,25 @@

from extract_thinker.extractor import Extractor
from extract_thinker.document_loader.document_loader_tesseract import DocumentLoaderTesseract
from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf
from tests.models.invoice import InvoiceContract
from extract_thinker.document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm

load_dotenv()
cwd = os.getcwd()


def test_extract_with_tessaract_and_claude():
def test_extract_with_tessaract_and_gpt4o_mini():

# Arrange
tesseract_path = os.getenv("TESSERACT_PATH")
test_file_path = os.path.join(cwd, "test_images", "invoice.png")
test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png")

extractor = Extractor()
extractor.load_document_loader(
DocumentLoaderTesseract(tesseract_path)
)
extractor.load_llm("claude-3-haiku-20240307")
extractor.load_llm("gpt-4o-mini")

# Act
result = extractor.extract(test_file_path, InvoiceContract)
Expand All @@ -31,16 +32,16 @@ def test_extract_with_tessaract_and_claude():
assert result.invoice_date == "2014-05-07"


def test_extract_with_azure_di_and_claude():
def test_extract_with_azure_di_and_gpt4o_mini():
subscription_key = os.getenv("AZURE_SUBSCRIPTION_KEY")
endpoint = os.getenv("AZURE_ENDPOINT")
test_file_path = os.path.join(cwd, "test_images", "invoice.png")
test_file_path = os.path.join(cwd, "tests", "test_images", "invoice.png")

extractor = Extractor()
extractor.load_document_loader(
DocumentLoaderAzureForm(subscription_key, endpoint)
)
extractor.load_llm("claude-3-haiku-20240307")
extractor.load_llm("gpt-4o-mini")
# Act
result = extractor.extract(test_file_path, InvoiceContract)

Expand All @@ -50,3 +51,21 @@ def test_extract_with_azure_di_and_claude():
assert result.lines[0].quantity == 1
assert result.lines[0].unit_price == 2500
assert result.lines[0].amount == 2500

def test_extract_with_pypdf_and_gpt4o_mini():
test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf")

extractor = Extractor()
extractor.load_document_loader(
DocumentLoaderPyPdf()
)
extractor.load_llm("gpt-4o-mini")
# Act
result = extractor.extract(test_file_path, InvoiceContract)

# Assert
assert result is not None
assert result.lines[0].description == "Consultation services"
assert result.lines[0].quantity == 3
assert result.lines[0].unit_price == 375
assert result.lines[0].amount == 1125
File renamed without changes.

0 comments on commit 1718465

Please sign in to comment.