From 85d3d2f43b9c8a68075a67b008eeb7d6500af778 Mon Sep 17 00:00:00 2001 From: staru09 Date: Sat, 5 Oct 2024 12:18:39 +0530 Subject: [PATCH 1/2] gpt_pdf added --- docetl/parsing_tools.py | 53 +++++++++++++++++++++++++++++++++++++ tests/test_parsing_tools.py | 30 +++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py index 72cfdbf7..21ca4f92 100644 --- a/docetl/parsing_tools.py +++ b/docetl/parsing_tools.py @@ -1,6 +1,7 @@ import importlib import io import os +from gptpdf import parse_pdf from typing import List, Optional from litellm import transcription @@ -378,6 +379,58 @@ def paddleocr_pdf_to_string( return pdf_content +def gptpdf_to_string( + input_path: str, + output_path: str, + doc_per_page: bool, + gpt_model: str, + api_key: str, + base_url: str, + verbose: bool, +) -> str: + """ + Parse PDF using GPT to convert the content of a PDF to a markdown format and write it to an output file. + + **Note: pip install gptpdf required** + + Args: + input_path (str): Path to the input PDF file. + output_path (str): Path where the extracted text will be written. + doc_per_page (bool): If True, return a list of strings, one per page. If False, return a single string. + gpt_model (str): GPT model to be used for parsing. + api_key (str): API key for GPT service. + base_url (str): Base URL for the GPT service. + verbose (bool): If True, will print additional information during parsing. + + Returns: + str: Extracted content as a string. + """ + from gptpdf import parse_pdf + + parsed_content, parsed_pages = parse_pdf( + pdf_path=input_path, + output_dir="./", + api_key=api_key, + base_url=base_url, + model=gpt_model, + verbose=verbose + ) + + if doc_per_page: + content = "\n\n".join(parsed_pages) + else: + content = parsed_content + + if verbose: + print(f"Parsed {len(parsed_pages)} pages from {input_path}") + + with open(output_path, "w", encoding="utf-8") as output_file: + output_file.write(content) + + print(f"Extracted content has been written to {output_path}") + + return content + # Define a dictionary mapping function names to their corresponding functions diff --git a/tests/test_parsing_tools.py b/tests/test_parsing_tools.py index 56f10b3f..89274ad7 100644 --- a/tests/test_parsing_tools.py +++ b/tests/test_parsing_tools.py @@ -213,3 +213,33 @@ def test_paddleocr_pdf_to_string(): assert len(result) == 1 assert "have received the new bottles, please discard" in result[0] + + +# test function todo + +'''def test_gptpdf_to_string(): + input_pdf = "tests/data/sample_test.pdf" + output_txt = "tests/output/sample_output.txt" + + result = parsing_tools.gptpdf_to_string( + input_path=input_pdf, + output_path=output_txt, + doc_per_page=True, + gpt_model="gpt-4o", + api_key="your_openai_api_key", + base_url="https://api.openai.com/v1", + verbose=True + ) + + assert isinstance(result, str), "The result should be a string." + assert os.path.exists(output_txt), "The output file should be created." + + with open(output_txt, "r", encoding="utf-8") as f: + file_content = f.read() + + assert file_content == result, "The content in the output file should match the returned string." + assert len(result) > 0, "The extracted content should not be empty." + + print("All assertions passed!") + +test_gptpdf_to_string()''' From 6f46a8483498d433c62aaf66dd7aee60ff2166cb Mon Sep 17 00:00:00 2001 From: Aru Sharma <70081536+staru09@users.noreply.github.com> Date: Sat, 5 Oct 2024 12:33:42 +0530 Subject: [PATCH 2/2] Update parsing_tools.py --- docetl/parsing_tools.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py index 21ca4f92..5c3c0330 100644 --- a/docetl/parsing_tools.py +++ b/docetl/parsing_tools.py @@ -405,8 +405,7 @@ def gptpdf_to_string( Returns: str: Extracted content as a string. """ - from gptpdf import parse_pdf - + parsed_content, parsed_pages = parse_pdf( pdf_path=input_path, output_dir="./",