From 85d3d2f43b9c8a68075a67b008eeb7d6500af778 Mon Sep 17 00:00:00 2001
From: staru09 <arusharmazxx000@gmail.com>
Date: Sat, 5 Oct 2024 12:18:39 +0530
Subject: [PATCH 1/2] gpt_pdf added

---
 docetl/parsing_tools.py     | 53 +++++++++++++++++++++++++++++++++++++
 tests/test_parsing_tools.py | 30 +++++++++++++++++++++
 2 files changed, 83 insertions(+)

diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py
index 72cfdbf7..21ca4f92 100644
--- a/docetl/parsing_tools.py
+++ b/docetl/parsing_tools.py
@@ -1,6 +1,7 @@
 import importlib
 import io
 import os
+from gptpdf import parse_pdf
 from typing import List, Optional
 
 from litellm import transcription
@@ -378,6 +379,58 @@ def paddleocr_pdf_to_string(
     return pdf_content
 
 
+def gptpdf_to_string(
+    input_path: str,
+    output_path: str,
+    doc_per_page: bool,
+    gpt_model: str,
+    api_key: str,
+    base_url: str,
+    verbose: bool,
+) -> str:
+    """
+    Parse PDF using GPT to convert the content of a PDF to a markdown format and write it to an output file.
+
+    **Note: pip install gptpdf required**
+
+    Args:
+        input_path (str): Path to the input PDF file.
+        output_path (str): Path where the extracted text will be written.
+        doc_per_page (bool): If True, return a list of strings, one per page. If False, return a single string.
+        gpt_model (str): GPT model to be used for parsing.
+        api_key (str): API key for GPT service.
+        base_url (str): Base URL for the GPT service.
+        verbose (bool): If True, will print additional information during parsing.
+    
+    Returns:
+        str: Extracted content as a string.
+    """
+    from gptpdf import parse_pdf
+
+    parsed_content, parsed_pages = parse_pdf(
+        pdf_path=input_path,
+        output_dir="./",
+        api_key=api_key,
+        base_url=base_url,
+        model=gpt_model,
+        verbose=verbose
+    )
+
+    if doc_per_page:
+        content = "\n\n".join(parsed_pages)  
+    else:
+        content = parsed_content  
+    
+    if verbose:
+        print(f"Parsed {len(parsed_pages)} pages from {input_path}")
+
+    with open(output_path, "w", encoding="utf-8") as output_file:
+        output_file.write(content)
+
+    print(f"Extracted content has been written to {output_path}")
+    
+    return content 
+
 # Define a dictionary mapping function names to their corresponding functions
 
 
diff --git a/tests/test_parsing_tools.py b/tests/test_parsing_tools.py
index 56f10b3f..89274ad7 100644
--- a/tests/test_parsing_tools.py
+++ b/tests/test_parsing_tools.py
@@ -213,3 +213,33 @@ def test_paddleocr_pdf_to_string():
     assert len(result) == 1
 
     assert "have received the new bottles, please discard" in result[0]
+
+
+# test function todo
+
+'''def test_gptpdf_to_string():
+    input_pdf = "tests/data/sample_test.pdf"
+    output_txt = "tests/output/sample_output.txt"
+
+    result = parsing_tools.gptpdf_to_string(
+        input_path=input_pdf,
+        output_path=output_txt,
+        doc_per_page=True,
+        gpt_model="gpt-4o",
+        api_key="your_openai_api_key",
+        base_url="https://api.openai.com/v1",
+        verbose=True
+    )
+
+    assert isinstance(result, str), "The result should be a string."
+    assert os.path.exists(output_txt), "The output file should be created."
+
+    with open(output_txt, "r", encoding="utf-8") as f:
+        file_content = f.read()
+
+    assert file_content == result, "The content in the output file should match the returned string."
+    assert len(result) > 0, "The extracted content should not be empty."
+
+    print("All assertions passed!")
+
+test_gptpdf_to_string()'''

From 6f46a8483498d433c62aaf66dd7aee60ff2166cb Mon Sep 17 00:00:00 2001
From: Aru Sharma <70081536+staru09@users.noreply.github.com>
Date: Sat, 5 Oct 2024 12:33:42 +0530
Subject: [PATCH 2/2] Update parsing_tools.py

---
 docetl/parsing_tools.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py
index 21ca4f92..5c3c0330 100644
--- a/docetl/parsing_tools.py
+++ b/docetl/parsing_tools.py
@@ -405,8 +405,7 @@ def gptpdf_to_string(
     Returns:
         str: Extracted content as a string.
     """
-    from gptpdf import parse_pdf
-
+    
     parsed_content, parsed_pages = parse_pdf(
         pdf_path=input_path,
         output_dir="./",