Merge pull request #67 from staru09/main

feat: add pdfgpt to parse PDFs
ucbepic · Oct 5, 2024 · 197ca50 · 197ca50
2 parents da282aa + 190c675
commit 197ca50
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 0 deletions.
diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py
@@ -1,6 +1,7 @@
 import importlib
 import io
 import os
+from gptpdf import parse_pdf
 from typing import List, Optional
 
 from litellm import transcription
@@ -397,6 +398,57 @@ def paddleocr_pdf_to_string(
     return pdf_content
 
 
+def gptpdf_to_string(
+    input_path: str,
+    output_path: str,
+    doc_per_page: bool,
+    gpt_model: str,
+    api_key: str,
+    base_url: str,
+    verbose: bool,
+) -> str:
+    """
+    Parse PDF using GPT to convert the content of a PDF to a markdown format and write it to an output file.
+
+    **Note: pip install gptpdf required**
+
+    Args:
+        input_path (str): Path to the input PDF file.
+        output_path (str): Path where the extracted text will be written.
+        doc_per_page (bool): If True, return a list of strings, one per page. If False, return a single string.
+        gpt_model (str): GPT model to be used for parsing.
+        api_key (str): API key for GPT service.
+        base_url (str): Base URL for the GPT service.
+        verbose (bool): If True, will print additional information during parsing.
+    
+    Returns:
+        str: Extracted content as a string.
+    """
+
+    parsed_content, parsed_pages = parse_pdf(
+        pdf_path=input_path,
+        output_dir="./",
+        api_key=api_key,
+        base_url=base_url,
+        model=gpt_model,
+        verbose=verbose
+    )
+
+    if doc_per_page:
+        content = "\n\n".join(parsed_pages)  
+    else:
+        content = parsed_content  
+
+    if verbose:
+        print(f"Parsed {len(parsed_pages)} pages from {input_path}")
+
+    with open(output_path, "w", encoding="utf-8") as output_file:
+        output_file.write(content)
+
+    print(f"Extracted content has been written to {output_path}")
+
+    return content 
+
 # Define a dictionary mapping function names to their corresponding functions
 
 

diff --git a/tests/test_parsing_tools.py b/tests/test_parsing_tools.py
@@ -213,3 +213,33 @@ def test_paddleocr_pdf_to_string():
     assert len(result) == 1
 
     assert "have received the new bottles, please discard" in result[0]
+
+
+# test function todo
+
+'''def test_gptpdf_to_string():
+    input_pdf = "tests/data/sample_test.pdf"
+    output_txt = "tests/output/sample_output.txt"
+
+    result = parsing_tools.gptpdf_to_string(
+        input_path=input_pdf,
+        output_path=output_txt,
+        doc_per_page=True,
+        gpt_model="gpt-4o",
+        api_key="your_openai_api_key",
+        base_url="https://api.openai.com/v1",
+        verbose=True
+    )
+
+    assert isinstance(result, str), "The result should be a string."
+    assert os.path.exists(output_txt), "The output file should be created."
+
+    with open(output_txt, "r", encoding="utf-8") as f:
+        file_content = f.read()
+
+    assert file_content == result, "The content in the output file should match the returned string."
+    assert len(result) > 0, "The extracted content should not be empty."
+
+    print("All assertions passed!")
+
+test_gptpdf_to_string()'''