Skip to content

Commit

Permalink
Merge pull request #67 from staru09/main
Browse files Browse the repository at this point in the history
feat: add pdfgpt to parse PDFs
  • Loading branch information
shreyashankar authored Oct 5, 2024
2 parents da282aa + 190c675 commit 197ca50
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 0 deletions.
52 changes: 52 additions & 0 deletions docetl/parsing_tools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import importlib
import io
import os
from gptpdf import parse_pdf
from typing import List, Optional

from litellm import transcription
Expand Down Expand Up @@ -397,6 +398,57 @@ def paddleocr_pdf_to_string(
return pdf_content


def gptpdf_to_string(
input_path: str,
output_path: str,
doc_per_page: bool,
gpt_model: str,
api_key: str,
base_url: str,
verbose: bool,
) -> str:
"""
Parse PDF using GPT to convert the content of a PDF to a markdown format and write it to an output file.
**Note: pip install gptpdf required**
Args:
input_path (str): Path to the input PDF file.
output_path (str): Path where the extracted text will be written.
doc_per_page (bool): If True, return a list of strings, one per page. If False, return a single string.
gpt_model (str): GPT model to be used for parsing.
api_key (str): API key for GPT service.
base_url (str): Base URL for the GPT service.
verbose (bool): If True, will print additional information during parsing.
Returns:
str: Extracted content as a string.
"""

parsed_content, parsed_pages = parse_pdf(
pdf_path=input_path,
output_dir="./",
api_key=api_key,
base_url=base_url,
model=gpt_model,
verbose=verbose
)

if doc_per_page:
content = "\n\n".join(parsed_pages)
else:
content = parsed_content

if verbose:
print(f"Parsed {len(parsed_pages)} pages from {input_path}")

with open(output_path, "w", encoding="utf-8") as output_file:
output_file.write(content)

print(f"Extracted content has been written to {output_path}")

return content

# Define a dictionary mapping function names to their corresponding functions


Expand Down
30 changes: 30 additions & 0 deletions tests/test_parsing_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,33 @@ def test_paddleocr_pdf_to_string():
assert len(result) == 1

assert "have received the new bottles, please discard" in result[0]


# test function todo

'''def test_gptpdf_to_string():
input_pdf = "tests/data/sample_test.pdf"
output_txt = "tests/output/sample_output.txt"
result = parsing_tools.gptpdf_to_string(
input_path=input_pdf,
output_path=output_txt,
doc_per_page=True,
gpt_model="gpt-4o",
api_key="your_openai_api_key",
base_url="https://api.openai.com/v1",
verbose=True
)
assert isinstance(result, str), "The result should be a string."
assert os.path.exists(output_txt), "The output file should be created."
with open(output_txt, "r", encoding="utf-8") as f:
file_content = f.read()
assert file_content == result, "The content in the output file should match the returned string."
assert len(result) > 0, "The extracted content should not be empty."
print("All assertions passed!")
test_gptpdf_to_string()'''

0 comments on commit 197ca50

Please sign in to comment.