forked from e-p-armstrong/augmentoolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_pdf_to_text.py
55 lines (41 loc) · 1.71 KB
/
convert_pdf_to_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# Written by GPT-4
import fitz # PyMuPDF
import pytesseract
from PIL import Image
import io
# Ensure the path to the tesseract executable is set if it's not in your PATH
# pytesseract.pytesseract.tesseract_cmd = r'<path_to_your_tesseract_executable>'
def convert_pdf_to_text(pdf_path, output_txt_path):
# Open the PDF file
document = fitz.open(pdf_path)
text = "" # Initialize a text string to hold all text from the PDF
for page_num in range(len(document)):
# Get the page
page = document.load_page(page_num)
# First, try to extract text using PyMuPDF
text_content = page.get_text()
if text_content.strip(): # If text is found, append it.
text += text_content
# else:
# # If no text is found, it might be an image-based PDF
# # Extract the image from the page
# for img_index, img in enumerate(page.get_images(full=True)):
# xref = img[0]
# base_image = document.extract_image(xref)
# image_bytes = base_image["image"]
# # Load it to PIL
# image = Image.open(io.BytesIO(image_bytes))
# # Use pytesseract to do OCR on the image
# text += pytesseract.image_to_string(image)
# Close the document
document.close()
# Write the text to a .txt file
with open(output_txt_path, "w", encoding="utf-8") as text_file:
text_file.write(text)
return text
# Usage
pdf_path = "./Introduction to Logic and Critical Thinking, by Matthew Van Cleave.pdf"
output_txt_path = (
"Introduction to Logic and Critical Thinking, by Matthew Van Cleave.txt"
)
convert_pdf_to_text(pdf_path, output_txt_path)