This repository was archived by the owner on Jun 29, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathTASK-4.py
93 lines (80 loc) · 3.93 KB
/
TASK-4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import PyPDF2
from PIL import Image
import os
def convert_pdf_to_text(pdf_path, text_output_path):
"""Converts a PDF file to text.
Args:
pdf_path (str): Path to the input PDF file.
text_output_path (str): Path to save the converted text file.
"""
try:
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
with open(text_output_path, 'w', encoding='utf-8') as text_file:
# Iterate through each page of the PDF
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
# Extract text from the page and write it to the text file
text_file.write(page.extract_text())
print(f"PDF converted to text successfully. Text file saved at {text_output_path}")
except Exception as e:
print(f"An error occurred: {e}")
def extract_images_from_pdf(pdf_path, image_output_folder):
"""Extracts images from a PDF file.
Args:
pdf_path (str): Path to the input PDF file.
image_output_folder (str): Folder to save the extracted images.
"""
try:
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
# Iterate through each page of the PDF
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
xObject = page['/Resources']['/XObject'].getObject()
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj]._data
mode = ''
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
img.save(os.path.join(image_output_folder, f"page{page_num+1}_{obj[1:]}.png"))
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(os.path.join(image_output_folder, f"page{page_num+1}_{obj[1:]}.jpg"), "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(os.path.join(image_output_folder, f"page{page_num+1}_{obj[1:]}.jp2"), "wb")
img.write(data)
img.close()
print(f"Images extracted successfully. Saved in {image_output_folder}")
except Exception as e:
print(f"An error occurred: {e}")
def main():
# Get input paths and output folder from user
pdf_path = input("Enter the path to the PDF file: ")
output_folder = input("Enter the output folder path: ")
# Create the output folder if it does not exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Choose conversion option
choice = input("Choose an option:\n1. Convert PDF to text\n2. Extract images from PDF\nEnter your choice: ")
if choice == '1':
# Convert PDF to text
text_output_path = os.path.join(output_folder, "converted_text.txt")
convert_pdf_to_text(pdf_path, text_output_path)
elif choice == '2':
# Extract images from PDF
image_output_folder = os.path.join(output_folder, "extracted_images")
if not os.path.exists(image_output_folder):
os.makedirs(image_output_folder)
extract_images_from_pdf(pdf_path, image_output_folder)
else:
print("Invalid choice. Please choose 1 or 2.")
if __name__ == "__main__":
main()