forked from shashwatanand1801/Pdf-Search-Engine
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpdfReader.py
36 lines (28 loc) · 820 Bytes
/
pdfReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import PyPDF2
import json
pdffileobj = open('AU127-1.pdf', 'rb')
pdfreader = PyPDF2.PdfFileReader(pdffileobj)
x = pdfreader.numPages
text = ""
for i in range(x):
pageobj = pdfreader.getPage(i)
pg = pageobj.extract_text()
text += "\n"
text += pg
with open("AU127-1.txt", "w") as file1:
file1.write(text)
metadata = pdfreader.getDocumentInfo()
pdf_metadata = {
"Author": metadata.author if metadata.author else 'Unknown',
"Title": metadata.title if metadata.title else 'Unknown',
"CreationDate": metadata['/CreationDate'] if '/CreationDate' in metadata else 'Unknown'
}
data = {
"Metadata": pdf_metadata,
"Content": text
}
with open("text.json", 'w') as f:
json.dump(data, f, indent=4)
print("PDF Metadata:")
print(json.dumps(pdf_metadata, indent=4))
pdffileobj.close()