-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse-pdf.py
54 lines (43 loc) · 1.68 KB
/
parse-pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import PyPDF2
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
import time
PDF_PATH="C:\\Users\\ASUS\\OneDrive\\BOOK\\AWS-SAA\\AWS Certified Solutions Architect Slides v13 Parse.pdf"
# Open the PDF file
pdf_file = open(PDF_PATH, 'rb')
# Create a PDF reader object
pdf_reader = PyPDF2.PdfReader(pdf_file)
# Initialize the summary variable
summary = ''
# Loop through each page in the PDF file
for page_num in range(len(pdf_reader.pages)):
# Extract the text from the page
# page = pdf_reader.getPage(page_num)
page = pdf_reader.pages[page_num]
text = page.extract_text()
print("Working on page_num: ", page_num)
text = text.split('\n')[0]
# print("text: ", text)
# print(text)
# Tokenize the text into sentences
sentences = sent_tokenize(text)
# print("sentence len: ", len(sentences))
# Check first sentence in the page
first_sentence = sentences[0]
# print("first_sentence: ", first_sentence)
first_sentence = first_sentence.replace("© Stephane MaarekNOT FOR DISTRIBUTION © Stephane Maarek www.datacumulus.com", "")
# print("first_sentence Updated: ", first_sentence)
first_sentence_title = first_sentence.split("•")[0]
# print(">>> first_sentence_title: ", first_sentence_title)
summary += first_sentence_title + '\n'
# Get the current time in epoch format
epoch_time = int(time.time())
# Print the summary
print(summary)
print("Parsing completed! Writing output to the file")
OUT_PUT_FILE = f"../output/summary_{epoch_time}.txt"
# Open the file in write mode
with open(OUT_PUT_FILE, 'a', encoding="utf-8") as file:
# Write the string to the file
file.write(str(summary))