Created Python Script to Extract text from a PDF

devikabhapkar · web-flow · commit f419be2d9ddc · 2022-10-13T20:32:21.000+05:30
diff --git a/extract_text_from_pdf/extract_text_from_pdf.py b/extract_text_from_pdf/extract_text_from_pdf.py
@@ -0,0 +1,19 @@
+# import module PyPDF2
+import PyPDF2
+# put 'example.pdf' in working directory
+# and open it in read binary mode
+pdfFileObj = open('example.pdf', 'rb')
+# call and store PdfFileReader
+# object in pdfReader
+pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
+# to print the total number of pages in pdf
+# print(pdfReader.numPages)
+# get specific page of pdf by passing
+# number since it stores pages in list
+# to access first page pass 0
+pageObj = pdfReader.getPage(0)
+# extract the page object
+# by extractText() function
+texts = pageObj.extractText()
+# print the extracted texts
+print(texts)