-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf.py
58 lines (45 loc) · 1.92 KB
/
pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import io
import pdfminer
import pdfminer.pdfinterp
import pdfminer.converter
import pdfminer.layout
import pdfminer.pdfpage
def generate_pdf_page_text(filename):
"""Generator that yields the text content of a PDF in a page-by-page fashion"""
# create a `pdfminer.pdfinterp.PDFResourceManager` object
resource_manager = pdfminer.pdfinterp.PDFResourceManager()
# create a `io.StringIO` object which will be used by the
# `TextConverter` (see below) to store the decoded text content
# of the PDF pages
str_return = io.StringIO()
# create a new `pdfminer.converter.TextConverter` that will process
# and decode the content of the PDF pages
device = pdfminer.converter.TextConverter(
resource_manager,
str_return,
laparams=pdfminer.layout.LAParams()
)
# create a new `pdfminer.pdfinterp.PDFPageInterpreter` which will perform the
# actual processing via the `device` object
interpreter = pdfminer.pdfinterp.PDFPageInterpreter(
resource_manager, device)
# open the define PDF file
fid = open(filename, "rb")
# get a PDF-page generator on the opened PDF file
generator_pages = pdfminer.pdfpage.PDFPage.get_pages(fid)
# iterate through all PDF pages
for page in generator_pages:
# use the `interpreter` to extract the actual text content of the PDF
# page
interpreter.process_page(page)
# retrieve the content from the `cStringIO.StringIO` object
text = str_return.getvalue()
# truncate the `cStringIO.StringIO` file object because the
# `interpreter` appends the new content to the previous one.
# This way, every iteration will only yield the current page
# instead of gradually appending the new content.
str_return.truncate(0)
# yield the content of the current page
yield text