-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_pdf.py
executable file
·93 lines (74 loc) · 3.2 KB
/
parse_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import tabula
import pandas as pd
import sys, os
from io import StringIO
import pdfplumber
class Pdf_Table_Extraction():
def __init__(self, pdf) -> None:
self.target_pdf = pdf
self.elimination_periods = ['30', '60', '90', '180', '365', '730']
def _split_drop_columns(self, table):
self.raw_header=['Issue Age']
drop_list = []
colIndex = 1
while colIndex < len(table.columns):
# Split and Expand
if " " in str(table.iloc[1, colIndex]):
table = pd.concat([
table[table.columns[0:colIndex]],
table.iloc[:,colIndex].str.split(expand=True),
table[table.columns[colIndex+1:len(table.columns)]],
],axis=1)
# Read Header
if not pd.isna(table.iloc[0, colIndex]):
self.raw_header.append(table.iloc[0, colIndex])
# Columns To Be Dropped
if pd.isna(table.iloc[1, colIndex]):
drop_list.append(colIndex)
else:
try:
float(table.iloc[1, colIndex])
except:
drop_list.append(colIndex)
colIndex += 1
table = table.drop(table.columns[drop_list], axis = 1)
return table
def _fix_column_header(self, table):
new_header = ['Issue Age']
col_width = len(table.columns)
for index in range(1, max(col_width - 2, 4) ):
new_header.append(self.elimination_periods[index-1])
counter = len(new_header) - col_width
while counter < 0:
new_header.append(self.raw_header[counter])
counter += 1
table.columns = new_header
table = table.drop([0], axis = 0) # drop first row
return table
def extract_table(self, target_page):
orig_err = sys.stderr
sys.stderr = StringIO()
# process pdf start
table = tabula.read_pdf(self.target_pdf, pages = target_page)[0]
table = self._split_drop_columns(table)
table = self._fix_column_header(table)
# process pdf end
sys.stderr = orig_err
# print(table.head(1))
return table.reset_index(drop=True)
def extract_meta_data(self, target_page):
page = pdfplumber.open(self.target_pdf).pages[target_page-1]
lines = page.extract_text().splitlines()
result = f'{lines[3]}~{lines[7]}'
return result
if __name__ == '__main__':
pte = Pdf_Table_Extraction('./Mutual of Omaha Filed Disability Policy - Rate Filing_20191029.pdf')
for p in range(7,72):
table_name = pte.extract_meta_data(p)
clean_table = pte.extract_table(p)
print(f'Page {p}: {table_name}')
file_path = f'./Data/{table_name}.csv'
if not os.path.isfile(file_path):
clean_table.to_csv(file_path)
else: # append without header
clean_table.to_csv(file_path, mode='a', header=False)