-
Notifications
You must be signed in to change notification settings - Fork 115
/
legistarparse.py
executable file
·76 lines (55 loc) · 2.04 KB
/
legistarparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
# Parse a legistar PDF calendar page using pdfminer
# https://stackoverflow.com/a/59423919
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
# For keeping a list sorted:
import bisect
# Keep track of the start position of columns.
# Round to integers. Keep sorted.
column_exes = []
def parse_pdf_file(filename):
# convert all horizontal text into a lines list (one entry per line)
# document is a file stream
document = open(filename, 'rb')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
exes = set()
for page in PDFPage.get_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for element in layout:
if not isinstance(element, LTTextBoxHorizontal):
continue
# element is a LTTextBoxHorizontal
# and has get_text()
# x0 y0 x1 y1 width neight
# bbox: (x0, y0, x1, y1)
# print(element.get_text(), element.x0, element.y0)
col = determine_column(element)
print(element.get_text(), col, element.y0)
print("Columns:", column_exes)
# How much slop in deciding something is in the same column?
COLSLOP = 1.5
def determine_column(textbox:LTTextBoxHorizontal) -> int:
"""Which column, rounded to the nearest inch, is this textbox in?
If column_exes doesn't have a column for it, add one.
"""
x0 = textbox.x0
for colx in column_exes:
if abs(x0 - colx) < COLSLOP:
return colx
print(x0, "too far from", colx)
# Insert into the list
x0 = int(x0)
bisect.insort(column_exes, x0)
return x0
if __name__ == '__main__':
import sys
parse_pdf_file(sys.argv[1])