-
Notifications
You must be signed in to change notification settings - Fork 1
/
pdftoics.py
146 lines (97 loc) · 3.37 KB
/
pdftoics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
from operator import attrgetter
from constantes import DAYS
from BeautifulSoup import BeautifulSoup
class Course:
def __init__(self):
self.content = " "
self.top = 0
self.left = 0
self.width = 0
self.height = 0
self.font = 0
@property
def h_center(self):
return self.left + self.width / 2
@property
def v_center(self):
return self.top + self.height / 2
def __repr__(self):
return "<Course>"
def striphtml(data):
p = re.compile(r'<.*?>')
return p.sub('', data)
def xml_to_blocks(xml):
blocks = []
for line in xml:
if "text" in line:
soup1 = BeautifulSoup(line)
c = Course()
c.content = soup1.find("text").text
c.top = int(soup1.find("text")['top'])
c.left = int(soup1.find("text")['left'])
c.width = int(soup1.find("text")['width'])
c.height = int(soup1.find("text")['height'])
c.font = int(soup1.find("text")['font'])
blocks.append(c)
return blocks
def blocks_to_matrix(blocks):
matrix = [
[None, None, None, None, None, None, None, None, None],
[None, None, None, None, None, None, None, None, None],
[None, None, None, None, None, None, None, None, None],
[None, None, None, None, None, None, None, None, None],
[None, None, None, None, None, None, None, None, None],
]
days = sorted(blocks[:5], key=attrgetter('left'))
times = sorted(blocks[-10:-1], key=attrgetter('top'))
courses = blocks[5:-10]
name = blocks[-1].content
for course in courses:
distance = 100000
closest_day = None
for i, day in enumerate(days):
if abs(day.h_center - course.h_center) < distance:
distance = abs(day.h_center - course.h_center)
closest_day = i
distance = 100000
closest_hour = None
for i, time in enumerate(times):
if abs(time.v_center - course.v_center) < distance:
distance = abs(time.v_center - course.v_center)
closest_hour = i
matrix[closest_day][closest_hour] = course.content
return name, matrix
def blocks_to_matrix_dict(blocks):
def palambda(x):
x.content = striphtml(x.content).strip()
return x
blocks = map(palambda, blocks)
blocks = filter(lambda x: x.font > 0, blocks)
blocks = filter(lambda x: x.content != '', blocks)
start, tables = 0, {}
for i, block in enumerate(blocks):
if block.font == 3:
name, matrix = blocks_to_matrix(blocks[start:i + 1])
tables[name] = matrix
start = i+1
return tables
def pp_group(matrix_dict, group):
if not group in matrix_dict:
raise ValueError('Not in matrix_dict, sorry')
matrix = matrix_dict[group]
for day, col in enumerate(matrix):
print '=== {} ==='.format(DAYS[day])
for course in col:
if course:
print course
else:
print '----'
if __name__ == '__main__':
blocks = xml_to_blocks(xml)
matrix_dict = blocks_to_matrix_dict(blocks)
#matrix_to_ics(matrix_dict, begin_date, end_date)
#print matrix_dict
#pp_group(matrix_dict, '1GIR-132')