-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
444 lines (366 loc) · 15.7 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
import re
import printer
"""
This class parses a file into constraint expressions and numerical values that
can then be passed into a solver for formulation
"""
class Parser:
# get the categories, applicants, goals, and requirements
# make sure that it is formatted correctly
def parse_file(self, fname):
printer.print_start_process("Parsing File")
# get cleaned up list of lines in file with line numbers
printer.print_update("Cleaning File")
contents = self.strip_contents(open(fname).readlines())
# get the set of lines where each header occurs
printer.print_update("Finding Headers")
header_locs = self.find_headers(contents)
# get the set of categories
printer.print_update("Getting Variables")
cats = self.parse_cats(
self.get_slice(contents, header_locs.get(self.CAT_H)))
# useful later on
self.setup_vars(cats)
# get the input values
printer.print_update("Getting Values")
vals = self.parse_vals(cats,
self.get_slice(contents, header_locs.get(self.VAL_H)))
# get the requirement and goal data
printer.print_update("Parsing Requirements")
reqs = self.parse_constraints(cats, vals,
self.get_slice(contents, header_locs.get(self.REQ_H)))
reqs += self.get_implicit_constraints(cats, vals)
printer.print_update("Parsing Goals")
goals = self.parse_constraints(cats, vals,
self.get_slice(contents, header_locs.get(self.GOAL_H)))
printer.print_start_process("Parsing File")
return (reqs, goals)
# useful for debugging
def print_data(self, data):
print("--------------------------")
for (new_num, (orig_num, line)) in enumerate(data):
print(str(orig_num) + ":\t" + str(new_num) + "\t" + line)
print("--------------------------")
# parse the categories into a list of (name, list)
def parse_cats(self, data):
cats = []
for el in data:
# split on the divider character (should be ":")
(cat, members_str) = self.divide(el)
# this happens if there was a syntax error
if cat is None:
continue
# turn the string into a list
members = members_str.split(self.SEP_CHAR)
cats.append((cat, members))
return cats
# parse the input values for the categories into a dict pointing to ints
def parse_vals(self, cats, data):
vals = {}
for el in data:
# split on te divide character (should be ":")
(lhs, rhs) = self.divide(el)
# this happens if there was a syntax error
if lhs is None:
continue
# parse the pattern
pattern = self.get_pattern(lhs)
# make sure this is a valid thing (i.e. not in[(zzx, 1)])
if not self.check_pattern(cats, pattern, req_all=True):
self.error(el[0], "Invalid membership: \"" + lhs + "\"")
pattern_str = self.pattern_to_string(cats, pattern)
if pattern_str in vals:
self.error(el[0], "Duplicate definition: \"" + lhs + "\"")
continue
try:
vals[pattern_str] = int(rhs)
except:
self.error(orig_num, "RHS must be a number: \"" + rhs + "\"")
return vals
# parses a set of constraints that can be either numeric or percentage
def parse_constraints(self, cats, vals, data):
consts = []
for el in data:
# split on the divider character (should be ":")
(kind, ineq) = self.divide(el)
# this happens if there was a syntax error
if kind is None:
continue
if kind == self.NUMERIC:
consts.append(self.parse_ineq_numeric(cats, vals, ineq))
elif kind == self.PERCENTAGE:
consts.append(self.parse_ineq_ratio(cats, vals, ineq))
else:
self.error(el[0],
"Invalid constraint type: \"" + kind + "\"\n\tAvailable "
"constraint types: " + self.CONSTRAINTS)
return consts
# require that all categories are upper bounded by applicants
def get_implicit_constraints(self, cats, vals):
imp_reqs = []
for pat in self.all_pats:
pat_str = "out"+self.pattern_to_string(cats, pat)
lhs = self.parse_lhs(cats, vals, pat_str)
rhs = self.get_val(cats, vals, pat)
imp_reqs.append((self.LESS, lhs, rhs))
return imp_reqs
# parses a ratio inequality
def parse_ineq_ratio(self, cats, vals, ineq):
# get what it would be without the division
(op, lhs, rhs) = self.parse_ineq_numeric(cats, vals, ineq)
# denominator is all values with weight 1
lhs_den = [1]*self.var_num
# cross multiply
rhs = self.weights_mult(lhs_den, -rhs)
# subtract weights (add because negated in previous line)
lhs_new = self.weights_add(lhs,rhs)
# will always have zero on rhs
return (op, lhs_new, 0)
# parses a numeric inequality
def parse_ineq_numeric(self, cats, vals, ineq):
(op, lhs, rhs) = self.split_on_operation(ineq)
lhs = self.parse_lhs(cats,vals,lhs)
rhs = self.parse_rhs(cats,vals,rhs)
return (op, lhs, rhs)
# weight all elements by a constant
def weights_mult(self, weights, a):
return [a*x for x in weights]
# add two weight sets together
def weights_add(self, w1, w2):
return [a+b for a, b in zip(w1,w2)]
# figures out the operation and returns the split on it
def split_on_operation(self, ineq):
m_op = None
for op in self.OPS:
if op in ineq:
m_op = op
sp = ineq.split(m_op)
(lhs, rhs) = self.divide((-1,ineq),div=m_op)
if lhs is None:
self.error(-1, "Invalid expression: \"" + ineq + "\"")
if m_op is None:
self.error(-1, "Invalid operation: \"" + ineq + "\"")
return (m_op, lhs, rhs)
# return a list of all decision variables associated with index
def setup_vars(self, cats):
self.var_inds = {}
self.all_pats = self.get_patterns_matching(cats, [])
for (i, pat) in enumerate(self.all_pats):
self.var_inds[self.pattern_to_string(cats, pat)] = i
self.var_num = len(self.all_pats)
# get the index of a variable based on its pattern
def get_var_index(self, cats, pat):
return self.var_inds[self.pattern_to_string(cats, pat)]
# look for all the output variables and their coefficients
# return a list of coefficients of every variable
def parse_lhs(self, cats, vals, s):
# regex magic
p = re.compile("(?:\d*\.\d+\*)?out\[(?:\w+=\w+,*)*\]")
# parse individual expressions
exprs = [self.parse_expr(x) for x in p.findall(s)]
# expand out expressions
weights = [0]*self.var_num
for expr in exprs:
if not self.check_pattern(cats, expr[1]): # make sure valid
self.error(-1, "Invalid membership: \"" + s + "\"")
matches = self.get_patterns_matching(cats, expr[1])
for match in matches:
weights[self.get_var_index(cats, match)] += expr[0]
return weights
# look for all the input variables and their coefficients
# return the numerical sum
# (the regex works, even if it's hard to read)
def parse_rhs(self, cats, vals, s):
# regex magic
p = re.compile("(?:\d*\.\d+\*)?in\[(?:\w+=\w+,*)*\]|(?:\d*\.?\d+)")
# parse individual expressions
exprs = [self.parse_expr(x) for x in p.findall(s)]
# get numeric value for rhs
for expr in exprs:
if expr[1] is not None and not self.check_pattern(cats,expr[1]):
self.error(-1, "Invalid membership: \"" + s + "\"")
# multiply and add
val = 0
for expr in exprs:
if expr[1] is None: # plain number value
val += expr[0]
else: # variable number value
val += expr[0]*self.get_val(cats,vals,expr[1])
return val
# parses an expression in the form of .75*in[...] into (.75, ...)
def parse_expr(self, s):
sp = s.split('*')
if len(sp) == 1: # either just a pattern or just a number
try:
f = float(sp[0])
return (f, None)
except:
return (1.0, self.get_pattern(sp[0]))
elif len(sp) == 2:
return (float(sp[0]), self.get_pattern(sp[1]))
else:
self.error(-1, "Illegal expression: \"" + s + "\"")
# will return the sum of all values matching the pattern
def get_val(self, cats, vals, pattern):
all_matches = self.get_patterns_matching(cats, pattern)
queries = [self.pattern_to_string(cats, x) for x in all_matches]
vals = [vals.get(x,0) for x in queries]
return sum(vals)
# just ensures that a string pattern has the proper format
def format_pattern(self, cats, s):
return self.pattern_to_string(cats, self.get_pattern(s))
# turn a string into a pattern (it works, but it doesn't detect errors
# too well - usually they will be caught later though)
def get_pattern(self, s):
s = s[s.find(self.OPEN_CHAR)+1:s.find(self.CLOSE_CHAR)].strip(self.SEP_CHAR)
if len(s) == 0: # check for empty sey of braces
return []
pattern = [x.split(self.EQ_CHAR) for x in s.split(self.SEP_CHAR)]
return pattern
# turn a pattern into an ordered string used for hashing
def pattern_to_string(self, cats, pattern):
s = self.OPEN_CHAR
for (cat_name, _) in cats:
for (cat, mem) in pattern:
if cat_name == cat:
s += cat + self.EQ_CHAR + mem + self.SEP_CHAR
return s.rstrip(self.SEP_CHAR) + self.CLOSE_CHAR
# return if this is a valid pattern based on the possible categories and
# member values. optionally require all categories to be represented
def check_pattern(self, cats, pattern, req_all=False):
# make sure the pattern has enough categories
if req_all and (len(pattern) != len(cats)):
return False
cat_names = [x[0] for x in cats]
mem_names = [x[1] for x in cats]
cs = [] # categories that have been seen so far
for cat, mem in pattern:
if cat in cs: # duplicate cat name
return False
if cat not in cat_names:
return False # invalid cat name
cat_idx = cat_names.index(cat)
if mem not in mem_names[cat_idx]: # invalid member name
return False
cs.append(cat)
return True
# takes pattern of the form "[(gender,male),(class_year,4)]", returns set
# of things that match this pattern
def get_patterns_matching(self, cats, pattern):
cat_names_pat = [x[0] for x in pattern]
mem_names_pat = [x[1] for x in pattern]
cat_names_rst = [x[0] for x in cats]
mem_names_rst = [x[1] for x in cats]
# remove categories that have already been accounted for in the pattern
for cat_name in cat_names_pat:
cat_idx = cat_names_rst.index(cat_name)
cat_names_rst.pop(cat_idx)
mem_names_rst.pop(cat_idx)
# at each step, add on all members to all matches
matches = [pattern]
for (i, cat_name) in enumerate(cat_names_rst):
new_matches = []
for match in matches:
for mem in mem_names_rst[i]:
new_matches.append(match + [(cat_name,mem)])
matches = new_matches
return matches
# prints out errors with references to original line numbers
# will quit if necessary
def error(self, line_num, text="Undefined Error", force_quit=True):
print("Error on line " + str(line_num+1) + ": " + text)
if force_quit:
print("Error was fatal, quitting...")
exit(0)
# divide a string into exactly two parts
def divide(self, el, div=":"):
(orig_num, line) = el
sp = line.split(div)
if len(sp) != 2:
self.error(orig_num, "Syntax Error")
return (None, None)
return sp
# gets a slice of the annotated lines array
def get_slice(self, contents, loc):
return contents[loc[0]:loc[1]]
# remove all comments, whitespace, and empty lines,
# annotate with line numbers
def strip_contents(self, lines):
ret = [] # new list of lines
for (num, line) in enumerate(lines):
# remove everything after the comment symbol
if self.COMMENT_CHAR in line:
line = line[:line.index(self.COMMENT_CHAR)]
# remove whitespace
line = "".join(line.split())
# don't add in empty lines
if line != "":
ret.append((num, line))
return ret
# get the start and end locations of each header, return as a dict
# does error checking along the way
def find_headers(self, contents):
# find all locations of headers in the file
header_locs = {}
# make sure first line is a header
if not contents[0][1].startswith(self.HEADER_CHAR):
self.error(contents[0][0], "Must start with header")
cur_header = None
cur_range = [None, None]
for (new_num, (orig_num, line)) in enumerate(contents):
if line.startswith(self.HEADER_CHAR):
h = line[1:] # get the header
if h not in self.HEADERS:
self.error(orig_num, "Illegal header: \"" + h + "\"")
else:
# start of range is 1 past the header
if cur_range[0] is None:
cur_header = h
cur_range[0] = new_num + 1
# end of the range is the next header
elif cur_range[1] is None:
cur_range[1] = new_num
header_locs[cur_header] = cur_range
cur_header = h
if h in header_locs:
self.error(orig_num, "Duplicate header: \"" + h + "\"")
cur_range = [cur_range[1] + 1, None]
# should not happen, but if it does, exit
else:
self.error(orig_num, force_quit=True)
header_locs[cur_header] = [cur_range[0], len(contents)]
return header_locs
# character that will begin a comment
COMMENT_CHAR = '#'
# character that will begin a header
HEADER_CHAR = '+'
# character that divides labels from data
DIV_CHAR = ':'
# character that separates list elements
SEP_CHAR = ','
# you get the idea
OPEN_CHAR = "["
CLOSE_CHAR = "]"
EQ_CHAR = "="
# header string for requirements
REQ_H = 'requirements'
# header string for goals
GOAL_H = 'goals'
# header string for categories
CAT_H = 'categories'
# header string for applicants
VAL_H = 'applicants'
# list of all valid headers
HEADERS = [REQ_H, GOAL_H, CAT_H, VAL_H]
# constraint types
NUMERIC = 'numeric'
PERCENTAGE = 'percentage'
CONSTRAINTS = ''
for c in [NUMERIC, PERCENTAGE]:
CONSTRAINTS += '\"' + c + '\" '
CONSTRAINTS = CONSTRAINTS.strip()
# list of all valid operations for constraints
LESS = "<="
GREATER = ">="
EQUAL = "=="
OPS = [LESS,GREATER,EQUAL]