-
Notifications
You must be signed in to change notification settings - Fork 4
/
march_madness_models.py
307 lines (248 loc) · 12.1 KB
/
march_madness_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import pandas as pd
import numpy as np
import march_madness_games as mmg
import random as rand
################# MODELS FOR OUR TOURNAMENTS #####################
# predictor high seed
class BasicPredictor(object):
# init function
def __init__(self):
return
# head to head predicitons
def predict(self, team_1, team_2):
return team_1
# actual tournament results
class ActualTournament(object):
# init function
def __init__(self, data, include_scoring_dif=False):
self.include_scoring_dif = include_scoring_dif
self.tourney = data
return
def predict(self, team_1, team_2):
game_played_team_1_win = self.tourney[(self.tourney["Wteam"] == int(team_1)) & (self.tourney["Lteam"] == int(team_2))]
game_played_team_2_win = self.tourney[(self.tourney["Lteam"] == int(team_1)) & (self.tourney["Wteam"] == int(team_2))]
# extract winner and loser
if game_played_team_1_win.shape[0] == 1:
winning_team = team_1
scoring_dif = game_played_team_1_win["Wscore"] - game_played_team_1_win["Lscore"]
elif game_played_team_2_win.shape[0] == 1:
winning_team = team_2
scoring_dif = game_played_team_2_win["Wscore"] - game_played_team_2_win["Lscore"]
else:
print "Error"
return -1
# return socre and scoring dif if we want
if self.include_scoring_dif:
return (winning_team, scoring_dif.values[0])
else:
return winning_team
# predictor using markov chain stationary distribution
class MarkovPredictor(object):
# init function
def __init__(self, data):
self.data = data
return
# head to head predicitons
def predict(self, team_1, team_2):
team_1 = int(team_1)
team_2 = int(team_2)
# lookup the pi values in the lookup table
team_1_pi_i = self.data.loc[self.data["Team"] == team_1, "pi_i"].values[0]
team_2_pi_i = self.data.loc[self.data["Team"] == team_2, "pi_i"].values[0]
if team_1_pi_i > team_2_pi_i:
return team_1
else:
return team_2
# MODEL PREDICTOR ------------------------------------------------------------------------
# however, you are able to do some biasing of the predictions
# higher_seed_bias=False ----> if True, will predict higher seed (upset) with probability p + higher_seed_bias_delta
# higher_seed_bias_delta=.05 ----> tuned to how much bias we want towards upsets/top seed winning
# we are also able to do "cooling" of our model ----> cooling cooresponds to changing the bias depening on the round
# pass in a dict of the form {1:r1, 2:r2, 3:r3, 4:r4, 5:r5, 6:r6}
# when we update bias the probability, we do p + higher_seed_bias_delta * r_i depending on the round
# we are also able to pass in other brackets and induce bias based on the similiarity
# predictor using some model for predicting head to head games
class ModelPredictor(object):
# init function
def __init__(self,
model,
scaler,
dfs_arr,
year,
seeds_df,
simulation=False,
higher_seed_bias=False,
higher_seed_bias_delta=.075,
cooling=None,
other_bracket_arr=[],
other_bracket_bias_delta = .1
):
self.model = model
self.dfs_arr = dfs_arr
self.year = year
self.simulation = simulation
self.scaler = scaler
self.seeds_df = seeds_df
self.higher_seed_bias = higher_seed_bias
self.higher_seed_bias_delta = higher_seed_bias_delta
self.cooling=cooling
self.other_bracket_arr = other_bracket_arr
self.other_bracket_bias_delta = other_bracket_bias_delta
# used to check what round we are in
self.game_count = 0
return
# head to head predicitons
def predict(self, team_1, team_2):
team_1 = int(team_1)
team_2 = int(team_2)
# min and max index
min_index_team = min(team_1, team_2)
max_index_team = max(team_1, team_2)
# get the x values
row = mmg.get_predictors_dif(min_index_team, max_index_team, self.year, self.dfs_arr)
# predict probability team 1 win under model
p_hat = self.model.predict_proba(self.scaler.transform(row.reshape(1,-1)))[0,1]
# get the seeds
team_seeds = self.__get_seeds(min_index_team, max_index_team)
min_index_seed, min_index_seed_str, max_index_seed, max_index_seed_str = team_seeds
# get the current round of the game
cur_round = self.__get_cur_round()
# update the game count
self.__update_game_count(min_index_seed_str, max_index_seed_str)
# check if we want to induce upsets, update p_hat
if self.higher_seed_bias:
# check if cooling
if self.cooling is None:
bias_delta = self.higher_seed_bias_delta
else:
# update the bias
cooling_factor = self.cooling.get(cur_round)
bias_delta = self.higher_seed_bias_delta * cooling_factor
# adjust our p_hat
p_hat = self.__bias_p_hat_upset(p_hat, bias_delta, min_index_seed, max_index_seed)
# check if we want to induce difference from other brackets, update p_hat
if len(self.other_bracket_arr) != 0:
# adjust our p_hat
p_hat = self.__bias_p_hat_dif(p_hat, min_index_team, max_index_team)
# make final prediction, determinisitcally or with biased coin flip
return self.__make_prediction(p_hat, self.simulation, min_index_team, max_index_team)
# gets seeds of team 1 and team 2
def __get_seeds(self, team_1, team_2):
# get the seeds to see which team is the underdog
team_1_seed_str = self.seeds_df.loc[self.seeds_df["Team"] == team_1, "Seed"].values[0]
team_2_seed_str = self.seeds_df.loc[self.seeds_df["Team"] == team_2, "Seed"].values[0]
# convert the seeds to ints for comparieson
team_1_seed = int(team_1_seed_str[1:3])
team_2_seed = int(team_2_seed_str[1:3])
return team_1_seed, team_1_seed_str, team_2_seed, team_2_seed_str
# checks if we have a play in game
def __check_playin_game(self, team_1_seed_str, team_2_seed_str):
# confirm not a play in game, iterate
return len(team_1_seed_str) == 4 and len(team_2_seed_str) == 4
# gets the current round of the game
def __get_cur_round(self):
# check which round we are in
if self.game_count < 32:
return 1
elif self.game_count < 32 + 16:
return 2
elif self.game_count < 32 + 16 + 8:
return 3
elif self.game_count < 32 + 16 + 8 + 4:
return 4
elif self.game_count < 32 + 16 + 8 + 4 + 2:
return 5
elif self.game_count < 32 + 16 + 8 + 4 + 2 + 1:
return 6
else:
print self.game_count
print "issue with game count"
return
# updates game count, if not a playin game
def __update_game_count(self, min_index_seed_str, max_index_seed_str):
# check if play in game, iterate game count if so
if self.__check_playin_game(min_index_seed_str, max_index_seed_str):
self.game_count = self.game_count
else:
self.game_count = self.game_count + 1
# biases the p_hat
def __bias_p_hat_upset(self, p_hat, bias, min_index_seed, max_index_seed):
# Update p_hat given the underdog status on one of the teams
if min_index_seed < max_index_seed:
# update p_hat to predict max_index more often
return p_hat - bias
# if max index team is the lower seed
elif max_index_seed < min_index_seed:
# update p_hat to predict min_index more often
return p_hat + bias
# otherwise just return phat
else:
return p_hat
# biases the p_hat
def __bias_p_hat_dif(self, p_hat, min_index_team, max_index_team):
# if we care about differentiating from another bracket
if len(self.other_bracket_arr) != 0:
# buffers
other_bracket_min_index_count = 0
other_bracket_max_index_count = 0
# count similarities
for other_bracket in self.other_bracket_arr:
# predicted team by other bracket
prediction = other_bracket.iloc[self.game_count - 1]["Prediction"]
# iterate count of similiarity
if int(prediction) == min_index_team:
other_bracket_min_index_count = other_bracket_min_index_count + 1
elif int(prediction) == max_index_team:
other_bracket_max_index_count = other_bracket_max_index_count + 1
# update bias if one of these teams was picked by other brackets
if other_bracket_min_index_count + other_bracket_max_index_count != 0:
# min index percent
percent_min_index = float(other_bracket_min_index_count) / (other_bracket_min_index_count + other_bracket_max_index_count)
# max index percent
percent_max_index = float(other_bracket_max_index_count) / (other_bracket_min_index_count + other_bracket_max_index_count)
# if most brackets pick min index
if percent_max_index < percent_min_index:
return p_hat - self.other_bracket_bias_delta
# if most brackets pick max index, bias probability towards the min index
else:
return p_hat + self.other_bracket_bias_delta
# otherwise, just use our model
else:
return p_hat
# dont update bias, if we are not checking other brackets
else:
return p_hat
# makes prediction
def __make_prediction(self, p_hat, simulation, min_index_team, max_index_team):
# if simulation, return min_index team with prob p_hat
if simulation:
random_unif = rand.uniform(0,1)
# return min_index with probability p_hat
if random_unif <= p_hat:
return min_index_team
else:
return max_index_team
# if not a simulation, return the prediction of the (possibly biased) model
else:
if p_hat > .5:
return min_index_team
else:
return max_index_team
# EXPECTED POINTS PREDICTOR ---------------------------------------------------------------------------------------------------
# predict based on expected points from the simulation
# looks up the expected number of points 2 teams will score,
# predicts arg_max(E[points_1], E[points_2])
class ExpectedPointsPredictor(object):
# pass in a dataframe with the expected points of each team from a simulation
def __init__(self, points_df):
self.points_df = points_df
return
# predict based on looking up expected points
def predict(self, team_1, team_2):
team_1_points = self.points_df.loc[self.points_df.index == int(team_1), "pred_points"].values[0]
team_2_points = self.points_df.loc[self.points_df.index == int(team_2), "pred_points"].values[0]
# predict max(points 1, points 2)
if team_1_points > team_2_points:
return team_1
else:
return team_2