-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
190 lines (125 loc) · 4.92 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""Predicting on the test set with the model"""
import pickle
import datetime as dt
import pandas as pd
import numpy as np
from create_data import build_train_period, build_dataset
from data_config import get_prev, get_day_number, get_column_name
from initial_data import (
sales,
get_days,
)
from linear_model import metrics, evaluate
def custom_round(float_value, threshold=0.5):
"""Round float_value to integer using custom threshold"""
if float_value <= 0:
return 0
int_part = int(float_value)
res = -1 * (int_part - float_value)
return int_part if res <= threshold else int_part + 1
def find_thres(preds, real_values, threshold_range, metric_to_follow="mse"):
"""Brute-force method to choose the best round threshold"""
best_metric_to_follow = np.inf # -np.inf dor maximizing task
best_t = 0
best_metrics = {}
for t in threshold_range:
current_metrics = evaluate(
list(map(lambda x: custom_round(x, t), preds)), real_values, metrics
)
if current_metrics[metric_to_follow] <= best_metric_to_follow:
best_metrics = current_metrics
best_t = t
return best_t, best_metrics
def predict_step_by_step(
model, target_list, target_exists=True, threshold=False, save=False
):
"""Validate train model on target_list provided"""
global sales
base_columns = sorted(set(sales.columns) - set(get_days(sales)))
# save predictions
preds_df = pd.DataFrame(index=sales.index)
for target in target_list:
target_col = get_column_name(target)
df_test = build_dataset(
base_columns=base_columns,
target_list=[target],
period=28,
datasets_num=1,
test=True,
)
preds = model.predict(df_test[model.feature_name()])
if threshold:
preds = list(map(lambda x: custom_round(x, threshold), preds))
preds_df = preds_df.merge(
pd.DataFrame({target_col: preds}, index=df_test.index),
left_index=True,
right_index=True,
)
if not target_exists:
sales = sales.merge(
pd.DataFrame({target_col: preds}, index=df_test.index),
left_index=True,
right_index=True,
)
return preds_df
def validate(preds_df):
"""Before or after postprocessing"""
global sales, metrics
all_metrics = {key: list().copy() for key, val in metrics.items()}
for col in preds_df.keys():
cur_metrics = evaluate(preds_df[col], sales[col], metrics)
{key: all_metrics[key].append(val) for key, val in cur_metrics.items()}
mean_metrics = {key: np.mean(val) for key, val in all_metrics.items()}
median_metrics = {key: np.median(val) for key, val in all_metrics.items()}
print(f"All metrics: {all_metrics}")
print(f"Mean metrics: {mean_metrics}")
print(f"Median metrics: {median_metrics}")
return all_metrics
def postprocess_predictions(preds_df, choose_threshold=False):
# step by step predictions using sales dataframe
global sales
if choose_threshold:
thresholds = []
for col in preds_df.keys():
t, _ = find_thres(preds_df[col], sales[col], np.arange(0.1, 1, 0.05))
thresholds.append(t)
threshold_found = np.median(thresholds)
print("Thresholds", thresholds)
print(f"Applying threshold {threshold_found}")
preds_df = preds_df.applymap(
lambda x: custom_round(x, threshold=threshold_found)
)
return preds_df
import lightgbm as lgb
from lgbm_regr import get_dt_str
from transfer_df import upload_pickled, usecols
if __name__ == "__main__":
# model = pickle.load(open("models/elastic_model_2020-05-04_20:25:51", "rb"))
# model = lgb.Booster(model_file=r"models/booster_2020-05-24_10:48:36.txt")
# val_list = list(range(1912, 1914))
# preds_df = predict_step_by_step(
# model, target_list=val_list, target_exists=True, threshold=False
# )
# validate(preds_df)
# validate(postprocess_predictions(preds_df, choose_threshold=True))
# val_list or target_list
# preds_df = preds_df.rename(
# {f"d_{x}": f"F{num}" for num, x in enumerate(val_list, start=1)}, axis=1
# )
# preds_df.to_csv("preds.csv", sep=",", header=True, index=True)
#####
# SUBMISSION
#####
model = lgb.Booster(
model_file=r"models/lgbm_2020-06-02_21:39:29/booster_2020-06-03_06:01:27.txt"
)
# val_list = list(range(1914, 1914 + 28 + 1))
# preds_df = predict_step_by_step(
# model, target_list=val_list, target_exists=False, threshold=False
# )
# # val_list or target_list
# preds_df = preds_df.rename(
# {f"d_{x}": f"F{num}" for num, x in enumerate(val_list, start=1)}, axis=1
# )
# preds_df.to_csv(f"submission_{get_dt_str()}.csv", sep=",", header=True, index=True)
upload_pickled()