-
Notifications
You must be signed in to change notification settings - Fork 2
/
01.cohort_feat_xgb.py
92 lines (86 loc) · 2.81 KB
/
01.cohort_feat_xgb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#%% imports and definitions
import itertools as itt
import os
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import cross_validate
from tqdm.auto import tqdm
from xgboost import XGBClassifier
from routine.data_generation import generate_data
PARAM_DATA = {
"num_users": 1000,
"num_campaigns": 100,
"samples_per_campaign": 10000,
"num_cohort": 10,
"fh_cohort": True,
"even_cohort": True,
"response_sig_a": 10,
"cross_weight": None,
"magnify_hf": 1,
}
PARAM_XGB = {
"max_depth": 5,
"learning_rate": 1,
"objective": "binary:logistic",
"eval_metric": "logloss",
"use_label_encoder": False,
}
PARAM_NROUND = 30
PARAM_VAR = np.linspace(0.05, 0.6, 12)
PARAM_COHORT = ["cohort id", "numerical features", "cohort id + numerical features"]
PARAM_NTRAIN = 10
PARAM_FONT_SZ = {"font_size": 16, "title_font_size": 24, "legend_title_font_size": 24}
OUT_RESULT_PATH = "./intermediate/cohort_feat_xgb"
FIG_PATH = "./figs/cohort_feat_xgb"
os.makedirs(OUT_RESULT_PATH, exist_ok=True)
os.makedirs(FIG_PATH, exist_ok=True)
#%% training
result_ls = []
for cvar, cs, itrain in tqdm(
list(itt.product(PARAM_VAR, PARAM_COHORT, range(PARAM_NTRAIN)))
):
data, user_df, camp_df = generate_data(cohort_variances=cvar, **PARAM_DATA)
if cs == "cohort id":
feat_cols = ["cohort", "camp_f0", "camp_f1"]
data_modified = pd.get_dummies(data[feat_cols], columns=["cohort"])
elif cs == "numerical features":
feat_cols = ["user_f0", "user_f1", "camp_f0", "camp_f1"]
data_modified = data[feat_cols]
elif cs == "cohort id + numerical features":
feat_cols = ["cohort", "user_f0", "user_f1", "camp_f0", "camp_f1"]
data_modified = pd.get_dummies(data[feat_cols], columns=["cohort"])
model = XGBClassifier(n_estimators=PARAM_NROUND, **PARAM_XGB)
score = cross_validate(model, data_modified, data["response"])["test_score"]
score = pd.DataFrame(
{
"cohort_variance": cvar,
"cs": cs,
"itrain": itrain,
"cv": np.arange(len(score)),
"score": score,
}
)
result_ls.append(score)
result = pd.concat(result_ls, ignore_index=True)
result.to_csv(os.path.join(OUT_RESULT_PATH, "result.csv"), index=False)
#%% plot result
result = pd.read_csv(os.path.join(OUT_RESULT_PATH, "result.csv"))
fig = px.box(
result,
x="cohort_variance",
y="score",
color="cs",
category_orders={
"cs": ["cohort id", "numerical features", "cohort id + numerical features"]
},
)
fig.update_layout(
legend_title="Input to the model",
xaxis_title="Cohort Variance",
yaxis_title="CV Score",
**PARAM_FONT_SZ
)
fig.write_html(os.path.join(FIG_PATH, "scores.html"))