-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtrain.py
64 lines (50 loc) · 1.73 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import argparse
import os
import pandas as pd
import pickle
import time
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn import model_selection
from utils import transform_datetime_features
from sdsj_feat import load_data
# use this to stop the algorithm before time limit exceeds
TIME_LIMIT = int(os.environ.get('TIME_LIMIT', 5*60))
ONEHOT_MAX_UNIQUE_VALUES = 20
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--train-csv', required=True)
parser.add_argument('--model-dir', required=True)
parser.add_argument('--mode', choices=['classification', 'regression'], required=True)
args = parser.parse_args()
start_time = time.time()
df_X, df_y, model_config, _ = load_data(args.train_csv)
model_config['mode'] = args.mode
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression' if args.mode == 'regression' else 'binary',
'metric': 'rmse',
"learning_rate": 0.01,
"num_leaves": 200,
"feature_fraction": 0.70,
"bagging_fraction": 0.70,
'bagging_freq': 4,
"max_depth": -1,
"verbosity" : -1,
"reg_alpha": 0.3,
"reg_lambda": 0.1,
#"min_split_gain":0.2,
"min_child_weight":10,
'zero_as_missing':True,
'num_threads': 4,
}
params['seed'] = 1
model = lgb.train(params, lgb.Dataset(df_X, label=df_y), 600)
model_config['model'] = model
model_config['params'] = params
model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
with open(model_config_filename, 'wb') as fout:
pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL)
print('Train time: {}'.format(time.time() - start_time))