-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathc_xgb_test.pyx
153 lines (114 loc) · 5.43 KB
/
c_xgb_test.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#required extern class constructors and functions
cdef extern from "c_xgb/c_xgb.cpp":
cdef cppclass CXgboost:
CXgboost()
CXgboost(int depth, int n_features, int n_trees_ , int objective_, float base_score_)
float predict(float *features, int ntree_limit)
import numpy as np
import xgboost as xgb
import numpy as np
import cPickle as pickle
import time
from sklearn.datasets import make_regression, make_classification
from libc.stdio cimport printf
def test_xgb_regression(n_samples = 10000, n_features = 20, n_estimators = 3, depth = 11):
'''
correctness and performance test for C-xgb for regression tasks
'''
cdef float base_score = 0.5
#creating dataset
x, y = make_regression(n_samples = n_samples, n_features = n_features, n_informative = n_features - 1, random_state = 1)
#training xgb classifier on random dataset
def create_xgb():
model = xgb.XGBRegressor(max_depth = depth, learning_rate = 0.1, n_estimators = n_estimators, silent = True,
objective = 'reg:linear', n_jobs = 8, min_child_weight = 1,
subsample = 0.8, colsample_bytree = 0.8, random_state = 5, missing = np.nan, base_score = base_score)
model.fit(x, y)
return model
model = create_xgb()
booster = model.get_booster()
#dumping xgb to json files
tree_data = booster.get_dump(dump_format='json')
cdef i
for i in xrange(len(tree_data)):
f = open("trees/tree_%d.json" % i, 'w')
f.write(tree_data[i])
f.close()
#creating instance of CXgboost xgboost class
# 0 in parameters means objective 'reg:linear'
cdef CXgboost model_c = CXgboost(depth, n_features, n_estimators, 0, base_score)
cdef float x_cython[20], time_c_xgb = 0.0, time_xgb = 0.0
cdef int j, q, N = 10
#performing tests
for i in xrange(n_samples):
for j in xrange(n_features):
x_cython[j] = x[i][j]#np.around(x[i][j], 3)
reshaped_sample = x[i].reshape(1, n_features)
preds_xgb = model.predict(reshaped_sample, ntree_limit = n_estimators)[0] #xgb prediction
preds_c_xgb = model_c.predict(x_cython, n_estimators) #C-xgb prediction
#comparing predictions
assert(abs(preds_xgb - preds_c_xgb) < 1e-3)
#time measurement for CythonXGB
start = time.time()
for q in xrange(N):
model_c.predict(x_cython, n_estimators)
time_c_xgb += (time.time() - start)
#time measurement for XGBoost
start = time.time()
for q in xrange(N):
model.predict(reshaped_sample)
time_xgb += (time.time() - start)
print 'n_samples = %d | n_estimators = %d | max_depth = %d | objective = %s' % (n_samples, n_estimators, depth, 'reg:linear')
print "XGBoost mean time: %f" % (time_xgb)
print "C_XGBoost mean time: %f" % (time_c_xgb)
print "ACCELERATION IS %f TIMES\n" % (time_xgb / time_c_xgb)
def test_xgb_logistic_regression(n_samples = 10000, n_features = 20, n_estimators = 3, depth = 11):
'''
correctness and performance test for C-xgb for logistic regression tasks
'''
cdef float base_score = 0.5
#creating dataset
x, y = make_classification(n_samples = n_samples, n_features = n_features, n_informative = n_features - 1, n_redundant = 0, n_classes = 2, random_state = 1)
#training xgb classifier on dataset
def create_xgb():
model = xgb.XGBClassifier(max_depth = depth, learning_rate = 0.1, n_estimators = n_estimators, silent = True,
objective = 'binary:logistic', n_jobs = 8, min_child_weight = 1,
subsample = 0.8, colsample_bytree = 0.8, random_state = 5, missing = np.nan, base_score = base_score)
model.fit(x,y)
return model
model = create_xgb()
booster = model.get_booster()
#dumping xgb to json files
tree_data = booster.get_dump(dump_format='json')
cdef i
for i in xrange(len(tree_data)):
f = open("trees/tree_%d.json" % i, 'w')
f.write(tree_data[i])
f.close()
#creating instance of CXgboost xgboost class
# 1 in parameters means objective 'binary:logistic'
cdef CXgboost model_c = CXgboost(depth, n_features, n_estimators, 1, base_score)
cdef float x_cython[20], time_c_xgb = 0.0, time_xgb = 0.0
cdef int j, q, N = 10
for i in xrange(n_samples):
for j in xrange(n_features):
x_cython[j] = x[i][j]
reshaped_sample = x[i].reshape(1, n_features)
preds_xgb = model.predict_proba(reshaped_sample)[0][1]
preds_c_xgb = model_c.predict(x_cython, n_estimators)
#comparing predictions
assert(abs(preds_xgb - preds_c_xgb) < 1e-3)
#time measurement for CythonXGB
start = time.time()
for q in xrange(N):
model_c.predict(x_cython, n_estimators)
time_c_xgb += (time.time() - start)
#time measurement for XGBoost
start = time.time()
for q in xrange(N):
model.predict(reshaped_sample)
time_xgb += (time.time() - start)
print 'n_samples = %d | n_estimators = %d | max_depth = %d | objective = %s' % (n_samples, n_estimators, depth, 'binary:logistic')
print "XGBoost mean time: %f" % (time_xgb)
print "C_XGBoost mean time: %f" % (time_c_xgb)
print "ACCELERATION IS %f TIMES\n" % (time_xgb / time_c_xgb)