-
-
Notifications
You must be signed in to change notification settings - Fork 94
/
Copy pathGBDT.py
137 lines (115 loc) · 5.32 KB
/
GBDT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import numpy as np
import pandas as pd
import sys
sys.path.append("D:/Github/Machine-Learning-Basic-Codes")
sys.path.append("D:/Github/Machine-Learning-Basic-Codes/07Decision_Trees")
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from utils.visualize import *
from utils.tool_func import *
from Decision_Trees_Reg import Skylark_DecisionTreeRegressor
class GBDT(object):
"""Super class of GradientBoostingClassifier and GradientBoostinRegressor.
Uses a collection of regression trees that trains on predicting the gradient
of the loss function.
Parameters:
-----------
n_estimators: int
树的数量
The number of classification trees that are used.
learning_rate: float
梯度下降的学习率
The step length that will be taken when following the negative gradient during
training.
min_samples_split: int
每棵子树的节点的最小数目(小于后不继续切割)
The minimum number of samples needed to make a split when building a tree.
min_impurity: float
每颗子树的最小纯度(小于后不继续切割)
The minimum impurity required to split the tree further.
max_depth: int
每颗子树的最大层数(大于后不继续切割)
The maximum depth of a tree.
regression: boolean
是否为回归问题
True or false depending on if we're doing regression or classification.
"""
def __init__(self, n_estimators, learning_rate, min_samples_split,
min_impurity, max_depth, regression):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.min_samples_split = min_samples_split
self.min_impurity = min_impurity
self.max_depth = max_depth
self.regression = regression
# 进度条 processbar
self.bar = progressbar.ProgressBar(widgets=bar_widgets)
self.loss = SquareLoss()
if not self.regression:
self.loss = SotfMaxLoss()
# 分类问题也使用回归树,利用残差去学习概率
self.trees = []
for i in range(self.n_estimators):
self.trees.append(Skylark_DecisionTreeRegressor(min_samples_split=self.min_samples_split,
min_impurity=self.min_impurity,
max_depth=self.max_depth))
def fit(self, X, y):
# 让第一棵树去拟合模型
self.trees[0].fit(X, y)
y_pred = self.trees[0].predict(X)
for i in self.bar(range(1, self.n_estimators)):
gradient = self.loss.gradient(y, y_pred)
self.trees[i].fit(X, gradient)
y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))
def predict(self, X):
y_pred = self.trees[0].predict(X)
for i in range(1, self.n_estimators):
y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))
if not self.regression:
# Turn into probability distribution
y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
# Set label to the value that maximizes probability
y_pred = np.argmax(y_pred, axis=1)
return y_pred
class Skylark_GBDT_Clf(GBDT):
def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,
min_info_gain=1e-7, max_depth=2, debug=False):
super(Skylark_GBDT_Clf, self).__init__(n_estimators=n_estimators,
learning_rate=learning_rate,
min_samples_split=min_samples_split,
min_impurity=min_info_gain,
max_depth=max_depth,
regression=False)
def fit(self, X, y):
y = to_categorical(y)
super(Skylark_GBDT_Clf, self).fit(X, y)
if __name__ == '__main__':
use_api = False
# Data Preprocessing
dataset = pd.read_csv('./dataset/Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
Y = dataset.iloc[:, 4].values
# Making Dataset
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.25, random_state=0)
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float64))
X_test = sc.transform(X_test.astype(np.float64))
if use_api:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(n_estimators=140, max_depth=2, min_samples_split=2, learning_rate=0.1)
classifier.fit(X_train, Y_train)
else:
classifier = Skylark_GBDT_Clf()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
print_confusion_matrix(
Y_test, Y_pred, clf_name='Gradient Boost Decision Tree Classification')
# Visualising the Training set results
visualization_clf(X_train, Y_train, classifier,
clf_name='Gradient Boost Decision Tree Classification', set_name='Training')
# Visualising the Test set results
visualization_clf(X_test, Y_test, classifier,
clf_name='Gradient Boost Decision Tree Classification', set_name='Test')