-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgradientBoosting.py
101 lines (88 loc) · 2.85 KB
/
gradientBoosting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/python
# -*- coding: UTF-8 -*-
'手工实现梯度提升'
import numpy
import matplotlib.pyplot as plot
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor # 决策树回归
from math import floor
import random
# 生成数据集
nPoints = 1000
xPlot = [(float(i) / float(nPoints) - 0.5) for i in range(nPoints + 1)]
x = [[s] for s in xPlot]
numpy.random.seed(1)
y = [s + numpy.random.normal(scale=0.1) for s in xPlot]
# 展示数据
plot.figure()
plot.plot(x, y)
plot.xlabel('x')
plot.ylabel('y')
plot.show()
# 采样
nSample = int(nPoints * 0.30) # 采样率为 30%
idxTest = random.sample(range(nPoints), nSample)
idxTest.sort()
idxTran = [idx for idx in range(nPoints) if not (idx in idxTest)]
# 测试集与训练集
xTrain = [x[r] for r in idxTran]
yTrain = [y[r] for r in idxTran]
xTest = [x[r] for r in idxTest]
yTest = [y[r] for r in idxTest]
# 参数设定
numTreeMax = 30; treeDepth = 5
# 模型数组
modelList = []
# 预测值
preList = []
# 步长控制
eps = 0.3
# 初始化残差
residuals = list(yTrain)
for iTree in range(numTreeMax):
modelList.append(DecisionTreeRegressor(max_depth=treeDepth))
# 使用训练集属性与残差对模型进行训练
modelList[-1].fit(xTrain, residuals)
# 使用本轮训练的模型对训练集进行预测
latestInSamplePrediction = modelList[-1].predict(xTrain)
# 更新残差,旧残差减去一定比例的本轮预测值
residuals = [residuals[i] - eps * latestInSamplePrediction[i] \
for i in range(len(residuals))]
# 样本外预测值
latestOutSamplePrediction = modelList[-1].predict(xTest)
preList.append(latestOutSamplePrediction)
# 均方误差
mse = []
allPredictions = []
for iModels in range(len(modelList)):
# 计算提升树模型的预测值须要将多轮迭代模型的结果进行求和
pred = []
for iPred in range(len(xTest)):
# 这里求和的原理可以看一下等比数列求和
# 不过仅当末项相对首项可忽略的情况下成立
pred.append(sum(
[preList[i][iPred] for i in range(iModels + 1)]
) * eps)
allPredictions.append(pred)
errors = [(yTest[i] - pred[i]) for i in range(len(yTest))]
mse.append(sum([e*e for e in errors]) / len(yTest))
nModels = [i + 1 for i in range(len(modelList))]
plot.plot(nModels, mse)
plot.axis('tight')
plot.xlabel(u'迭代次数')
plot.ylabel(u'均方误差')
plot.ylim((0.0, max(mse)))
plot.show()
plotList = [0, 7, 14]
lineType = [':', '-.', '--']
plot.figure()
for i in range(len(plotList)):
iPlot = plotList[i]
textLegend = str(iPlot) + u'轮迭代的预测值'
plot.plot(xTest, allPredictions[iPlot], label=textLegend, linestyle=lineType[i])
plot.plot(xTest, yTest, label=u'实际值', alpha=0.25)
plot.legend(bbox_to_anchor=(1,0.3))
plot.axis('tight')
plot.xlabel(u'x 值')
plot.xlabel(u'预测值')
plot.show()