-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRandom Forest.R
156 lines (141 loc) · 3.25 KB
/
Random Forest.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#install.packages("adabag")
#randomForest
#install.packages("randomForest")
library(randomForest)
data<-read.table("/Users/chenchuandong/Desktop/german.csv",
header = TRUE, sep = ",", dec=".")
#View(data)
attach(data)
summary(data)
colnames(data)
nrow(data)
head(data)
prop.table(table(data$target))
target<-ifelse(data[,"target"]=="0","bad","good")
tmp<-data[,-21]
data<-cbind(tmp,target)
#attach(data)
#View(data)
#Simple Splitting Based on the Outcome
#Details can be seen:http://topepo.github.io/caret/splitting.html
set.seed(4543)#randomization
splitIndex<-createDataPartition(
data$target,## the outcome data are needed
p=0.75,## The percentage of data in the training set
list=FALSE## The format of the results
)
## The output is a set of integers for the rows of data
## that belong in the training set.
training<-data[splitIndex,]
testing<-data[-splitIndex,]
nrow(training);nrow(testing)
prop.table(table(training$target))
prop.table(table(testing$target))
str(training);str(testing)
#build the model
rf<-randomForest(target ~.,data= training,importance=T)
print(rf)
#importance
varImpPlot(rf,main="random")
#prediction
rf.pr<-predict(rf, testing)
predType <-c()
for (i in 1:length(rf.pr))
{
if (rf.pr[i]=="good")
{
predType[i]<-1
}
if (rf.pr[i]=="bad")
{
predType[i]<-0
}
}
trclass<-testing$target
trueType <-c()
for (i in 1:length(trclass))
{
if (trclass[i]=="good")
{
trueType[i]<-1
}
if (trclass[i]=="bad")
{
trueType[i]<-0
}
}
# Model evaluation based on confusion matrix
#trueType:true variable. The type of the data is 0 and 1.
#predType:predict result. The type of the data is 0 and 1.
#return-->list(...)
ppe<-function(trueType,predType)
{
#build confusion matrix
confusionMatrix<-as.matrix(table(trueType,predType))
#get TP、FN、FP、TN
TP<-confusionMatrix[2,2] #true positive
FN<-confusionMatrix[2,1] #false negative
FP<-confusionMatrix[1,2] #false positive
TN<-confusionMatrix[1,1] #true negative
#1.Accuracy
e.A<-TP/(TP+FP)
#2.Negtive Accuracy
e.NA<-TN/(TN+FN)
#3.Total Accuracy
e.TA<-(TP+TN)/(TP+FN+FP+TN)
#4.Error Rate
e.ER<-FP/(TP+FP)
#5.Negtive Error Rate
e.NER<-FN/(FN+TN)
#6.Total Error Rate
e.TER<-1-e.TA
#7.Coverage Rate
e.CR<-TP/(TP+FN)
#8.Negtive Coverage Rate
e.NCR<-TN/(FP+TN)
#9.FP Rate
e.FPR<-FP/(FP+TN)
#10.FN Rate
e.FNR<-FN/(TP+FN)
#11.F value
e.F<-2*e.A*e.CR/(e.A+e.CR)
#12.Lift Value
e.LV<-e.A/((TP+FN)/(TP+FN+FP+TN))
#13.correlation coefficient
e.phi<-(TP*TN-FP*FN)/sqrt((TP+FN)*(TN+FP)*(TP+FP)*(TN+FN))
#14.Kappa
pe<-((TP+FN)*(TP+FP)+(FP+TN)*(FN+TN))/(TP+FN+FP+TN)^2
e.Kappa<-(e.TA-pe)/(1-pe)
return(list(e.A=e.A,e.NA=e.NA,e.TA=e.TA,e.ER=e.ER,e.NER=e.NER,e.TER=e.TER,
e.CR=e.CR,e.NCR=e.NCR,e.FPR=e.FPR,e.FNR=e.FNR,e.F=e.F,e.LV=e.LV,
e.phi=e.phi,e.Kappa=e.Kappa))
}
ppe(trueType, predType)
#plot roc
#install.packages("gplots")
library(ROCR)
#score test data set
pred<-prediction(trueType, predType)
perf <- performance(pred,"tpr","fpr")
plot(perf)
#calculate auc
auc.obj <- performance(pred,"auc")
auc <- auc.obj@y.values[[1]]
auc