-
Notifications
You must be signed in to change notification settings - Fork 0
/
bayesGLM_predictBugCovering_fromRanking.R
171 lines (128 loc) · 5.49 KB
/
bayesGLM_predictBugCovering_fromRanking.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#predicts the minimal ranking to consider a question
#as bug covering
#uses bayesGLM, glm, and knn from CARET package
#Which model provides the best predictions?
#NaiveBayes,KNN, RandomForest, SVN, or GLM
#libraries
library(caret)
####################
#Import data
source("C://Users//chris//OneDrive//Documentos//GitHub//ML_VotingAggregation//aggregateAnswerOptionsPerQuestion.R");
summaryTable <- runMain();
#summaryTable <- data.frame(summaryTable);
#I need to guarantee that some examples (i.e., failing methods)
#do not dominate the training or testing sets. To do that, I need to get a
#close to equal proportion of examples in both sets
#Scramble the dataset before extracting the training set.
set.seed(8850);
g<- runif((nrow(summaryTable))); #generates a random distribution
summaryTable <- summaryTable[order(g),];
##################################################
# Create trainControl to be reused by all models #
#convert columns to numeric
summaryTable<- data.frame(summaryTable, stringsAsFactors = FALSE)
summaryTable[,"rankingVote"] <- as.numeric(unlist(summaryTable[,"rankingVote"]));
summaryTable$bugCoveringLabels <- as.character(summaryTable$bugCovering);
summaryTable$bugCoveringLabels<- replace(summaryTable$bugCoveringLabels,summaryTable$bugCoveringLabels=="FALSE", "F");
summaryTable$bugCoveringLabels<- replace(summaryTable$bugCoveringLabels,summaryTable$bugCoveringLabels=="TRUE", "T");
summaryTable$bugCoveringLabels<- as.factor(summaryTable$bugCoveringLabels);
# Create custom indices: myFolds
#Guarantees that we are going to use the exact
#same datasets for all models
myFolds <- createFolds(summaryTable[,"rankingVote"] , k = 10);
# Create reusable trainControl object: myControl
kFoldControl <- trainControl(
index = myFolds, #Train with 9 folds and validate with one
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE, #
savePredictions = TRUE, #
summaryFunction = twoClassSummary
);
#######################
# Generate each model #
##############
# Naive Bayes
nb<- train(bugCoveringLabels ~ rankingVote,summaryTable, method="nb", trControl=kFoldControl);
#nb
# usekernel ROC Sens Spec
# FALSE 0.7546970 0.9031409 0.5664596
# TRUE 0.7534538 0.9270484 0.5095109
######
# KNN
knn <- train(bugCoveringLabels ~ rankingVote,summaryTable, method="knn", trControl=kFoldControl);
#knn
# k ROC Sens Spec
# 5 0.8290137 0.9778947 0.1340909
################
# Random Forest
rf<- train(bugCoveringLabels ~ rankingVote,summaryTable, method="rf", trControl=kFoldControl);
#rf
# ROC Sens Spec
# 0.8124545 0.8762876 0.5132246
######
# GLM
glm<- train(bugCoveringLabels ~ rankingVote,summaryTable, method="glm", trControl=kFoldControl);
#glmnet<- train(bugCoveringLabels ~ rankingVote,summaryTable, method="glmnet", trControl=kFoldControl);
#glmBoost<- train(bugCoveringLabels ~ rankingVote,summaryTable, method="glmBoost", trControl=kFoldControl);
bayesglm<- train(bugCoveringLabels ~ rankingVote,summaryTable, method="bayesglm", trControl=kFoldControl);
#glm
#ROC Sens Spec
#0.8747113 0.9237826 0.4507378
#glmnet
#ROC Sens Spec
#0.8747113 0.9237826 0.4507378
#glmBoost
#mstop ROC Sens Spec
#150 0.8898239 0.9259331 0.4174045
# bayesglm
# ROC Sens Spec
# 0.8898239 0.9322932 0.4371014
#glmnet model is a more sophisticated solution that use penalty terms to reduce the magnitude
#of the two GLM coeficients. The goal of GMLNet is to explain as much variance in the model.
#The trade-off is that glmnet accepts more bias in the data (more risk of overfitting)
#In any case, both glmnet and glm produce the exact same results for my data, therefore I favored
#the simplest model.
######
# SVM
svmLinear <- train(bugCoveringLabels ~ rankingVote,summaryTable, method="svmLinear", trControl=kFoldControl);
svmLinear2 <- train(bugCoveringLabels ~ rankingVote,summaryTable, method="svmLinear2", trControl=kFoldControl);
svmLinearWeights <- train(bugCoveringLabels ~ rankingVote,summaryTable, method="svmLinearWeights", trControl=kFoldControl);
#svmLinear
# ROC Sens Spec
#0.6798618 0.9643897 0.2357955
#svmLinear2
#cost ROC Sens Spec
#0.50 0.7713566 0.9757671 0.1613636
#svmLinearWeights
# cost weight ROC Sens Spec
# 0.50 3 0.8102679 0.8439114 0.5645059
#Next steps
#Why bayesGLM seems better?
#Discover the minimal ranking value that would have predicted the same bug Covering questions</p>
# ```{r bayesglm.metric, echo=FALSE}
predictionList <- predict(nb);
predictedBugCoveringTable<-summaryTable[predictionList=="T",];
predictedList <- as.numeric(unlist(predictedBugCoveringTable[,9]));
predictedList
#####All predictions are the same. This makes all methods equivalent.
#
# > table(predict(nb))
#
# F T
# 108 21
# > predict(nb)
# [1] F F T F F T F F F F F F F F F F F F F F F F F F F F F F F T F F F F F F T F F F T F F F T T F F F F F F
# [53] F F T F F F F F F T F F F F F F T F F F F F F T F T F F F F F F F F F T F F F F T F F F T F F F F T F F
# [105] F F T F F F F F F F F F T F T T F F F F F F F T F
# Levels: F T
# > predict(bayesglm)
# [1] F F T F F T F F F F F F F F F F F F F F F F F F F F F F F T F F F F F F T F F F T F F F T T F F F F F F
# [53] F F T F F F F F F T F F F F F F T F F F F F F T F T F F F F F F F F F T F F F F T F F F T F F F F T F F
# [105] F F T F F F F F F F F F T F T T F F F F F F F T F
# Levels: F T
# > table(predict(nb))
#
# F T
# 108 21
#```
#Get the measures