-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathXGBoost.R
162 lines (108 loc) · 3.98 KB
/
XGBoost.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
library(data.table)
library(Matrix)
library(caret)
library(xgboost)
dt <- fread("case4_train_numeric.csv",header= TRUE)
set.seed(123)
dt <- dt[sample(nrow(dt), 200000),]
Y <- dt$Response
dt[ , Response := NULL]
for(col in names(dt)) set(dt, j = col, value = dt[[col]] + 2)
for(col in names(dt)) set(dt, which(is.na(dt[[col]])), col, 0)
#Matrix with sparse = T reduces the storage needed
dt[1:5, 1:5]
X <- Matrix(as.matrix(dt), sparse = T)
rm(dt)
folds <- createFolds(as.factor(Y), k = 6)
valid <- folds$Fold1
model <- c(1:length(Y))[-valid]
#Param for XGBoost, learning rate 0.01, base score (default 0.5) as we have fifty damaged dataset,
param <- list(objective = "binary:logistic",
eval_metric = "auc",
eta = 0.01,
base_score = 0.005,
col_sample = 0.5)
#Transformations for Model application
dmodel <- xgb.DMatrix(X[model,], label = Y[model])
dvalid <- xgb.DMatrix(X[valid,], label = Y[valid])
m1 <- xgb.train(data = dmodel, param, nrounds = 20,
watchlist = list(mod = dmodel, val = dvalid), verbose = 1)
imp <- xgb.importance(model = m1, feature_names = colnames(X))
cols <- imp$Feature
imp[1:10]
length(cols)
head(cols, 10)
#Entfernt alle Variablen außer Cols
rm(list = setdiff(ls(), "cols"))
###Apply xgboost on good features
#Only read the detected important cols from the total dataset
dt <- fread("case4_train_numeric.csv",
select = c(cols, "Response"),
showProgress = T)
Y <- dt$Response
dt[ , Response := NULL]
for(col in names(dt)) set(dt, j = col, value = dt[[col]] + 2)
for(col in names(dt)) set(dt, which(is.na(dt[[col]])), col, 0)
X <- Matrix(as.matrix(dt), sparse = T)
rm(dt)
#Apply XGBoost
set.seed(7579)
folds <- createFolds(as.factor(Y), k = 6)
valid <- folds$Fold3
model <- c(1:length(Y))[-valid]
param <- list(objective = "binary:logistic",
eval_metric = "auc",
eta = 0.01,
max_depth = 2,
colsample_bytree = 0.5,
base_score = 0.005)
dmodel <- xgb.DMatrix(X[model,], label = Y[model])
dvalid <- xgb.DMatrix(X[valid,], label = Y[valid])
m1 <- xgb.train(data = dmodel, param, nrounds = 30,
watchlist = list(mod = dmodel, val = dvalid))
pred <- predict(m1, dvalid)
summary(pred)
imp <- xgb.importance(model = m1, feature_names = colnames(X))
head(imp, 30)
## Select threshold to optimize according to MCC
mc <- function(actual, predicted) {
tp <- as.numeric(sum(actual == 1 & predicted == 1))
tn <- as.numeric(sum(actual == 0 & predicted == 0))
fp <- as.numeric(sum(actual == 0 & predicted == 1))
fn <- as.numeric(sum(actual == 1 & predicted == 0))
numer <- (tp * tn) - (fp * fn)
denom <- ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ^ 0.5
numer / denom
}
matt <- data.table(thresh = seq(0.0, 0.998, by = 0.001))
matt$scores <- sapply(matt$thresh, FUN =
function(x) mc(Y[valid], (pred > x) * 1))
opt <- matt[which.max(matt$scores), ]
print(opt)
pred_bin <- ifelse((pred > opt$thresh), 1, 0)
table(Y[valid], pred_bin)
mc(Y[valid],pred_bin)
###Apply Random Forest in Contrast
sum(data$Response)
trainSize <- round(nrow(data)*0.7)
#random selection of training and test data
set.seed(123)
training_indices <- sample(seq_len(nrow(data)), size = trainSize)
trainData <- data[training_indices,]
testData <- data[-training_indices,]
sum(testData$Response)
#Drop Response column of test Data
Response_test <-testData$Response
testData$Response <- NULL
trainData$Response <- as.factor(trainData$Response)
library(randomForest)
model <- randomForest(formula = Response~., ntree=400, data = trainData, importance = T, do.trace = T)
#model <- rpart(formula = Response~., data = trainData, method = "class")
varImp(model)
#Make Predictions and clalulate mcc
predictions <- predict(model, newdata = testData)
predictions <- as.numeric(predictions)
predictions[predictions == 1] <- 0
predictions[predictions == 2] <- 1
table(predictions, Response_test)
mcc(predictions,Response_test)