Skip to content

Commit

Permalink
predict test set with bst
Browse files Browse the repository at this point in the history
  • Loading branch information
zhugejunwei committed Jan 27, 2016
1 parent 6c86961 commit e584f6b
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 23 deletions.
13 changes: 0 additions & 13 deletions R project/Airbnb.Rproj

This file was deleted.

37 changes: 27 additions & 10 deletions R project/Advanced.R → R project/Boosted.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ library(pROC)
library(Ckmeans.1d.dp)

# set wd to be kaggle folder
train <- read.csv('/Users/zhugejunwei/Documents/Semester 2/DM/Final Project/data/train_users_2.csv', stringsAsFactors = FALSE)
test <- read.csv('/Users/zhugejunwei/Documents/Semester 2/DM/Final Project/data/test_users.csv', stringsAsFactors = FALSE)
setwd('/Users/zhugejunwei/Documents/Semester 2/DM/Airbnb Project/R project')
train <- read.csv('/Users/zhugejunwei/Documents/Semester 2/DM/Airbnb Project/data/train_users_2.csv', stringsAsFactors = FALSE)
test <- read.csv('/Users/zhugejunwei/Documents/Semester 2/DM/Airbnb Project/data/test_users.csv', stringsAsFactors = FALSE)
# clean the training data
labels <- train$country_destination
labelInd <- which(colnames(train) == "country_destination")
Expand Down Expand Up @@ -55,17 +56,17 @@ dummies <- dummyVars(~ gender + signup_method + signup_flow + language +
featuresInDummies <- as.data.frame(predict(dummies, newdata = dataAll))
dataAll <- cbind(dataAll[,-c(which(colnames(dataAll) %in% featuresToDummies))], featuresInDummies)

dataTrain <- dataAll[which(dataAll$id %in% train$id), ] # get the train set
dataTrain <- dataAll[which(dataAll$id %in% train$id), ] # separate the train set
dataTest <- dataAll[-which(dataAll$id %in% train$id), ] # separate the test set
set.seed(1)
modelTrainInd <- runif(dim(dataTrain)[1]) # generates random deviates
modelTrain <- dataAll[which(modelTrainInd < 0.7), ]
modelTest <- dataAll[which(modelTrainInd >= 0.7), ]
modelTrain <- dataTrain[which(modelTrainInd < 0.7), ] # training set
modelvali <- dataTrain[which(modelTrainInd >= 0.7), ] # validation set

lableTrain <- recode(labels, "'NDF'=0; 'US'=1; 'other'=2; 'FR'=3; 'CA'=4;
'GB'=5; 'ES'=6; 'IT'=7; 'PT'=8; 'NL'=9; 'DE'=10;
'AU'=11")
modelTrainLabel <- lableTrain[which(modelTrainInd < 0.7)]

'AU'=11") # train set label
modelTrainLabel <- lableTrain[which(modelTrainInd < 0.7)] # Training set label

# train xgboost (multi class)
xgb<- xgboost(data = data.matrix(modelTrain[,-c(1,2)]),
Expand All @@ -78,8 +79,24 @@ xgb<- xgboost(data = data.matrix(modelTrain[,-c(1,2)]),
nthread = 3
)

# predict values in test set
predXgb <- predict(xgb, data.matrix(modelTest[,-c(1,2)]))
# predict values in validation set
predXgb <- predict(xgb, data.matrix(modelvali[,-c(1,2)]))

importance_matrix <- xgb.importance(colnames(modelTrain[,-c(1,2)]), model = xgb)
xgb.plot.importance(importance_matrix[1:10,])

# use the top 10 important variables to train the bst model
xgb2 <- xgboost(data = data.matrix(dataTrain[,c(3,15,18,4,5,6,10,46,71,82)]),
label = lableTrain,
max_depth = 8,
nround=20,
eval_metric = ndcg5,
objective = "multi:softprob",
num_class = 12,
nthread = 3
)
# predict values in test set
predXgb2 <- predict(xgb2, data.matrix(dataTest[,c(3,15,18,4,5,6,10,46,71,82)]))

# write into submit file - "my_solution"
my_solution <- data.frame(PassengerId = test$PassengerId, Survived = my_prediction)

0 comments on commit e584f6b

Please sign in to comment.