-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmusic_rating.R
515 lines (457 loc) · 18.4 KB
/
music_rating.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
require(MASS)
require(class)
require(boot)
require(leaps)
require(glmnet)
require(splines)
require(gam)
require(tree)
require(randomForest)
require(gbm)
require(e1071)
require(ROCR)
# Read data file
setwd("F:/sem4/ml/project")
mydat1=read.csv('training_data.csv',header = T)
mydat=log10(mydat1+1)
mytest1=read.csv('testing_data.csv',header = T)
mytest=log10(mytest1+1)
attach(mydat)
names(mydat)
summary(mydat)
#Simple linear regression
plot(song_hotttnesss~artist_familiarity,mydat,main = "Artist familiarity vs Song hotness")
fit0=lm(song_hotttnesss~artist_familiarity,data=mydat)
fit0
summary(fit0)
abline(fit0,col='red')
confint(fit0)
predict(fit0,data.frame(artist_familiarity=c(0.6,0.8)),interval="confidence")
#Multiple linear regression
fit1=lm(song_hotttnesss~.,data=mydat)
fit1
summary(fit1)
confint(fit1)
par(mfrow=c(2,2))
plot(fit1,main = "Linear regression")
fit2=update(fit1,~.-tempo-key-time_signature-mode-end_of_fade_in)
fit2
summary(fit2)
confint(fit2)
plot(fit2,main = "Linear regression with smaller model")
#Non linear terms and interactions
fit3=lm(song_hotttnesss~poly(artist_familiarity,2))
summary(fit3)
par(mfrow=c(1,1))
plot(song_hotttnesss~artist_familiarity,main = "Artist familiarity vs Song hotness")
points(artist_familiarity,fitted(fit3),col="red")
fit4=lm(song_hotttnesss~poly(artist_familiarity,3))
summary(fit4)
points(artist_familiarity,fitted(fit4),col="blue")
#Simple linear regression using functions
regplot=function(x,y){
fit=lm(y~x)
plot(x,y,main = "Artist familiarity vs Song hotness using functions")
abline(fit,col="red")
}
regplot(artist_familiarity,song_hotttnesss)
#Logistic regression
glm.fit=glm(song_hotttnesss~.,data = mydat,family = binomial)
summary(glm.fit)
confint(glm.fit)
par(mfrow=c(2,2))
plot(glm.fit,main = "Logistic regression")
#Finding prediction accuracy using probability
glm.probs=predict(glm.fit,type = "response")
glm.probs[1:5]
threshold=0.25
glm.pred=ifelse(glm.probs>threshold,"Hit","Fail")
song_hotttnesss_check=ifelse(song_hotttnesss>threshold,"Hit","Fail")
tab=table(glm.pred,song_hotttnesss_check)
tab
accuracy=sum(diag(tab))/sum(tab)
error=1-accuracy
accuracy
error
#Testing accuracy
glm.probs_test=predict(glm.fit,newdata = mytest,type="response")
glm.probs_test[1:5]
threshold_test=0.22
glm.pred_test=ifelse(glm.probs_test>threshold_test,"Hit","Fail")
song_hotttnesss_check_test=ifelse(mytest$song_hotttnesss>threshold_test,"Hit","Fail")
tab_test=table(glm.pred_test,song_hotttnesss_check_test)
tab_test
accuracy_test=sum(diag(tab_test))/sum(tab_test)
error_test=1-accuracy_test
accuracy_test
error_test
#Fitting smaller model with impactful parameters for better accuracy
glm.fit1=glm(song_hotttnesss~.-tempo-key-time_signature-mode-end_of_fade_in
-duration-start_of_fade_out,data = mydat,family = binomial)
summary(glm.fit1)
confint(glm.fit1)
par(mfrow=c(2,2))
plot(glm.fit1)
glm.probs1=predict(glm.fit1,type = "response")
glm.probs1[1:5]
glm.pred1=ifelse(glm.probs1>threshold,"Hit","Fail")
tab1=table(glm.pred1,song_hotttnesss_check)
tab1
accuracy1=sum(diag(tab1))/sum(tab1)
error1=1-accuracy1
accuracy1
error1
#Testing accuracy for the same
glm.probs1_test=predict(glm.fit1,newdata=mytest,type = "response")
glm.probs1_test[1:5]
glm.pred1_test=ifelse(glm.probs1_test>threshold_test,"Hit","Fail")
tab1_test=table(glm.pred1_test,song_hotttnesss_check_test)
tab1_test
accuracy1_test=sum(diag(tab1_test))/sum(tab1_test)
error1_test=1-accuracy1_test
accuracy1_test
error1_test
#Thus we find that there is no change in the accuracy as we reached the saturation in both training
#and testing data
#Linear discriminant analysis(LDA)
lda.fit=lda(song_hotttnesss_check~artist_familiarity+artist_hotttnesss+loudness,data=mydat)
lda.fit
plot(lda.fit)
lda.pred=predict(lda.fit,mydat)
data.frame(lda.pred)[1:5,]
tab2=table(lda.pred$class,song_hotttnesss_check)
tab2
accuracy2=sum(diag(tab2))/sum(tab2)
error2=1-accuracy2
accuracy2
error2
#Testing accuracy
lda.pred_test=predict(lda.fit,mytest)
data.frame(lda.pred_test)[1:5,]
tab2_test=table(lda.pred_test$class,song_hotttnesss_check_test)
tab2_test
accuracy2_test=sum(diag(tab2_test))/sum(tab2_test)
error2_test=1-accuracy2_test
accuracy2_test
error2_test
#Here we get a low accuracy compared to logistic regression model
#But at the same time we notice that the accuracy of the test data is slightly higher than the
#training data. This was opposite in logistic regression where the training had higher accuracy
#than the test
#Quadratic discriminant analysis(QDA)
qda.fit=qda(song_hotttnesss_check~artist_familiarity+artist_hotttnesss+loudness,data = mydat)
qda.fit
qda.pred=predict(qda.fit,mydat)
data.frame(qda.pred)[1:5,]
tab3=table(qda.pred$class,song_hotttnesss_check)
tab3
accuracy3=sum(diag(tab3))/sum(tab3)
error3=1-accuracy3
accuracy3
error3
#Analysing with the test data
qda.pred_test=predict(qda.fit,mytest)
data.frame(qda.pred_test)[1:5,]
tab3_test=table(qda.pred_test$class,song_hotttnesss_check_test)
tab3_test
accuracy3_test=sum(diag(tab3_test))/sum(tab3_test)
error3_test=1-accuracy3_test
accuracy3_test
error3_test
#Thus QDA gives the highest accuracy with the testing data compared with Logistic regression and LDA
#K-Nearest Neighbors
Xlag=cbind(artist_familiarity,artist_hotttnesss,loudness)
Xlag_test=cbind(mytest$artist_familiarity,mytest$artist_hotttnesss,mytest$loudness)
Xlag_test[1:5,]
set.seed(1)
pts=c()
par(mfrow=c(1,1))
for(i in 1:20){
knn.prob=knn(Xlag,Xlag_test,song_hotttnesss,k=i)
knn.prob=as.numeric(as.character(knn.prob))
knn.pred=ifelse(knn.prob>threshold_test,"Hit","Fail")
tab4_test=table(knn.pred,song_hotttnesss_check_test)
tab4_test
accuracy4_test=sum(diag(tab4_test))/sum(tab4_test)
error4_test=1-accuracy4_test
pts=append(pts,accuracy4_test)
error4_test
}
plot(c(1:20),pts,main = "KNN",xlab = "k-values",ylab = "accuracies",col="red")
points(which.max(pts),max(pts),pch=20,col="red")
#This gives better results than logistic regression but LDA and QDA
#still performs better than KNN
#For storing all training accuracies
all_accuracy_train=c(accuracy,accuracy1,accuracy2,accuracy3,0.86)
#For storing all testing accuracies
all_accuracy_test=c(accuracy_test,accuracy1_test,accuracy2_test,accuracy3_test,max(pts))
types=c(1,2,3,4,5)
plot(types,all_accuracy_train,type = 'b',col='red',main = "Training vs Testing accuracies",xlab = "Various models",ylab = "Accuracies")
lines(factor(types),all_accuracy_test,type = 'b',col='blue')
legend("topright", legend=c("Training", "Testing"),col=c("red","blue"), lty=1:1)
legend(1,0.96, legend = c("1-Logistic","2-Logistic with smaller model",
"3-LDA","4-QDA","5-KNN"))
points(which.max(all_accuracy_train),max(all_accuracy_train),pch=20,col="red")
points(which.max(all_accuracy_test),max(all_accuracy_test),pch=20,col="blue")
#From the graph by looking at the accuracies we see that all the models
#are consistent apart from knn
#A glimpse of all our accuracies till now
#Logistic regression:
#Train:0.9671554
#Test:0.9061288
#For smaller model
#Train:0.9671554
#Test:0.9061288
#LDA
#Train:0.9671554
#Test:0.9022498
#QDA
#Train:0.9627566
#Test:0.9045722
#KNN for k=18(Max accuracy)
#Test:0.8758728
#This CV is used to find which degree of polynomial helps in best fitting the model
#Leave one out validation(LOOCV) using function
glm.fit_cv=glm(song_hotttnesss~artist_familiarity,data = mydat)
cv.glm(mydat,glm.fit_cv)$delta
#Using formula
loocv=function(fit){
h=lm.influence(fit)$h
mean((residuals(fit)/(1-h))^2)
}
loocv(glm.fit_cv)
cv.error=rep(0,5)
degree=1:5
for(d in degree){
glm.fit_cv=glm(song_hotttnesss~poly(artist_familiarity,d),data = mydat)
cv.error[d]=loocv(glm.fit_cv)
}
par(mfrow=c(1,1))
plot(degree,cv.error,type='b',main = "Leave one out cross validation")
#10-fold CV
cv.error10=rep(0,5)
for(d in degree){
glm.fit_cv10=glm(song_hotttnesss~poly(artist_familiarity,d),data = mydat)
cv.error10[d]=cv.glm(mydat,glm.fit_cv10,K=10)$delta[1]
}
plot(degree,cv.error,type='b',main = "Leave one out cross validation vs 10-fold CV")
lines(degree,cv.error10,type = 'b',col='red')
legend("topright", legend=c("LOOCV", "10-fold CV"),col=c("black", "red"), lty=1:1)
#Thus we can clearly see that both LOOCV and 10-fold CV performs almost exactly same way
#But it is preffered that we use 10-fold here because of its high computational speed compared to LOOCV
#Best subset regression
regfit.full=regsubsets(song_hotttnesss~.,data = mydat)
summary(regfit.full)
#Here by default only 8 variable subset is printed
#But we have 10 variables so lets do for that
regfit.full_var=regsubsets(song_hotttnesss~.,data = mydat, nvmax = 10)
reg.summary=summary(regfit.full_var)
reg.summary
names(reg.summary)
coef(regfit.full_var,3)
#Forward stepwise selection
regfit.fwd=regsubsets(song_hotttnesss~.,data = mydat,nvmax = 10,method = "forward")
reg.summary_fwd=summary(regfit.fwd)
reg.summary_fwd
coef(regfit.fwd,3)
#Backward stepwise selection
regfit.bwd=regsubsets(song_hotttnesss~.,data = mydat,nvmax = 10,method = "backward")
reg.summary_bwd=summary(regfit.bwd)
reg.summary_bwd
coef(regfit.bwd,3)
par(mfrow=c(1,2))
#Plot to see bic statistic for both the models
plot(reg.summary$bic,main = "Best Subset",xlab = "Number of variables", ylab = "bic")
which.min(reg.summary$bic)
points(3,reg.summary$bic[3],pch=20,col="red")
plot(reg.summary_fwd$bic,main = "Forward Stepwise",xlab = "Number of variables", ylab = "bic")
which.min(reg.summary_fwd$bic)
points(3,reg.summary_fwd$bic[3],pch=20,col="red")
plot(reg.summary_bwd$bic,main = "Backward Stepwise",xlab = "Number of variables", ylab = "bic")
which.min(reg.summary_bwd$bic)
points(3,reg.summary_bwd$bic[3],pch=20,col="red")
#Plot to see the difference in both the models
plot(regfit.full_var,scale = "bic",main = "Best Subset")
plot(regfit.fwd,scale="bic",main = "Forward Stepwise")
plot(regfit.bwd,scale="bic",main = "Backward Stepwise")
par(mfrow=c(1,1))
plot(sqrt(regfit.fwd$rss[-1]/180),col="blue",main = "Validation",xlab = "Predictor count",ylab = "RSS error",pch=10,type = 'b')
#The Root mean RSS error shows that error is monotonically decreasing in forward stepwise. This is true
#because everytime when a new variable is included it improves the fit and hence error decreases
#This CV is used in model selection
#Here we use 10 fold cross validation to find model selection
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
mat[,names(coefi)]%*%coefi
}
set.seed(11)
folds=sample(rep(1:10,length=nrow(mydat)))
table(folds)
cv.error_model=matrix(NA,10,10)
for(k in 1:10){
best.fit=regsubsets(song_hotttnesss~.,data = mydat[folds!=k,],nvmax = 10,method = "forward")
for(i in 1:10){
pred=predict(best.fit,mydat[folds==k,],id=i)
cv.error_model[k,i]=mean((song_hotttnesss[folds==k]-pred)^2)
}
}
rmse.cv=sqrt(apply(cv.error_model,2,mean))
plot(rmse.cv,pch=10,col="red",main = "10-fold Cross Validation",xlab = "Predictor count",ylab = "Root mean squared error",type = 'b')
#This favours model of size 6 as it gets lowest RMSE
#Ridge regression
x=model.matrix(song_hotttnesss~.-1,data = mydat)
y=song_hotttnesss
fit.ridge=glmnet(x,y,alpha = 0)
plot(fit.ridge,xvar="lambda",label=TRUE)
#Here coefficients are the beta terms
#Higher the lambda lower the beta and becomes 0 at one point
#When lambda is 0, then model behaves as normal RSS
#Unlike forward or best subset, here no predictor is dropped, it only shrinks the parameter
#10-fold cross validation for the same
cv.ridge=cv.glmnet(x,y,alpha=0)
plot(cv.ridge)
#Here the two vertical lines in the graph indicates one is the minimum and
#the other is within 1 standard error of the minimum
#This is not restricted model as the 1 standard error doesnt have a considerable width between
#Lasso model
fit.lasso=glmnet(x,y)#by default alpha is taken as 1
plot(fit.lasso,xvar="lambda",label=TRUE)
plot(fit.lasso,xvar = "dev",label = TRUE)
#This deviance graph and x-axis is r-squared
#And there is sudden jump of coefficients which shows that model is not overfit
coef(cv.ridge)
#We can see that apart from artist familiarity, artist hotttnesss and loudness
#The weight of other coefficients are very very small and doesnt make any considerable impact to the fit
cv.lasso=cv.glmnet(x,y)
plot(cv.lasso)
#The top of the plot shows how many coefficients are present at all instances
#It is doing shrinkage and variable selection
#From the plot we infer that minimum cross validation is when model selection is 6
#And it is pretty flat untill we have 1 standard error of the minimum which happenes at 3
coef(cv.lasso)
#Here the coefficients of the best model will be printed
#Clearly we can see that model with 3predictors gives best fit as we got in other methods also
#And also in both the ridge and lasso
#We notice that only 3 coefficients seems to be significant beacause
#others tend to zero which means model selection gives 3 as the best fit
#Non linear models
#Polynomial logistic regression
glm.poly=predict(glm.fit,newdata = mydat,type="response",se=T)
se.bands=cbind(glm.poly$fit+2*glm.poly$se,glm.poly$fit-2*glm.poly$se)
plot(artist_familiarity,song_hotttnesss)
points(artist_familiarity,fitted(fit4),col='blue')
matpoints(artist_familiarity,se.bands,col='red',cex=0.7)
legend("topleft", legend=c("1-Hit","2-Fail"),col='red')
#Here we can see that the polynomial logstic regression does a good work in classifying the data
#Splines
#Fix knot regression splines
fita=lm(song_hotttnesss~bs(artist_familiarity,knots = c(0.10,0.15,0.25)),data=mydat)
plot(artist_familiarity,song_hotttnesss)
points(artist_familiarity,fitted(fit4),col='darkgreen')
abline(v=c(0.1,0.15,0.25),lty=2,col='darkgreen')
summary(fita)
#Here is df is taken as 6
#Smooth splines using degrees of freedom
plot(artist_familiarity,song_hotttnesss)
fitb=smooth.spline(artist_familiarity,song_hotttnesss,df=16)
lines(fitb,col='blue',lwd=2)
#We can also determine the smoothing parameter using CV
#Here we use LOOCV
plot(artist_familiarity,song_hotttnesss)
fitc=smooth.spline(artist_familiarity,song_hotttnesss,cv=TRUE)
lines(fitc,col='purple',lwd=2)
fitc
#Here we find out that df is taken as 4.64
#Generalised additive models(GAM)
#For linear regression
gam1=gam(song_hotttnesss~s(artist_familiarity,df=4)+s(artist_hotttnesss,df=4)+s(loudness,df=2),data = mydat)
par(mfrow=c(1,3))
plot(gam1,se=T)
#We can see clearly that as artist familiarity and artist hotttnesss increases
#Song hotttnesss also increases accordingly
#But as the loudness increases we see that song hotttnesss decreases
#Thus it has a negative slope and that is why coefficient of loudness is
#also negative
#For logistic regression
gam2=gam(I(song_hotttnesss>threshold)~s(artist_familiarity,df=4)+s(artist_hotttnesss,df=4)+s(loudness,df=2),data = mydat,family = binomial)
plot(gam2,se=T)
#The point of these graphs is that the show the contribution to the y-predictor
#by looking at the probability of each x as a seperate function
#Non linear terms that are required
gam2a=gam(I(song_hotttnesss>threshold)~s(artist_familiarity,df=4)+artist_hotttnesss+s(loudness,df=2),data = mydat,family = binomial)
anova(gam2a,gam2,test='Chisq')
#We infer that we highly need non linear term in the variables after analysing the chisq test
#Decision trees
hit=ifelse(song_hotttnesss<=threshold,"Fail","Hit")
mydat_tree=data.frame(mydat,hit)
trees.song=tree(hit~.-song_hotttnesss-tempo-duration,data = mydat_tree)
trees.song
summary(trees.song)
par(mfrow=c(1,1))
plot(trees.song)
text(trees.song,pretty = 5)
#Testing the same
hit_test=ifelse(mytest$song_hotttnesss<=threshold_test,"Fail","Hit")
mytest_tree=data.frame(mytest,hit_test)
tree.pred=predict(trees.song,mytest_tree,type = 'class')
tab5_test=table(tree.pred,hit_test)
tab5_test
accuracy5_test=sum(diag(tab5_test))/sum(tab5_test)
error5_test=1-accuracy5_test
accuracy5_test
error5_test
#Pruning
cv.mydat=cv.tree(trees.song,FUN=prune.misclass)
cv.mydat
plot(cv.mydat)
prune.song=prune.misclass(trees.song,best = 5)
plot(prune.song)
text(prune.song,pretty = 5)
#Now that we pruned using cross validation we are able to see the tree more
#clearly and can interpret conditions when the song is a hit or a fail
tree.pred_prune=predict(prune.song,mytest_tree,type = 'class')
tab6_test=table(tree.pred_prune,hit_test)
tab6_test
#We didnt get much from pruning seeing the same accuracy
#But we can interpret the data more clearly now
#Random forests
set.seed(175)
rf.song=randomForest(song_hotttnesss~.,data = mydat)
rf.song
oob.err=double(10)
test.err=double(10)
for(mtry in 1:10){
fit_random=randomForest(song_hotttnesss~.,mytest,mtry=mtry,ntree=400)
oob.err[mtry]=fit_random$mse[400]
pred_random=predict(fit_random,mytest)
test.err[mtry]=mean((mytest$song_hotttnesss)^2)
cat(mtry," ")
}
matplot(1:mtry,oob.err,pch = 19,col='red', type = 'b',
ylab = 'Mean squared error',main='OOB error rate')
matplot(1:mtry,test.err,pch = 19, col='blue',type = 'b',
ylab = 'Mean squared error',main='Test error rate')
#We observe that for both oob error and test error, for mtry =2, we have least MSE
#Boosting
boost.song=gbm(song_hotttnesss~.,data=mydat,distribution = 'gaussian',
n.trees = 10000,shrinkage = 0.01,interaction.depth = 4)
summary(boost.song)
#Clearly we can see that which variables are important by looking at the bar plot
plot(boost.song,i='artist_hotttnesss')
plot(boost.song,i='artist_familiarity')
plot(boost.song,i='loudness')
#We can see clearly that as artist familiarity and artist hotttnesss increases
#Song hotttnesss also increases accordingly
#But as the loudness increases we see that song hotttnesss decreases
#Thus it has a negative slope and that is why coefficient of loudness is
#also negative
n.trees=seq(from = 100, to = 10000, by=100)
predmat=predict(boost.song,mytest,n.trees = n.trees)
dim(predmat)
boost_err=apply((predmat-mytest$song_hotttnesss)^2,2,mean)
plot(n.trees,boost_err,pch=19,ylab = 'Mean squared error',xlab = 'Number of trees',main = 'Boosting test error')
plot(as.factor(c("Boosting","Random forest")),c(min(boost_err),min(test.err)),main = "Random forest vs Boosting for testing",ylab = "Error rate")
#We notice there is drastic drop in the error rate which shows that boosting is ideal here
1-min(boost_err)