-
Notifications
You must be signed in to change notification settings - Fork 0
/
Bating_Analysis.R
91 lines (67 loc) · 3.45 KB
/
Bating_Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# ---------------------- It Deals with Preprossesing the model like, checking Linearity, Normality, etc assumptions
# ---------------------- along with addressing the problems (Step-wise Regression, Principal Component Analysis, etc.)
#----------------------- in the dataset under use to fit an effective model
# Installing Package ggplot2
install.packages("ggplot2")
library(ggplot2)
# Before Model Fitting We need to test our Assumptions
install.packages("lmtest")
library(lmtest)
# First of All we need to fit some model
## Importing Data
library(readxl)
Bating_100 = read_excel("Bating_100(Req).xlsx")
#Model Using All Columns
model2 = lm(Runs~.,data = Bating_100)
summary(model2)
# Perform the Durbin-Watson test(Cheking Autocorrelation)
# Expected Value =2 (1.5,2.5)
dw_result <- dwtest(model2)
dw_result
# AIF Score (lesser the value, effective the model)
# Example: Checking homoscedasticity
# Create a plot of residuals vs. fitted values
plot(model2)
#define model with all predictors
all <- lm(Runs~.,data = Bating_100)
# Initialize a model with all predictors
backward_model <- lm(Runs~.,data = Bating_100)
# Backward stepwise regression
backward_model <- step(backward_model, direction = "backward")
# Akaike information criterion (AIC) is an estimator of prediction error and thereby relative quality of
#statistical models for a given set of data. Given a collection of models for the data, AIC estimates the
#uality of each model, relative to each of the other models.
#Thus, AIC provides a means for model selection.
# Initialize a model with all predictors
both_model <- lm(Runs~.,data = Bating_100)
# Both-direction stepwise regression
both_model <- step(both_model, direction = "both")
plot(lm(Runs~.,data = Bating_100))
# Separation of Dependent and Independent variable
ind_var = Bating_100[,c('NO', 'Avg','BF','SR','100','50','4s','6s')]
dep_var = Bating_100[,c("Runs")]
## Principal Component Analysis
library(factoextra)
Dim = prcomp(ind_var)
Dim
summary(Dim)
## Scree Plot showing explained variation
fviz_eig(Dim)
## Hence it is clearly visible that taking first three Principal Components will suffice
Bating_100$PC1 = ((-0.004390014)*Bating_100$NO)+((-0.206135952)*Bating_100$Avg)+((-0.954977131)*Bating_100$BF)+((-0.143009283)*Bating_100$SR)+((-0.001822171)*Bating_100$`100`)+((-0.010512970)*Bating_100$`50`)+((-0.140155797)*Bating_100$`4s`)+((-0.072788951)*Bating_100$`6s`)
Bating_100$PC2 = ((0.0053420356)*Bating_100$NO)+((0.1197771531)*Bating_100$Avg)+((-0.1772873319)*Bating_100$BF)+((0.9750834830)*Bating_100$SR)+((0.0002296697)*Bating_100$`100`)+((0.0030177066)*Bating_100$`50`)+((0.0063965098)*Bating_100$`4s`)+((0.0579329335)*Bating_100$`6s`)
Bating_100$PC3 = ((-0.0457504257)*Bating_100$NO)+((-0.9682368592)*Bating_100$Avg)+((0.1763452449)*Bating_100$BF)+((0.1516984763)*Bating_100$SR)+((-0.0009854498)*Bating_100$`100`)+((-0.0042542952)*Bating_100$`50`)+((0.0777294403)*Bating_100$`4s`)+((-0.0159145808)*Bating_100$`6s`)
#Three Component Model
new_model1 = lm(Runs~PC1+PC2+PC3, data = Bating_100)
summary(new_model1)
plot(lm(Runs~PC1+PC2+PC3,data = Bating_100))
#Two Component Model
new_model2 = lm(Runs~PC1+PC2, data = Bating_100)
summary(new_model2)
plot(lm(Runs~PC1+PC2,data = Bating_100))
# So We can say that the PC3 is not a significant component, henceforth we
# shall consider only PC1, PC2 while model building in Python
install.packages("openxlsx")
library(openxlsx)
file_path <- "Bating_100(new).xlsx"
write.xlsx(Bating_100, file_path)