-
Notifications
You must be signed in to change notification settings - Fork 0
/
.Rhistory
92 lines (92 loc) · 3.08 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#Data Conversion into Factors
df_ip$city<-as.factor(df_ip$city)
source("C:/Users/lol51/Downloads/Driving_range-main/Driving Range Prediction Model.r")
#Data Conversion into Factors
df_ip$city<-as.factor(df_ip$city)
#Checking the structure of Data
head(df_ip)
dim(df_ip)
str(df_ip)
#Checking for any Null Values.
#There are no Null values in the data
colSums(is.na(df_ip))
#Data Conversion into Factors
df_ip$city<-as.factor(df_ip$city)
df_ip$motor_way<-as.factor(df_ip$motor_way)
df_ip$country_roads<-as.factor(df_ip$country_roads)
df_ip$`A/C` <-as.factor(df_ip$`A/C`)
8
df_ip$park_heating<-as.factor(df_ip$park_heating)
df_ip = fread('Electric_cars.csv')
library(data.table)
install.packages("data.table")
df_ip <-fread('Electric_cars.csv')
#Checking the structure of Data
head(df_ip)
setwd("C:\Users\lol51\Downloads\Driving_range-main")
setwd("C:\\Users\\lol51\\Downloads\\Driving_range-main")
df_ip <-fread('Electric_cars.csv')
#Checking the structure of Data
head(df_ip)
dim(df_ip)
str(df_ip)
#Checking for any Null Values.
#There are no Null values in the data
colSums(is.na(df_ip))
#Data Conversion into Factors
df_ip$city<-as.factor(df_ip$city)
df_ip$motor_way<-as.factor(df_ip$motor_way)
df_ip$country_roads<-as.factor(df_ip$country_roads)
df_ip$`A/C` <-as.factor(df_ip$`A/C`)
df_ip$park_heating<-as.factor(df_ip$park_heating)
df_ip<-as.data.frame(df_ip)
str(df_ip)
summary(df_ip)
df1<-df_ip
#EXPLORATORY DATA ANALYIS
#DEPENDENT VARIABLE
boxplot(df1$`trip_distance(km)`)
hist(df1$`trip_distance(km)`)
#BOX and Histogram of the target variable show that there are lot of outliers which have to be removed. based on the plots, any value >150 can be removed.
df1<-df1[df1$`trip_distance(km)`<=100,]
#In the Quantiyt field, there seems to be some outliers. Hence, for this model, anything less than 15 needs to be removed.
df1<-df1[df1$`quantity(kWh)` <=15,]
#UNI-VARIATE ANALYSIS
library(DescTools)
Desc(df1)
#BI-VARIATE ANALYSIS
library(dplyr)
library(DT)
#Average Driving Range by City
#From the summaries below we can see that City has high predictive power. However we also have to check for classification imbalance
Range_City= df1 %>%
group_by(city) %>%
summarise(avg_range = mean(`trip_distance(km)`)) %>%
arrange(desc(avg_range))
DT::datatable(Range_City)
plt_city = df1 %>% group_by(city) %>% summarise(avg_range = mean(`trip_distance(km)`)) %>%
ggplot(aes(x=city,y=avg_range)) + geom_bar(stat="identity") +
labs(title = "Average Driving Range by City")
predict_rpart
summary(predict_rpart)
#Adding RPart Predicted values to the test dataframe
test$Rpart_Pred<-predict_rpart
# RANDOM FOREST MODEL
set.seed(120) # Setting seed
classifier_RF = randomForest(x = train[-1],
y = train$`trip_distance(km)`,
ntree = 500)
library(dplyr)
# Predicting the Test set results
y_pred = predict(classifier_RF, newdata = test[-1])
#Adding RF Predicted values to the test dataframe
test$RandomForest_Pred<-y_pred
# Plotting model
plot(classifier_RF)
# Importance plot
importance(classifier_RF)
# Variable importance plot
varImpPlot(classifier_RF)
#Exporting the Predicted output data into Excel
library("writexl")
write_xlsx(test,"prediction_output.xlsx")