forked from frankhlchi/R-scorecard
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStep 1 Exploratory data analysis and binning.R
25 lines (20 loc) · 1.28 KB
/
Step 1 Exploratory data analysis and binning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
library(caret)
library(smbinning)
library(ggplot2)
#load the data
german_credit <- read.csv("~/german_credit.csv")
train <-createDataPartition(y=german_credit$Creditability,p=0.75,list=FALSE)
train2 <- german_credit[train, ]
test2 <- german_credit[-train, ]
#Explore data distribution
ggplot(german_credit, aes(x = Duration,y = ..count..,)) + geom_histogram(fill = "blue", colour = "grey60", size = 0.2, alpha = 0.2,binwidth = 5)
ggplot(german_credit, aes(x = CreditAmount,y = ..count..,)) + geom_histogram(fill = "blue", colour = "grey60", size = 0.2, alpha = 0.2,binwidth = 1000)
ggplot(german_credit, aes(x = Age,y = ..count..,)) + geom_histogram(fill = "blue", colour = "grey60", size = 0.2, alpha = 0.2,binwidth = 5)
ggplot(german_credit, aes(x =Creditability,y = ..count..,)) + geom_histogram(fill = "blue", colour = "grey60" , alpha = 0.2,stat="count")
#Optimal Binning
Durationresult=smbinning(df=train2,y="Creditability",x="Duration",p=0.05)
CreditAmountresult=smbinning(df=train2,y="Creditability",x="CreditAmount",p=0.05)
Ageresult=smbinning(df=train2,y="Creditability",x="Age",p=0.05)
smbinning.plot(CreditAmountresult,option="WoE",sub="CreditAmount")
smbinning.plot(Durationresult,option="WoE",sub="Duration")
smbinning.plot(Ageresult,option="WoE",sub="Age")