-
Notifications
You must be signed in to change notification settings - Fork 0
/
version3.r
58 lines (48 loc) · 2.83 KB
/
version3.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(dplyr)
library(tidyr)
library(caTools)
setwd("~/Documents/Statistik/kaggle")
hb_read <- read.csv(file="housing_builds.csv", stringsAsFactors = TRUE, header = TRUE)
#select income and ownership columns
hb_mod<-hb_read%>%select(Counted.Homeownership.Units,Counted.Rental.Units,Extremely.Low.Income.Units,Low.Income.Units,Very.Low.Income.Units,Moderate.Income.Units,Middle.Income.Units,Other.Income.Units)
#setting low and high income counts
hb_mod$low<-hb_mod$Extremely.Low.Income.Units+hb_mod$Low.Income.Units+hb_mod$Very.Low.Income.Units
hb_mod$high<-hb_mod$Moderate.Income.Units+hb_mod$Middle.Income.Units+hb_mod$Other.Income.Units
hb_mod<-hb_mod%>%select(high,low,Counted.Homeownership.Units,Counted.Rental.Units)
#setting new variable All.Units
hb_mod$All.Units <- hb_mod$Counted.Homeownership.Units+hb_mod$Counted.Rental.Units
#Percent low
hb_mod$Percent.of.low.income.Units <- (hb_mod$low/hb_mod$All.Units)*100
#Percent high
hb_mod$Percent.of.high.income.Units <- (hb_mod$high/hb_mod$All.Units)*100
#Percent of rental
hb_mod$Percent.of.rental.Units <- (hb_mod$Counted.Rental.Units/hb_mod$All.Units)*100
#Percent of homeownership
hb_mod$Percent.of.homeownership.Units <- (hb_mod$Counted.Homeownership.Units/hb_mod$All.Units)*100
hb_mod
#Next thing is regression. We want to predict our Dependent Variable 'Counted Homeownership Units' per House.
#We have 'low', 'high', 'TotalNumberOfHousingUnits' and our percentage of ____ per House as Independent Variables.
#I would say we take 100 cases and delete the Values of 'Counted Homeownership Units'.
#Then we make a Regression Analysis with the rest and predict the 100 cases. We want to reach 95 right cases.
require(caTools)
set.seed(101)
sample = sample.split(hb_mod$All.Units, SplitRatio = .75)
train = subset(hb_mod, sample == TRUE)
test = subset(hb_mod, sample == FALSE)
#Linear Regression
#linearMod <- lm(train$Counted.Homeownership.Units ~ train$high + train$low + train$All.Units, data=hb_mod) # build linear regression model on full data
#print(linearMod)
#Linear Regression
linearMod <- lm(Counted.Homeownership.Units ~ high +low + +Counted.Rental.Units, +All.Units, data=train) # build linear regression model on train data
#All.Units was not producing a good fit the predict function said it was a rank-deficient model, dunno why
print(linearMod)
### NOT SIGNIFICANT: Percent.of.high.income.Units +Percent.of.homeownership.Units +Percent.of.low.income.Units +Percent.of.rental.Units
#simple prediciton
preds <- predict(linearMod, test)
test$predicted.units<-preds
as.data.frame(test$predicted.units,test$Counted.Homeownership.Units)
summary(linearMod)
#Here we test if our predicted values are relatively equal to the the actual values
test$TRUFALS <- near(test$Counted.Homeownership.Units, test$predicted.units, tol = test$Counted.Homeownership.Units * 0.05)
gezaehlt <- count(test, vars = TRUFALS)
gezaehlt