-
Notifications
You must be signed in to change notification settings - Fork 1
/
8 Evaluate Out of Sample.R
72 lines (51 loc) · 2.39 KB
/
8 Evaluate Out of Sample.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#Project: Zillow Tracker
#Code: 8 Evaluate Out of Sample
#Author: Scott Onestak
#Last Executed: 4/18/2022
#Last model update 4/6/2022
#Packages
library(tidyverse)
library(stringr)
library(h2o)
library(dplyr)
#Data
theData = read.csv('Data/cleanedFinalDataset.csv',header=T,stringsAsFactors=F)
redfin = read.csv('Data/redfin_data_for_asset_appreciation.csv',header=T,stringsAsFactors=F)
#Create field to determine which month to join to
theData$join_date = substr(theData$soldDate,1,7)
redfin$join_date = substr(redfin$Time,1,7)
#Modify fields for use
theData$soldDate = as.Date(theData$soldDate)
theData$listDate = as.Date(theData$listDate)
redfin$Time = as.Date(redfin$Time)
#Filter data to >= 2020
theData = theData %>% filter(substr(join_date,1,4) %in% c("2020","2021","2022"))
#Mutate the join for months too new for data
theMissing = setdiff(unique(theData$join_date),unique(redfin$join_date))
changeto = redfin[dim(redfin)[1],"join_date"]
theData$join_date = ifelse(theData$join_date %in% theMissing,changeto,theData$join_date)
#Calculate Asset Appreciation to Now
price_now = redfin[dim(redfin)[1],"Median_Price"]
redfin$appreciation = ((price_now - redfin$Median_Price) / redfin$Median_Price) + 1
#Join Appreciation and Adjust Sold Price
theData = theData %>%
left_join(.,redfin %>% select(join_date,appreciation),by="join_date") %>%
mutate(soldPriceAdj = soldPrice * appreciation)
#Filter out suburbs with little volume
theSuburbs = theData %>% group_by(suburb) %>% summarise(count=n())
suburbsKeep = unlist(theSuburbs %>% filter(count > 20) %>% select(suburb))
theData = theData %>% filter(suburb %in% suburbsKeep)
theData$suburb = as.factor(theData$suburb)
#Get the test data points
theTestData = theData %>% filter(soldDate >= as.Date("2022-04-06") & !is.na(livingArea))
#read in gbm model and execute
h2o.init(max_mem_size = "16g")
H2O_theTestData = as.h2o(theTestData)
gbm = h2o.loadModel("Models/gbm")
theTestData$soldPriceAdjPred = as.vector(h2o.predict(gbm,H2O_theTestData))
h2o.shutdown(prompt=FALSE)
#Look at performance
theTestData$soldPriceAdjDiff = theTestData$soldPriceAdjPred - theTestData$soldPriceAdj
theTestData$soldPriceAdjDiffPrct = (theTestData$soldPriceAdjPred - theTestData$soldPriceAdj)/theTestData$soldPriceAdj
plot(theTestData$soldPriceAdj,theTestData$soldPriceAdjDiffPrct)
summary(theTestData$soldPriceAdjDiffPrct)