-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathr_data_prep.R
141 lines (109 loc) · 7.37 KB
/
r_data_prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#######################################################################################################################
## Author: Brianna Noland, brianna.noland@gmmail.com
## Date: 6/4/2016
#######################################################################################################################
library(data.table)
options(warn=-1)
# Read in data downloaded from https://www.whitehouse.gov/briefing-room/disclosures/visitor-records
whitehouse_visitors <- read.csv("TestData2016Messey.csv", stringsAsFactors = FALSE, na.strings=c("","NA"))
#######################################################################################################################
# Look at the names of the columns
colnames(whitehouse_visitors)
# There are a mix of upper and lower case headings. These should be consistent
colnames(whitehouse_visitors) <- tolower(colnames(whitehouse_visitors))
# double check to make sure it looks right
colnames(whitehouse_visitors)
#######################################################################################################################
# Take look at the first rows of the dataframe to get a feel for the data
head(whitehouse_visitors, 10)
# Notice that namelast, namefirst, vistee_namelast, visitee_namefirst, caller_name_last, and caller_name_first are a
# mix of upper and lowercase. This should be consistent
cols_to_lower <- c('namelast', 'namefirst', 'visitee_namelast', 'visitee_namefirst', 'caller_name_last',
'caller_name_first')
for(column in cols_to_lower) {
whitehouse_visitors[,column] <- iconv(whitehouse_visitors[,column], from = 'UTF-8', to = 'latin1')
tryCatch(whitehouse_visitors[,column] <- tolower(whitehouse_visitors[,column]),
error=function(e) {
print(paste('Error in column : ', column))
})
}
# Notice that I changed the encoding of the name columns from utf-8 to latin1. I did this to account for special
# characters that may be present in the names
#######################################################################################################################
# check the data types of the columns
sapply(whitehouse_visitors, class)
# A few things jump out at me: 1) based on the data description, bdgnbr, access_type, poa, pod, post, terminal_suffix,
# meeting_loc, meeting_room, and caller_room should be factors because they represent data that would have a limited
# number of unique entries (there may be more, but we don't know yet), 2) appt_made_date, toa, tod, appt_start_date,
# appt_end_date, appt_cancel_date, lastentrydate, release_date, and requestsubmitted should be date objects
to_factor <- c('bdgnbr', 'access_type', 'poa', 'pod', 'post', 'terminal_suffix',
'meeting_loc', 'meeting_room', 'caller_room')
for(column in to_factor) {
whitehouse_visitors[,column] <- as.factor(whitehouse_visitors[,column])
}
# double check the results
sapply(whitehouse_visitors, class)
# Notice that release_date doesn't contain seconds while the other date columns do - because this format is different,
# we will handle the correction in a separate line of code, similarly, requestsubmitted is in a different format
to_date <- c('appt_made_date', 'appt_start_date', 'appt_end_date', 'appt_cancel_date',
'lastentrydate', 'toa', 'tod')
for(column in to_date) {
whitehouse_visitors[,column] <- as.POSIXct(whitehouse_visitors[,column], format = '%m/%d/%Y %H:%M')
}
whitehouse_visitors$release_date <- as.POSIXct(whitehouse_visitors$release_date, format = '%m/%d/%Y')
whitehouse_visitors$requestsubmitted <- as.POSIXct(whitehouse_visitors$requestsubmitted, format = '%Y-%m-%d')
#######################################################################################################################
# review a summary of the data
summary(whitehouse_visitors)
# Notice here that caller_room is an empty column, so we can drop it safely
whitehouse_visitors <- whitehouse_visitors[,-which(colnames(whitehouse_visitors) == 'caller_room')]
colnames(whitehouse_visitors)
#######################################################################################################################
# Lets take a look at guest_phone and guest_zip, as they should have standard formats. We can check to see if they
# conform to the appropriate lengths (zip = 5, and phone = 9)
# Zip code
# Get the number of times each length of zip code appears (note that the zip codes are strings)
zip_lengths <- as.data.table(nchar(whitehouse_visitors$guest.zip))
zip_length_counts <- zip_lengths[,.N, by = V1]
zip_length_counts
# We can see that most zip codes are 5 or 10 characters. These are most likely valid codes, but since the majority are
# the first five digits, keep only the first five digits of those that are 9 or 10 i.e remove -xxxx or xxxx)
# Some of the zip codes contain spaces. Remove the spaces.
whitehouse_visitors$guest.zip <- gsub(' ', '', whitehouse_visitors$guest.zip)
whitehouse_visitors$guest.zip[(!is.na(whitehouse_visitors$guest.zip)) & (nchar(whitehouse_visitors$guest.zip) == 9 | nchar(whitehouse_visitors$guest.zip) == 10)] <- substr(whitehouse_visitors$guest.zip, 1, 5)
whitehouse_visitors$guest.zip[!(nchar(whitehouse_visitors$guest.zip) == 9 | nchar(whitehouse_visitors$guest.zip) == 10 | nchar(whitehouse_visitors$guest.zip) == 5)] <- NA
# Can repeat a similar excercise for phone number
#######################################################################################################################
# Examine car length and car weight
# These are numbers that can reasonably ordered and relative magnitude makes sense (whereas phone # and zip code do not)
# so lets convert these to integers
# Car weight
# First lets look at the first few values
head(whitehouse_visitors$car.weight, 20)
# We can see that there are a variety of formats. Lets remove whitespace, 'lbs', and commas
char_to_rem <- c(' ', 'lbs', ' lbs', ',')
whitehouse_visitors$car.weight <- as.integer(trimws(gsub('lbs|lb|,', '', whitehouse_visitors$car.weight)))
# Plot car weights to look for outliers
plot(whitehouse_visitors$car.weight)
# It looks like we've got 5 values that appear to be outliers. We can set those to NA
whitehouse_visitors$car.weight[whitehouse_visitors$car.weight > 60000] <- NA
plot(whitehouse_visitors$car.weight)
# Car length
# First lets look at the first few values
head(whitehouse_visitors$car.length.in.ft, 20)
# It looks like there are a number of nonnumeric values. Coerce the column to an integer and the nonnumeric values
# will convert to NA
whitehouse_visitors$car.length.in.ft <- as.integer(whitehouse_visitors$car.length.in.ft)
# Plot for outliers
plot(whitehouse_visitors$car.length.in.ft)
#######################################################################################################################
# Replace empty cells and Nulls with NAs
sapply(whitehouse_visitors, function(x), gsub(''))
# Determine how to deal with NAs
# First, calculate the percentage of each column that NA
sapply(whitehouse_visitors, function(x) round(sum(is.na(x))/length(x)*100, 4))
# How many NAs you can allow, and how you handle them are dependent on the application.
# In this case, we will drop all rows with 20% or more NAs
whitehouse_visitors <- whitehouse_visitors[sapply(whitehouse_visitors, function(x) round(sum(is.na(x))/length(x)*100, 4)) < 20]
#######################################################################################################################
summary(whitehouse_visitors)