-
Notifications
You must be signed in to change notification settings - Fork 257
/
Copy pathload_trips.R
58 lines (45 loc) · 1.98 KB
/
load_trips.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
library(dplyr)
library(readr)
# define a function to turn strings into datetimes
parse_datetime <- function(s, format="%Y-%m-%d %H:%M:%S") {
as.POSIXct(as.character(s), format=format)
}
########################################
# load and clean trip data
########################################
# load each month of the trip data into one big data frame
csvs <- Sys.glob('*-tripdata.csv')
trips <- data.frame()
for (csv in csvs) {
print(csv)
tmp <- read_csv(csv, na='\\N')
# the date format changed to something ugly in 2014-09 which read_csv doesn't recognize as a datetime,
# so manually convert the date from a string to a datetime
if (typeof(tmp$starttime) == "character")
tmp <- mutate(tmp,
starttime=parse_datetime(starttime, "%m/%d/%Y %H:%M"),
stoptime=parse_datetime(stoptime, "%m/%d/%Y %H:%M"))
trips <- rbind(trips, tmp)
}
# replace spaces in column names with underscores
names(trips) <- gsub(' ', '_', names(trips))
# add a column for year/month/day (without time of day)
trips <- mutate(trips, ymd=as.Date(starttime))
# recode gender as a factor 0->"Unknown", 1->"Male", 2->"Female"
trips <- mutate(trips, gender=factor(gender, levels=c(0,1,2), labels=c("Unknown","Male","Female")))
########################################
# load and clean weather data
########################################
# load weather data from belvedere tower in central park
# https://www.ncei.noaa.gov/orders/cdo/2992179.csv
# ordered from
# http://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USW00094728/detail
weather <- read.table('weather.csv', header=T, sep=',')
# extract just a few columns, lowercase column names, and parse dates
weather <- select(weather, DATE, PRCP, SNWD, SNOW, TMAX, TMIN)
names(weather) <- tolower(names(weather))
weather <- mutate(weather,
ymd = as.Date(parse_datetime(date, "%Y-%m-%d")))
weather <- tbl_df(weather)
# save data frame for easy loading in the future
save(trips, weather, file='trips.RData')