-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathmeansdhist.R
178 lines (150 loc) · 7.28 KB
/
meansdhist.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
## ----global_options, include = FALSE-------------------------------------------------------------------------------------------------------------------
try(source("../.Rprofile"))
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# Load in Data Tools
# For Reading/Loading Data
library(tidyverse)
# Load in Data
df_temp <- read_csv('data/TempCitiesUSA.csv')
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# A function that shows Unique Values for Categorical Variables in a Table format
show.unique.values <- function(df, cate.var.str, lvl_str_max_len=15){
# Unique Categories
unique.cates <- df %>%
group_by(!!sym(cate.var.str)) %>%
summarise(freq = n()) %>%
mutate(distinct_N = paste0(!!sym(cate.var.str), ' (n=', freq, ')')) %>%
select(distinct_N)
# At most 10 columns
unique.count <- dim(unique.cates)[1]
col.count <- min(ceiling(sqrt(unique.count)), 8)
row.count <- ceiling(unique.count/col.count)
# Generate Table to Fill in
expand.length = row.count*col.count
unique.cates.expand <- vector(mode = "character", length = expand.length)
# Unique Categories and Counts
unique.cates.shorter <- substring(t(unique.cates), first = 1, last = lvl_str_max_len)
unique.cates.expand[0:unique.count] <- unique.cates.shorter
# Reshape
dim(unique.cates.expand) <- c(row.count, col.count)
# Show
title <- sprintf("From Dataset: %s, %d unique Levels for: %s",
deparse(substitute(df)), unique.count, cate.var.str)
return(list(title=title,
levels=unique.cates.expand))
}
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# List of categorical Variables
cate.vars.list <- c('month', 'state', 'city')
lapply(cate.vars.list, show.unique.values, df = df_temp, lvl_str_max_len = 30)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# Control Graph Size
options(repr.plot.width = 5, repr.plot.height = 5)
# Draw Scatter Plot
# 1. specify x and y
# 2. label each state
# 3. add in trend line
scatter <- ggplot(df_temp, aes(x=month, y=temp.f)) +
geom_jitter(size=0.1, width = 0.15) +
labs(title = 'Distribution of Temperature Across Cities in USA',
x = 'Months',
y = 'Temperature in Fahrenheit',
caption = 'Temperature data 2017') +
scale_x_continuous(labels = as.character(df_temp$month),
breaks = df_temp$month) +
theme_bw()
print(scatter)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# Control Graph Size
options(repr.plot.width = 5, repr.plot.height = 5)
# First Filter Data
df_temp_txflak <- df_temp %>% filter(state %in% c('AK', 'TX', 'FL'))
# Draw Scatter Plot
# 1. specify x and y
# 2. label each state
# 3. add in trend line
scatter <- ggplot(df_temp_txflak, aes(x=month, y=temp.f,
colour=state)) +
geom_jitter(size=1, width = 0.15) +
labs(title = 'Distribution of Temperature Across Cities\nin Florida (FL), Texas (TX) and Alaska (AK)',
x = 'Months',
y = 'Temperature in Fahrenheit',
caption = 'Temperature data 2017') +
scale_x_continuous(labels = as.character(df_temp$month),
breaks = df_temp$month) +
theme_bw()
print(scatter)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# Show mean and standard deviation in tabular form
df_temp_mth_summ <- df_temp %>%
group_by(month) %>%
summarise(mean_temp = mean(temp.f), sd_temp = sd(temp.f))
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# Control Graph Size
options(repr.plot.width = 5, repr.plot.height = 4)
# Show mean and standard deviation in graphical form
# We will gather the data first, it is an essential reshaping command
lineplot <- df_temp_mth_summ %>%
gather(variable, value, -month) %>%
ggplot(aes(x=month, y=value, colour=variable, linetype=variable)) +
geom_line() +
geom_point() +
labs(title = 'Mean and SD of Temperature Acorss US Cities',
x = 'Months',
y = 'Temperature in Fahrenheit',
caption = 'Temperature data 2017') +
scale_x_continuous(labels = as.character(df_temp_mth_summ$month),
breaks = df_temp_mth_summ$month)
print(lineplot)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# Control Graph Size
options(repr.plot.width = 6, repr.plot.height = 6)
# Show mean and standard deviation in graphical form
# We start from the dataset:
# 1. select a subset of states we want
# 2. group by state and month to generate mean and sd
# 3. reshape data with gather
# 4. generate line plots, state by state
lineplot <- df_temp %>%
filter(state %in% c('AK', 'CA', 'FL', 'TX')) %>%
group_by(state, month) %>%
summarise(mean_temp = mean(temp.f), sd_temp = sd(temp.f)) %>%
gather(variable, value, -month, -state) %>%
ggplot(aes(x=month, y=value,
colour=variable, linetype=variable, shape=variable)) +
facet_wrap( ~ state) +
geom_line() +
geom_point() +
labs(title = 'Mean and SD of Temperature Acorss US Cities',
x = 'Months',
y = 'Temperature in Fahrenheit',
caption = 'Temperature data 2017') +
scale_x_continuous(labels = as.character(df_temp_mth_summ$month),
breaks = df_temp_mth_summ$month)
print(lineplot)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# Control Graph Size
options(repr.plot.width = 6, repr.plot.height = 4)
# Show mean and standard deviation in graphical form
# We start from the dataset:
# 1. select a subset of states we want
# 2. group by state and month to generate mean and sd
# 3. reshape data with gather
# 4. generate line plots, state by state
lineplot <- df_temp %>%
filter(state %in% c('AK', 'CA', 'FL', 'TX')) %>%
group_by(state, month) %>%
summarise(mean_temp = mean(temp.f), sd_temp = sd(temp.f)) %>%
gather(variable, value, -month, -state) %>%
ggplot(aes(x=month, y=value,
colour=state, linetype=state, shape=state)) +
facet_wrap( ~ variable, scales="free_y") +
geom_line() +
geom_point() +
labs(title = 'Mean and SD of Temperature Acorss US Cities',
x = 'Months',
y = 'Temperature in Fahrenheit',
caption = 'Temperature data 2017') +
scale_x_continuous(labels = as.character(df_temp_mth_summ$month),
breaks = df_temp_mth_summ$month)
print(lineplot)