-
Notifications
You must be signed in to change notification settings - Fork 3
/
05-tidying-data.R
231 lines (181 loc) · 7.51 KB
/
05-tidying-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
### -----------------------------
## advanced R & recent advances in R
## simon munzert
### -----------------------------
## peparations -------------------
source("00-course-setup.r")
wd <- getwd()
## basic data tidying and examination with janitor ----------
# clean variable names
# only lowercase letters, with _ as separator
# handles special characters and spaces
# appends numbers to duplicated names
foo_df <- as.data.frame(matrix(ncol = 6))
names(foo_df) <- c("hIgHlo", "REPEAT VALUE", "REPEAT VALUE", "% successful (2009)", "abc@!*", "")
foo_df
janitor::clean_names(foo_df)
make.names(names(foo_df)) # base R solution - not very convincing
# convert multiple values to NA
convert_to_NA(letters[1:5], c("b", "d"))
convert_to_NA(sample(c(1:5, 98, 99), 20, replace = TRUE), c(98,99))
# clean frequency tables
head(mtcars)
table(mtcars$cyl)
janitor::tabyl(mtcars$cyl, show_na = TRUE, sort = TRUE)
janitor::tabyl(mtcars$cyl, show_na = TRUE, sort = TRUE) %>% add_totals_row()
# clean cross tabulations
mtcars %$% table(cyl, gear)
mtcars %>% janitor::crosstab(cyl, gear)
mtcars %>% janitor::crosstab(cyl, gear) %>% adorn_crosstab(denom = "row", show_totals = TRUE)
# use first valid value of multiple variables to get rid of NAs
set.seed(123)
x <- sample(c(1:10, rep(NA, 5)))
y <- sample(c(1:10, rep(NA, 5)))
z <- sample(c(1:10, rep(NA, 5)))
foo_df <- data.frame(x, y, z)
foo_df
foo_df %$% ifelse(!is.na(x), x, ifelse(!is.na(y), y, ifelse(!is.na(z), z, NA)))
foo_df %$% use_first_valid_of(x, y, z)
# more functionality available; see
browseURL("https://cran.r-project.org/web/packages/janitor/vignettes/introduction.html")
browseURL("https://github.com/sfirke/janitor") # worth a look if you have to deal with messy Excel/spreadsheet data
## tidying data frames with tidyr ----------------
# Hadley Wickham's paper on tidy data:
browseURL("https://www.jstatsoft.org/article/view/v059i10")
# Wickham is proponent of "one column, one variable" paradigm
# corresponds to "long" format for panel data sets
# fit nicely with dplyr, ggplot and similar packages that operate on data frames
# tidyr essentially provides yet another set of functions to deal with reshaping data frames to fit these rules:
# each variable must have its own column
# each observation must have its own row
# each value must have its own cell
# related to reshape2 and reshape, although tidyr is not designed for general reshaping
# gather() ~ melt()
# --> takes multiple columns and gathers them into key-value pairs
# --> wide to long
# spread() ~ cast()
# --> takes two columns (key and value) and spreads in to multiple columns
# --> long to wide
# additional functions:
# separate()
# --> pull apart a column that represents multiple variables
# unite()
# complement to separate()
# example: gather()
# demo("so-17481212")
race <- read.table(header = TRUE, check.names = FALSE, text = "
Name 50 100 150 200 250 300 350
Carla 1.2 1.8 2.2 2.3 3.0 2.5 1.8
Mace 1.5 1.1 1.9 2.0 3.6 3.0 2.5
Lea 1.7 1.6 2.3 2.7 2.6 2.2 2.6
Karen 1.3 1.7 1.9 2.2 3.2 1.5 1.9
")
race
race_long <- gather(race, key = Time, value = Score, -Name, convert = TRUE)
race_long
race_long %>% arrange(Name, Time)
# example: spread()
# demo("so-16032858")
results <- data.frame(
Ind = paste0("Ind", 1:10),
Treatment = rep(c("Treat", "Cont"), each = 10),
value = 1:20
)
results
spread(results, key = Treatment, value = value)
# example: separate()
df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
df %>% separate(x, c("A", "B"))
df %>% separate(x, c("A", "B"), sep = -2)
df %>% separate(x, c("A", "B"), sep = "\\.")
# example 2: separate() - every row doesn't split into the same number of pieces
df <- data.frame(x = c("a", "a b", "a b c", NA))
df %>% separate(x, c("a", "b"))
# the same behaviour but no warnings, fill with missing values on specified side
df %>% separate(x, c("a", "b"), extra = "drop", fill = "right")
df %>% separate(x, c("a", "b"), extra = "drop", fill = "left")
# do not drop extra pieces, only splits at most length(into) times
df %>% separate(x, c("a", "b"), extra = "merge", fill = "right")
df %>% separate(x, c("a", "b", "c"), extra = "merge", fill = "right")
# example: separate_rows()
df <- data.frame(
x = 1:3,
y = c("a", "d,e,f", "g,h"),
z = c("1", "2,3,4", "5,6"),
stringsAsFactors = FALSE
)
df
separate_rows(df, y, z, convert = TRUE)
# example: unite()
df <- data.frame(
country = rep(c("Afghan", "Brazil", "China"), each = 2),
century = rep(c("19", "20"), 3),
year = rep(c("99", "00"), 3),
stringsAsFactors = FALSE
)
df
unite(df, century, year, col = "year", sep = "")
# example: gather() + separate() + spread()
# demo("dadmom")
dadmom <- read_dta("http://www.ats.ucla.edu/stat/stata/modules/dadmomw.dta")
dadmom # 3+1 variables in 5 columns
dadmom %>% gather(key, value, named:incm)
dadmom %>% gather(key, value, named:incm) %>%
separate(key, c("variable", "type"), -2)
dadmom %>% gather(key, value, named:incm) %>%
separate(key, c("variable", "type"), -2) %>%
spread(variable, value, convert = TRUE)
## handling missing values
df <- data.frame(y = LETTERS[1:6], x = c(1, NA, NA, 3, NA, 4))
df
drop_na(df)
fill(df, x, .direction = "up")
replace_na(df, list(x = 2))
fill(df)
## tidying model output with broom ----------
# overview at
browseURL("ftp://cran.r-project.org/pub/R/web/packages/broom/vignettes/broom.html")
browseURL("ftp://cran.r-project.org/pub/R/web/packages/broom/vignettes/broom_and_dplyr.html")
## motivation
# model inputs usually have to be tidy
# model outputs less so...
# this makes dealing with model results (e.g., visualizing coefficients, comparing results across models, etc.) sometimes difficult
## example: linear model output
model_out <- lm(mpg ~ wt, mtcars) # linear relationship between miles/gallon and weight (in 1000 lbs)
model_out
summary(model_out)
# examine model object
str(model_out)
coef(summary(model_out)) # matrix of coefficients with variable terms in row names
broom::tidy(model_out)
?tidy.lm
# add fitted values and residuals to original data
broom::augment(model_out) %>% head
?augment.lm
# inspect summary statistics
broom::glance(model_out)
?glance.lm
# many supported models; see
?tidy # ... and click on "index"
# the true power of broom unfolds in settings where you want to combine results from multiple analyses (using subgroups of data, different models, bootstrap replicates of the original data frame, permutations, imputations, ...)
data(Orange)
Orange
# inspect relationship between age and circumference
cor(Orange$age, Orange$circumference)
ggplot(Orange, aes(age, circumference, color = Tree)) + geom_line()
# using broom and dplyr together works like a charm
Orange %>% group_by(Tree) %>% summarize(correlation = cor(age, circumference))
cor.test(Orange$age, Orange$circumference)
Orange %>% group_by(Tree) %>% do(tidy(cor.test(.$age, .$circumference)))
# also works for regressions
Orange %>% group_by(Tree) %>% do(tidy(lm(age ~ circumference, data=.)))
# if you want not just the tidy output, but the augment and glance outputs as well, while still performing each regression only once, you do:
regressions <- mtcars %>% group_by(cyl) %>%
do(fit = lm(wt ~ mpg + qsec + gear, .))
regressions
regressions %>% tidy(fit)
regressions %>% augment(fit)
regressions %>% glance(fit)
# other examples online
browseURL("ftp://cran.r-project.org/pub/R/web/packages/broom/vignettes/kmeans.html") # k-means clustering
browseURL("ftp://cran.r-project.org/pub/R/web/packages/broom/vignettes/bootstrapping.html") # bootstrapping