-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathex08_nesting-is-good.R
130 lines (111 loc) · 3.63 KB
/
ex08_nesting-is-good.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#' ---
#' title: "Why nesting is worth the awkwardness"
#' author: "Jenny Bryan"
#' date: "`r format(Sys.Date())`"
#' output: github_document
#' ---
#+ setup, include = FALSE, cache = FALSE
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
error = TRUE
)
options(tidyverse.quiet = TRUE)
#+ body
# ----
library(gapminder)
library(tidyverse)
# ----
#' gapminder data for Asia only
gap <- gapminder %>%
filter(continent == "Asia") %>%
mutate(yr1952 = year - 1952)
#+ alpha-order
ggplot(gap, aes(x = lifeExp, y = country)) +
geom_point()
#' Countries are in alphabetical order.
#'
#' Set factor levels with intent. Example: order based on life expectancy in
#' 2007, the last year in this dataset. Imagine you want this to persist across
#' an entire analysis.
gap <- gap %>%
mutate(country = fct_reorder2(country, .x = year, .y = lifeExp))
#+ principled-order
ggplot(gap, aes(x = lifeExp, y = country)) +
geom_point()
#' Much better!
#'
#' Now imagine we want to fit a model to each country and look at dot plots of
#' slope and intercept.
#'
#' `dplyr::group_by()` + `tidyr::nest()` created a *nested data frame* and is an
#' alternative to splitting into country-specific data frames. Those data frames
#' end up, instead, in a list-column. The `country` variable remains as a normal
#' factor.
gap_nested <- gap %>%
group_by(country) %>%
nest()
gap_nested
gap_nested$data[[1]]
gap_fitted <- gap_nested %>%
mutate(fit = map(data, ~ lm(lifeExp ~ yr1952, data = .x)))
gap_fitted
gap_fitted$fit[[1]]
gap_fitted <- gap_fitted %>%
mutate(
intercept = map_dbl(fit, ~ coef(.x)[["(Intercept)"]]),
slope = map_dbl(fit, ~ coef(.x)[["yr1952"]])
)
gap_fitted
#+ principled-order-coef-ests
ggplot(gap_fitted, aes(x = intercept, y = country)) +
geom_point()
ggplot(gap_fitted, aes(x = slope, y = country)) +
geom_point()
#' The `split()` + `lapply()` + `do.call(rbind, ...)` approach.
#'
#' Split gap into many data frames, one per country.
gap_split <- split(gap, gap$country)
#' Fit a model to each country.
gap_split_fits <- lapply(
gap_split,
function(df) {
lm(lifeExp ~ yr1952, data = df)
}
)
#' Oops ... the unused levels of country are a problem (empty data frames in our
#' list).
#'
#' Drop unused levels in country and split.
gap_split <- split(droplevels(gap), droplevels(gap)$country)
head(gap_split, 2)
#' Fit model to each country and get `coefs()`.
gap_split_coefs <- lapply(
gap_split,
function(df) {
coef(lm(lifeExp ~ yr1952, data = df))
}
)
head(gap_split_coefs, 2)
#' Now we need to put everything back togethers. Row bind the list of coefs.
#' Coerce from matrix back to data frame.
gap_split_coefs <- as.data.frame(do.call(rbind, gap_split_coefs))
#' Restore `country` variable from row names.
gap_split_coefs$country <- rownames(gap_split_coefs)
str(gap_split_coefs)
#+ revert-to-alphabetical
ggplot(gap_split_coefs, aes(x = `(Intercept)`, y = country)) +
geom_point()
#' Uh-oh, we lost the order of the `country` factor, due to coercion from factor
#' to character (list and then row names).
#'
#' The `nest()` approach allows you to keep data as data vs. in attributes, such
#' as list or row names. Preserves factors and their levels or integer
#' variables. Designs away various opportunities for different pieces of the
#' dataset to get "out of sync" with each other, by leaving them in a data frame
#' at all times.
#'
#' First in an interesting series of blog posts exploring these patterns and
#' asking whether the tidyverse still needs a way to include the nesting
#' variable in the nested data:
#' <https://coolbutuseless.bitbucket.io/2018/03/03/split-apply-combine-my-search-for-a-replacement-for-group_by---do/>