Implement nest(.by = ) and revive .key (#1461)

* Implement `nest(.by = )` * NEWS bullet * Update revdep checks * Tweak handling of `.key = deprecated()` * Update revdep checks * Revise the examples one more time
tidyverse · Jan 13, 2023 · c5b189e · c5b189e
1 parent 5c3460b
commit c5b189e
Show file tree

Hide file tree

Showing 10 changed files with 720 additions and 198 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # tidyr (development version)
 
+* `nest()` has gained a new argument, `.by`, which allows you to specify the
+  columns to nest by (rather than the columns to nest, i.e. through `...`).
+  Additionally, the `.key` argument is no longer deprecated, and is used
+  whenever `...` isn't specified (#1458).
+
 * All built in datasets are now standard tibbles (#1459).
 
 * `unnest()`, `unchop()`, `unnest_longer()`, and `unnest_wider()` better handle

diff --git a/R/nest.R b/R/nest.R
@@ -9,10 +9,14 @@
 #'
 #' Learn more in `vignette("nest")`.
 #'
+#' @details
+#' If neither `...` nor `.by` are supplied, `nest()` will nest all variables,
+#' and will use the column name supplied through `.key`.
+#'
 #' @section New syntax:
 #' tidyr 1.0.0 introduced a new syntax for `nest()` and `unnest()` that's
 #' designed to be more similar to other functions. Converting to the new syntax
-#' should be straightforward (guided by the message you'll recieve) but if
+#' should be straightforward (guided by the message you'll receive) but if
 #' you just need to run an old analysis, you can easily revert to the previous
 #' behaviour using [nest_legacy()] and [unnest_legacy()] as follows:
 #'
@@ -24,46 +28,77 @@
 #'
 #' @section Grouped data frames:
 #' `df %>% nest(data = c(x, y))` specifies the columns to be nested; i.e. the
-#' columns that will appear in the inner data frame. Alternatively, you can
-#' `nest()` a grouped data frame created by [dplyr::group_by()]. The grouping
-#' variables remain in the outer data frame and the others are nested. The
-#' result preserves the grouping of the input.
+#' columns that will appear in the inner data frame. `df %>% nest(.by = c(x,
+#' y))` specifies the columns to nest _by_; i.e. the columns that will remain in
+#' the outer data frame. An alternative way to achieve the latter is to `nest()`
+#' a grouped data frame created by [dplyr::group_by()]. The grouping variables
+#' remain in the outer data frame and the others are nested. The result
+#' preserves the grouping of the input.
 #'
 #' Variables supplied to `nest()` will override grouping variables so that
 #' `df %>% group_by(x, y) %>% nest(data = !z)` will be equivalent to
 #' `df %>% nest(data = !z)`.
 #'
+#' You can't supply `.by` with a grouped data frame, as the groups already
+#' represent what you are nesting by.
+#'
 #' @param .data A data frame.
-#' @param ... <[`tidy-select`][tidyr_tidy_select]> Columns to nest, specified
-#'   using name-variable pairs of the form `new_col = c(col1, col2, col3)`.
-#'   The right hand side can be any valid tidyselect expression.
+#' @param ... <[`tidy-select`][tidyr_tidy_select]> Columns to nest; these will
+#'   appear in the inner data frames.
+#'
+#'   Specified using name-variable pairs of the form
+#'   `new_col = c(col1, col2, col3)`. The right hand side can be any valid
+#'   tidyselect expression.
+#'
+#'   If not supplied, then `...` is derived as all columns _not_ selected by
+#'   `.by`, and will use the column name from `.key`.
 #'
 #'   `r lifecycle::badge("deprecated")`:
 #'   previously you could write `df %>% nest(x, y, z)`.
 #'   Convert to `df %>% nest(data = c(x, y, z))`.
+#' @param .by <[`tidy-select`][tidyr_tidy_select]> Columns to nest _by_; these
+#'   will remain in the outer data frame.
+#'
+#'   `.by` can be used in place of or in conjunction with columns supplied
+#'   through `...`.
+#'
+#'   If not supplied, then `.by` is derived as all columns _not_ selected by
+#'   `...`.
+#' @param .key The name of the resulting nested column. Only applicable when
+#'   `...` isn't specified, i.e. in the case of `df %>% nest(.by = x)`.
+#'
+#'   If `NULL`, then `"data"` will be used by default.
 #' @param .names_sep If `NULL`, the default, the inner names will come from
 #'   the former outer names. If a string, the  new inner names will use the
 #'   outer names with `names_sep` automatically stripped. This makes
 #'   `names_sep` roughly symmetric between nesting and unnesting.
-#' @param .key
-#'   `r lifecycle::badge("deprecated")`:
-#'   No longer needed because of the new `new_col = c(col1, col2,
-#'   col3)` syntax.
 #' @export
 #' @examples
 #' df <- tibble(x = c(1, 1, 1, 2, 2, 3), y = 1:6, z = 6:1)
+#'
+#' # Specify variables to nest using name-variable pairs.
 #' # Note that we get one row of output for each unique combination of
-#' # non-nested variables
+#' # non-nested variables.
 #' df %>% nest(data = c(y, z))
-#' # chop does something similar, but retains individual columns
-#' df %>% chop(c(y, z))
 #'
-#' # use tidyselect syntax and helpers, just like in dplyr::select()
+#' # Specify variables to nest by (rather than variables to nest) using `.by`
+#' df %>% nest(.by = x)
+#'
+#' # In this case, since `...` isn't used you can specify the resulting column
+#' # name with `.key`
+#' df %>% nest(.by = x, .key = "cols")
+#'
+#' # Use tidyselect syntax and helpers, just like in `dplyr::select()`
 #' df %>% nest(data = any_of(c("y", "z")))
 #'
-#' iris %>% nest(data = !Species)
-#' nest_vars <- names(iris)[1:4]
-#' iris %>% nest(data = any_of(nest_vars))
+#' # `...` and `.by` can be used together to drop columns you no longer need,
+#' # or to include the columns you are nesting by in the inner data frame too.
+#' # This drops `z`:
+#' df %>% nest(data = y, .by = x)
+#' # This includes `x` in the inner data frame:
+#' df %>% nest(data = everything(), .by = x)
+#'
+#' # Multiple nesting structures can be specified at once
 #' iris %>%
 #'   nest(petal = starts_with("Petal"), sepal = starts_with("Sepal"))
 #' iris %>%
@@ -74,20 +109,27 @@
 #'   dplyr::group_by(fish) %>%
 #'   nest()
 #'
+#' # That is similar to `nest(.by = )`, except here the result isn't grouped
+#' fish_encounters %>%
+#'   nest(.by = fish)
+#'
 #' # Nesting is often useful for creating per group models
 #' mtcars %>%
-#'   dplyr::group_by(cyl) %>%
-#'   nest() %>%
+#'   nest(.by = cyl) %>%
 #'   dplyr::mutate(models = lapply(data, function(df) lm(mpg ~ wt, data = df)))
-nest <- function(.data, ..., .names_sep = NULL, .key = deprecated()) {
+nest <- function(.data,
+                 ...,
+                 .by = NULL,
+                 .key = NULL,
+                 .names_sep = NULL) {
   cols <- enquos(...)
-
   empty <- names2(cols) == ""
+
   if (any(empty)) {
     cols_good <- cols[!empty]
     cols_bad <- cols[empty]
 
-    .key <- if (missing(.key)) "data" else as.character(ensym(.key))
+    .key <- check_key(.key)
 
     if (length(cols_bad) == 1L) {
       cols_bad <- cols_bad[[1]]
@@ -106,39 +148,52 @@ nest <- function(.data, ..., .names_sep = NULL, .key = deprecated()) {
       i = "Did you want `{(.key)} = {cols_fixed_label}`?"
     ))
 
-    return(nest(.data, !!!cols))
+    return(nest(.data, !!!cols, .by = {{ .by }}))
   }
 
   UseMethod("nest")
 }
 
 #' @export
-nest.data.frame <- function(.data, ..., .names_sep = NULL, .key = deprecated()) {
+nest.data.frame <- function(.data,
+                            ...,
+                            .by = NULL,
+                            .key = NULL,
+                            .names_sep = NULL) {
   # The data frame print handles nested data frames poorly, so we want to
   # convert data frames (but not subclasses) to tibbles
   if (identical(class(.data), "data.frame")) {
     .data <- as_tibble(.data)
   }
 
-  nest.tbl_df(.data, ..., .names_sep = .names_sep, .key = .key)
+  nest.tbl_df(
+    .data,
+    ...,
+    .by = {{ .by }},
+    .key = .key,
+    .names_sep = .names_sep
+  )
 }
 
 #' @export
-nest.tbl_df <- function(.data, ..., .names_sep = NULL, .key = deprecated()) {
-  .key <- check_key(.key)
-  if (missing(...)) {
-    cli::cli_warn(c(
-      "`...` can't be empty for ungrouped data frames.",
-      i = "Did you want `{(.key)} = everything()`?"
-    ))
-    cols <- quos(!!.key := everything())
-  } else {
-    cols <- enquos(...)
-  }
-
+nest.tbl_df <- function(.data,
+                        ...,
+                        .by = NULL,
+                        .key = NULL,
+                        .names_sep = NULL) {
   error_call <- current_env()
 
-  out <- pack(.data, !!!cols, .names_sep = .names_sep, .error_call = error_call)
+  info <- nest_info(.data, ..., .by = {{ .by }}, .key = .key)
+  cols <- info$cols
+  inner <- info$inner
+  outer <- info$outer
+
+  inner <- .data[inner]
+  inner <- pack(inner, !!!cols, .names_sep = .names_sep, .error_call = error_call)
+
+  out <- .data[outer]
+  out <- vec_cbind(out, inner, .name_repair = "check_unique", .error_call = error_call)
+  out <- reconstruct_tibble(.data, out)
   out <- chop(out, cols = all_of(names(cols)), error_call = error_call)
 
   # `nest()` currently doesn't return list-of columns
@@ -150,21 +205,109 @@ nest.tbl_df <- function(.data, ..., .names_sep = NULL, .key = deprecated()) {
 }
 
 #' @export
-nest.grouped_df <- function(.data, ..., .names_sep = NULL, .key = deprecated()) {
+nest.grouped_df <- function(.data,
+                            ...,
+                            .by = NULL,
+                            .key = NULL,
+                            .names_sep = NULL) {
+  by <- enquo(.by)
+  if (!quo_is_null(by)) {
+    cli::cli_abort("Can't supply {.arg .by} when {.arg .data} is a grouped data frame.")
+  }
+
   if (missing(...)) {
     .key <- check_key(.key)
     cols <- setdiff(names(.data), dplyr::group_vars(.data))
     nest.tbl_df(.data, !!.key := all_of(cols), .names_sep = .names_sep)
   } else {
-    NextMethod()
+    nest.tbl_df(.data, ..., .key = .key, .names_sep = .names_sep)
   }
 }
 
-check_key <- function(.key) {
-  if (!is_missing(.key)) {
-    warn("`.key` is deprecated")
-    .key
-  } else {
+nest_info <- function(.data,
+                      ...,
+                      .by = NULL,
+                      .key = NULL,
+                      .error_call = caller_env()) {
+  by <- enquo(.by)
+
+  cols <- enquos(...)
+  n_cols <- length(cols)
+
+  key <- check_key(.key, error_call = .error_call)
+
+  if (n_cols != 0L && !is_default_key(.key)) {
+    warn_unused_key(error_call = .error_call)
+  }
+
+  cols <- lapply(cols, function(col) {
+    names(tidyselect::eval_select(
+      expr = col,
+      data = .data,
+      allow_rename = FALSE,
+      error_call = .error_call
+    ))
+  })
+
+  names <- names(.data)
+
+  outer <- names(tidyselect::eval_select(
+    expr = by,
+    data = .data,
+    allow_rename = FALSE,
+    error_call = .error_call
+  ))
+
+  inner <- list_unchop(cols, ptype = character(), name_spec = zap())
+  inner <- vec_unique(inner)
+
+  if (n_cols == 0L) {
+    # Derive `inner` names from `.by`
+    inner <- setdiff(names, outer)
+    cols <- list2(!!key := inner)
+  }
+
+  if (quo_is_null(by)) {
+    # Derive `outer` names from `...`
+    outer <- setdiff(names, inner)
+  }
+
+  # Regenerate quosures for `pack()`
+  cols <- map(cols, function(col) {
+    quo(all_of(!!col))
+  })
+  cols <- new_quosures(cols)
+
+  list(
+    cols = cols,
+    inner = inner,
+    outer = outer
+  )
+}
+
+warn_unused_key <- function(error_call = caller_env()) {
+  message <- c(
+    "Can't supply both {.arg .key} and {.arg ...}.",
+    i = "{.arg .key} will be ignored."
+  )
+  cli::cli_warn(message, call = error_call)
+}
+
+check_key <- function(key, error_call = caller_env()) {
+  if (is_default_key(key)) {
     "data"
+  } else {
+    check_string(key, allow_empty = FALSE, arg = ".key", call = error_call)
+    key
+  }
+}
+
+is_default_key <- function(key) {
+  if (identical(maybe_missing(key), deprecated())) {
+    # Temporary support for S3 method authors that set `.key = deprecated()`.
+    # Remove this entire helper all methods have been updated.
+    key <- NULL
   }
+
+  is.null(key)
 }