diff --git a/DESCRIPTION b/DESCRIPTION index dec4866cc..d5fba498e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,6 +21,7 @@ BugReports: https://github.com/tidyverse/tidyr/issues Depends: R (>= 3.1) Imports: + cli (>= 3.3.0), dplyr (>= 1.0.0), glue, lifecycle, diff --git a/NAMESPACE b/NAMESPACE index 86adee5e0..e459be0b6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -101,10 +101,13 @@ export(unnest_) export(unnest_auto) export(unnest_legacy) export(unnest_longer) +export(unnest_tree) export(unnest_wider) export(unpack) import(rlang) import(vctrs) +importFrom(cli,cli_abort) +importFrom(cli,qty) importFrom(dplyr,tbl_vars) importFrom(glue,glue) importFrom(lifecycle,deprecated) diff --git a/NEWS.md b/NEWS.md index 7b4ed45db..2178856b9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # tidyr (development version) +* New `unnest_tree()` to unnest a recursive data frame (@mgirlich, #1384). + * `pivot_longer()` now throws a slightly better error message when `values_ptypes` or `names_ptypes` is provided and the coercion can't be made (#1364). diff --git a/R/tidyr.R b/R/tidyr.R index 7eebc1631..aaa9e0a72 100644 --- a/R/tidyr.R +++ b/R/tidyr.R @@ -58,3 +58,6 @@ tidyselect::one_of #' @importFrom tidyselect starts_with #' @export tidyselect::starts_with + +#' @importFrom cli cli_abort qty +NULL diff --git a/R/unnest-tree.R b/R/unnest-tree.R new file mode 100644 index 000000000..c93df7754 --- /dev/null +++ b/R/unnest-tree.R @@ -0,0 +1,251 @@ +#' Unnest a recursive data frame +#' +#' @param data A data frame. +#' @param id_col <[`tidy-select`][tidyr_tidy_select]> A column that uniquely +#' identifies each observation. +#' @param child_col <[`tidy-select`][tidyr_tidy_select]> Column containing the +#' children of an observation. This must be a list where each element is either +#' `NULL` or a data frame with the same columns as `data`. +#' @param level_to A string (`"level"` by default) specifying the new column to +#' store the level of an observation. Use `NULL` if you don't need this +#' information. +#' @param parent_to A string (`"parent"` by default) specifying the new column +#' storing the parent id of an observation. Use `NULL` if you don't need this +#' information. +#' @param ancestors_to A string (`NULL` by default) specifying the new column +#' storing the ids of its ancestors. Use `NULL` if you don't need this +#' information. +#' +#' @return A data frame. +#' @export +#' +#' @examples +#' df <- tibble( +#' id = 1L, +#' name = "a", +#' children = list( +#' tibble( +#' id = 11:12, +#' name = c("b", "c"), +#' children = list( +#' NULL, +#' tibble( +#' id = 121:122, +#' name = c("d", "e") +#' ) +#' ) +#' ) +#' ) +#' ) +#' +#' unnest_tree( +#' df, +#' id_col = "id", +#' child_col = "children", +#' level_to = "level", +#' parent_to = "parent", +#' ancestors_to = "ancestors" +#' ) +unnest_tree <- function(data, + id_col, + child_col, + level_to = "level", + parent_to = "parent", + ancestors_to = NULL) { + if (!is.data.frame(data)) { + cli_abort("{.arg data} must be a data frame.") + } + + id_col <- names(eval_pull(data, enquo(id_col), "id_col")) + child_col <- names(eval_pull(data, enquo(child_col), "child_col")) + check_arg_different(child_col, id_col) + + if (!is_null(level_to)) { + level_to <- vctrs::vec_cast(level_to, character()) + vctrs::vec_assert(level_to, size = 1L) + } + parent_to <- check_unnest_parent_to(parent_to, data, level_to) + ancestors_to <- check_unnest_ancestors_to(ancestors_to, data, level_to, parent_to) + + call <- current_env() + + level_sizes <- list() + level_parent_ids <- list() + level_ancestors <- list() + level_data <- list() + out_ptype <- vctrs::vec_ptype(dplyr::select(data, -any_of(child_col))) + + level <- 1L + parent_ids <- vctrs::vec_init(data[[id_col]]) + ns <- vctrs::vec_size(data) + cur_ancestors <- vctrs::vec_rep_each(list(NULL), ns) + + while (!is.null(data)) { + children <- data[[child_col]] %||% list() + # TODO this could mention the path? + # -> this would require tracking the current ancestors. Worth it? + vctrs::vec_check_list(children, arg = child_col) + + data <- dplyr::select(data, -any_of(child_col)) + # keep track of the out ptype to error earlier and better error messages (in the future...) + out_ptype <- vctrs::vec_ptype2(out_ptype, data) + level_data[[level]] <- data + # we could also directly repeat the parent ids but it is a bit more efficient + # to store the parent ids and level sizes in a list and expand + repeat them + # in the end + if (!is_null(parent_to)) { + level_sizes[[level]] <- ns + level_parent_ids[[level]] <- parent_ids + } + + if (!is_null(ancestors_to)) { + if (level > 1L) { + ancestors_simple <- purrr::map2(cur_ancestors, vctrs::vec_chop(parent_ids), c) + cur_ancestors <- vctrs::vec_rep_each(ancestors_simple, ns) + } + level_ancestors[[level]] <- cur_ancestors + } + + ns <- vctrs::list_sizes(children) + if (all(ns == 0)) { + break + } + + parent_ids <- data[[id_col]] + # unclass `list_of` to avoid performance hit + children <- purrr::map(children, ~ unclass_list_of(.x, child_col, call = call)) + data <- vctrs::vec_unchop(children) + + level <- level + 1L + } + + out <- vctrs::vec_rbind(!!!level_data, .ptype = out_ptype) + + if (!is_null(level_to)) { + times <- list_sizes(level_data) + levels <- vctrs::vec_seq_along(level_data) + out[[level_to]] <- vctrs::vec_rep_each(levels, times) + } + + if (!is_null(parent_to)) { + parent_ids <- vctrs::vec_c(!!!level_parent_ids, .ptype = out[[id_col]]) + times <- vctrs::vec_c(!!!level_sizes, .ptype = integer()) + out[[parent_to]] <- vctrs::vec_rep_each(parent_ids, times) + } + + if (!is_null(ancestors_to)) { + out[[ancestors_to]] <- vctrs::vec_unchop(level_ancestors) + } + + check_id(out[[id_col]], id_col) + out +} + +unclass_list_of <- function(x, child_col, call = caller_env()) { + if (is_null(x)) { + return(NULL) + } + + if (!inherits(x, "data.frame")) { + # TODO mention path + # TODO mention other type -> need rlang compat file + msg <- "Each child must be NULL or a data frame." + cli_abort(msg, call = call) + } + + # unclass to avoid slow `[[.tbl_df` and `[[<-.tbl_df` + x <- unclass(x) + child_children <- x[[child_col]] + if (inherits(child_children, "vctrs_list_of")) { + x[[child_col]] <- unclass(child_children) + } + + vctrs::new_data_frame(x) +} + +eval_pull <- function(data, col, col_arg) { + # TODO use `eval_pull()` once implemented + # https://github.com/r-lib/tidyselect/issues/189 + col <- tidyselect::eval_select(col, data, allow_rename = FALSE) + if (length(col) != 1L) { + cli_abort("{.arg {col_arg}} must select 1 column, not {length(col)}.") + } + + nm <- colnames(data)[[col]] + set_names(col, nm) +} + +check_unnest_parent_to <- function(parent_to, data, level_to, call = caller_env()) { + if (!is_null(parent_to)) { + parent_to <- vctrs::vec_cast(parent_to, character(), call = call) + vctrs::vec_assert(parent_to, size = 1L, call = call) + check_arg_different(parent_to, level_to, call = call) + check_col_new(data, parent_to, call = call) + } + + parent_to +} + +check_unnest_ancestors_to <- function(ancestors_to, + data, + level_to, + parent_to, + call = caller_env()) { + if (!is_null(ancestors_to)) { + ancestors_to <- vctrs::vec_cast(ancestors_to, character(), call = call) + vctrs::vec_assert(ancestors_to, size = 1L, call = call) + check_arg_different(ancestors_to, level_to, parent_to, call = call) + check_col_new(data, ancestors_to, call = call) + } + + ancestors_to +} + +check_col_new <- function(data, + col, + col_arg = caller_arg(col), + data_arg = "data", + call = caller_env()) { + if (col %in% colnames(data)) { + msg <- "{.arg {col_arg}} must not be a column in {.arg {data_arg}}." + cli_abort(msg, call = call) + } +} + +check_arg_different <- function(arg, + ..., + arg_name = caller_arg(arg), + call = caller_env()) { + other_args <- dots_list(..., .named = TRUE) + + for (i in seq_along(other_args)) { + if (identical(arg, other_args[[i]])) { + other_arg_nm <- names(other_args)[[i]] + msg <- "{.arg {arg_name}} must be different from {.arg {other_arg_nm}}." + cli_abort(msg, call = call) + } + } +} + +check_id <- function(x, x_arg, call = caller_env()) { + incomplete <- !vctrs::vec_detect_complete(x) + if (any(incomplete)) { + incomplete_loc <- which(incomplete) + n <- length(incomplete_loc) + msg <- c( + "Each value of column {.field {x_arg}} must be non-missing.", + i = "{qty(n)}Element{?s} {incomplete_loc} {qty(n)}{?is/are} missing." + ) + cli_abort(msg, call = call) + } + + if (vctrs::vec_duplicate_any(x)) { + duplicated_flag <- vctrs::vec_duplicate_detect(x) + duplicated_loc <- which(duplicated_flag) + msg <- c( + "Each value of column {.field {x_arg}} must be unique.", + i = "The elements at locations {duplicated_loc} are duplicated." + ) + cli_abort(msg, call = call) + } +} diff --git a/man/unnest_tree.Rd b/man/unnest_tree.Rd new file mode 100644 index 000000000..a957151d7 --- /dev/null +++ b/man/unnest_tree.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/unnest-tree.R +\name{unnest_tree} +\alias{unnest_tree} +\title{Unnest a recursive data frame} +\usage{ +unnest_tree( + data, + id_col, + child_col, + level_to = "level", + parent_to = "parent", + ancestors_to = NULL +) +} +\arguments{ +\item{data}{A data frame.} + +\item{id_col}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> A column that uniquely +identifies each observation.} + +\item{child_col}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> Column containing the +children of an observation. This must be a list where each element is either +\code{NULL} or a data frame with the same columns as \code{data}.} + +\item{level_to}{A string (\code{"level"} by default) specifying the new column to +store the level of an observation. Use \code{NULL} if you don't need this +information.} + +\item{parent_to}{A string (\code{"parent"} by default) specifying the new column +storing the parent id of an observation. Use \code{NULL} if you don't need this +information.} + +\item{ancestors_to}{A string (\code{NULL} by default) specifying the new column +storing the ids of its ancestors. Use \code{NULL} if you don't need this +information.} +} +\value{ +A data frame. +} +\description{ +Unnest a recursive data frame +} +\examples{ +df <- tibble( + id = 1L, + name = "a", + children = list( + tibble( + id = 11:12, + name = c("b", "c"), + children = list( + NULL, + tibble( + id = 121:122, + name = c("d", "e") + ) + ) + ) + ) +) + +unnest_tree( + df, + id_col = "id", + child_col = "children", + level_to = "level", + parent_to = "parent", + ancestors_to = "ancestors" +) +} diff --git a/tests/testthat/_snaps/unnest-tree.md b/tests/testthat/_snaps/unnest-tree.md new file mode 100644 index 000000000..7f6acfc1e --- /dev/null +++ b/tests/testthat/_snaps/unnest-tree.md @@ -0,0 +1,147 @@ +# checks arguments + + Code + (expect_error(unnest_tree(1L))) + Output + + Error in `unnest_tree()`: + ! `data` must be a data frame. + Code + (expect_error(unnest_tree(df, id_col = "not-there"))) + Output + + Error in `chr_as_locations()`: + ! Can't subset columns that don't exist. + x Column `not-there` doesn't exist. + Code + (expect_error(unnest_tree(df, id_col = 1:2))) + Output + + Error in `eval_pull()`: + ! `id_col` must select 1 column, not 2. + Code + (expect_error(unnest_tree(df, id_col, children = "not-there"))) + Output + + Code + (expect_error(unnest_tree(df, id_col, children = 1:2))) + Output + + Code + (expect_error(unnest_tree(df, children, children))) + Output + + Error in `unnest_tree()`: + ! `child_col` must be different from `id_col`. + Code + (expect_error(unnest_tree(df, id, children, level_to = 1L))) + Output + + Error in `unnest_tree()`: + ! Can't convert `level_to` to . + Code + (expect_error(unnest_tree(df, id, children, level_to = c("a", "b")))) + Output + + Error in `unnest_tree()`: + ! `level_to` must have size 1, not size 2. + Code + (expect_error(unnest_tree(df, id, children, parent_to = "level"))) + Output + + Error in `unnest_tree()`: + ! `parent_to` must be different from `level_to`. + Code + (expect_error(unnest_tree(df, id, children, parent_to = 1L))) + Output + + Error in `unnest_tree()`: + ! Can't convert `parent_to` to . + Code + (expect_error(unnest_tree(df, id, children, parent_to = c("a", "b")))) + Output + + Error in `unnest_tree()`: + ! `parent_to` must have size 1, not size 2. + Code + (expect_error(unnest_tree(df, id, children, ancestors_to = "level"))) + Output + + Error in `unnest_tree()`: + ! `ancestors_to` must be different from `level_to`. + Code + (expect_error(unnest_tree(df, id, children, ancestors_to = "parent"))) + Output + + Error in `unnest_tree()`: + ! `ancestors_to` must be different from `parent_to`. + Code + (expect_error(unnest_tree(df, id, children, ancestors_to = 1L))) + Output + + Error in `unnest_tree()`: + ! Can't convert `ancestors_to` to . + Code + (expect_error(unnest_tree(df, id, children, ancestors_to = c("a", "b")))) + Output + + Error in `unnest_tree()`: + ! `ancestors_to` must have size 1, not size 2. + +--- + + Code + NULL + Output + NULL + +# child col type is checked + + Code + (expect_error(unnest_tree(df, id, children))) + Output + + Error in `unnest_tree()`: + ! `children` must be a list, not an integer. + Code + (expect_error(unnest_tree(df2, id, children))) + Output + + Error in `unnest_tree()`: + ! Each child must be NULL or a data frame. + +--- + + Code + (expect_error(unnest_tree(df, id, children))) + Output + + Error in `unnest_tree()`: + ! Each child must be NULL or a data frame. + +# can handle children of differen types + + Code + (expect_error(unnest_tree(df, id, children))) + Output + + Error in `unnest_tree()`: + ! Can't combine `out_ptype$id` and `data$id` . + +# checks ids + + Code + (expect_error(unnest_tree(df_duplicated, id, children))) + Output + + Error in `unnest_tree()`: + ! Each value of column id must be unique. + i The elements at locations 1 and 2 are duplicated. + Code + (expect_error(unnest_tree(df_na, id, children))) + Output + + Error in `unnest_tree()`: + ! Each value of column id must be non-missing. + i Element 2 is missing. + diff --git a/tests/testthat/test-unnest-tree.R b/tests/testthat/test-unnest-tree.R new file mode 100644 index 000000000..0edabbd40 --- /dev/null +++ b/tests/testthat/test-unnest-tree.R @@ -0,0 +1,246 @@ +test_that("checks arguments", { + df <- tibble::tibble( + id = 1, + x = "a", + children = list() + ) + + expect_snapshot({ + (expect_error(unnest_tree(1L))) + + (expect_error(unnest_tree(df, id_col = "not-there"))) + (expect_error(unnest_tree(df, id_col = 1:2))) + + (expect_error(unnest_tree(df, id_col, children = "not-there"))) + (expect_error(unnest_tree(df, id_col, children = 1:2))) + + (expect_error(unnest_tree(df, children, children))) + (expect_error(unnest_tree(df, id, children, level_to = 1L))) + (expect_error(unnest_tree(df, id, children, level_to = c("a", "b")))) + + (expect_error(unnest_tree(df, id, children, parent_to = "level"))) + (expect_error(unnest_tree(df, id, children, parent_to = 1L))) + (expect_error(unnest_tree(df, id, children, parent_to = c("a", "b")))) + + (expect_error(unnest_tree(df, id, children, ancestors_to = "level"))) + (expect_error(unnest_tree(df, id, children, ancestors_to = "parent"))) + (expect_error(unnest_tree(df, id, children, ancestors_to = 1L))) + (expect_error(unnest_tree(df, id, children, ancestors_to = c("a", "b")))) + }) + + expect_snapshot({ + + }) +}) + +test_that("child col type is checked", { + df <- tibble::tibble( + id = 1, + x = "a", + children = 1L + ) + + df2 <- df + df2$children <- list(1L) + + expect_snapshot({ + (expect_error(unnest_tree(df, id, children))) + (expect_error(unnest_tree(df2, id, children))) + }) + + df <- tibble::tibble( + id = 1, + x = "a", + children = list(1L) + ) + + expect_snapshot({ + (expect_error(unnest_tree(df, id, children))) + }) +}) + +test_that("can unnest", { + # simple case + df <- tibble::tibble( + id = 1L, + x = "a", + children = list( + tibble::tibble( + id = 2:3, + x = c("b", "c"), + children = list(NULL) + ) + ) + ) + + expect_equal( + unnest_tree(df, id, children, ancestors_to = "ancestors"), + tibble::tibble( + id = 1:3, + x = c("a", "b", "c"), + level = c(1L, 2L, 2L), + parent = c(NA, 1L, 1L), + ancestors = list(NULL, 1L, 1L) + ) + ) + + # some elements with children, others not + df <- tibble::tibble( + id = 1:2, + x = c("a", "b"), + children = list( + tibble::tibble( + id = 3L, + x = "c", + children = list(NULL) + ), + NULL + ) + ) + + expect_equal( + unnest_tree(df, id, children, ancestors_to = "ancestors"), + tibble::tibble( + id = 1:3, + x = c("a", "b", "c"), + level = c(1L, 1L, 2L), + parent = c(NA, NA, 1L), + ancestors = list(NULL, NULL, 1L) + ) + ) + + # deep nesting + df <- tibble::tibble( + id = 1:2, + x = c("a", "b"), + children = list( + tibble::tibble( + id = 3L, + x = "c", + children = list( + tibble::tibble( + id = 5L, + x = "e", + children = list( + tibble::tibble( + id = 6L, x = "f", children = list(NULL) + ) + ) + ) + ) + ), + tibble::tibble( + id = 4, + x = "d", + children = list(NULL) + ) + ) + ) + + expect_equal( + unnest_tree(df, id, children, ancestors_to = "ancestors"), + tibble::tibble( + id = 1:6, + x = letters[1:6], + level = c(1L, 1L, 2L, 2L, 3L, 4L), + parent = c(NA, NA, 1L, 2L, 3L, 5L), + ancestors = list( + NULL, + NULL, + 1L, + 2L, + c(1L, 3L), + c(1L, 3L, 5L) + ) + ) + ) +}) + +test_that("can handle children of differen types", { + df <- tibble::tibble( + id = 1:2, + x = c("a", "b"), + children = list( + tibble::tibble(id = 3L, x = "c", children = list(NULL)), + tibble::tibble(id = 4.0, x = "d", children = list(NULL)) + ) + ) + + result <- unnest_tree(df, id, children) + expect_identical( + result, + tibble( + id = c(1, 2, 3, 4), + x = c("a", "b", "c", "d"), + level = c(1L, 1L, 2L, 2L), + parent = c(NA, NA, 1, 2) + ) + ) + expect_type(result$id, "double") + + df <- tibble::tibble( + id = 1L, + x = "a", + children = list( + tibble::tibble(id = "a", x = "c", children = list(NULL)) + ) + ) + + # TODO produce a nicer error message here giving the path of the child? + expect_snapshot({ + (expect_error(unnest_tree(df, id, children))) + }) +}) + +test_that("can handle if all childre have no children column", { + df <- tibble::tibble( + id = 1, + x = "a", + children = list(NULL) + ) + expect_equal( + unnest_tree(df, id, children), + tibble(id = 1, x = "a", level = 1L, parent = NA_real_) + ) +}) + +test_that("can handle 0 row data", { + df <- tibble::tibble( + id = integer(), + x = character(), + children = list() + ) + + expect_equal( + unnest_tree(df, id, children, ancestors_to = "ancestors"), + tibble::tibble( + id = integer(), + x = character(), + level = integer(), + parent = integer(), + ancestors = list() + ) + ) +}) + +test_that("checks ids", { + df_duplicated <- tibble::tibble( + id = 1L, + x = "a", + children = list( + tibble::tibble( + id = 1L, + x = "b", + children = list(NULL) + ) + ) + ) + + df_na <- df_duplicated + df_na$children[[1]]$id <- NA + + expect_snapshot({ + (expect_error(unnest_tree(df_duplicated, id, children))) + (expect_error(unnest_tree(df_na, id, children))) + }) +})