Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unnest_tree() #1386

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ BugReports: https://github.com/tidyverse/tidyr/issues
Depends:
R (>= 3.1)
Imports:
cli (>= 3.3.0),
dplyr (>= 1.0.0),
glue,
lifecycle,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,13 @@ export(unnest_)
export(unnest_auto)
export(unnest_legacy)
export(unnest_longer)
export(unnest_tree)
export(unnest_wider)
export(unpack)
import(rlang)
import(vctrs)
importFrom(cli,cli_abort)
importFrom(cli,qty)
importFrom(dplyr,tbl_vars)
importFrom(glue,glue)
importFrom(lifecycle,deprecated)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# tidyr (development version)

* New `unnest_tree()` to unnest a recursive data frame (@mgirlich, #1384).

* `pivot_longer()` now throws a slightly better error message when
`values_ptypes` or `names_ptypes` is provided and the coercion can't be made
(#1364).
Expand Down
3 changes: 3 additions & 0 deletions R/tidyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,6 @@ tidyselect::one_of
#' @importFrom tidyselect starts_with
#' @export
tidyselect::starts_with

#' @importFrom cli cli_abort qty
NULL
251 changes: 251 additions & 0 deletions R/unnest-tree.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
#' Unnest a recursive data frame
#'
#' @param data A data frame.
#' @param id_col <[`tidy-select`][tidyr_tidy_select]> A column that uniquely
#' identifies each observation.
#' @param child_col <[`tidy-select`][tidyr_tidy_select]> Column containing the
#' children of an observation. This must be a list where each element is either
#' `NULL` or a data frame with the same columns as `data`.
#' @param level_to A string (`"level"` by default) specifying the new column to
#' store the level of an observation. Use `NULL` if you don't need this
#' information.
#' @param parent_to A string (`"parent"` by default) specifying the new column
#' storing the parent id of an observation. Use `NULL` if you don't need this
#' information.
#' @param ancestors_to A string (`NULL` by default) specifying the new column
#' storing the ids of its ancestors. Use `NULL` if you don't need this
#' information.
#'
#' @return A data frame.
#' @export
#'
#' @examples
#' df <- tibble(
#' id = 1L,
#' name = "a",
#' children = list(
#' tibble(
#' id = 11:12,
#' name = c("b", "c"),
#' children = list(
#' NULL,
#' tibble(
#' id = 121:122,
#' name = c("d", "e")
#' )
#' )
#' )
#' )
#' )
#'
#' unnest_tree(
#' df,
#' id_col = "id",
#' child_col = "children",
#' level_to = "level",
#' parent_to = "parent",
#' ancestors_to = "ancestors"
#' )
unnest_tree <- function(data,
id_col,
child_col,
level_to = "level",
parent_to = "parent",
ancestors_to = NULL) {
if (!is.data.frame(data)) {
cli_abort("{.arg data} must be a data frame.")
}

id_col <- names(eval_pull(data, enquo(id_col), "id_col"))
child_col <- names(eval_pull(data, enquo(child_col), "child_col"))
check_arg_different(child_col, id_col)

if (!is_null(level_to)) {
level_to <- vctrs::vec_cast(level_to, character())
vctrs::vec_assert(level_to, size = 1L)
}
parent_to <- check_unnest_parent_to(parent_to, data, level_to)
ancestors_to <- check_unnest_ancestors_to(ancestors_to, data, level_to, parent_to)

call <- current_env()

level_sizes <- list()
level_parent_ids <- list()
level_ancestors <- list()
level_data <- list()
out_ptype <- vctrs::vec_ptype(dplyr::select(data, -any_of(child_col)))

level <- 1L
parent_ids <- vctrs::vec_init(data[[id_col]])
ns <- vctrs::vec_size(data)
cur_ancestors <- vctrs::vec_rep_each(list(NULL), ns)

while (!is.null(data)) {
children <- data[[child_col]] %||% list()
# TODO this could mention the path?
# -> this would require tracking the current ancestors. Worth it?
vctrs::vec_check_list(children, arg = child_col)

data <- dplyr::select(data, -any_of(child_col))
# keep track of the out ptype to error earlier and better error messages (in the future...)
out_ptype <- vctrs::vec_ptype2(out_ptype, data)
level_data[[level]] <- data
# we could also directly repeat the parent ids but it is a bit more efficient
# to store the parent ids and level sizes in a list and expand + repeat them
# in the end
if (!is_null(parent_to)) {
level_sizes[[level]] <- ns
level_parent_ids[[level]] <- parent_ids
}

if (!is_null(ancestors_to)) {
if (level > 1L) {
ancestors_simple <- purrr::map2(cur_ancestors, vctrs::vec_chop(parent_ids), c)
cur_ancestors <- vctrs::vec_rep_each(ancestors_simple, ns)
}
level_ancestors[[level]] <- cur_ancestors
}

ns <- vctrs::list_sizes(children)
if (all(ns == 0)) {
break
}

parent_ids <- data[[id_col]]
# unclass `list_of` to avoid performance hit
children <- purrr::map(children, ~ unclass_list_of(.x, child_col, call = call))
data <- vctrs::vec_unchop(children)

level <- level + 1L
}

out <- vctrs::vec_rbind(!!!level_data, .ptype = out_ptype)

if (!is_null(level_to)) {
times <- list_sizes(level_data)
levels <- vctrs::vec_seq_along(level_data)
out[[level_to]] <- vctrs::vec_rep_each(levels, times)
}

if (!is_null(parent_to)) {
parent_ids <- vctrs::vec_c(!!!level_parent_ids, .ptype = out[[id_col]])
times <- vctrs::vec_c(!!!level_sizes, .ptype = integer())
out[[parent_to]] <- vctrs::vec_rep_each(parent_ids, times)
}

if (!is_null(ancestors_to)) {
out[[ancestors_to]] <- vctrs::vec_unchop(level_ancestors)
}

check_id(out[[id_col]], id_col)
out
}

unclass_list_of <- function(x, child_col, call = caller_env()) {
if (is_null(x)) {
return(NULL)
}

if (!inherits(x, "data.frame")) {
# TODO mention path
# TODO mention other type -> need rlang compat file
msg <- "Each child must be NULL or a data frame."
cli_abort(msg, call = call)
}

# unclass to avoid slow `[[.tbl_df` and `[[<-.tbl_df`
x <- unclass(x)
Copy link
Contributor Author

@mgirlich mgirlich Aug 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not quite as bad as the list_of performance hit but still a lot. Without unclassing this takes

devtools::load_all("~/GitHub/tidyr/")
#> ℹ Loading tidyr
test_tree <- readr::read_rds("~/GitHub/tidyr/test-tree.rds")
bench::mark(
  unnest_tree = unnest_tree(
    test_tree,
    id,
    children
  )
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 1 × 6
#>   expression       min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 unnest_tree    425ms    428ms      2.34    3.89MB     7.01

Created on 2022-08-25 with reprex v2.0.2

child_children <- x[[child_col]]
if (inherits(child_children, "vctrs_list_of")) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing the vctrs_list_of class has huge impact on performance:

After removing:

devtools::load_all("~/GitHub/tidyr/")
#> ℹ Loading tidyr
test_tree <- readr::read_rds("~/GitHub/tidyr/test-tree.rds")
bench::mark(
  unnest_tree = unnest_tree(
    test_tree,
    id,
    children
  )
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 1 × 6
#>   expression       min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 unnest_tree    170ms    180ms      5.27    3.97MB     7.02

Created on 2022-08-25 with reprex v2.0.2

Before removing:

#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 1 × 6
#>   expression       min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 unnest_tree    26.9s    26.9s    0.0372     508MB     7.62

Created on 2022-08-25 with reprex v2.0.2

x[[child_col]] <- unclass(child_children)
}

vctrs::new_data_frame(x)
}

eval_pull <- function(data, col, col_arg) {
# TODO use `eval_pull()` once implemented
# https://github.com/r-lib/tidyselect/issues/189
col <- tidyselect::eval_select(col, data, allow_rename = FALSE)
if (length(col) != 1L) {
cli_abort("{.arg {col_arg}} must select 1 column, not {length(col)}.")
}

nm <- colnames(data)[[col]]
set_names(col, nm)
}

check_unnest_parent_to <- function(parent_to, data, level_to, call = caller_env()) {
if (!is_null(parent_to)) {
parent_to <- vctrs::vec_cast(parent_to, character(), call = call)
vctrs::vec_assert(parent_to, size = 1L, call = call)
check_arg_different(parent_to, level_to, call = call)
check_col_new(data, parent_to, call = call)
}

parent_to
}

check_unnest_ancestors_to <- function(ancestors_to,
data,
level_to,
parent_to,
call = caller_env()) {
if (!is_null(ancestors_to)) {
ancestors_to <- vctrs::vec_cast(ancestors_to, character(), call = call)
vctrs::vec_assert(ancestors_to, size = 1L, call = call)
check_arg_different(ancestors_to, level_to, parent_to, call = call)
check_col_new(data, ancestors_to, call = call)
}

ancestors_to
}

check_col_new <- function(data,
col,
col_arg = caller_arg(col),
data_arg = "data",
call = caller_env()) {
if (col %in% colnames(data)) {
msg <- "{.arg {col_arg}} must not be a column in {.arg {data_arg}}."
cli_abort(msg, call = call)
}
}

check_arg_different <- function(arg,
...,
arg_name = caller_arg(arg),
call = caller_env()) {
other_args <- dots_list(..., .named = TRUE)

for (i in seq_along(other_args)) {
if (identical(arg, other_args[[i]])) {
other_arg_nm <- names(other_args)[[i]]
msg <- "{.arg {arg_name}} must be different from {.arg {other_arg_nm}}."
cli_abort(msg, call = call)
}
}
}

check_id <- function(x, x_arg, call = caller_env()) {
incomplete <- !vctrs::vec_detect_complete(x)
if (any(incomplete)) {
incomplete_loc <- which(incomplete)
n <- length(incomplete_loc)
msg <- c(
"Each value of column {.field {x_arg}} must be non-missing.",
i = "{qty(n)}Element{?s} {incomplete_loc} {qty(n)}{?is/are} missing."
)
cli_abort(msg, call = call)
}

if (vctrs::vec_duplicate_any(x)) {
duplicated_flag <- vctrs::vec_duplicate_detect(x)
duplicated_loc <- which(duplicated_flag)
msg <- c(
"Each value of column {.field {x_arg}} must be unique.",
i = "The elements at locations {duplicated_loc} are duplicated."
)
cli_abort(msg, call = call)
}
}
71 changes: 71 additions & 0 deletions man/unnest_tree.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading