-
Notifications
You must be signed in to change notification settings - Fork 417
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add unnest_tree()
#1386
Add unnest_tree()
#1386
Changes from all commits
9bd3c70
c77bfe0
2aa3fc2
d52b4ba
ccb7f61
0abbedc
152000a
05c049b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,251 @@ | ||
#' Unnest a recursive data frame | ||
#' | ||
#' @param data A data frame. | ||
#' @param id_col <[`tidy-select`][tidyr_tidy_select]> A column that uniquely | ||
#' identifies each observation. | ||
#' @param child_col <[`tidy-select`][tidyr_tidy_select]> Column containing the | ||
#' children of an observation. This must be a list where each element is either | ||
#' `NULL` or a data frame with the same columns as `data`. | ||
#' @param level_to A string (`"level"` by default) specifying the new column to | ||
#' store the level of an observation. Use `NULL` if you don't need this | ||
#' information. | ||
#' @param parent_to A string (`"parent"` by default) specifying the new column | ||
#' storing the parent id of an observation. Use `NULL` if you don't need this | ||
#' information. | ||
#' @param ancestors_to A string (`NULL` by default) specifying the new column | ||
#' storing the ids of its ancestors. Use `NULL` if you don't need this | ||
#' information. | ||
#' | ||
#' @return A data frame. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' df <- tibble( | ||
#' id = 1L, | ||
#' name = "a", | ||
#' children = list( | ||
#' tibble( | ||
#' id = 11:12, | ||
#' name = c("b", "c"), | ||
#' children = list( | ||
#' NULL, | ||
#' tibble( | ||
#' id = 121:122, | ||
#' name = c("d", "e") | ||
#' ) | ||
#' ) | ||
#' ) | ||
#' ) | ||
#' ) | ||
#' | ||
#' unnest_tree( | ||
#' df, | ||
#' id_col = "id", | ||
#' child_col = "children", | ||
#' level_to = "level", | ||
#' parent_to = "parent", | ||
#' ancestors_to = "ancestors" | ||
#' ) | ||
unnest_tree <- function(data, | ||
id_col, | ||
child_col, | ||
level_to = "level", | ||
parent_to = "parent", | ||
ancestors_to = NULL) { | ||
if (!is.data.frame(data)) { | ||
cli_abort("{.arg data} must be a data frame.") | ||
} | ||
|
||
id_col <- names(eval_pull(data, enquo(id_col), "id_col")) | ||
child_col <- names(eval_pull(data, enquo(child_col), "child_col")) | ||
check_arg_different(child_col, id_col) | ||
|
||
if (!is_null(level_to)) { | ||
level_to <- vctrs::vec_cast(level_to, character()) | ||
vctrs::vec_assert(level_to, size = 1L) | ||
} | ||
parent_to <- check_unnest_parent_to(parent_to, data, level_to) | ||
ancestors_to <- check_unnest_ancestors_to(ancestors_to, data, level_to, parent_to) | ||
|
||
call <- current_env() | ||
|
||
level_sizes <- list() | ||
level_parent_ids <- list() | ||
level_ancestors <- list() | ||
level_data <- list() | ||
out_ptype <- vctrs::vec_ptype(dplyr::select(data, -any_of(child_col))) | ||
|
||
level <- 1L | ||
parent_ids <- vctrs::vec_init(data[[id_col]]) | ||
ns <- vctrs::vec_size(data) | ||
cur_ancestors <- vctrs::vec_rep_each(list(NULL), ns) | ||
|
||
while (!is.null(data)) { | ||
children <- data[[child_col]] %||% list() | ||
# TODO this could mention the path? | ||
# -> this would require tracking the current ancestors. Worth it? | ||
vctrs::vec_check_list(children, arg = child_col) | ||
|
||
data <- dplyr::select(data, -any_of(child_col)) | ||
# keep track of the out ptype to error earlier and better error messages (in the future...) | ||
out_ptype <- vctrs::vec_ptype2(out_ptype, data) | ||
level_data[[level]] <- data | ||
# we could also directly repeat the parent ids but it is a bit more efficient | ||
# to store the parent ids and level sizes in a list and expand + repeat them | ||
# in the end | ||
if (!is_null(parent_to)) { | ||
level_sizes[[level]] <- ns | ||
level_parent_ids[[level]] <- parent_ids | ||
} | ||
|
||
if (!is_null(ancestors_to)) { | ||
if (level > 1L) { | ||
ancestors_simple <- purrr::map2(cur_ancestors, vctrs::vec_chop(parent_ids), c) | ||
cur_ancestors <- vctrs::vec_rep_each(ancestors_simple, ns) | ||
} | ||
level_ancestors[[level]] <- cur_ancestors | ||
} | ||
|
||
ns <- vctrs::list_sizes(children) | ||
if (all(ns == 0)) { | ||
break | ||
} | ||
|
||
parent_ids <- data[[id_col]] | ||
# unclass `list_of` to avoid performance hit | ||
children <- purrr::map(children, ~ unclass_list_of(.x, child_col, call = call)) | ||
data <- vctrs::vec_unchop(children) | ||
|
||
level <- level + 1L | ||
} | ||
|
||
out <- vctrs::vec_rbind(!!!level_data, .ptype = out_ptype) | ||
|
||
if (!is_null(level_to)) { | ||
times <- list_sizes(level_data) | ||
levels <- vctrs::vec_seq_along(level_data) | ||
out[[level_to]] <- vctrs::vec_rep_each(levels, times) | ||
} | ||
|
||
if (!is_null(parent_to)) { | ||
parent_ids <- vctrs::vec_c(!!!level_parent_ids, .ptype = out[[id_col]]) | ||
times <- vctrs::vec_c(!!!level_sizes, .ptype = integer()) | ||
out[[parent_to]] <- vctrs::vec_rep_each(parent_ids, times) | ||
} | ||
|
||
if (!is_null(ancestors_to)) { | ||
out[[ancestors_to]] <- vctrs::vec_unchop(level_ancestors) | ||
} | ||
|
||
check_id(out[[id_col]], id_col) | ||
out | ||
} | ||
|
||
unclass_list_of <- function(x, child_col, call = caller_env()) { | ||
if (is_null(x)) { | ||
return(NULL) | ||
} | ||
|
||
if (!inherits(x, "data.frame")) { | ||
# TODO mention path | ||
# TODO mention other type -> need rlang compat file | ||
msg <- "Each child must be NULL or a data frame." | ||
cli_abort(msg, call = call) | ||
} | ||
|
||
# unclass to avoid slow `[[.tbl_df` and `[[<-.tbl_df` | ||
x <- unclass(x) | ||
child_children <- x[[child_col]] | ||
if (inherits(child_children, "vctrs_list_of")) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing the After removing: devtools::load_all("~/GitHub/tidyr/")
#> ℹ Loading tidyr
test_tree <- readr::read_rds("~/GitHub/tidyr/test-tree.rds")
bench::mark(
unnest_tree = unnest_tree(
test_tree,
id,
children
)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 unnest_tree 170ms 180ms 5.27 3.97MB 7.02 Created on 2022-08-25 with reprex v2.0.2 Before removing: #> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 unnest_tree 26.9s 26.9s 0.0372 508MB 7.62 Created on 2022-08-25 with reprex v2.0.2 |
||
x[[child_col]] <- unclass(child_children) | ||
} | ||
|
||
vctrs::new_data_frame(x) | ||
} | ||
|
||
eval_pull <- function(data, col, col_arg) { | ||
# TODO use `eval_pull()` once implemented | ||
# https://github.com/r-lib/tidyselect/issues/189 | ||
col <- tidyselect::eval_select(col, data, allow_rename = FALSE) | ||
if (length(col) != 1L) { | ||
cli_abort("{.arg {col_arg}} must select 1 column, not {length(col)}.") | ||
} | ||
|
||
nm <- colnames(data)[[col]] | ||
set_names(col, nm) | ||
} | ||
|
||
check_unnest_parent_to <- function(parent_to, data, level_to, call = caller_env()) { | ||
if (!is_null(parent_to)) { | ||
parent_to <- vctrs::vec_cast(parent_to, character(), call = call) | ||
vctrs::vec_assert(parent_to, size = 1L, call = call) | ||
check_arg_different(parent_to, level_to, call = call) | ||
check_col_new(data, parent_to, call = call) | ||
} | ||
|
||
parent_to | ||
} | ||
|
||
check_unnest_ancestors_to <- function(ancestors_to, | ||
data, | ||
level_to, | ||
parent_to, | ||
call = caller_env()) { | ||
if (!is_null(ancestors_to)) { | ||
ancestors_to <- vctrs::vec_cast(ancestors_to, character(), call = call) | ||
vctrs::vec_assert(ancestors_to, size = 1L, call = call) | ||
check_arg_different(ancestors_to, level_to, parent_to, call = call) | ||
check_col_new(data, ancestors_to, call = call) | ||
} | ||
|
||
ancestors_to | ||
} | ||
|
||
check_col_new <- function(data, | ||
col, | ||
col_arg = caller_arg(col), | ||
data_arg = "data", | ||
call = caller_env()) { | ||
if (col %in% colnames(data)) { | ||
msg <- "{.arg {col_arg}} must not be a column in {.arg {data_arg}}." | ||
cli_abort(msg, call = call) | ||
} | ||
} | ||
|
||
check_arg_different <- function(arg, | ||
..., | ||
arg_name = caller_arg(arg), | ||
call = caller_env()) { | ||
other_args <- dots_list(..., .named = TRUE) | ||
|
||
for (i in seq_along(other_args)) { | ||
if (identical(arg, other_args[[i]])) { | ||
other_arg_nm <- names(other_args)[[i]] | ||
msg <- "{.arg {arg_name}} must be different from {.arg {other_arg_nm}}." | ||
cli_abort(msg, call = call) | ||
} | ||
} | ||
} | ||
|
||
check_id <- function(x, x_arg, call = caller_env()) { | ||
incomplete <- !vctrs::vec_detect_complete(x) | ||
if (any(incomplete)) { | ||
incomplete_loc <- which(incomplete) | ||
n <- length(incomplete_loc) | ||
msg <- c( | ||
"Each value of column {.field {x_arg}} must be non-missing.", | ||
i = "{qty(n)}Element{?s} {incomplete_loc} {qty(n)}{?is/are} missing." | ||
) | ||
cli_abort(msg, call = call) | ||
} | ||
|
||
if (vctrs::vec_duplicate_any(x)) { | ||
duplicated_flag <- vctrs::vec_duplicate_detect(x) | ||
duplicated_loc <- which(duplicated_flag) | ||
msg <- c( | ||
"Each value of column {.field {x_arg}} must be unique.", | ||
i = "The elements at locations {duplicated_loc} are duplicated." | ||
) | ||
cli_abort(msg, call = call) | ||
} | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not quite as bad as the
list_of
performance hit but still a lot. Without unclassing this takesCreated on 2022-08-25 with reprex v2.0.2