Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support .keep, .before, and .after in mutate() #802

Merged
merged 8 commits into from
Apr 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# dbplyr (development version)

* `mutate()` now supports the arguments `.keep`, `.before`, and `.after
(@mgirlich, #802).

* Multiple `across()` calls in `mutate()` and `transmute()` can now access
freshly created variables (@mgirlich, #802).

* `transmute()` now keeps grouping variables (@mgirlich, #802).

* Added `copy_inline()` as a `copy_to()` equivalent that does not need write
access (@mgirlich, #628).

Expand Down
179 changes: 135 additions & 44 deletions R/verb-mutate.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@
#' They are translated to computed expressions in the `SELECT` clause of
#' the SQL query.
#'
#' @param .keep `r lifecycle::badge("experimental")`
#' Control which columns from `.data` are retained in the output. Grouping
#' columns and columns created by `...` are always kept.
#'
#' * `"all"` retains all columns from `.data`. This is the default.
#' * `"used"` retains only the columns used in `...` to create new
#' columns. This is useful for checking your work, as it displays inputs
#' and outputs side-by-side.
#' * `"unused"` retains only the columns _not_ used in `...` to create new
#' columns. This is useful if you generate new columns, but no longer need
#' the columns used to generate them.
#' * `"none"` doesn't retain any extra columns from `.data`. Only the grouping
#' variables and columns created by `...` are kept.
#' @param .before,.after `r lifecycle::badge("experimental")`
#' <[`tidy-select`][dplyr_tidy_select]> Optionally, control where new columns
#' should appear (the default is to add to the right hand side). See
#' [relocate()] for more details.
#' @inheritParams arrange.tbl_lazy
#' @inheritParams dplyr::mutate
#' @inherit arrange.tbl_lazy return
Expand All @@ -21,68 +38,142 @@
#' db %>%
#' mutate(x1 = x + 1, x2 = x1 * 2) %>%
#' show_query()
mutate.tbl_lazy <- function(.data, ...) {
dots <- partial_eval_dots(.data, ..., .named = TRUE)
mutate.tbl_lazy <- function(.data,
...,
.keep = c("all", "used", "unused", "none"),
.before = NULL,
.after = NULL) {
keep <- arg_match(.keep)
layer_info <- get_mutate_layers(.data, ...)
used <- layer_info$used_vars
layers <- layer_info$layers

# The layers may contain `var = quo(NULL)` at this point.
# They are removed in `add_select()`.
out <- .data
for (layer in layers) {
out$lazy_query <- add_select(out, layer, "mutate")
}

cols_data <- op_vars(.data)
cols_group <- group_vars(.data)

nest_vars(.data, dots, union(op_vars(.data), op_grps(.data)))
cols_expr <- layer_info$modified_vars
cols_expr_modified <- intersect(cols_expr, cols_data)
cols_expr_new <- setdiff(cols_expr, cols_expr_modified)

cols_used <- setdiff(cols_data, c(cols_group, cols_expr_modified, names(used)[!used]))
cols_unused <- setdiff(cols_data, c(cols_group, cols_expr_modified, names(used)[used]))

.before <- enquo(.before)
.after <- enquo(.after)

if (!quo_is_null(.before) || !quo_is_null(.after)) {
# Only change the order of new columns
out <- relocate(out, all_of(cols_expr_new), .before = !!.before, .after = !!.after)
}

cols_out <- op_vars(out)

if (keep == "all") {
cols_retain <- cols_out
} else if (keep == "used") {
cols_retain <- setdiff(cols_out, cols_unused)
} else if (keep == "unused") {
cols_retain <- setdiff(cols_out, cols_used)
} else if (keep == "none") {
cols_retain <- setdiff(cols_out, c(cols_used, cols_unused))
}


select(out, all_of(cols_retain))
}

#' @export
#' @importFrom dplyr transmute
transmute.tbl_lazy <- function(.data, ...) {
dots <- partial_eval_dots(.data, ..., .named = TRUE)
layer_info <- get_mutate_layers(.data, ...)

for (layer in layer_info$layers) {
.data$lazy_query <- add_select(.data, layer, "mutate")
}

nest_vars(.data, dots, character())
# Retain expression columns in order of their appearance
cols_expr <- layer_info$modified_vars

# Retain untouched group variables up front
cols_group <- group_vars(.data)
cols_group <- setdiff(cols_group, cols_expr)

cols_retain <- c(cols_group, cols_expr)

select(.data, all_of(cols_retain))
}

# helpers -----------------------------------------------------------------

# TODO: refactor to remove `.data` argument and return a list of layers.
nest_vars <- function(.data, dots, all_vars) {
# For each expression, check if it uses any newly created variables.
# If so, nest the mutate()
new_vars <- character()
init <- 0L
get_mutate_layers <- function(.data, ...) {
dots <- enquos(..., .named = TRUE)
grps <- syms(op_grps(.data))
cur_data <- simulate_lazy_tbl(op_vars(.data), grps)

layer_modified_vars <- character()
all_modified_vars <- character()
all_used_vars <- character()
all_vars <- op_vars(.data)
var_is_null <- rep_named(op_vars(.data), FALSE)

cur_layer <- syms(set_names(op_vars(.data)))
layers <- list()

for (i in seq_along(dots)) {
cur_var <- names(dots)[[i]]
used_vars <- all_names(get_expr(dots[[i]]))

if (any(used_vars %in% new_vars)) {
new_actions <- dots[seq2(init, length(dots))][new_vars]
.data$lazy_query <- add_select(.data, carry_over(union(all_vars, used_vars), new_actions), "mutate")
all_vars <- c(all_vars, setdiff(new_vars, all_vars))
new_vars <- cur_var
init <- i
} else {
new_vars <- c(new_vars, cur_var)
quosures <- partial_eval_quo(dots[[i]], cur_data)
if (!is.list(quosures)) {
quosures <- set_names(list(quosures), names(dots)[[i]])
}
}
quosures <- unclass(quosures)

if (init != 0L) {
dots <- dots[-seq2(1L, init - 1)]
}
.data$lazy_query <- add_select(.data, carry_over(all_vars, dots), "mutate")
.data
}
for (k in seq_along(quosures)) {
cur_quo <- quosures[[k]]
cur_var <- names(quosures)[[k]]

# Combine a selection (passed through from subquery)
# with new actions
carry_over <- function(sel = character(), act = list()) {
if (is.null(names(sel))) {
names(sel) <- sel
}
sel <- syms(sel)
if (quo_is_null(cur_quo)) {
var_is_null[[cur_var]] <- TRUE
cur_layer[[cur_var]] <- cur_quo
layer_modified_vars <- setdiff(layer_modified_vars, cur_var)
all_modified_vars <- setdiff(all_modified_vars, cur_var)
next
}

# Keep last of duplicated acts
act <- act[!duplicated(names(act), fromLast = TRUE)]
all_modified_vars <- c(all_modified_vars, setdiff(cur_var, all_modified_vars))

# Preserve order of sel
both <- intersect(names(sel), names(act))
sel[both] <- act[both]
used_vars <- all_names(cur_quo)
all_used_vars <- c(all_used_vars, used_vars)
if (any(used_vars %in% layer_modified_vars)) {
layers <- append(layers, list(cur_layer))

# Adding new variables at end
new <- setdiff(names(act), names(sel))
cur_layer[!var_is_null] <- syms(names(cur_layer)[!var_is_null])
layer_modified_vars <- character()
}

var_is_null[[cur_var]] <- FALSE
cur_layer[[cur_var]] <- cur_quo
layer_modified_vars <- c(layer_modified_vars, cur_var)
}

c(sel, act[new])
all_vars <- names(cur_layer)[!var_is_null]
cur_data <- simulate_lazy_tbl(all_vars, grps)
}

list(
layers = append(layers, list(cur_layer)),
modified_vars = all_modified_vars,
used_vars = set_names(all_vars %in% all_used_vars, all_vars)
)
}

simulate_lazy_tbl <- function(vars, groups) {
df <- as_tibble(as.list(set_names(vars)), .name_repair = "minimal")
tbl_lazy(df) %>%
group_by(!!!groups)
}
2 changes: 1 addition & 1 deletion R/verb-select.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ add_select <- function(.data, vars, op = c("select", "mutate")) {
if (length(lazy_query$last_op) == 1 && lazy_query$last_op %in% c("select", "mutate")) {
# Special optimisation when applied to pure projection() - this is
# conservative and we could expand to any op_select() if combined with
# the logic in nest_vars()
# the logic in get_mutate_layers()
select <- lazy_query$select

if (purrr::every(vars, is.symbol)) {
Expand Down
28 changes: 27 additions & 1 deletion man/mutate.tbl_lazy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

54 changes: 53 additions & 1 deletion tests/testthat/_snaps/verb-mutate.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,31 @@
df %>% group_by(g) %>% transmute(across(.fns = ~0))
Output
<SQL>
SELECT 0.0 AS `x`
SELECT `g`, 0.0 AS `x`
FROM `df`

# across() can access previously created variables

Code
remote_query(lf)
Output
<SQL> SELECT `x`, SQRT(`y`) AS `y`
FROM (
SELECT `x`, 2.0 AS `y`
FROM `df`
) `q01`

# new columns take precedence over global variables

Code
remote_query(lf)
Output
<SQL> SELECT `x`, `y`, `y` + 1.0 AS `z`
FROM (
SELECT `x`, 2.0 AS `y`
FROM `df`
) `q01`

# mutate generates subqueries as needed

Code
Expand Down Expand Up @@ -85,3 +107,33 @@
SELECT `y` * 2.0 AS `y`, `x` * 2.0 AS `x`
FROM `df`

# var = NULL works when var is in original data

Code
remote_query(lf)
Output
<SQL> SELECT `x` * 2.0 AS `z`
FROM (
SELECT 2.0 AS `x`
FROM `df`
) `q01`

# var = NULL when var is in final output

Code
remote_query(lf)
Output
<SQL> SELECT `x`, 3.0 AS `y`
FROM `df`

# temp var with nested arguments

Code
remote_query(lf)
Output
<SQL> SELECT `x`, `y` * 2.0 AS `z`
FROM (
SELECT `x`, 2.0 AS `y`
FROM `df`
) `q01`

Loading