Skip to content

Commit

Permalink
Merge pull request #674 from cynkra/f-663-review-key-tracking
Browse files Browse the repository at this point in the history
- `mutate()`, `transmute()`, `distinct()` and `summarize()` now support `dplyr::across()` and extra arguments (#640).
- Key tracking for the first three verbs is less strict and based on name equality (#663).
  • Loading branch information
krlmlr authored Oct 15, 2021
2 parents 989ef1e + cb1822b commit b463461
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 53 deletions.
27 changes: 10 additions & 17 deletions R/dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,9 @@ mutate.dm <- function(.data, ...) {
#' @export
mutate.zoomed_dm <- function(.data, ...) {
tbl <- tbl_zoomed(.data)
quos <- enquos(..., .named = TRUE)
mutated_tbl <- mutate(tbl, !!!quos)
# all columns that are not touched count as "selected"; names of "selected" are identical to "selected"
# in case no keys are tracked, `set_names(NULL)` would throw an error
selected <- set_names(setdiff(names2(col_tracker_zoomed(.data)), names(quos)))
mutated_tbl <- mutate(tbl, ...)
# #663: user responsibility: those columns are tracked whose names remain
selected <- set_names(intersect(colnames(tbl), colnames(mutated_tbl)))
new_tracked_cols_zoom <- new_tracked_cols(.data, selected)
replace_zoomed_tbl(.data, mutated_tbl, new_tracked_cols_zoom)
}
Expand All @@ -57,10 +55,9 @@ transmute.zoomed_dm <- function(.data, ...) {
groups <- set_names(map_chr(groups(tbl), as_string))
transmuted_tbl <- transmute(tbl, ...)

# columns can vanish, https://github.com/tidyverse/dbplyr/issues/670:
groups <- groups[names(groups) %in% colnames(transmuted_tbl)]

new_tracked_cols_zoom <- new_tracked_cols(.data, groups)
# #663: user responsibility: those columns are tracked whose names remain
selected <- set_names(intersect(colnames(tbl), colnames(transmuted_tbl)))
new_tracked_cols_zoom <- new_tracked_cols(.data, selected)

replace_zoomed_tbl(.data, transmuted_tbl, new_tracked_cols_zoom)
}
Expand Down Expand Up @@ -125,15 +122,10 @@ distinct.dm <- function(.data, ...) {
distinct.zoomed_dm <- function(.data, ..., .keep_all = FALSE) {
tbl <- tbl_zoomed(.data)
distinct_tbl <- distinct(tbl, ..., .keep_all = .keep_all)
# when keeping all columns or empty ellipsis
# (use all columns for distinct)
# all keys columns remain
if (.keep_all || rlang::dots_n(...) == 0) {
return(replace_zoomed_tbl(.data, distinct_tbl))
}

selected <- eval_select_both(quo(c(...)), colnames(tbl))
new_tracked_cols_zoom <- new_tracked_cols(.data, selected$names)
# #663: user responsibility: those columns are tracked whose names remain
selected <- set_names(intersect(colnames(tbl), colnames(distinct_tbl)))
new_tracked_cols_zoom <- new_tracked_cols(.data, selected)

replace_zoomed_tbl(.data, distinct_tbl, new_tracked_cols_zoom)
}
Expand Down Expand Up @@ -277,6 +269,7 @@ summarise.dm <- function(.data, ...) {
summarise.zoomed_dm <- function(.data, ...) {
tbl <- tbl_zoomed(.data)
# groups are "selected"; key tracking will continue for them
# #663: user responsibility: if group columns are manipulated, they are still tracked
groups <- set_names(map_chr(groups(tbl), as_string))
summarized_tbl <- summarize(tbl, ...)
new_tracked_cols_zoom <- new_tracked_cols(.data, groups)
Expand Down
48 changes: 26 additions & 22 deletions tests/testthat/_snaps/dplyr.md
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@

Code
# mutate()
zoomed_grouped_out_dm %>% mutate(d_mean = mean(d), d = d * 2) %>%
zoomed_grouped_out_dm %>% mutate(d_mean = mean(d)) %>% select(-d) %>%
dm_insert_zoomed("new_tbl") %>% get_all_keys()
Output
$pks
Expand All @@ -278,29 +278,32 @@
6 tf_5 m tf_6 n no_action
Code
zoomed_grouped_in_dm %>% mutate(f = list(g)) %>% dm_insert_zoomed("new_tbl") %>%
get_all_keys()
zoomed_grouped_in_dm %>% mutate(f = paste0(g, g)) %>% dm_insert_zoomed(
"new_tbl") %>% get_all_keys()
Output
$pks
# A tibble: 6 x 2
table pk_col
<chr> <keys>
1 tf_1 a
2 tf_2 c
3 tf_3 f, f1
4 tf_4 h
5 tf_5 k
6 tf_6 o
# A tibble: 7 x 2
table pk_col
<chr> <keys>
1 tf_1 a
2 tf_2 c
3 tf_3 f, f1
4 tf_4 h
5 tf_5 k
6 tf_6 o
7 new_tbl f, f1
$fks
# A tibble: 5 x 5
# A tibble: 7 x 5
child_table child_fk_cols parent_table parent_key_cols on_delete
<chr> <keys> <chr> <keys> <chr>
1 tf_2 d tf_1 a no_action
2 tf_2 e, e1 tf_3 f, f1 no_action
3 tf_4 j, j1 tf_3 f, f1 no_action
4 tf_5 l tf_4 h cascade
5 tf_5 m tf_6 n no_action
6 tf_2 e, e1 new_tbl f, f1 no_action
7 tf_4 j, j1 new_tbl f, f1 no_action
Code
zoomed_grouped_in_dm %>% mutate(g_new = list(g)) %>% dm_insert_zoomed("new_tbl") %>%
Expand Down Expand Up @@ -340,15 +343,16 @@
dm_insert_zoomed("new_tbl") %>% get_all_keys()
Output
$pks
# A tibble: 6 x 2
table pk_col
<chr> <keys>
1 tf_1 a
2 tf_2 c
3 tf_3 f, f1
4 tf_4 h
5 tf_5 k
6 tf_6 o
# A tibble: 7 x 2
table pk_col
<chr> <keys>
1 tf_1 a
2 tf_2 c
3 tf_3 f, f1
4 tf_4 h
5 tf_5 k
6 tf_6 o
7 new_tbl c
$fks
# A tibble: 6 x 5
Expand Down
62 changes: 49 additions & 13 deletions tests/testthat/test-dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,10 @@ test_that("basic test: 'slice()'-methods work", {
)
)

# silent when no PK available anymore
expect_silent(
mutate(zoomed_dm(), c = 1) %>% slice(1:3)
# changed for #663: mutate() tracks now all cols that remain
expect_message(
mutate(zoomed_dm(), c = 1) %>% slice(1:3),
"Keeping PK column"
)

expect_silent(
Expand Down Expand Up @@ -304,7 +305,7 @@ test_that("basic test: 'join()'-methods for `zoomed.dm` work (2)", {

# a former FK-relation could not be tracked
expect_dm_error(
zoomed_dm() %>% mutate(e = e) %>% left_join(tf_3),
zoomed_dm() %>% select(-e) %>% left_join(tf_3),
"fk_not_tracked"
)

Expand Down Expand Up @@ -414,6 +415,40 @@ test_that("basic test: 'join()'-methods for `dm` throws error", {
)
})

test_that("basic test: 'across' works properly", {
expect_equivalent_tbl(
dm_for_filter() %>%
dm_zoom_to(tf_2) %>%
mutate(across(c(1, 3), ~ "C")) %>%
pull_tbl(),
dm_for_filter() %>%
pull_tbl(tf_2) %>%
mutate(across(c(1, 3), ~ "C"))
)

expect_equivalent_tbl(
dm_for_filter() %>%
dm_zoom_to(tf_2) %>%
summarize(across(c(c, e), ~ "C")) %>%
pull_tbl(),
dm_for_filter() %>%
pull_tbl(tf_2) %>%
summarize(across(c(c, e), ~ "C"))
)

expect_equivalent_tbl(
dm_for_filter() %>%
dm_zoom_to(tf_2) %>%
group_by(d) %>%
summarize(across(c(1, 3), ~ "C")) %>%
pull_tbl(),
dm_for_filter() %>%
pull_tbl(tf_2) %>%
group_by(d) %>%
summarize(across(c(1, 3), ~ "C"))
)
})

# test key tracking for all methods ---------------------------------------

# dm_for_filter(), zoomed to tf_2; PK: c; 2 outgoing FKs: d, e; no incoming FKS
Expand Down Expand Up @@ -465,7 +500,7 @@ test_that("key tracking works (2)", {
expect_snapshot({
"transmute()"

# grouped by two key cols: "c" and "e" -> these two remain
# grouped by three key cols: "c", "e", "e1" -> these three remain
zoomed_grouped_out_dm %>%
transmute(d_mean = mean(d)) %>%
dm_insert_zoomed("new_tbl") %>%
Expand All @@ -489,19 +524,22 @@ test_that("key tracking works (4)", {
expect_snapshot({
"mutate()"

# grouped by two key cols: "c" and "e" -> these two remain
# grouped by three key cols: "c", "e" and "e1 -> these three remain
zoomed_grouped_out_dm %>%
mutate(d_mean = mean(d), d = d * 2) %>%
mutate(d_mean = mean(d)) %>%
select(-d) %>%
dm_insert_zoomed("new_tbl") %>%
get_all_keys()

# grouped_by non-key col means, that only key-columns that are not touched remain for mutate()
# grouped_by non-key col means, that only key-columns that remain in the
# result tibble are tracked for mutate()
zoomed_grouped_in_dm %>%
mutate(f = list(g)) %>%
mutate(f = paste0(g, g)) %>%
dm_insert_zoomed("new_tbl") %>%
get_all_keys()

# grouped_by non-key col means, that only key-columns that are not touched remain for
# grouped_by non-key col means, that only key-columns that remain in the
# result tibble are tracked for transmute()
zoomed_grouped_in_dm %>%
mutate(g_new = list(g)) %>%
dm_insert_zoomed("new_tbl") %>%
Expand Down Expand Up @@ -543,9 +581,7 @@ test_that("key tracking works for distinct() and arrange()", {
dm_get_all_fks_impl(),
dm_for_filter() %>%
dm_get_all_fks_impl() %>%
filter(child_table != "tf_2" | parent_table != "tf_3") %>%
# https://github.com/r-lib/vctrs/issues/1371
mutate(child_fk_cols = new_keys(if_else(child_fk_cols == new_keys("d"), list("d_new"), unclass(child_fk_cols))))
filter(child_table != "tf_2")
)

expect_identical(
Expand Down
8 changes: 7 additions & 1 deletion vignettes/tech-dm-zoom.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,17 @@ flights_dm %>%

1. Currently not all of the {dplyr}-verbs have their own method for a `zoomed_dm`, so be aware that in some cases it will still be necessary to resort to extracting one or more tables from a `dm` and reinserting a transformed version of theirs into the `dm` eventually.
The supported functions are:
`group_by()`, `ungroup()`, `summarise()`, `mutate()`, `transmute()`, `filter()`, `select()`, `rename()`, `distinct()`, `arrange()`, `slice()`, `left_join()`, `inner_join()`, `full_join()`, `right_join()`, `semi_join()` and `anti_join()`.
`group_by()`, `ungroup()`, `summarise()`, `mutate()`, `transmute()`, `filter()`, `select()`, `relocate()`, `rename()`, `distinct()`, `arrange()`, `slice()`, `left_join()`, `inner_join()`, `full_join()`, `right_join()`, `semi_join()` and `anti_join()`.

1. The same is true for {tidyr}-functions.
Methods are provided for: `unite()` and `separate()`.

1. There might be situations when you would like the key relations to remain intact, but they are dropped nevertheless.
This is because a rigid logic is implemented, that does drop a key when its associated column is acted upon with e.g. a `mutate()` call.
In these cases the key relations will need to be established once more after finishing with the manipulations.

1. For each implemented {dplyr}-verb there is a logic for tracking key relations between the tables.
Up to {dm} version 0.2.4 we tried to track the columns in a very detailed manner. This has become increasingly difficult, especially with `dplyr::across()`. As of {dm} 0.2.5, we give more responsibility to the {dm} user:
Now those columns are tracked whose **names** remain in the resulting table.
Affected by these changes are the methods for: `mutate()`, `transmute()`, `distinct()`.
When using one of these functions, be aware that if you want to replace a key column with a column with a different content but of the same name, this column will automatically become a key column.

0 comments on commit b463461

Please sign in to comment.