From d76fb26c60fbf35c99eb1f3793690c6eae0c60d4 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 31 Jan 2023 09:14:01 +0000 Subject: [PATCH 1/3] Move argument `auto_index` in `*_join()` --- NEWS.md | 2 + R/verb-joins.R | 89 ++++++++++++++++++++++++++++++++------------- man/join.tbl_sql.Rd | 75 ++++++++++++++++++++++---------------- 3 files changed, 108 insertions(+), 58 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5a479d8b8..ff3fc6a0d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # dbplyr (development version) +* Moved argument `auto_index` after `...` in `*_join()` (@mgirlich). + * Removed dependency on assertthat (@mgirlich, #1112). * Catch unsupported argument `pivot_wider(id_expand = TRUE)` and diff --git a/R/verb-joins.R b/R/verb-joins.R index 8d813ddc4..fe702036b 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -67,11 +67,18 @@ NULL #' @rdname join.tbl_sql #' @export #' @importFrom dplyr inner_join -inner_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, +inner_join.tbl_lazy <- function(x, + y, + by = NULL, + copy = FALSE, suffix = NULL, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), - x_as = NULL, y_as = NULL) { + ..., + na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, + x_as = NULL, + y_as = NULL) { + check_dots_empty() x$lazy_query <- add_join( x, y, "inner", @@ -92,11 +99,17 @@ inner_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, #' @rdname join.tbl_sql #' @export #' @importFrom dplyr left_join -left_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, - suffix = NULL, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), - x_as = NULL, y_as = NULL) { +left_join.tbl_lazy <- function(x, + y, + by = NULL, + copy = FALSE, + suffix = NULL, + ..., + na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, + x_as = NULL, + y_as = NULL) { x$lazy_query <- add_join( x, y, "left", @@ -117,11 +130,17 @@ left_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, #' @rdname join.tbl_sql #' @export #' @importFrom dplyr right_join -right_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, +right_join.tbl_lazy <- function(x, + y, + by = NULL, + copy = FALSE, suffix = NULL, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), - x_as = NULL, y_as = NULL) { + ..., + na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, + x_as = NULL, + y_as = NULL) { x$lazy_query <- add_join( x, y, "right", @@ -142,11 +161,17 @@ right_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, #' @rdname join.tbl_sql #' @export #' @importFrom dplyr full_join -full_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, - suffix = NULL, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), - x_as = NULL, y_as = NULL) { +full_join.tbl_lazy <- function(x, + y, + by = NULL, + copy = FALSE, + suffix = NULL, + ..., + na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, + x_as = NULL, + y_as = NULL) { x$lazy_query <- add_join( x, y, "full", @@ -167,10 +192,16 @@ full_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, #' @rdname join.tbl_sql #' @export #' @importFrom dplyr semi_join -semi_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), - x_as = NULL, y_as = NULL) { +semi_join.tbl_lazy <- function(x, + y, + by = NULL, + copy = FALSE, + ..., + na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, + x_as = NULL, + y_as = NULL) { x$lazy_query <- add_semi_join( x, y, anti = FALSE, @@ -190,10 +221,16 @@ semi_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, #' @rdname join.tbl_sql #' @export #' @importFrom dplyr anti_join -anti_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), - x_as = NULL, y_as = NULL) { +anti_join.tbl_lazy <- function(x, + y, + by = NULL, + copy = FALSE, + ..., + na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, + x_as = NULL, + y_as = NULL) { x$lazy_query <- add_semi_join( x, y, anti = TRUE, diff --git a/man/join.tbl_sql.Rd b/man/join.tbl_sql.Rd index ded168630..e0299f88c 100644 --- a/man/join.tbl_sql.Rd +++ b/man/join.tbl_sql.Rd @@ -16,10 +16,10 @@ by = NULL, copy = FALSE, suffix = NULL, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, x_as = NULL, y_as = NULL ) @@ -30,10 +30,10 @@ by = NULL, copy = FALSE, suffix = NULL, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, x_as = NULL, y_as = NULL ) @@ -44,10 +44,10 @@ by = NULL, copy = FALSE, suffix = NULL, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, x_as = NULL, y_as = NULL ) @@ -58,10 +58,10 @@ by = NULL, copy = FALSE, suffix = NULL, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, x_as = NULL, y_as = NULL ) @@ -71,10 +71,10 @@ y, by = NULL, copy = FALSE, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, x_as = NULL, y_as = NULL ) @@ -84,10 +84,10 @@ y, by = NULL, copy = FALSE, - auto_index = FALSE, ..., - sql_on = NULL, na_matches = c("never", "na"), + sql_on = NULL, + auto_index = FALSE, x_as = NULL, y_as = NULL ) @@ -95,23 +95,34 @@ \arguments{ \item{x, y}{A pair of lazy data frames backed by database queries.} -\item{by}{A character vector of variables to join by. +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all -variables in common across \code{x} and \code{y}. A message lists the variables so that you -can check they're correct; suppress the message by supplying \code{by} explicitly. +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. -To join by different variables on \code{x} and \code{y}, use a named vector. -For example, \code{by = c("a" = "b")} will match \code{x$a} to \code{y$b}. +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. -To join by multiple variables, use a vector with length > 1. -For example, \code{by = c("a", "b")} will match \code{x$a} to \code{y$a} and \code{x$b} to -\code{y$b}. Use a named vector to match different variables in \code{x} and \code{y}. -For example, \code{by = c("a" = "b", "c" = "d")} will match \code{x$a} to \code{y$b} and -\code{x$c} to \code{y$d}. +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. -To perform a cross-join, generating all combinations of \code{x} and \code{y}, -use \code{by = character()}.} +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} \item{copy}{If \code{x} and \code{y} are not from the same data source, and \code{copy} is \code{TRUE}, then \code{y} will be copied into a @@ -127,22 +138,22 @@ operation so you must opt into it.} \code{y}, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.} -\item{auto_index}{if \code{copy} is \code{TRUE}, automatically create -indices for the variables in \code{by}. This may speed up the join if -there are matching indexes in \code{x}.} - \item{...}{Other parameters passed onto methods.} +\item{na_matches}{Should NA (NULL) values match one another? +The default, "never", is how databases usually work. \code{"na"} makes +the joins behave like the dplyr join functions, \code{\link[=merge]{merge()}}, \code{\link[=match]{match()}}, +and \code{\%in\%}.} + \item{sql_on}{A custom join predicate as an SQL expression. Usually joins use column equality, but you can perform more complex queries by supply \code{sql_on} which should be a SQL expression that uses \code{LHS} and \code{RHS} aliases to refer to the left-hand side or right-hand side of the join respectively.} -\item{na_matches}{Should NA (NULL) values match one another? -The default, "never", is how databases usually work. \code{"na"} makes -the joins behave like the dplyr join functions, \code{\link[=merge]{merge()}}, \code{\link[=match]{match()}}, -and \code{\%in\%}.} +\item{auto_index}{if \code{copy} is \code{TRUE}, automatically create +indices for the variables in \code{by}. This may speed up the join if +there are matching indexes in \code{x}.} \item{x_as, y_as}{Alias to use for \code{x} resp. \code{y}. Defaults to \code{"LHS"} resp. \code{"RHS"}} From 0ac15d1c561f7a33bb9836cf49e69bee1ccfc970 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 31 Jan 2023 09:20:38 +0000 Subject: [PATCH 2/3] Check for empty dots in `*_join()` --- R/verb-joins.R | 82 ++++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/R/verb-joins.R b/R/verb-joins.R index fe702036b..8f4492cd5 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -80,17 +80,17 @@ inner_join.tbl_lazy <- function(x, y_as = NULL) { check_dots_empty() x$lazy_query <- add_join( - x, y, - "inner", + x, + y, + type = "inner", by = by, - sql_on = sql_on, copy = copy, suffix = suffix, - auto_index = auto_index, na_matches = na_matches, + sql_on = sql_on, + auto_index = auto_index, x_as = x_as, - y_as = y_as, - ... + y_as = y_as ) x @@ -110,18 +110,19 @@ left_join.tbl_lazy <- function(x, auto_index = FALSE, x_as = NULL, y_as = NULL) { + check_dots_empty() x$lazy_query <- add_join( - x, y, - "left", + x, + y, + type = "left", by = by, - sql_on = sql_on, copy = copy, suffix = suffix, - auto_index = auto_index, na_matches = na_matches, + sql_on = sql_on, + auto_index = auto_index, x_as = x_as, - y_as = y_as, - ... + y_as = y_as ) x @@ -141,18 +142,19 @@ right_join.tbl_lazy <- function(x, auto_index = FALSE, x_as = NULL, y_as = NULL) { + check_dots_empty() x$lazy_query <- add_join( - x, y, - "right", + x, + y, + type = "right", by = by, - sql_on = sql_on, copy = copy, suffix = suffix, - auto_index = auto_index, na_matches = na_matches, + sql_on = sql_on, + auto_index = auto_index, x_as = x_as, - y_as = y_as, - ... + y_as = y_as ) x @@ -172,18 +174,19 @@ full_join.tbl_lazy <- function(x, auto_index = FALSE, x_as = NULL, y_as = NULL) { + check_dots_empty() x$lazy_query <- add_join( - x, y, - "full", + x, + y, + type = "full", by = by, - sql_on = sql_on, copy = copy, suffix = suffix, - auto_index = auto_index, na_matches = na_matches, + sql_on = sql_on, + auto_index = auto_index, x_as = x_as, - y_as = y_as, - ... + y_as = y_as ) x @@ -202,17 +205,18 @@ semi_join.tbl_lazy <- function(x, auto_index = FALSE, x_as = NULL, y_as = NULL) { + check_dots_empty() x$lazy_query <- add_semi_join( - x, y, + x, + y, anti = FALSE, by = by, - sql_on = sql_on, copy = copy, - auto_index = auto_index, na_matches = na_matches, + sql_on = sql_on, + auto_index = auto_index, x_as = x_as, - y_as = y_as, - ... + y_as = y_as ) x @@ -231,27 +235,33 @@ anti_join.tbl_lazy <- function(x, auto_index = FALSE, x_as = NULL, y_as = NULL) { + check_dots_empty() x$lazy_query <- add_semi_join( - x, y, + x, + y, anti = TRUE, by = by, - sql_on = sql_on, copy = copy, - auto_index = auto_index, na_matches = na_matches, + sql_on = sql_on, + auto_index = auto_index, x_as = x_as, - y_as = y_as, - ... + y_as = y_as ) x } -add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, +add_join <- function(x, + y, + type, + by = NULL, + copy = FALSE, suffix = NULL, - auto_index = FALSE, na_matches = "never", + sql_on = NULL, + auto_index = FALSE, x_as = NULL, y_as = NULL, call = caller_env()) { From 35275528311b68219d403cc257a6f7f48f7ba18d Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 31 Jan 2023 09:22:58 +0000 Subject: [PATCH 3/3] Add issue number --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ff3fc6a0d..9d6fb691e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # dbplyr (development version) -* Moved argument `auto_index` after `...` in `*_join()` (@mgirlich). +* Moved argument `auto_index` after `...` in `*_join()` (@mgirlich, #1115). * Removed dependency on assertthat (@mgirlich, #1112).