Skip to content

Commit

Permalink
Merge pull request #45 from christophsax/join-using-merge
Browse files Browse the repository at this point in the history
Join using merge.data.table()
  • Loading branch information
krlmlr authored May 11, 2017
2 parents e15fc7d + 4fe5a97 commit 6c81a9a
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 40 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ Description: This implements the data table back-end for 'dplyr' so that you
License: GPL (>= 2)
Imports:
dplyr (>= 0.5.0),
data.table,
lazyeval
data.table (>= 1.9.6),
lazyeval,
rlang
Suggests:
Lahman,
nycflights13,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export(grouped_dt)
export(is.grouped_dt)
export(src_dt)
export(tbl_dt)
import(rlang)
importFrom(data.table,as.data.table)
importFrom(dplyr,arrange_)
importFrom(dplyr,as.tbl)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# dtplyr 0.0.2.9000

- joins use extended `merge.data.table` and the `on=` argument, introduced in
data.table 1.9.6. Avoids copy and allows joins by different keys (#20, #21).


# dtplyr 0.0.2

- This is a compatibility release. It makes dtplyr compatible with
Expand Down
79 changes: 44 additions & 35 deletions R/joins.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,49 +31,58 @@
#' @name join.tbl_dt
NULL

join_dt <- function(op) {
# nocov start
template <- substitute(function(x, y, by = NULL, copy = FALSE, ...) {
by <- dplyr::common_by(by, x, y)
if (!identical(by$x, by$y)) {
stop("Data table joins must be on same key", call. = FALSE)
}
y <- dplyr::auto_copy(x, y, copy = copy)

x <- data.table::copy(x)
y <- data.table::copy(y)
data.table::setkeyv(x, by$x)
data.table::setkeyv(y, by$x)
out <- op
grouped_dt(out, groups(x))
})

f <- eval(template, parent.frame())
attr(f, "srcref") <- NULL # fix so prints correctly
f
# nocov end
}

#' @rdname join.tbl_dt
inner_join.data.table <- join_dt({merge(x, y, by = by$x, allow.cartesian = TRUE)})
inner_join.data.table <- function(x, y, by = NULL, copy = FALSE, ...){
by <- dplyr::common_by(by, x, y)
y <- dplyr::auto_copy(x, y, copy = copy)
out <- merge(x, y, by.x = by$x, by.y = by$y, all = FALSE, allow.cartesian = TRUE)
grouped_dt(out, groups(x))
}

#' @rdname join.tbl_dt
left_join.data.table <- join_dt({merge(x, y, by = by$x, all.x = TRUE, allow.cartesian = TRUE)})
left_join.data.table <- function(x, y, by = NULL, copy = FALSE, ...){
by <- dplyr::common_by(by, x, y)
y <- dplyr::auto_copy(x, y, copy = copy)
out <- merge(x, y, by.x = by$x, by.y = by$y, all.x = TRUE, allow.cartesian = TRUE)
grouped_dt(out, groups(x))
}

#' @rdname join.tbl_dt
right_join.data.table <- join_dt(merge(x, y, by = by$x, all.y = TRUE, allow.cartesian = TRUE))
right_join.data.table <- function(x, y, by = NULL, copy = FALSE, ...){
by <- dplyr::common_by(by, x, y)
y <- dplyr::auto_copy(x, y, copy = copy)
out <- merge(x, y, by.x = by$x, by.y = by$y, all.y = TRUE, allow.cartesian = TRUE)
grouped_dt(out, groups(x))
}

#' @rdname join.tbl_dt
semi_join.data.table <- join_dt({
# http://stackoverflow.com/questions/18969420/perform-a-semi-join-with-data-table
w <- unique(x[y, which = TRUE, allow.cartesian = TRUE])
w <- w[!is.na(w)]
x[w]
})
full_join.data.table <- function(x, y, by = NULL, copy = FALSE, ...){
by <- dplyr::common_by(by, x, y)
y <- dplyr::auto_copy(x, y, copy = copy)
out <- merge(x, y, by.x = by$x, by.y = by$y, all = TRUE, allow.cartesian = TRUE)
grouped_dt(out, groups(x))
}

#' @rdname join.tbl_dt
anti_join.data.table <- join_dt({x[!y, allow.cartesian = TRUE]})
#' @import rlang
semi_join.data.table <- function(x, y, by = NULL, copy = FALSE, ...) {
by <- dplyr::common_by(by, x, y)
y <- dplyr::auto_copy(x, y, copy = copy)
on <- set_names(by$y, by$x)
y_trimmed <- y[, by$y, with = FALSE]
w <- x[y_trimmed, which = TRUE, on = on, nomatch = 0L]
out <- x[sort(unique(w))]
grouped_dt(out, groups(x))
}

#' @rdname join.tbl_dt
# http://stackoverflow.com/a/15170956/946850
full_join.data.table <- join_dt({merge(x, y, by = by$x, all = TRUE, allow.cartesian = TRUE)})
#' @import rlang
anti_join.data.table <- function(x, y, by = NULL, copy = FALSE, ...) {
by <- dplyr::common_by(by, x, y)
y <- dplyr::auto_copy(x, y, copy = copy)
on <- set_names(by$y, by$x)
y_trimmed <- y[, by$y, with = FALSE]
w <- x[!y_trimmed, which = TRUE, on = on]
out <- x[sort(unique(w))]
grouped_dt(out, groups(x))
}
6 changes: 3 additions & 3 deletions man/join.tbl_dt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions tests/testthat/test-joins.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,26 @@ test_that("joining data tables returns data tables (#470) and does not modify th
}
}
})


test_that("joining data tables returns same result as dplyr", {
a_dt <- data.table(x = c(1, 1, 2, 3), y = 4:1)
b_dt <- data.table(x = c(1, 2, 2, 4), z = 1:4)

a_df <- as_data_frame(a_dt)
b_df <- as_data_frame(b_dt)

test_join <- function(join_fun) {
out <- join_fun(a_dt, b_dt, "x")
out_dplyr <- tbl_dt(join_fun(a_df, b_df, "x"))
expect_equal(out, out_dplyr)
}

test_join(left_join)
test_join(semi_join)
test_join(right_join)
test_join(full_join)
test_join(inner_join)
test_join(anti_join)
})

0 comments on commit 6c81a9a

Please sign in to comment.