Merge branch 'non-equi-joins'

* non-equi-joins: non-equi joins update to NEWS. #1452. Patching another issue spotted by Jan. Thanks! Update ?data.table with current non-equi join functionality. Limit number of combinations for tests to max of 100. Closes #1257, on=.() syntax is now possible. Added test for join on char type with op other than '=='. Allow only '==' operator for joins on char type. Free allocated variable. Fix for the issue @jan spotted. Added tests. Thanks Jan. Finally, non-equi joins NAs/NaNs correctly in all cases, hopefully. Added a note to self comment to nestedid. Minor: fix code spacing. Adding tests for non-equi joins only for non-NA/NaN cases. Fixing logic for NAs in i. Various improvements and fixes to nestedid. better logic fixes edge cases, also removes for-loop = ~3x faster Just fixing indentation and minor code cleanup. No implementations. thinko! should be seq_len, not seq_along. First stab at non-equi joins
Rdatatable · Apr 6, 2016 · adbf620 · adbf620
2 parents a781d0b + 55a7b55
commit adbf620
Show file tree

Hide file tree

Showing 10 changed files with 561 additions and 129 deletions.
diff --git a/R/bmerge.R b/R/bmerge.R
@@ -1,5 +1,5 @@
 
-bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, verbose)
+bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, ops, nqgrp, nqmaxgrp, verbose)
 {
     # TO DO: rename leftcols to icols, rightcols to xcols
     # NB: io is currently just TRUE or FALSE for whether i is keyed
@@ -89,15 +89,8 @@ bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, v
             set(i, j=lc, value=newval)
         }
     }
-
-    # Now that R doesn't copy named inputs to list(), we can return these as a list()
-    # TO DO: could be allocated inside Cbmerge and returned as list from that
-    f__ = integer(nrow(i))
-    len__ = integer(nrow(i))
-    allLen1 = logical(1)
-
     if (verbose) {last.started.at=proc.time()[3];cat("Starting bmerge ...");flush.console()}
-    .Call(Cbmerge, i, x, as.integer(leftcols), as.integer(rightcols), io<-haskey(i), xo, roll, rollends, nomatch, f__, len__, allLen1)
+    ans = .Call(Cbmerge, i, x, as.integer(leftcols), as.integer(rightcols), io<-haskey(i), xo, roll, rollends, nomatch, ops, nqgrp, nqmaxgrp)
     # NB: io<-haskey(i) necessary for test 579 where the := above change the factor to character and remove i's key
     if (verbose) {cat("done in",round(proc.time()[3]-last.started.at,3),"secs\n");flush.console()}
 
@@ -109,7 +102,7 @@ bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, v
         if (haskey(origi))
             setattr(i, 'sorted', key(origi))
     }    
-    return(list(starts=f__, lens=len__, allLen1=allLen1))
+    return(ans)
 }
 
 
diff --git a/R/data.table.R b/R/data.table.R
@@ -461,6 +461,8 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
         xo = NULL
         isub = substitute(i)
         isnull_inames = FALSE
+        nqgrp = integer(0)  # for non-equi join
+        nqmaxgrp = 1L       # for non-equi join
         # Fixes 4994: a case where quoted expression with a "!", ex: expr = quote(!dt1); dt[eval(expr)] requires 
         # the "eval" to be checked before `as.name("!")`. Therefore interchanged.
         restore.N = remove.N = FALSE
@@ -554,7 +556,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                 i = as.data.table( unique(RHS) )
                 # To do: wrap isub[[3L]] with as.data.table() first before eval to save copy
                 leftcols = 1L
-                ans = bmerge(i, x, leftcols, rightcols, io<-FALSE, xo, roll=0.0, rollends=c(FALSE,FALSE), nomatch=0L, verbose=verbose)
+                ans = bmerge(i, x, leftcols, rightcols, io<-FALSE, xo, roll=0.0, rollends=c(FALSE,FALSE), nomatch=0L, 1L, nqgrp, nqmaxgrp, verbose=verbose)
                 # No need to shallow copy i before passing to bmerge; we just created i above ourselves
                 i = if (ans$allLen1 && !identical(suppressWarnings(min(ans$starts)), 0L)) ans$starts else vecseq(ans$starts, ans$lens, NULL)
                 if (length(xo)) i = fsort(xo[i]) else i = fsort(i) # fix for #1495
@@ -581,10 +583,10 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
             }
         }
         if (is.null(i)) return( null.data.table() )
-	if (is.character(i)) {
-	    isnull_inames = TRUE
-	    i = data.table(V1=i)   # for user convenience; e.g. DT["foo"] without needing DT[.("foo")]
-	} else if (identical(class(i),"list") && length(i)==1L && is.data.frame(i[[1L]])) i = as.data.table(i[[1L]])
+        if (is.character(i)) {
+            isnull_inames = TRUE
+            i = data.table(V1=i)   # for user convenience; e.g. DT["foo"] without needing DT[.("foo")]
+        } else if (identical(class(i),"list") && length(i)==1L && is.data.frame(i[[1L]])) i = as.data.table(i[[1L]])
         else if (identical(class(i),"data.frame")) i = as.data.table(i)   # TO DO: avoid these as.data.table() and use a flag instead
         else if (identical(class(i),"list")) {
             isnull_inames = is.null(names(i))
@@ -595,36 +597,75 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                 stop("When i is a data.table (or character vector), the columns to join by must be specified either using 'on=' argument (see ?data.table) or by keying x (i.e. sorted, and, marked as sorted, see ?setkey). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
             }
             if (!missing(on)) {
-                if (!is.character(on))
-                    stop("'on' argument should be a named atomic vector oc column names indicating which columns in 'i' should be joined with which columns in 'x'.")
-                if (is.null(names(on))) {
-                    if (verbose)
-                        cat("names(on) = NULL. Assigning 'on' to names(on)' as well.\n")
-                    on_names = on
-                    # facilitating adhoc joins without having to name them, #1375
-                    if (isnull_inames) on = paste("V", seq_along(on), sep="")
-                    names(on) = on_names
-                } else {
-                    if (length(empty_names <- which(names(on) == ""))) 
-                        names(on)[empty_names] = on[empty_names]
+                # on = .() is now possible, #1257
+                parse_on <- function(onsub) {
+                    ops = c("==", "<=", "<", ">=", ">", "!=")
+                    pat = paste("(", ops, ")", sep = "", collapse = "|")
+                    if (is.call(onsub) && onsub[[1L]] == "eval") {
+                        onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L))
+                        if (is.call(onsub) && onsub[[1L]] == "eval") onsub = onsub[[2L]]
+                    }
+                    if (is.call(onsub) && as.character(onsub[[1L]]) %in% c("list", ".")) {
+                        spat = paste("[ ]+(", pat, ")[ ]+", sep="")
+                        onsub = lapply(as.list(onsub)[-1L], function(x) gsub(spat, "\\1", deparse(x, width.cutoff=500L)))
+                        onsub = as.call(c(quote(c), onsub))
+                    }
+                    on = eval(onsub, parent.frame(2L), parent.frame(2L))
+                    if (!is.character(on))
+                        stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
+                    this_op = regmatches(on, gregexpr(pat, on))
+                    idx = (vapply(this_op, length, 0L) == 0L)
+                    this_op[idx] = "=="
+                    this_op = unlist(this_op, use.names=FALSE)
+                    idx_op = match(this_op, ops, nomatch=0L)
+                    if (any(idx_op %in% c(0L, 6L)))
+                        stop("Invalid operators ", paste(this_op[idx_op==0L], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".")
+                    if (is.null(names(on))) {
+                        on[idx] = if (isnull_inames) paste(on[idx], paste("V", seq_len(sum(idx)), sep=""), sep="==") else paste(on[idx], on[idx], sep="==")
+                    } else {
+                        on[idx] = paste(names(on)[idx], on[idx], sep="==")
+                    }
+                    split = tstrsplit(on, paste("[ ]*", pat, "[ ]*", sep=""))
+                    on = setattr(split[[2L]], 'names', split[[1L]])
+                    if (length(empty_idx <- which(names(on) == "")))
+                        names(on)[empty_idx] = on[empty_idx]
+                    list(on = on, ops = idx_op)
                 }
+                on_ops = parse_on(substitute(on))
+                on = on_ops[[1L]]
+                ops = on_ops[[2L]]
+                # TODO: collect all '==' ops first to speeden up Cnestedid
                 rightcols = chmatch(names(on), names(x))
                 if (length(nacols <- which(is.na(rightcols))))
                     stop("Column(s) [", paste(names(on)[nacols], collapse=","), "] not found in x")
                 leftcols  = chmatch(unname(on), names(i))
                 if (length(nacols <- which(is.na(leftcols))))
                     stop("Column(s) [", paste(unname(on)[nacols], collapse=","), "] not found in i")
-                # reuse secondary index, #1439
-                if (verbose) cat("Looking for existing (secondary) index... ")
-                xo = attr(attr(x, 'index'), paste("__", names(x)[rightcols], sep="", collapse=""))
-                if (is.null(xo)) {
-                    if (verbose) {
-                        cat("not found.\n")
-                        tt = system.time(xo <- forderv(x, by=rightcols))
-                        cat("forder took", tt["user.self"] + tt["sys.self"], "sec\n")
-                    } else xo = forderv(x, by = rightcols)
+                # figure out the columns on which to compute groups on
+                non_equi = which.first(ops != 1L) # 1 is "==" operator
+                if (!is.na(non_equi)) { # non-equi conditions present.. investigate groups..
+                    nqcols = rightcols[non_equi:length(rightcols)]
+                    nqgrp = .Call(Cnestedid, x, nqcols, forderv(x, nqcols))
+                    if ( (nqmaxgrp <- max(nqgrp)) > 1L) { # got some non-equi join work to do
+                        if ("_nqgrp_" %in% names(x)) stop("Column name '_nqgrp_' is reserved for non-equi joins.")
+                        set(nqx<-shallow(x), j="_nqgrp_", value=nqgrp)
+                        xo = forderv(nqx, c(ncol(nqx), rightcols))
+                    } else nqgrp = integer(0)
+                }
+                if (nqmaxgrp == 1L) { # equi join. Reuse secondary index, #1439
+                    if (verbose) cat("Looking for existing (secondary) index... ")
+                    xo = attr(attr(x, 'index'), paste("__", names(x)[rightcols], sep="", collapse=""))
+                    if (is.null(xo)) {
+                        if (verbose) {
+                            cat("not found.\n")
+                            tt = system.time(xo <- forderv(x, by=rightcols))
+                            cat("forder took", tt["user.self"] + tt["sys.self"], "sec\n")
+                        } else xo = forderv(x, by = rightcols)
+                    } else {
+                        if (verbose) cat("found. Reusing index.\n")
+                    }
                 } else {
-                    if (verbose) cat("found. Reusing index.\n")
+                    if (!missing(by)) stop("by-joins are not yet implemented for multi-group non-equi-joins.")
                 }
             } else if (is.null(xo)) {
                 rightcols = chmatch(key(x),names(x))   # NAs here (i.e. invalid data.table) checked in bmerge()
@@ -642,9 +683,10 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                     txtav = c(names(x)[-rightcols], names(i)[-leftcols])
                     if (missing(j)) j = jsub = as.call(parse(text=paste(".(",paste(txtav, collapse=","),")",sep="")))[[1]]
                 }
+                ops = rep(1L, length(leftcols))
             }
             # Implementation for not-join along with by=.EACHI, #604
-	    if (notjoin && (byjoin || mult != "all")) { # mult != "all" needed for #1571 fix
+            if (notjoin && (byjoin || mult != "all")) { # mult != "all" needed for #1571 fix
                 notjoin = FALSE
                 if (verbose) {last.started.at=proc.time()[3];cat("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff(x,i) ...");flush.console()}
                 orignames = copy(names(i))
@@ -655,10 +697,15 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
             }
             io = if (missing(on)) haskey(i) else identical(unname(on), head(key(i), length(on)))
             i = .shallow(i, retain.key = io)
-            ans = bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, verbose=verbose)
+            ans = bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, ops, nqgrp, nqmaxgrp, verbose=verbose)
+            # temp fix for issue spotted by Jan. Ideally would like to avoid this 'setorder', as there's another 
+            # 'setorder' in generating 'irows' below...
+            if (length(ans$indices)) setorder(setDT(ans[1:3]), indices)
+            allLen1 = ans$allLen1
+            allGrp1 = ans$allGrp1
             f__ = ans$starts
             len__ = ans$lens
-            allLen1 = ans$allLen1
+            indices__ = ans$indices
             # length of input nomatch (single 0 or NA) is 1 in both cases.
             # When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N
             # If using secondary key of x, f__ will refer to xo
@@ -688,14 +735,20 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                     if (length(irows)) stop("Internal error. irows has length in by=.EACHI")
                 }
             } else {
-        if (!byjoin) { # fix for #1287 and #1271
-            irows = if (mult=="first") f__ else f__+len__-1L
-            if (identical(nomatch,0L)) irows = irows[len__>0L]  # 0s are len 0, so this removes -1 irows
-        } else { if (mult == "last") f__ = f__+len__- 1L } # fix for #1287 and #1271
-        # for test 456, and consistency generally. The if() is for R < 2.15.1 when pmin was enhanced, see v1.8.6.
-        if (length(len__)) len__ = pmin(len__, 1L)
+                if (nqmaxgrp>1L) stop("Non-equi joins don't work with mult='first' and mult='last' yet.")
+                if (!byjoin) { # fix for #1287 and #1271
+                    irows = if (mult=="first") f__ else f__+len__-1L
+                    if (identical(nomatch,0L)) irows = irows[len__>0L]  # 0s are len 0, so this removes -1 irows
+                } else { if (mult == "last") f__ = f__+len__- 1L } # fix for #1287 and #1271
+                # for test 456, and consistency generally. The if() is for R < 2.15.1 when pmin was enhanced, see v1.8.6.
+                if (length(len__)) len__ = pmin(len__, 1L)
+            }
+            if (length(xo) && length(irows)) {
+                irows = xo[irows]   # TO DO: fsort here?
+                if (mult=="all" && nqmaxgrp>1L && length(xo)) {
+                    irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))$irows
+                }
             }
-            if (length(xo) && length(irows)) irows = xo[irows]   # TO DO: fsort here?
         } else {
             if (!missing(on)) {
                 stop("logical error. i is not a data.table, but 'on' argument is provided.")
@@ -705,8 +758,8 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
             if (!is.logical(i) && !is.numeric(i)) stop("i has not evaluated to logical, integer or double")
             if (is.logical(i)) {
                 if (isTRUE(i)) irows = i = NULL  # fixes #1249
-		else if (identical(i, NA)) irows=i=integer(0)  # DT[NA] thread recycling of NA logical exists,
-							   # but for #1252 and consistency, we need to return 0-rows
+                else if (identical(i, NA)) irows=i=integer(0)  # DT[NA] thread recycling of NA logical exists,
+                                # but for #1252 and consistency, we need to return 0-rows
                 else if (length(i)==nrow(x)) irows = i = which(i)   # e.g. DT[colA>3,which=TRUE]
                                                                # also replacing 'i' here - to save memory, #926.
                 else irows=seq_len(nrow(x))[i]  # e.g. recycling DT[c(TRUE,FALSE),which=TRUE], for completeness 
@@ -1229,15 +1282,15 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
             # TO DO: port more of this to C
             ans = vector("list", length(ansvars))
             if (length(i) && length(icols)) {
-                if (allLen1 && (is.na(nomatch) || !any(f__==0L))) {   # nomatch=0 should drop rows in i that have no match
+                if (allLen1 && nqmaxgrp==1L && (is.na(nomatch) || !any(f__==0L))) {   # nomatch=0 should drop rows in i that have no match
                     for (s in seq_along(icols)) {
                         target = icolsAns[s]
                         source = icols[s]
                         ans[[target]] = i[[source]]
                         if (address(ans[[target]]) == address(i[[source]])) ans[[target]] = copy(ans[[target]])
                     }
                 } else {
-                    ii = rep.int(seq_len(nrow(i)),len__)
+		    ii = rep.int(if(nqmaxgrp==1L) seq_len(nrow(i)) else indices__, len__)
                     for (s in seq_along(icols)) {
                         target = icolsAns[s]
                         source = icols[s]
@@ -2702,12 +2755,6 @@ gsd <- function(x, na.rm=FALSE) .Call(Cgsd, x, na.rm)
 gstart <- function(o, f, l, rows) .Call(Cgstart, o, f, l, rows)
 gend <- function() .Call(Cgend)
 
-# rowwise summary functions
-rowmeans <- function(x, na.rm=FALSE) .Call("Crowmeans", x, na.rm)
-rowsums <- function(x, na.rm=FALSE) .Call("Crowsums", x, na.rm)
-rowmins <- function(x, na.rm=FALSE) .Call("Crowmins", x, na.rm)
-rowmaxs <- function(x, na.rm=FALSE) .Call("Crowmaxs", x, na.rm)
-
 isReallyReal <- function(x) {
     .Call(CisReallyReal, x)
 }
diff --git a/R/foverlaps.R b/R/foverlaps.R
@@ -113,7 +113,7 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.
     matches <- function(ii, xx, del, ...) {
         cols = setdiff(names(xx), del)
         xx = shallow(xx, cols)
-        ans = bmerge(xx, ii, seq_along(xx), seq_along(xx), haskey(xx), integer(0), verbose=verbose, ...)
+	ans = bmerge(xx, ii, seq_along(xx), seq_along(xx), haskey(xx), integer(0), ops=rep(1L, length(xx)), integer(0), 1L, verbose=verbose, ...)
         # vecseq part should never run here, but still...
         if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
     }

diff --git a/README.md b/README.md
@@ -76,6 +76,10 @@
 
   30. x's columns can be referred to in `j` using the prefix `x.` at all times. This is particularly useful when it is necessary to x's column that is *also a join column*. This is a patch addressing [#1615](https://github.com/Rdatatable/data.table/issues/1615).
 
+  31. `on=.()` syntax is now posible, e.g., `X[Y, on=.(x==a, y==b)]`, [#1257](https://github.com/Rdatatable/data.table/issues/1257). Thanks @dselivanov.
+
+  32. Non-equi joins are now possible using the familiar `on=` syntax. With this, the set of binary operators extend from just `==` to `>=`, `>`, `<=`, `<` and `==`. For e.g., `X[Y, on=.(a, b>b)]` looks for `X.a == Y.a` first and within those matching rows for rows where`X.b > Y.b`. Partly addreses [#1452](https://github.com/Rdatatable/data.table/issues/1452).
+
 #### BUG FIXES
 
   1. Now compiles and runs on IBM AIX gcc. Thanks to Vinh Nguyen for investigation and testing, [#1351](https://github.com/Rdatatable/data.table/issues/1351).