Allow x's cols to be referred to using 'x.' prefix, addresses #1615

Rdatatable · Apr 2, 2016 · 620276b · 620276b
1 parent 6dc53e8
commit 620276b
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 5 deletions.
diff --git a/R/data.table.R b/R/data.table.R
@@ -742,7 +742,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
     if (missing(j)) {
         # missing(by)==TRUE was already checked above before dealing with i
         if (!length(x)) return(null.data.table())
-        if (!length(leftcols)) {               
+        if (!length(leftcols)) {
             ansvars = names(x)
             jisvars = character()
             xcols = xcolsAns = seq_along(x)
@@ -1036,7 +1036,8 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                     }
                 }
                 # fix for long standing FR/bug, #495 and #484
-                if ( length(othervars <- setdiff(intersect(av, names(x)), c(bynames, ansvars))) ) {
+                allcols = c(names(x), paste("x.",names(x),sep=""), if (is.data.table(i)) c(names(i), paste("i.", names(i), sep="")))
+                if ( length(othervars <- setdiff(intersect(av, allcols), c(bynames, ansvars))) ) {
                     # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or 
                     # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., 
                     ansvars = union(ansvars, othervars)
@@ -1045,7 +1046,8 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                 # .SDcols might include grouping columns if users wants that, but normally we expect user not to include them in .SDcols
             } else {
                 if (!missing(.SDcols)) warning("This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table.")
-                ansvars = setdiff(intersect(av,c(names(x),names(i),paste("i.",names(i),sep=""))), bynames)
+                allcols = c(names(x), paste("x.",names(x),sep=""), if (is.data.table(i)) c(names(i), paste("i.", names(i), sep="")))
+                ansvars = setdiff(intersect(av,allcols), bynames)
                 if (verbose) cat("Detected that j uses these columns:",if (!length(ansvars)) "<none>" else paste(ansvars,collapse=","),"\n")
                 # using a few named columns will be faster
                 # Consider:   DT[,max(diff(date)),by=list(month=month(date))]
@@ -1061,7 +1063,8 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                     # get('varname') is too difficult to detect which columns are used in general
                     # eval(macro) column names are detected via the  if jsub[[1]]==eval switch earlier above.
                 }
-                ansvars = setdiff(c(names(x), if (is.data.table(i)) c(names(i), paste("i.", names(i), sep=""))),bynames) # fix for bug #5443
+                allcols = c(names(x), paste("x.",names(x),sep=""), if (is.data.table(i)) c(names(i), paste("i.", names(i), sep="")))
+                ansvars = setdiff(allcols,bynames) # fix for bug #5443
                 ansvals = chmatch(ansvars, names(x))
                 if (verbose) cat("New:",paste(ansvars,collapse=","),"\n")
             }
@@ -1175,7 +1178,15 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
 
         if (length(ansvars)) {
             w = ansvals
-            if (length(rightcols) && missing(by)) w[ w %in% rightcols ] = NA
+            if (length(rightcols) && missing(by)) {
+                w[ w %in% rightcols ] = NA
+            }
+            # patch for #1615. Allow 'x.' syntax. Only useful during join op when x's join col needs to be used.
+            # Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] to refer to x's join column 
+            # as well because x[i, col] == x[i][, col] will not be TRUE anymore..
+            xjoincols = paste("x.",names(x),sep="")
+            if ( any(xjoinvals <- ansvars %in% xjoincols))
+                w[xjoinvals] = chmatch(ansvars[xjoinvals], xjoincols)
             if (!any(wna <- is.na(w))) {
                 xcols = w
                 xcolsAns = seq_along(ansvars)

diff --git a/README.md b/README.md
@@ -74,6 +74,8 @@
 
   29. New `split` method for data.table. Faster, more flexible and consistent with data.frame method. Closes [#1389](https://github.com/Rdatatable/data.table/issues/1389).
 
+  30. x's columns can be referred to in `j` using the prefix `x.` at all times. This is particularly useful when it is necessary to x's column that is *also a join column*. This is a patch addressing [#1615](https://github.com/Rdatatable/data.table/issues/1615).
+
 #### BUG FIXES
 
   1. Now compiles and runs on IBM AIX gcc. Thanks to Vinh Nguyen for investigation and testing, [#1351](https://github.com/Rdatatable/data.table/issues/1351).

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -8404,6 +8404,13 @@ test(1639.137, sort.by.names(ans), sort.by.names(unlist(split(setDT(df), by=c("p
 test(1639.138, ans, split(as.data.table(df), by=c("product","year")))
 test(1639.139, sort.by.names(ans), sort.by.names(unlist(split(as.data.table(df), by=c("product","year"), flatten=FALSE), recursive = FALSE)))
 
+# allow x's cols (specifically x's join cols) to be referred to using 'x.' syntax
+# patch for #1615. Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] 
+# to refer to x's join column as well because x[i, col] == x[i][, col] will not be TRUE anymore..
+x <- data.table(aa = 1:3, cc = letters[1:3])
+y <- data.table(bb = 3:5, dd = 3:1)
+test(1640.1, x[y, x.aa, on=c(aa="bb")], INT(3,NA,NA))
+test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c("c", NA,NA), x.aa=INT(3,NA,NA)))
 
 ##########################