Better (and consistent) handling of duplicate names during subset. Cl…

…oses #5008 and $5688.
Rdatatable · May 23, 2014 · 9f0a7f8 · 9f0a7f8
1 parent f7ec4fb
commit 9f0a7f8
Show file tree

Hide file tree

Showing 6 changed files with 210 additions and 11 deletions.
diff --git a/NEWS b/NEWS
@@ -183,6 +183,14 @@ BUG FIXES
   o  Fixed a (segfault) issue during grouping with assignment by refernece, ex: `DT[, LHS := RHS, by=.]`, where 
      length(RHS) > group size (.N). Closes #5647. Thanks to Zachary Long for reporting on datatable mailing list.
 
+  o  Consistent subset rules on datat.tables with duplicate columns. In short, if indices are directly provided, 'j', 
+     or in .SDcols, then just those columns are either returned (or deleted if you provide -.SDcols or !j). If 
+     instead, column names are given and there are more than one occurrence of that column, then it's hard to 
+     decide which to keep and which to remove on a subset. Therefore, to remove, all occurrences of that column are 
+     removed, and to keep, always the first column is returned each time. Also closes #5688 and #5008.
+     Note that using 'by' to aggregate on duplicate columns may not give intended result still, as it may not operate 
+     on the proper column.
+
 NOTES
 
   o  Reminder: using rolltolast still works but since v1.9.2 now issues the following warning :

diff --git a/R/data.table.R b/R/data.table.R
@@ -312,6 +312,17 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
     x
 }
 
+# A (relatively) fast (uses DT grouping) wrapper for matching two vectors, BUT:
+# it behaves like 'pmatch' but only the 'exact' matching part. That is, a value in 
+# 'x' is matched to 'table' only once. No index will be present more than once. 
+# This should make it even clearer:
+# chmatch2(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table'
+# chmatch2(c("a", "a"), c("a", "b")) # 1,NA - the second one doesn't 'see' the first 'a'
+# chmatch2(c("a", "a"), c("a", "a.1")) # 1,NA - this is where it differs from pmatch - we don't need the partial match.
+chmatch2 <- function(x, table, nomatch=NA_integer_) {
+    .Call(Cchmatch2, x, table, as.integer(nomatch)) # this is in 'rbindlist.c' for now.
+}
+
 "[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch"), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, rolltolast=FALSE)
 {
     # ..selfcount <<- ..selfcount+1  # in dev, we check no self calls, each of which doubles overhead, or could
@@ -371,7 +382,17 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
     irows = NULL  # Meaning all rows. We avoid creating 1:nrow(x) for efficiency.
     notjoin = FALSE
     rightcols = leftcols = integer(0)
-
+
+    # To take care of duplicate column names properly (see chmatch2 function above `[data.table`) for description
+    dupmatch <- function(x, y, ...) {
+        if (anyDuplicated(x))
+            pmax(chmatch(x,y, ...), chmatch2(x,y,0L))
+        else chmatch(x,y)
+    }
+
+    # setdiff removes duplicate entries, which'll create issues with duplicated names. Use '%chin% instead.
+    dupdiff <- function(x, y) x[!x %chin% y]
+
     if (!missing(i)) {
         isub = substitute(i)
         # Fixes 4994: a case where quoted expression with a "!", ex: expr = quote(!dt1); dt[eval(expr)] requires 
@@ -535,7 +556,6 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
     byval = NULL
     xnrow = nrow(x)
     xcols = xcolsAns = icols = icolsAns = integer()
-
     if (missing(j)) {
         # missing(by)==TRUE was already checked above before dealing with i
         if (!length(x)) return(null.data.table())
@@ -552,6 +572,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
             icolsAns = c(rightcols, seq.int(ncol(x)+1L, length.out=ncol(i)-length(leftcols)))
             xcols = xcolsAns = seq_along(x)[-rightcols]
         }
+        ansvals = chmatch(ansvars, names(x))
     } else {
         jsub = substitute(j)
         # deconstruct and eval everything with just one argument, then reconstruct back to a call
@@ -585,16 +606,25 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
                     if (any(is.na(w))) {
                         warning("column(s) not removed because not found: ",paste(j[is.na(w)],collapse=","))
                         w = w[!is.na(w)]
-                    } 
-                    ansvars = if (length(w)) names(x)[-w] else names(x)
+                    }
+                    # changed names(x)[-w] to use 'setdiff'. Here, all instances of the column must be removed.
+                    # Ex: DT <- data.table(x=1, y=2, x=3); DT[, !"x", with=FALSE] should just output 'y'.
+                    # But keep 'dup cols' beause it's basically DT[, !names(DT) %chin% "x", with=FALSE] which'll subset all cols not 'x'.
+                    ansvars = if (length(w)) dupdiff(names(x), names(x)[w]) else names(x)
+                    ansvals = dupmatch(ansvars, names(x))
                 } else {
+                    # once again, use 'setdiff'. Basically, unless indices are specified in `j`, we shouldn't care about duplicated columns.
                     ansvars = j   # x. and i. prefixes may be in here, and they'll be dealt with below 
+                    # dups = FALSE here.. even if DT[, c("x", "x"), with=FALSE], we subset only the first.. No way to tell which one the OP wants without index.
+                    ansvals = chmatch(ansvars, names(x))
                 }
             } else if (is.numeric(j)) {
                 if (any(abs(j) > ncol(x) | j==0L)) stop("j out of bounds")
                 if (any(j<0L) && any(j>0L)) stop("j mixes positive and negative")
                 if (any(j<0L)) j = seq_len(ncol(x))[j]
                 ansvars = names(x)[ if (notj) -j else j ]  # DT[,!"columntoexclude",with=FALSE], if a copy is needed, rather than :=NULL
+                # DT[, c(1,3), with=FALSE] should clearly provide both 'x' columns
+                ansvals = if (notj) setdiff(seq_along(x), as.integer(j)) else as.integer(j)
             }
 
         } else {   # with=TRUE and byjoin could be TRUE
@@ -742,11 +772,15 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
             av = all.vars(jsub,TRUE)  # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c)
             if (".SD" %chin% av) {
                 if (missing(.SDcols)) {
-                    ansvars = setdiff(names(x),union(bynames,allbyvars))   # TO DO: allbyvars here for vars used by 'by'. Document.
+                    # here we need to use 'dupdiff' instead of 'setdiff'. Ex: setdiff(c("x", "x"), NULL) will give 'x'.
+                    ansvars = dupdiff(names(x),union(bynames,allbyvars))   # TO DO: allbyvars here for vars used by 'by'. Document.
                     # just using .SD in j triggers all non-by columns in the subset even if some of
                     # those columns are not used. It would be tricky to detect whether the j expression
                     # really does use all of the .SD columns or not, hence .SDcols for grouping
                     # over a subset of columns
+
+                    # all duplicate columns must be matched, because nothing is provided
+                    ansvals = dupmatch(ansvars, names(x))
                 } else {
                     # FR #4979 - negative numeric and character indices for SDcols
                     colsub = substitute(.SDcols)
@@ -755,14 +789,18 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
                         colm = TRUE
                         .SDcols = eval(colsub[[2L]], parent.frame(), parent.frame())
                     } else colm = FALSE
+                    # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
                     if (is.numeric(.SDcols)) {
                         if (length(unique(sign(.SDcols))) != 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
                         if (any(is.na(.SDcols)) || any(abs(.SDcols)>ncol(x)) || any(abs(.SDcols)<1L)) stop(".SDcols is numeric but out of bounds (or NA)")
-                        if (colm) ansvars = setdiff(names(x)[-.SDcols], bynames) else ansvars = names(x)[.SDcols]
+                        if (colm) ansvars = dupdiff(names(x)[-.SDcols], bynames) else ansvars = names(x)[.SDcols]
+                        ansvals = if (colm) setdiff(seq_along(names(x)), c(as.integer(.SDcols), which(names(x) %chin% bynames))) else as.integer(.SDcols)
                     } else {
                         if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
                         if (any(is.na(.SDcols)) || any(!.SDcols %chin% names(x))) stop("Some items of .SDcols are not column names (or are NA)")
                         if (colm) ansvars = setdiff(setdiff(names(x), .SDcols), bynames) else ansvars = .SDcols
+                        # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
+                        ansvals = chmatch(ansvars, names(x))
                     }
                     # .SDcols might include grouping columns if users wants that, but normally we expect user not to include them in .SDcols
                 }
@@ -774,6 +812,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
                 # Consider:   DT[,max(diff(date)),by=list(month=month(date))]
                 # and:        DT[,lapply(.SD,sum),by=month(date)]
                 # We don't want date in .SD in the latter, but we do in the former; hence the union() above.
+                ansvals = chmatch(ansvars, names(x))
             }
             # if (!length(ansvars)) Leave ansvars empty. Important for test 607.
             if ("get" %chin% av) {
@@ -783,6 +822,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
                     # eval(macro) column names are detected via the  if jsub[[1]]==eval switch earlier above.
                 }
                 ansvars = setdiff(c(names(x), if (is.data.table(i)) c(names(i), paste("i.", names(i), sep=""))),bynames) # fix for bug #5443
+                ansvals = chmatch(ansvars, names(x))
                 if (verbose) cat("New:",paste(ansvars,collapse=","),"\n")
             }
 
@@ -887,7 +927,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
         }
 
         if (length(ansvars)) {
-            w = chmatch(ansvars, names(x))
+            w = ansvals
             if (length(rightcols) && missing(by)) w[ w %in% rightcols ] = NA
             if (!any(wna <- is.na(w))) {
                 xcols = w
@@ -1269,8 +1309,8 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
     if (byjoin) {
         groups = i
         grpcols = leftcols # 'leftcols' are the columns in i involved in the join (either head of key(i) or head along i)
-        jiscols = chmatch(jisvars,names(i))  # integer() if there are no jisvars (usually there aren't, advanced feature)
-        xjiscols = chmatch(xjisvars, names(x))
+        jiscols = dupmatch(jisvars,names(i))  # integer() if there are no jisvars (usually there aren't, advanced feature)
+        xjiscols = dupmatch(xjisvars, names(x))
         SDenv$.xSD = x[min(nrow(i), 1L), xjisvars, with=FALSE]
     } else {
         groups = byval
@@ -2017,7 +2057,7 @@ chgroup = function(x) {
     rbindlist(l, use.names, fill)
 }
 
-rbindlist = function(l, use.names=FALSE, fill=FALSE) {
+rbindlist = function(l, use.names=fill, fill=FALSE) {
     ans = .Call("Crbindlist", l, use.names, fill)
     if (!length(ans)) return(null.data.table())
     setattr(ans,"row.names",.set_row_names(length(ans[[1L]])))

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -4426,6 +4426,91 @@ dt<-data.table(x=c(1:2), y=1:10)
 cp = copy(dt)
 test(1289.2, dt[, z := c(rep(NA, 5),y), by=x], cp[, z := rep(NA_integer_, 10)], warning="RHS 1 is length 10")
 
+########################################
+# Extensve testing for "duplicate" names
+########################################
+# Rules: Basically, if index is directly given in 'j', just those columns are touched/operated on. But if 'column' names are given and there are more than one 
+# occurrence of that column, then it's hard to decide which to keep and which to remove. So, to remove, all are removed, to keep, always the first is kept.
+# 1) when i,j,by are all absent (or) just 'i' is present then ALL duplicate columns are returned.
+# 2) When 'with=FALSE' and 'j' is a character and 'notj' is TRUE, all instances of the column to be removed will be removed.
+# 3) When 'with=FALSE' and 'j' is a character and 'notj' is FALSE, only the first column will be recognised in presence of duplicate columns.
+# 4) When 'with=FALSE' and 'j' is numeric and 'notj' is TRUE, just those indices will be removed.
+# 5) When 'with=FALSE' and 'j' is numeric and 'notj' is FALSE, all columns for indices given, if valid, are returned. (FIXES #5688)
+# 6) When .SD is in 'j', but '.SDcols'  is not present, ALL columns are subset'd - FIXES BUG #5008.
+# 7) When .SD and .SDcols are present and .SDcols is numeric, columns corresponding to the given indices are returned.
+# 8) When .SD and .SDcols are present and .SDcols is character, duplicate column names will only return the first column, each time.
+# 9) When .SD and .SDcols are present and .SDcols is numeric, and it's -SDcols, then just those columns are removed.
+# 10) When .SD and .SDcols are present and .SDcols is character and -SDcols, then all occurrences of that object is removed.
+# 11) When no .SD and no .SDcols and no with=FALSE, only duplicate column names will return only the first column each time.
+# 12) With 'get("col")', it's the same as with all character types.
+# 13) A logical expression in 'j'.
+# 14) Finally, no tests but.. using 'by' with duplicate columns and aggregating may not return the intended result, as it may operate on column names in some cases.
+
+# All points are tested with this example:
+DT  <- data.table(x=1:2, y=3:4, x=5:6, x=7:8, y=9:10, z=11:12)
+DT1 <- data.table(x=1L, y=3L, x=5L, x=7L, y=9L, z=11L)
+DT2 <- data.table(x=2L, y=4L, x=6L, x=8L, y=10L, z=12L)
+ll  <- list(x=1:2, y=3:4, x=5:6, x=7:8, y=9:10, z=11:12)
+
+# case (1)
+test(1290.1, DT[1], DT1)
+test(1290.2, DT[], DT)
+test(1290.3, DT[(TRUE)], DT)
+
+# case (2)
+test(1290.4, DT[, !"x", with=FALSE], as.data.table(ll[c(2,5,6)]))
+test(1290.5, DT[, !"y", with=FALSE], as.data.table(ll[c(1,3,4,6)]))
+test(1290.6, DT[, !c("x", "x"), with=FALSE], as.data.table(ll[c(2,5,6)]))
+test(1290.7, DT[, !c("y", "y"), with=FALSE], as.data.table(ll[c(1,3,4,6)]))
+
+# case (3)
+test(1290.9,  DT[, "x", with=FALSE], as.data.table(ll[1]))
+test(1290.10, DT[, "y", with=FALSE], as.data.table(ll[2]))
+test(1290.11, DT[, c("x", "x"), with=FALSE], as.data.table(ll[c(1,1)]))
+test(1290.12, DT[, c("y", "y"), with=FALSE], as.data.table(ll[c(2,2)]))
+
+# case (4)
+test(1290.13, DT[, !3, with=FALSE], as.data.table(ll[c(1,2,4,5,6)]))
+test(1290.14, DT[, !c(1,1,3,4), with=FALSE], as.data.table(ll[c(2,5,6)]))
+test(1290.15, DT[, !2, with=FALSE], as.data.table(ll[c(1,3,4,5,6)]))
+test(1290.16, DT[, !c(2,5,2), with=FALSE], as.data.table(ll[c(1,3,4,6)]))
+
+# case (5)
+test(1290.17, DT[, 3, with=FALSE], as.data.table(ll[3]))
+test(1290.18, DT[, c(1,1,3,4), with=FALSE], as.data.table(ll[c(1,1,3,4)]))
+test(1290.19, DT[, 2, with=FALSE], as.data.table(ll[2]))
+test(1290.20, DT[, c(2,5,2), with=FALSE], as.data.table(ll[c(2,5,2)]))
+
+# case (6)
+test(1290.21, DT[, .SD], as.data.table(ll))
+test(1290.22, DT[, .SD[1]], DT[1])
+test(1290.23, DT[, .SD[1, !3, with=FALSE]], as.data.table(DT[1, !3, with=FALSE]))
+
+# case (7)
+test(1290.24, DT[, .SD, .SDcols=c(1,1,3,4)], as.data.table(ll[c(1,1,3,4)]))
+
+# case (8)
+test(1290.25, DT[, .SD, .SDcols=c("x", "x", "y")], as.data.table(ll[c(1,1,2)]))
+
+# case (9)
+test(1290.26, DT[, .SD, .SDcols=-c(1,2)], as.data.table(ll[c(-(1:2))]))
+
+# case (10)
+test(1290.27, DT[, .SD, .SDcols=-c("x")], as.data.table(ll[c(2,6)]))
+
+# case (11)
+test(1290.28, DT[, x], ll[[1]])
+test(1290.29, DT[, list(x,x,y,y,y)], as.data.table(ll[c(1,1,2,2,2)]))
+test(1290.30, DT[, list(x,x,y)], as.data.table(ll[c(1,1,2)]))
+
+# cast (12)
+test(1290.31, DT[, get("x")], ll[[1]])
+test(1290.32, DT[, list(get("x"))], setnames(as.data.table(ll[1]), "V1"))
+test(1290.33, DT[, list(get("x"), get("y"))], setnames(as.data.table(ll[1:2]), c("V1", "V2")))
+
+# case (13)
+test(1290.34, DT[, names(DT) == "x", with=FALSE], as.data.table(ll[c(1,3,4)]))
+
 ##########################
 
 # TO DO: Add test for fixed bug #5519 - dcast.data.table returned error when a package imported data.table, but dint happen when "depends" on data.table. This is fixed (commit 1263 v1.9.3), but not sure how to add test.

diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd
@@ -7,7 +7,7 @@
   Same as \code{do.call("rbind", l)} on \code{data.frame}s, but much faster. See \code{DETAILS} for more.
 }
 \usage{
-rbindlist(l, use.names=FALSE, fill=FALSE)
+rbindlist(l, use.names=fill, fill=FALSE)
 rbind(..., use.names=TRUE, fill=FALSE)
 }
 \arguments{

diff --git a/src/init.c b/src/init.c
@@ -45,6 +45,7 @@ SEXP pointWrapper();
 SEXP setNumericRounding();
 SEXP getNumericRounding();
 SEXP binary();
+SEXP chmatch2();
 
 // .Externals
 SEXP fastmean();
@@ -90,6 +91,7 @@ R_CallMethodDef callMethods[] = {
 {"CsetNumericRounding", (DL_FUNC) &setNumericRounding, -1},
 {"CgetNumericRounding", (DL_FUNC) &getNumericRounding, -1},
 {"Cbinary", (DL_FUNC) &binary, -1},
+{"Cchmatch2", (DL_FUNC) &chmatch2, -1},
 {NULL, NULL, 0}
 };
 

diff --git a/src/rbindlist.c b/src/rbindlist.c
@@ -739,3 +739,67 @@ SEXP rbindlist(SEXP l, SEXP sexp_usenames, SEXP sexp_fill) {
     UNPROTECT(protecti);
     return(ans);
 }
+
+// A (relatively) fast (uses DT grouping) wrapper for matching two vectors, BUT:
+// it behaves like 'pmatch' but only the 'exact' matching part. That is, a value in 
+// 'x' is matched to 'table' only once. No index will be present more than once. 
+// This should make it even clearer:
+// chmatch2(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table'
+// chmatch2(c("a", "a"), c("a", "b")) # 1,NA - the second one doesn't 'see' the first 'a'
+// chmatch2(c("a", "a"), c("a", "a.1")) # 1,NA - this is where it differs from pmatch - we don't need the partial match.
+SEXP chmatch2(SEXP x, SEXP table, SEXP nomatch) {
+
+    R_len_t i, j, k, nx, li, si, oi;
+    SEXP dt, l, ans, order, start, lens, grpid, index;
+    if (TYPEOF(x) != STRSXP) error("'x' must be a character vector");
+    if (TYPEOF(table) != STRSXP) error("'table' must be a character vector");
+    if (TYPEOF(nomatch) != INTSXP || length(nomatch) != 1) error("'nomatch' must be an integer of length 1");
+    if (!length(x)) return(allocVector(INTSXP, 0));
+    nx=length(x);
+    if (!length(table)) {
+        ans = PROTECT(allocVector(INTSXP, nx));
+        for (i=0; i<nx; i++) INTEGER(ans)[i] = INTEGER(nomatch)[0];
+        UNPROTECT(1);
+        return(ans);
+    }
+    // Done with special cases. On to the real deal.
+    l = PROTECT(allocVector(VECSXP, 2));
+    SET_VECTOR_ELT(l, 0, x);
+    SET_VECTOR_ELT(l, 1, table);
+
+    UNPROTECT(1); // l
+    dt = PROTECT(unlist2(l));
+
+    // order - first time
+    order = PROTECT(fast_order(dt, 2));
+    start = PROTECT(getAttrib(order, mkString("starts")));
+    lens  = PROTECT(uniq_lengths(start, length(order))); // length(order) = nrow(dt)
+    grpid = VECTOR_ELT(dt, 1); // dt[2] is unused here.
+    index = VECTOR_ELT(dt, 2);
+
+    // replace dt[1], we don't need it anymore
+    k=0;
+    for (i=0; i<length(lens); i++) {
+        for (j=0; j<INTEGER(lens)[i]; j++) {
+            INTEGER(grpid)[INTEGER(order)[k+j]-1] = j;
+        }
+        k += j;
+    }
+    // order - again
+    UNPROTECT(3); // order, start, lens
+    order = PROTECT(fast_order(dt, 2)); 
+    start = PROTECT(getAttrib(order, mkString("starts")));
+    lens  = PROTECT(uniq_lengths(start, length(order)));
+
+    ans = PROTECT(allocVector(INTSXP, nx));
+    k = 0;
+    for (i=0; i<length(lens); i++) {
+        li = INTEGER(lens)[i];
+        si = INTEGER(start)[i]-1;
+        oi = INTEGER(order)[si]-1;
+        if (oi > nx-1) continue;
+        INTEGER(ans)[oi] = (li == 2) ? INTEGER(index)[INTEGER(order)[si+1]-1]+1 : INTEGER(nomatch)[0];
+    }
+    UNPROTECT(5); // order, start, lens, ans
+    return(ans);
+}