Skip to content

Commit

Permalink
Better (and consistent) handling of duplicate names during subset. Cl…
Browse files Browse the repository at this point in the history
…oses #5008 and $5688.
  • Loading branch information
arunsrinivasan committed May 23, 2014
1 parent f7ec4fb commit 9f0a7f8
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 11 deletions.
8 changes: 8 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,14 @@ BUG FIXES
o Fixed a (segfault) issue during grouping with assignment by refernece, ex: `DT[, LHS := RHS, by=.]`, where
length(RHS) > group size (.N). Closes #5647. Thanks to Zachary Long for reporting on datatable mailing list.

o Consistent subset rules on datat.tables with duplicate columns. In short, if indices are directly provided, 'j',
or in .SDcols, then just those columns are either returned (or deleted if you provide -.SDcols or !j). If
instead, column names are given and there are more than one occurrence of that column, then it's hard to
decide which to keep and which to remove on a subset. Therefore, to remove, all occurrences of that column are
removed, and to keep, always the first column is returned each time. Also closes #5688 and #5008.
Note that using 'by' to aggregate on duplicate columns may not give intended result still, as it may not operate
on the proper column.

NOTES

o Reminder: using rolltolast still works but since v1.9.2 now issues the following warning :
Expand Down
60 changes: 50 additions & 10 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,17 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
x
}

# A (relatively) fast (uses DT grouping) wrapper for matching two vectors, BUT:
# it behaves like 'pmatch' but only the 'exact' matching part. That is, a value in
# 'x' is matched to 'table' only once. No index will be present more than once.
# This should make it even clearer:
# chmatch2(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table'
# chmatch2(c("a", "a"), c("a", "b")) # 1,NA - the second one doesn't 'see' the first 'a'
# chmatch2(c("a", "a"), c("a", "a.1")) # 1,NA - this is where it differs from pmatch - we don't need the partial match.
chmatch2 <- function(x, table, nomatch=NA_integer_) {
.Call(Cchmatch2, x, table, as.integer(nomatch)) # this is in 'rbindlist.c' for now.
}

"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch"), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, rolltolast=FALSE)
{
# ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could
Expand Down Expand Up @@ -371,7 +382,17 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency.
notjoin = FALSE
rightcols = leftcols = integer(0)


# To take care of duplicate column names properly (see chmatch2 function above `[data.table`) for description
dupmatch <- function(x, y, ...) {
if (anyDuplicated(x))
pmax(chmatch(x,y, ...), chmatch2(x,y,0L))
else chmatch(x,y)
}

# setdiff removes duplicate entries, which'll create issues with duplicated names. Use '%chin% instead.
dupdiff <- function(x, y) x[!x %chin% y]

if (!missing(i)) {
isub = substitute(i)
# Fixes 4994: a case where quoted expression with a "!", ex: expr = quote(!dt1); dt[eval(expr)] requires
Expand Down Expand Up @@ -535,7 +556,6 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
byval = NULL
xnrow = nrow(x)
xcols = xcolsAns = icols = icolsAns = integer()

if (missing(j)) {
# missing(by)==TRUE was already checked above before dealing with i
if (!length(x)) return(null.data.table())
Expand All @@ -552,6 +572,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
icolsAns = c(rightcols, seq.int(ncol(x)+1L, length.out=ncol(i)-length(leftcols)))
xcols = xcolsAns = seq_along(x)[-rightcols]
}
ansvals = chmatch(ansvars, names(x))
} else {
jsub = substitute(j)
# deconstruct and eval everything with just one argument, then reconstruct back to a call
Expand Down Expand Up @@ -585,16 +606,25 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
if (any(is.na(w))) {
warning("column(s) not removed because not found: ",paste(j[is.na(w)],collapse=","))
w = w[!is.na(w)]
}
ansvars = if (length(w)) names(x)[-w] else names(x)
}
# changed names(x)[-w] to use 'setdiff'. Here, all instances of the column must be removed.
# Ex: DT <- data.table(x=1, y=2, x=3); DT[, !"x", with=FALSE] should just output 'y'.
# But keep 'dup cols' beause it's basically DT[, !names(DT) %chin% "x", with=FALSE] which'll subset all cols not 'x'.
ansvars = if (length(w)) dupdiff(names(x), names(x)[w]) else names(x)
ansvals = dupmatch(ansvars, names(x))
} else {
# once again, use 'setdiff'. Basically, unless indices are specified in `j`, we shouldn't care about duplicated columns.
ansvars = j # x. and i. prefixes may be in here, and they'll be dealt with below
# dups = FALSE here.. even if DT[, c("x", "x"), with=FALSE], we subset only the first.. No way to tell which one the OP wants without index.
ansvals = chmatch(ansvars, names(x))
}
} else if (is.numeric(j)) {
if (any(abs(j) > ncol(x) | j==0L)) stop("j out of bounds")
if (any(j<0L) && any(j>0L)) stop("j mixes positive and negative")
if (any(j<0L)) j = seq_len(ncol(x))[j]
ansvars = names(x)[ if (notj) -j else j ] # DT[,!"columntoexclude",with=FALSE], if a copy is needed, rather than :=NULL
# DT[, c(1,3), with=FALSE] should clearly provide both 'x' columns
ansvals = if (notj) setdiff(seq_along(x), as.integer(j)) else as.integer(j)
}

} else { # with=TRUE and byjoin could be TRUE
Expand Down Expand Up @@ -742,11 +772,15 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
av = all.vars(jsub,TRUE) # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c)
if (".SD" %chin% av) {
if (missing(.SDcols)) {
ansvars = setdiff(names(x),union(bynames,allbyvars)) # TO DO: allbyvars here for vars used by 'by'. Document.
# here we need to use 'dupdiff' instead of 'setdiff'. Ex: setdiff(c("x", "x"), NULL) will give 'x'.
ansvars = dupdiff(names(x),union(bynames,allbyvars)) # TO DO: allbyvars here for vars used by 'by'. Document.
# just using .SD in j triggers all non-by columns in the subset even if some of
# those columns are not used. It would be tricky to detect whether the j expression
# really does use all of the .SD columns or not, hence .SDcols for grouping
# over a subset of columns

# all duplicate columns must be matched, because nothing is provided
ansvals = dupmatch(ansvars, names(x))
} else {
# FR #4979 - negative numeric and character indices for SDcols
colsub = substitute(.SDcols)
Expand All @@ -755,14 +789,18 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
colm = TRUE
.SDcols = eval(colsub[[2L]], parent.frame(), parent.frame())
} else colm = FALSE
# if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
if (is.numeric(.SDcols)) {
if (length(unique(sign(.SDcols))) != 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
if (any(is.na(.SDcols)) || any(abs(.SDcols)>ncol(x)) || any(abs(.SDcols)<1L)) stop(".SDcols is numeric but out of bounds (or NA)")
if (colm) ansvars = setdiff(names(x)[-.SDcols], bynames) else ansvars = names(x)[.SDcols]
if (colm) ansvars = dupdiff(names(x)[-.SDcols], bynames) else ansvars = names(x)[.SDcols]
ansvals = if (colm) setdiff(seq_along(names(x)), c(as.integer(.SDcols), which(names(x) %chin% bynames))) else as.integer(.SDcols)
} else {
if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
if (any(is.na(.SDcols)) || any(!.SDcols %chin% names(x))) stop("Some items of .SDcols are not column names (or are NA)")
if (colm) ansvars = setdiff(setdiff(names(x), .SDcols), bynames) else ansvars = .SDcols
# dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
ansvals = chmatch(ansvars, names(x))
}
# .SDcols might include grouping columns if users wants that, but normally we expect user not to include them in .SDcols
}
Expand All @@ -774,6 +812,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
# Consider: DT[,max(diff(date)),by=list(month=month(date))]
# and: DT[,lapply(.SD,sum),by=month(date)]
# We don't want date in .SD in the latter, but we do in the former; hence the union() above.
ansvals = chmatch(ansvars, names(x))
}
# if (!length(ansvars)) Leave ansvars empty. Important for test 607.
if ("get" %chin% av) {
Expand All @@ -783,6 +822,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
# eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above.
}
ansvars = setdiff(c(names(x), if (is.data.table(i)) c(names(i), paste("i.", names(i), sep=""))),bynames) # fix for bug #5443
ansvals = chmatch(ansvars, names(x))
if (verbose) cat("New:",paste(ansvars,collapse=","),"\n")
}

Expand Down Expand Up @@ -887,7 +927,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
}

if (length(ansvars)) {
w = chmatch(ansvars, names(x))
w = ansvals
if (length(rightcols) && missing(by)) w[ w %in% rightcols ] = NA
if (!any(wna <- is.na(w))) {
xcols = w
Expand Down Expand Up @@ -1269,8 +1309,8 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
if (byjoin) {
groups = i
grpcols = leftcols # 'leftcols' are the columns in i involved in the join (either head of key(i) or head along i)
jiscols = chmatch(jisvars,names(i)) # integer() if there are no jisvars (usually there aren't, advanced feature)
xjiscols = chmatch(xjisvars, names(x))
jiscols = dupmatch(jisvars,names(i)) # integer() if there are no jisvars (usually there aren't, advanced feature)
xjiscols = dupmatch(xjisvars, names(x))
SDenv$.xSD = x[min(nrow(i), 1L), xjisvars, with=FALSE]
} else {
groups = byval
Expand Down Expand Up @@ -2017,7 +2057,7 @@ chgroup = function(x) {
rbindlist(l, use.names, fill)
}

rbindlist = function(l, use.names=FALSE, fill=FALSE) {
rbindlist = function(l, use.names=fill, fill=FALSE) {
ans = .Call("Crbindlist", l, use.names, fill)
if (!length(ans)) return(null.data.table())
setattr(ans,"row.names",.set_row_names(length(ans[[1L]])))
Expand Down
85 changes: 85 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -4426,6 +4426,91 @@ dt<-data.table(x=c(1:2), y=1:10)
cp = copy(dt)
test(1289.2, dt[, z := c(rep(NA, 5),y), by=x], cp[, z := rep(NA_integer_, 10)], warning="RHS 1 is length 10")

########################################
# Extensve testing for "duplicate" names
########################################
# Rules: Basically, if index is directly given in 'j', just those columns are touched/operated on. But if 'column' names are given and there are more than one
# occurrence of that column, then it's hard to decide which to keep and which to remove. So, to remove, all are removed, to keep, always the first is kept.
# 1) when i,j,by are all absent (or) just 'i' is present then ALL duplicate columns are returned.
# 2) When 'with=FALSE' and 'j' is a character and 'notj' is TRUE, all instances of the column to be removed will be removed.
# 3) When 'with=FALSE' and 'j' is a character and 'notj' is FALSE, only the first column will be recognised in presence of duplicate columns.
# 4) When 'with=FALSE' and 'j' is numeric and 'notj' is TRUE, just those indices will be removed.
# 5) When 'with=FALSE' and 'j' is numeric and 'notj' is FALSE, all columns for indices given, if valid, are returned. (FIXES #5688)
# 6) When .SD is in 'j', but '.SDcols' is not present, ALL columns are subset'd - FIXES BUG #5008.
# 7) When .SD and .SDcols are present and .SDcols is numeric, columns corresponding to the given indices are returned.
# 8) When .SD and .SDcols are present and .SDcols is character, duplicate column names will only return the first column, each time.
# 9) When .SD and .SDcols are present and .SDcols is numeric, and it's -SDcols, then just those columns are removed.
# 10) When .SD and .SDcols are present and .SDcols is character and -SDcols, then all occurrences of that object is removed.
# 11) When no .SD and no .SDcols and no with=FALSE, only duplicate column names will return only the first column each time.
# 12) With 'get("col")', it's the same as with all character types.
# 13) A logical expression in 'j'.
# 14) Finally, no tests but.. using 'by' with duplicate columns and aggregating may not return the intended result, as it may operate on column names in some cases.

# All points are tested with this example:
DT <- data.table(x=1:2, y=3:4, x=5:6, x=7:8, y=9:10, z=11:12)
DT1 <- data.table(x=1L, y=3L, x=5L, x=7L, y=9L, z=11L)
DT2 <- data.table(x=2L, y=4L, x=6L, x=8L, y=10L, z=12L)
ll <- list(x=1:2, y=3:4, x=5:6, x=7:8, y=9:10, z=11:12)

# case (1)
test(1290.1, DT[1], DT1)
test(1290.2, DT[], DT)
test(1290.3, DT[(TRUE)], DT)

# case (2)
test(1290.4, DT[, !"x", with=FALSE], as.data.table(ll[c(2,5,6)]))
test(1290.5, DT[, !"y", with=FALSE], as.data.table(ll[c(1,3,4,6)]))
test(1290.6, DT[, !c("x", "x"), with=FALSE], as.data.table(ll[c(2,5,6)]))
test(1290.7, DT[, !c("y", "y"), with=FALSE], as.data.table(ll[c(1,3,4,6)]))

# case (3)
test(1290.9, DT[, "x", with=FALSE], as.data.table(ll[1]))
test(1290.10, DT[, "y", with=FALSE], as.data.table(ll[2]))
test(1290.11, DT[, c("x", "x"), with=FALSE], as.data.table(ll[c(1,1)]))
test(1290.12, DT[, c("y", "y"), with=FALSE], as.data.table(ll[c(2,2)]))

# case (4)
test(1290.13, DT[, !3, with=FALSE], as.data.table(ll[c(1,2,4,5,6)]))
test(1290.14, DT[, !c(1,1,3,4), with=FALSE], as.data.table(ll[c(2,5,6)]))
test(1290.15, DT[, !2, with=FALSE], as.data.table(ll[c(1,3,4,5,6)]))
test(1290.16, DT[, !c(2,5,2), with=FALSE], as.data.table(ll[c(1,3,4,6)]))

# case (5)
test(1290.17, DT[, 3, with=FALSE], as.data.table(ll[3]))
test(1290.18, DT[, c(1,1,3,4), with=FALSE], as.data.table(ll[c(1,1,3,4)]))
test(1290.19, DT[, 2, with=FALSE], as.data.table(ll[2]))
test(1290.20, DT[, c(2,5,2), with=FALSE], as.data.table(ll[c(2,5,2)]))

# case (6)
test(1290.21, DT[, .SD], as.data.table(ll))
test(1290.22, DT[, .SD[1]], DT[1])
test(1290.23, DT[, .SD[1, !3, with=FALSE]], as.data.table(DT[1, !3, with=FALSE]))

# case (7)
test(1290.24, DT[, .SD, .SDcols=c(1,1,3,4)], as.data.table(ll[c(1,1,3,4)]))

# case (8)
test(1290.25, DT[, .SD, .SDcols=c("x", "x", "y")], as.data.table(ll[c(1,1,2)]))

# case (9)
test(1290.26, DT[, .SD, .SDcols=-c(1,2)], as.data.table(ll[c(-(1:2))]))

# case (10)
test(1290.27, DT[, .SD, .SDcols=-c("x")], as.data.table(ll[c(2,6)]))

# case (11)
test(1290.28, DT[, x], ll[[1]])
test(1290.29, DT[, list(x,x,y,y,y)], as.data.table(ll[c(1,1,2,2,2)]))
test(1290.30, DT[, list(x,x,y)], as.data.table(ll[c(1,1,2)]))

# cast (12)
test(1290.31, DT[, get("x")], ll[[1]])
test(1290.32, DT[, list(get("x"))], setnames(as.data.table(ll[1]), "V1"))
test(1290.33, DT[, list(get("x"), get("y"))], setnames(as.data.table(ll[1:2]), c("V1", "V2")))

# case (13)
test(1290.34, DT[, names(DT) == "x", with=FALSE], as.data.table(ll[c(1,3,4)]))

##########################

# TO DO: Add test for fixed bug #5519 - dcast.data.table returned error when a package imported data.table, but dint happen when "depends" on data.table. This is fixed (commit 1263 v1.9.3), but not sure how to add test.
Expand Down
2 changes: 1 addition & 1 deletion man/rbindlist.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
Same as \code{do.call("rbind", l)} on \code{data.frame}s, but much faster. See \code{DETAILS} for more.
}
\usage{
rbindlist(l, use.names=FALSE, fill=FALSE)
rbindlist(l, use.names=fill, fill=FALSE)
rbind(..., use.names=TRUE, fill=FALSE)
}
\arguments{
Expand Down
2 changes: 2 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ SEXP pointWrapper();
SEXP setNumericRounding();
SEXP getNumericRounding();
SEXP binary();
SEXP chmatch2();

// .Externals
SEXP fastmean();
Expand Down Expand Up @@ -90,6 +91,7 @@ R_CallMethodDef callMethods[] = {
{"CsetNumericRounding", (DL_FUNC) &setNumericRounding, -1},
{"CgetNumericRounding", (DL_FUNC) &getNumericRounding, -1},
{"Cbinary", (DL_FUNC) &binary, -1},
{"Cchmatch2", (DL_FUNC) &chmatch2, -1},
{NULL, NULL, 0}
};

Expand Down
64 changes: 64 additions & 0 deletions src/rbindlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -739,3 +739,67 @@ SEXP rbindlist(SEXP l, SEXP sexp_usenames, SEXP sexp_fill) {
UNPROTECT(protecti);
return(ans);
}

// A (relatively) fast (uses DT grouping) wrapper for matching two vectors, BUT:
// it behaves like 'pmatch' but only the 'exact' matching part. That is, a value in
// 'x' is matched to 'table' only once. No index will be present more than once.
// This should make it even clearer:
// chmatch2(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table'
// chmatch2(c("a", "a"), c("a", "b")) # 1,NA - the second one doesn't 'see' the first 'a'
// chmatch2(c("a", "a"), c("a", "a.1")) # 1,NA - this is where it differs from pmatch - we don't need the partial match.
SEXP chmatch2(SEXP x, SEXP table, SEXP nomatch) {

R_len_t i, j, k, nx, li, si, oi;
SEXP dt, l, ans, order, start, lens, grpid, index;
if (TYPEOF(x) != STRSXP) error("'x' must be a character vector");
if (TYPEOF(table) != STRSXP) error("'table' must be a character vector");
if (TYPEOF(nomatch) != INTSXP || length(nomatch) != 1) error("'nomatch' must be an integer of length 1");
if (!length(x)) return(allocVector(INTSXP, 0));
nx=length(x);
if (!length(table)) {
ans = PROTECT(allocVector(INTSXP, nx));
for (i=0; i<nx; i++) INTEGER(ans)[i] = INTEGER(nomatch)[0];
UNPROTECT(1);
return(ans);
}
// Done with special cases. On to the real deal.
l = PROTECT(allocVector(VECSXP, 2));
SET_VECTOR_ELT(l, 0, x);
SET_VECTOR_ELT(l, 1, table);

UNPROTECT(1); // l
dt = PROTECT(unlist2(l));

// order - first time
order = PROTECT(fast_order(dt, 2));
start = PROTECT(getAttrib(order, mkString("starts")));
lens = PROTECT(uniq_lengths(start, length(order))); // length(order) = nrow(dt)
grpid = VECTOR_ELT(dt, 1); // dt[2] is unused here.
index = VECTOR_ELT(dt, 2);

// replace dt[1], we don't need it anymore
k=0;
for (i=0; i<length(lens); i++) {
for (j=0; j<INTEGER(lens)[i]; j++) {
INTEGER(grpid)[INTEGER(order)[k+j]-1] = j;
}
k += j;
}
// order - again
UNPROTECT(3); // order, start, lens
order = PROTECT(fast_order(dt, 2));
start = PROTECT(getAttrib(order, mkString("starts")));
lens = PROTECT(uniq_lengths(start, length(order)));

ans = PROTECT(allocVector(INTSXP, nx));
k = 0;
for (i=0; i<length(lens); i++) {
li = INTEGER(lens)[i];
si = INTEGER(start)[i]-1;
oi = INTEGER(order)[si]-1;
if (oi > nx-1) continue;
INTEGER(ans)[oi] = (li == 2) ? INTEGER(index)[INTEGER(order)[si+1]-1]+1 : INTEGER(nomatch)[0];
}
UNPROTECT(5); // order, start, lens, ans
return(ans);
}

0 comments on commit 9f0a7f8

Please sign in to comment.