Skip to content

Commit

Permalink
Closes #481 (RForge 5585). DT[, list(list(y)), by=x] issue in >= v3.1…
Browse files Browse the repository at this point in the history
….0 is fixed.
  • Loading branch information
arunsrinivasan committed Jun 24, 2014
1 parent 44c10f9 commit 86276f4
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 2 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ DT[, list(.N, mean(y), sum(y)), by=x] # 1.9.3+ - will use GForce.
* Fixed an edge case in `DT[order(.)]` internal optimisation to be consistent with base. Closes [#696](https://github.com/Rdatatable/data.table/issues/696). Thanks to Michael Smith and Garrett See for reporting.
* `DT[, list(list(.)), by=.]` returns correct results in R >=3.1.0 as well. The bug was due to recent (welcoming) changes in R v3.1.0 where `list(.)` does not result in a *copy*. Closes [#481](https://github.com/Rdatatable/data.table/issues/696).
#### NOTES
* Reminder: using `rolltolast` still works but since v1.9.2 now issues the following warning :
Expand Down
8 changes: 8 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -4833,6 +4833,14 @@ test(1340.16, setorderv(copy(DT), "y", -1L, na.last=FALSE), DT[order(-y, na.last
test(1340.17, setorder(copy(DT), x, na.last=NA), error="na.last must be logical TRUE/FALSE")
test(1340.18, setorderv(copy(DT), "x", na.last=NA), error="na.last must be logical TRUE/FALSE")

# bug #481 - DT[, list(list(.)), by=.] on R v3.1.0
set.seed(1L)
DT <- data.table(x=sample(3,10,TRUE), y=as.numeric(sample(10)))
test(1341.1, DT[, list(list(y)), by=x], data.table(x=unique(DT$x), V1=list(c(3,5,9), c(2,6,4,1), c(10,7,8))))
test(1341.2, DT[, list(list(.I)), by=x], data.table(x=unique(DT$x), V1=list(c(1,5,10), c(2,3,8,9), c(4,6,7))))
f <- function(x) list(x)
test(1341.3, DT[, list(f(y)), by=x], data.table(x=unique(DT$x), V1=list(c(3,5,9), c(2,6,4,1), c(10,7,8))))

##########################


Expand Down
81 changes: 79 additions & 2 deletions src/dogroups.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
#include <Rinternals.h>
#include <Rdefines.h>
//#include <sys/mman.h>
#include <Rversion.h>
#include <fcntl.h>
#include <time.h>
// #include <signal.h> // the debugging machinery + breakpoint aidee
// raise(SIGINT);

size_t sizes[100]; // max appears to be FUNSXP = 99, see Rinternals.h
SEXP SelfRefSymbol;
Expand Down Expand Up @@ -42,6 +41,10 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
SEXP names, names2, xknames, bynames, dtnames, ans=NULL, jval, thiscol, SD, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, listwrap, target, source;
SEXP *nameSyms, *xknameSyms;
Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE, recycleWarn=TRUE;
#if defined(R_VERSION) && R_VERSION >= R_Version(3, 1, 0)
SEXP dupcol;
int named=0;
#endif
size_t size; // must be size_t, otherwise bug #5305 (integer overflow in memcpy)
clock_t tstart=0, tblock[10]={0}; int nblock[10]={0};

Expand Down Expand Up @@ -244,6 +247,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX

if (LOGICAL(verbose)[0]) tstart = clock(); // call to clock() is more expensive than an 'if'
PROTECT(jval = eval(jexp, env));

if (LOGICAL(verbose)[0]) { tblock[2] += clock()-tstart; nblock[2]++; }

if (isNull(jval)) {
Expand Down Expand Up @@ -418,7 +422,35 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
warning("Column %d of result for group %d is length %d but the longest column in this result is %d. Recycled leaving remainder of %d items. This warning is once only for the first group with this issue.",j+1,i+1,thislen,maxn,maxn%thislen);
recycleWarn = FALSE;
}
// fix for issues/481
#if defined(R_VERSION) && R_VERSION >= R_Version(3, 1, 0)
// added version because, for ex: DT[, list(list(unique(y))), by=x] gets duplicated
// because unique(y) returns NAMED(2). So, do it only if v>= 3.1.0. If <3.1.0,
// it gets duplicated anyway, so avoid copying twice!
named=0;
if (isNewList(source) && NAMED(source) != 2) {
// NAMED(source) != 2 prevents DT[, list(y), by=x] where 'y' is already a list
// or data.table and 99% of cases won't clear the if-statement above.
dupcol = VECTOR_ELT(source, 0);
named = NAMED(dupcol);
while(isNewList(dupcol)) {
// while loop basically peels each list() layer one by one until there's no
// list() wrapped anymore. Ex: consider DT[, list(list(list(sum(y)))), by=x] -
// here, we don't need to duplicate, but we won't know that until we reach
// 'sum(y)' and know that it's NAMED() != 2.
if (named == 2) break;
else {
dupcol = VECTOR_ELT(dupcol, 0);
named = NAMED(dupcol);
}
}
if (named == 2) source = PROTECT(duplicate(source));
}
memrecycle(target, R_NilValue, thisansloc, maxn, source);
if (named == 2) UNPROTECT(1);
#else
memrecycle(target, R_NilValue, thisansloc, maxn, source);
#endif
}
ansloc += maxn;
if (firstalloc) {
Expand Down Expand Up @@ -495,3 +527,48 @@ SEXP growVector(SEXP x, R_len_t newlen)
}


// benchmark timings for #481 fix:
// old code - no changes, R v3.0.3
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 82.593 0.936 84.314
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 34.558 0.628 35.658
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 37.056 0.315 37.668
//
// All new changes in place, R v3.0.3
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 82.852 0.952 84.575
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 34.600 0.356 35.173
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 36.865 0.514 37.901

// old code - no changes, R v3.1.0 --- BUT RESULTS ARE WRONG!
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 11.022 0.352 11.455
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 10.397 0.119 10.600
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 10.665 0.101 11.013

// All new changes in place, R v3.1.0
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 83.279 1.057 89.856
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 30.569 0.633 31.452
// > system.time(dt[, list(list(y)), by=x])
// user system elapsed
// 30.827 0.239 32.306

0 comments on commit 86276f4

Please sign in to comment.