Skip to content

Commit

Permalink
gmin and gmax done. Partially address #5754 (git #523)
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Jun 18, 2014
1 parent 2602f23 commit 0af4511
Show file tree
Hide file tree
Showing 6 changed files with 266 additions and 25 deletions.
5 changes: 3 additions & 2 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1250,7 +1250,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
dotN <- function(x) if (is.name(x) && x == ".N") TRUE else FALSE # For #5760
if (getOption("datatable.optimize")>=2 && !byjoin && !length(irows) && length(f__) && length(ansvars) && !length(lhs)) {
# Apply GForce
gfuns = c("sum","mean",".N") # added .N for #5760
gfuns = c("sum","mean",".N", "min", "max") # added .N for #5760
.ok = function(q) {
if (dotN(q)) return(TRUE) # For #5760
ans = is.call(q) && as.character(q[[1L]]) %chin% gfuns && !is.call(q[[2L]]) && (length(q)==2 || identical("na",substring(names(q)[3L],1,2)))
Expand Down Expand Up @@ -2138,6 +2138,7 @@ setDT <- function(x, giveNames=TRUE, keep.rownames=FALSE) {

gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm)
gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm)
gmax = function(x, na.rm=FALSE) .Call(Cgmax, x, na.rm)
gstart = function(o, f, l) .Call(Cgstart, o, f, l)
gend = function() .Call(Cgend)

6 changes: 3 additions & 3 deletions R/test.data.table.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@

test.data.table = function(verbose=FALSE) {
test.data.table = function(verbose=FALSE, pkg="pkg") {
if (exists("test.data.table",.GlobalEnv,inherits=FALSE)) {
# package developer
if ("package:data.table" %in% search()) stop("data.table package loaded")
if (.Platform$OS.type == "unix" && Sys.info()['sysname'] != "Darwin")
d = path.expand("~/R/gitdatatable/pkg/inst/tests")
else {
if (!"pkg" %in% dir()) stop("'pkg' not in dir()")
d = paste(getwd(),"/pkg/inst/tests",sep="")
if (!pkg %in% dir()) stop(paste(pkg, " not in dir()", sep=""))
d = paste(getwd(),"/", pkg, "/inst/tests",sep="")
}
} else {
# user
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ DT[, grep("^b", names(DT)) := NULL] # where no columns start with b
# warns now and returns DT instead of error
```

* GForce now is also optimised for j-expression with `.N`. Closes **#5760** (git [#334](https://github.com/Rdatatable/data.table/issues/334)).
* GForce now is also optimised for j-expression with `.N`. Closes **#5760** (git [#334](https://github.com/Rdatatable/data.table/issues/334) and git [#523](https://github.com/Rdatatable/data.table/issues/523)).
```S
DT[, list(.N, mean(y), sum(y)), by=x] # 1.9.2 - doesn't know to use GForce - will be (relatively) slower
DT[, list(.N, mean(y), sum(y)), by=x] # 1.9.3+ - will use GForce.
Expand All @@ -130,6 +130,8 @@ DT[, list(.N, mean(y), sum(y)), by=x] # 1.9.3+ - will use GForce.

* Looping calls to `unique(DT)` such as in `DT[,unique(.SD),by=group]` is now faster by avoiding internal overhead of calling `[.data.table`. Thanks again to Ron Hylton for highlighting in the [same thread](http://r.789695.n4.nabble.com/data-table-is-asking-for-help-tp4692080.html). His example is reduced from 28 sec to 9 sec, with identical results.

* Following `gsum` and `gmean`, now `gmin` and `gmax` from GForce are also implemented. Closes part of #5754 (git [#523](https://github.com/Rdatatable/data.table/issues/523)). Benchmarks are also provided.
i.e., DT[, list(sum(x), min(y), max(z), .N), by=...] # runs by default using GForce

#### BUG FIXES

Expand Down
40 changes: 40 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -4630,6 +4630,46 @@ DT = data.table(a=1:3,b=6:1)
test(1312, DT[,setkey(.SD),by=a], error="SD is locked. Using set.*functions on [.]SD is reserved")
# was warning "Already keyed by this key but had invalid row order" due to the key not being cleared after the previous group. A solution could have been to put back the original key on populating .SD for each group. But instead we reserve it for future use and push the user towards doing it a different more efficient way (see Arun's speedups in the datatable-help thread).

# gmin and gmax extensive testing (because there are tricky cases)
DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647))
# make sure GForce is running
options(datatable.optimize=2L)

# for integers
test(1313.1, DT[, min(y), by=x], DT[, base:::min(y), by=x])
test(1313.2, DT[, max(y), by=x], DT[, base:::max(y), by=x])
test(1313.3, DT[, min(y, na.rm=TRUE), by=x], DT[, base:::min(y, na.rm=TRUE), by=x])
test(1313.4, DT[, max(y, na.rm=TRUE), by=x], DT[, base:::max(y, na.rm=TRUE), by=x])
# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median)
DT[x==6, y := INT(NA)]
test(1313.5, DT[, min(y), by=x], DT[, base:::min(y), by=x])
test(1313.6, DT[, max(y), by=x], DT[, base:::max(y), by=x])
test(1313.7, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-2147483647,Inf)), warning="No non-missing")
test(1313.8, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-2147483647,-Inf)), warning="No non-missing")

# for numeric
DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA))
test(1313.9, DT[, min(y), by=x], DT[, base:::min(y), by=x])
test(1313.10, DT[, max(y), by=x], DT[, base:::max(y), by=x])
test(1313.11, DT[, min(y, na.rm=TRUE), by=x], DT[, base:::min(y, na.rm=TRUE), by=x])
test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base:::max(y, na.rm=TRUE), by=x])
# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median)
DT[x==6, y := NA_real_]
test(1313.13, DT[, min(y), by=x], DT[, base:::min(y), by=x])
test(1313.14, DT[, max(y), by=x], DT[, base:::max(y), by=x])
test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,Inf)), warning="No non-missing")
test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,-Inf)), warning="No non-missing")

# for date (attribute check.. especially after issues/689 !!!)
DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="gmt") + seq(0, 86400*9, 86400))
test(1313.17, DT[, list(y=min(y)), by=x], DT[c(1,6)])
test(1313.18, DT[, list(y=max(y)), by=x], DT[c(5,10)])
DT[c(1,6), y := NA]
test(1313.19, DT[, list(y=min(y)), by=x], DT[c(1,6)])
test(1313.20, DT[, list(y=max(y)), by=x], DT[c(1,6)])
test(1313.21, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)])
test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,1)])


##########################

Expand Down
232 changes: 213 additions & 19 deletions src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,12 @@ SEXP gsum(SEXP x, SEXP narm)
for (i=0; i<n; i++) {
thisgrp = grp[i];
if(INTEGER(x)[i] == NA_INTEGER) {
if (!LOGICAL(narm)[0]) s[thisgrp] = NA_REAL; // Let NA_REAL propogate from here. R_NaReal is IEEE.
continue;
}
s[thisgrp] += INTEGER(x)[i]; // no under/overflow here, s is long double (like base)
}
ans = PROTECT(allocVector(INTSXP, ngrp));
if (!LOGICAL(narm)[0]) s[thisgrp] = NA_REAL; // Let NA_REAL propogate from here. R_NaReal is IEEE.
continue;
}
s[thisgrp] += INTEGER(x)[i]; // no under/overflow here, s is long double (like base)
}
ans = PROTECT(allocVector(INTSXP, ngrp));
for (i=0; i<ngrp; i++) {
if (s[i] > INT_MAX || s[i] < INT_MIN) {
warning("Group %d summed to more than type 'integer' can hold so the result has been coerced to 'numeric' automatically, for convenience.", i+1);
Expand All @@ -84,14 +84,14 @@ SEXP gsum(SEXP x, SEXP narm)
for (i=0; i<n; i++) {
thisgrp = grp[i];
if(ISNAN(REAL(x)[i]) && LOGICAL(narm)[0]) continue; // else let NA_REAL propogate from here
s[thisgrp] += REAL(x)[i]; // done in long double, like base
}
for (i=0; i<ngrp; i++) {
if (s[i] > DBL_MAX) REAL(ans)[i] = R_PosInf;
s[thisgrp] += REAL(x)[i]; // done in long double, like base
}
for (i=0; i<ngrp; i++) {
if (s[i] > DBL_MAX) REAL(ans)[i] = R_PosInf;
else if (s[i] < -DBL_MAX) REAL(ans)[i] = R_NegInf;
else REAL(ans)[i] = (double)s[i];
}
break;
break;
default:
free(s);
error("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
Expand Down Expand Up @@ -140,18 +140,18 @@ SEXP gmean(SEXP x, SEXP narm)
for (i=0; i<n; i++) {
thisgrp = grp[i];
if(INTEGER(x)[i] == NA_INTEGER) continue;
s[thisgrp] += INTEGER(x)[i]; // no under/overflow here, s is long double
c[thisgrp]++;
}
s[thisgrp] += INTEGER(x)[i]; // no under/overflow here, s is long double
c[thisgrp]++;
}
break;
case REALSXP:
for (i=0; i<n; i++) {
thisgrp = grp[i];
if (ISNAN(REAL(x)[i])) continue;
s[thisgrp] += REAL(x)[i];
c[thisgrp]++;
}
break;
s[thisgrp] += REAL(x)[i];
c[thisgrp]++;
}
break;
default:
free(s); free(c);
error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
Expand All @@ -170,6 +170,200 @@ SEXP gmean(SEXP x, SEXP narm)
return(ans);
}

// TO DO: gmin, gmax, gsd, gprod, gwhich.min, gwhich.max
// TO DO: gsd, gprod, gwhich.min, gwhich.max

// gmin
SEXP gmin(SEXP x, SEXP narm)
{
if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
if (!isVectorAtomic(x)) error("GForce sum can only be applied to columns, not .SD or similar. To find min of all items in a list such as .SD, either add the prefix base::min(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lappy(.SD,min),by=,.SDcols=]'");
R_len_t i, thisgrp=0;
int n = LENGTH(x);
//clock_t start = clock();
SEXP ans;
if (grpn != length(x)) error("grpn [%d] != length(x) [%d] in gsum", grpn, length(x));
char *update = Calloc(ngrp, char);
if (update == NULL) error("Unable to allocate %d * %d bytes for gsum", ngrp, sizeof(char));
switch(TYPEOF(x)) {
case LGLSXP: case INTSXP:
ans = PROTECT(allocVector(INTSXP, ngrp));
for (i=0; i<ngrp; i++) INTEGER(ans)[i] = 0;
if (!LOGICAL(narm)[0]) {
for (i=0; i<n; i++) {
thisgrp = grp[i];
if (INTEGER(x)[i] != NA_INTEGER && INTEGER(ans)[thisgrp] != NA_INTEGER) {
if ( update[thisgrp] != 1 || INTEGER(ans)[thisgrp] > INTEGER(x)[i] ) {
INTEGER(ans)[thisgrp] = INTEGER(x)[i];
if (update[thisgrp] != 1) update[thisgrp] = 1;
}
} else INTEGER(ans)[thisgrp] = NA_INTEGER;
}
} else {
for (i=0; i<n; i++) {
thisgrp = grp[i];
if (INTEGER(x)[i] != NA_INTEGER) {
if ( update[thisgrp] != 1 || INTEGER(ans)[thisgrp] > INTEGER(x)[i] ) {
INTEGER(ans)[thisgrp] = INTEGER(x)[i];
if (update[thisgrp] != 1) update[thisgrp] = 1;
}
} else {
if (update[thisgrp] != 1) {
INTEGER(ans)[thisgrp] = NA_INTEGER;
}
}
}
for (i=0; i<ngrp; i++) {
if (update[i] != 1) {// equivalent of INTEGER(ans)[thisgrp] == NA_INTEGER
warning("No non-missing values found in at least one group. Coercing to numeric type and returning 'Inf' for such groups to be consistent with base");
UNPROTECT(1);
ans = PROTECT(coerceVector(ans, REALSXP));
for (i=0; i<ngrp; i++) {
if (update[i] != 1) REAL(ans)[i] = R_PosInf;
}
}
}
}
break;
case REALSXP:
ans = PROTECT(allocVector(REALSXP, ngrp));
for (i=0; i<ngrp; i++) REAL(ans)[i] = 0;
if (!LOGICAL(narm)[0]) {
for (i=0; i<n; i++) {
thisgrp = grp[i];
if ( !ISNA(REAL(x)[i]) && !ISNA(REAL(ans)[thisgrp]) ) {
if ( update[thisgrp] != 1 || REAL(ans)[thisgrp] > REAL(x)[i] ) {
REAL(ans)[thisgrp] = REAL(x)[i];
if (update[thisgrp] != 1) update[thisgrp] = 1;
}
} else REAL(ans)[thisgrp] = NA_REAL;
}
} else {
for (i=0; i<n; i++) {
thisgrp = grp[i];
if ( !ISNA(REAL(x)[i]) ) {
if ( update[thisgrp] != 1 || REAL(ans)[thisgrp] > REAL(x)[i] ) {
REAL(ans)[thisgrp] = REAL(x)[i];
if (update[thisgrp] != 1) update[thisgrp] = 1;
}
} else {
if (update[thisgrp] != 1) {
REAL(ans)[thisgrp] = R_PosInf;
}
}
}
// everything taken care of already. Just warn if all NA groups have occurred at least once
for (i=0; i<ngrp; i++) {
if (update[i] != 1) {// equivalent of REAL(ans)[thisgrp] == R_PosInf
warning("No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base");
break;
}
}
}
break;
default:
error("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
}
copyMostAttrib(x, ans); // all but names,dim and dimnames. And if so, we want a copy here, not keepattr's SET_ATTRIB.
UNPROTECT(1);
Free(update);
// Rprintf("this gsum took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);
return(ans);
}

// gmax
SEXP gmax(SEXP x, SEXP narm)
{
if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
if (!isVectorAtomic(x)) error("GForce sum can only be applied to columns, not .SD or similar. To find min of all items in a list such as .SD, either add the prefix base::min(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lappy(.SD,min),by=,.SDcols=]'");
R_len_t i, thisgrp=0;
int n = LENGTH(x);
//clock_t start = clock();
SEXP ans;
if (grpn != length(x)) error("grpn [%d] != length(x) [%d] in gsum", grpn, length(x));
char *update = Calloc(ngrp, char);
if (update == NULL) error("Unable to allocate %d * %d bytes for gsum", ngrp, sizeof(char));
switch(TYPEOF(x)) {
case LGLSXP: case INTSXP:
ans = PROTECT(allocVector(INTSXP, ngrp));
for (i=0; i<ngrp; i++) INTEGER(ans)[i] = 0;
if (!LOGICAL(narm)[0]) { // simple case - deal in a straightforward manner first
for (i=0; i<n; i++) {
thisgrp = grp[i];
if (INTEGER(x)[i] != NA_INTEGER && INTEGER(ans)[thisgrp] != NA_INTEGER) {
if ( update[thisgrp] != 1 || INTEGER(ans)[thisgrp] < INTEGER(x)[i] ) {
INTEGER(ans)[thisgrp] = INTEGER(x)[i];
if (update[thisgrp] != 1) update[thisgrp] = 1;
}
} else INTEGER(ans)[thisgrp] = NA_INTEGER;
}
} else {
for (i=0; i<n; i++) {
thisgrp = grp[i];
if (INTEGER(x)[i] != NA_INTEGER) {
if ( update[thisgrp] != 1 || INTEGER(ans)[thisgrp] < INTEGER(x)[i] ) {
INTEGER(ans)[thisgrp] = INTEGER(x)[i];
if (update[thisgrp] != 1) update[thisgrp] = 1;
}
} else {
if (update[thisgrp] != 1) {
INTEGER(ans)[thisgrp] = NA_INTEGER;
}
}
}
for (i=0; i<ngrp; i++) {
if (update[i] != 1) {// equivalent of INTEGER(ans)[thisgrp] == NA_INTEGER
warning("No non-missing values found in at least one group. Coercing to numeric type and returning 'Inf' for such groups to be consistent with base");
UNPROTECT(1);
ans = PROTECT(coerceVector(ans, REALSXP));
for (i=0; i<ngrp; i++) {
if (update[i] != 1) REAL(ans)[i] = -R_PosInf;
}
}
}
}
break;
case REALSXP:
ans = PROTECT(allocVector(REALSXP, ngrp));
for (i=0; i<ngrp; i++) REAL(ans)[i] = 0;
if (!LOGICAL(narm)[0]) {
for (i=0; i<n; i++) {
thisgrp = grp[i];
if ( !ISNA(REAL(x)[i]) && !ISNA(REAL(ans)[thisgrp]) ) {
if ( update[thisgrp] != 1 || REAL(ans)[thisgrp] < REAL(x)[i] ) {
REAL(ans)[thisgrp] = REAL(x)[i];
if (update[thisgrp] != 1) update[thisgrp] = 1;
}
} else REAL(ans)[thisgrp] = NA_REAL;
}
} else {
for (i=0; i<n; i++) {
thisgrp = grp[i];
if ( !ISNA(REAL(x)[i]) ) {
if ( update[thisgrp] != 1 || REAL(ans)[thisgrp] < REAL(x)[i] ) {
REAL(ans)[thisgrp] = REAL(x)[i];
if (update[thisgrp] != 1) update[thisgrp] = 1;
}
} else {
if (update[thisgrp] != 1) {
REAL(ans)[thisgrp] = -R_PosInf;
}
}
}
// everything taken care of already. Just warn if all NA groups have occurred at least once
for (i=0; i<ngrp; i++) {
if (update[i] != 1) { // equivalent of REAL(ans)[thisgrp] == -R_PosInf
warning("No non-missing values found in at least one group. Returning '-Inf' for such groups to be consistent with base");
break;
}
}
}
break;
default:
error("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
}
copyMostAttrib(x, ans); // all but names,dim and dimnames. And if so, we want a copy here, not keepattr's SET_ATTRIB.
UNPROTECT(1);
Free(update);
// Rprintf("this gsum took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);
return(ans);
}
4 changes: 4 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ SEXP gstart();
SEXP gend();
SEXP gsum();
SEXP gmean();
SEXP gmin();
SEXP gmax();
SEXP isOrderedSubset();
SEXP pointWrapper();
SEXP setNumericRounding();
Expand Down Expand Up @@ -89,6 +91,8 @@ R_CallMethodDef callMethods[] = {
{"Cgend", (DL_FUNC) &gend, -1},
{"Cgsum", (DL_FUNC) &gsum, -1},
{"Cgmean", (DL_FUNC) &gmean, -1},
{"Cgmin", (DL_FUNC) &gmin, -1},
{"Cgmax", (DL_FUNC) &gmax, -1},
{"CisOrderedSubset", (DL_FUNC) &isOrderedSubset, -1},
{"CpointWrapper", (DL_FUNC) &pointWrapper, -1},
{"CsetNumericRounding", (DL_FUNC) &setNumericRounding, -1},
Expand Down

0 comments on commit 0af4511

Please sign in to comment.