gmin and gmax done. Partially address #5754 (git #523)

Rdatatable · Jun 18, 2014 · 0af4511 · 0af4511
1 parent 2602f23
commit 0af4511
Show file tree

Hide file tree

Showing 6 changed files with 266 additions and 25 deletions.
diff --git a/R/data.table.R b/R/data.table.R
@@ -1250,7 +1250,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
         dotN <- function(x) if (is.name(x) && x == ".N") TRUE else FALSE # For #5760
         if (getOption("datatable.optimize")>=2 && !byjoin && !length(irows) && length(f__) && length(ansvars) && !length(lhs)) {
             # Apply GForce
-            gfuns = c("sum","mean",".N") # added .N for #5760
+            gfuns = c("sum","mean",".N", "min", "max") # added .N for #5760
             .ok = function(q) {
                 if (dotN(q)) return(TRUE) # For #5760
                 ans = is.call(q) && as.character(q[[1L]]) %chin% gfuns && !is.call(q[[2L]]) && (length(q)==2 || identical("na",substring(names(q)[3L],1,2)))
@@ -2138,6 +2138,7 @@ setDT <- function(x, giveNames=TRUE, keep.rownames=FALSE) {
 
 gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm)
 gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
+gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm)
+gmax = function(x, na.rm=FALSE) .Call(Cgmax, x, na.rm)
 gstart = function(o, f, l) .Call(Cgstart, o, f, l)
 gend = function() .Call(Cgend)
-
diff --git a/R/test.data.table.R b/R/test.data.table.R
@@ -1,13 +1,13 @@
 
-test.data.table = function(verbose=FALSE) {
+test.data.table = function(verbose=FALSE, pkg="pkg") {
     if (exists("test.data.table",.GlobalEnv,inherits=FALSE)) {
         # package developer
         if ("package:data.table" %in% search()) stop("data.table package loaded")
         if (.Platform$OS.type == "unix" && Sys.info()['sysname'] != "Darwin")
             d = path.expand("~/R/gitdatatable/pkg/inst/tests")
         else {
-            if (!"pkg" %in% dir()) stop("'pkg' not in dir()") 
-            d = paste(getwd(),"/pkg/inst/tests",sep="")
+            if (!pkg %in% dir()) stop(paste(pkg, " not in dir()", sep=""))
+            d = paste(getwd(),"/", pkg, "/inst/tests",sep="")
         }
     } else {
         # user

diff --git a/README.md b/README.md
@@ -116,7 +116,7 @@ DT[, grep("^b", names(DT)) := NULL] # where no columns start with b
 # warns now and returns DT instead of error
 ```
 
-  * GForce now is also optimised for j-expression with `.N`. Closes **#5760** (git [#334](https://github.com/Rdatatable/data.table/issues/334)).
+  * GForce now is also optimised for j-expression with `.N`. Closes **#5760** (git [#334](https://github.com/Rdatatable/data.table/issues/334) and git [#523](https://github.com/Rdatatable/data.table/issues/523)).
 ```S
 DT[, list(.N, mean(y), sum(y)), by=x] # 1.9.2 - doesn't know to use GForce - will be (relatively) slower
 DT[, list(.N, mean(y), sum(y)), by=x] # 1.9.3+ - will use GForce.
@@ -130,6 +130,8 @@ DT[, list(.N, mean(y), sum(y)), by=x] # 1.9.3+ - will use GForce.
 
   * Looping calls to `unique(DT)` such as in `DT[,unique(.SD),by=group]` is now faster by avoiding internal overhead of calling `[.data.table`. Thanks again to Ron Hylton for highlighting in the [same thread](http://r.789695.n4.nabble.com/data-table-is-asking-for-help-tp4692080.html). His example is reduced from 28 sec to 9 sec, with identical results.
 
+  * Following `gsum` and `gmean`, now `gmin` and `gmax` from GForce are also implemented. Closes part of #5754 (git [#523](https://github.com/Rdatatable/data.table/issues/523)). Benchmarks are also provided.
+        i.e., DT[, list(sum(x), min(y), max(z), .N), by=...] # runs by default using GForce
 
 #### BUG FIXES
 

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -4630,6 +4630,46 @@ DT = data.table(a=1:3,b=6:1)
 test(1312, DT[,setkey(.SD),by=a], error="SD is locked. Using set.*functions on [.]SD is reserved")
 # was warning "Already keyed by this key but had invalid row order" due to the key not being cleared after the previous group.  A solution could have been to put back the original key on populating .SD for each group.  But instead we reserve it for future use and push the user towards doing it a different more efficient way (see Arun's speedups in the datatable-help thread).
 
+# gmin and gmax extensive testing (because there are tricky cases)
+DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647))
+# make sure GForce is running
+options(datatable.optimize=2L)
+
+# for integers
+test(1313.1, DT[, min(y), by=x], DT[, base:::min(y), by=x])
+test(1313.2, DT[, max(y), by=x], DT[, base:::max(y), by=x])
+test(1313.3, DT[, min(y, na.rm=TRUE), by=x], DT[, base:::min(y, na.rm=TRUE), by=x])
+test(1313.4, DT[, max(y, na.rm=TRUE), by=x], DT[, base:::max(y, na.rm=TRUE), by=x])
+# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median)
+DT[x==6, y := INT(NA)]
+test(1313.5, DT[, min(y), by=x], DT[, base:::min(y), by=x])
+test(1313.6, DT[, max(y), by=x], DT[, base:::max(y), by=x])
+test(1313.7, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-2147483647,Inf)), warning="No non-missing")
+test(1313.8, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-2147483647,-Inf)), warning="No non-missing")
+
+# for numeric
+DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA))
+test(1313.9, DT[, min(y), by=x], DT[, base:::min(y), by=x])
+test(1313.10, DT[, max(y), by=x], DT[, base:::max(y), by=x])
+test(1313.11, DT[, min(y, na.rm=TRUE), by=x], DT[, base:::min(y, na.rm=TRUE), by=x])
+test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base:::max(y, na.rm=TRUE), by=x])
+# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median)
+DT[x==6, y := NA_real_]
+test(1313.13, DT[, min(y), by=x], DT[, base:::min(y), by=x])
+test(1313.14, DT[, max(y), by=x], DT[, base:::max(y), by=x])
+test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,Inf)), warning="No non-missing")
+test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,-Inf)), warning="No non-missing")
+
+# for date (attribute check.. especially after issues/689 !!!)
+DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="gmt") + seq(0, 86400*9, 86400))
+test(1313.17, DT[, list(y=min(y)), by=x], DT[c(1,6)])
+test(1313.18, DT[, list(y=max(y)), by=x], DT[c(5,10)])
+DT[c(1,6), y := NA]
+test(1313.19, DT[, list(y=min(y)), by=x], DT[c(1,6)])
+test(1313.20, DT[, list(y=max(y)), by=x], DT[c(1,6)])
+test(1313.21, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)])
+test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,1)])
+
 
 ##########################
 

diff --git a/src/gsumm.c b/src/gsumm.c
@@ -59,12 +59,12 @@ SEXP gsum(SEXP x, SEXP narm)
         for (i=0; i<n; i++) {
             thisgrp = grp[i];
             if(INTEGER(x)[i] == NA_INTEGER) { 
-	            if (!LOGICAL(narm)[0]) s[thisgrp] = NA_REAL;  // Let NA_REAL propogate from here. R_NaReal is IEEE.
-	            continue;
-	        }
-	        s[thisgrp] += INTEGER(x)[i];  // no under/overflow here, s is long double (like base)
-	    }
-	    ans = PROTECT(allocVector(INTSXP, ngrp));
+                if (!LOGICAL(narm)[0]) s[thisgrp] = NA_REAL;  // Let NA_REAL propogate from here. R_NaReal is IEEE.
+                continue;
+            }
+            s[thisgrp] += INTEGER(x)[i];  // no under/overflow here, s is long double (like base)
+        }
+        ans = PROTECT(allocVector(INTSXP, ngrp));
         for (i=0; i<ngrp; i++) {
             if (s[i] > INT_MAX || s[i] < INT_MIN) {
                 warning("Group %d summed to more than type 'integer' can hold so the result has been coerced to 'numeric' automatically, for convenience.", i+1);
@@ -84,14 +84,14 @@ SEXP gsum(SEXP x, SEXP narm)
         for (i=0; i<n; i++) {
             thisgrp = grp[i];
             if(ISNAN(REAL(x)[i]) && LOGICAL(narm)[0]) continue;  // else let NA_REAL propogate from here
-	        s[thisgrp] += REAL(x)[i];  // done in long double, like base
-	    }
-	    for (i=0; i<ngrp; i++) {
-	        if (s[i] > DBL_MAX) REAL(ans)[i] = R_PosInf;
+            s[thisgrp] += REAL(x)[i];  // done in long double, like base
+        }
+        for (i=0; i<ngrp; i++) {
+            if (s[i] > DBL_MAX) REAL(ans)[i] = R_PosInf;
             else if (s[i] < -DBL_MAX) REAL(ans)[i] = R_NegInf;
             else REAL(ans)[i] = (double)s[i];
         }
-	    break;
+        break;
     default:
         free(s);
         error("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
@@ -140,18 +140,18 @@ SEXP gmean(SEXP x, SEXP narm)
         for (i=0; i<n; i++) {
             thisgrp = grp[i];
             if(INTEGER(x)[i] == NA_INTEGER) continue;
-	        s[thisgrp] += INTEGER(x)[i];  // no under/overflow here, s is long double
-	        c[thisgrp]++;
-	    }
+            s[thisgrp] += INTEGER(x)[i];  // no under/overflow here, s is long double
+            c[thisgrp]++;
+        }
         break;
     case REALSXP:
         for (i=0; i<n; i++) {
             thisgrp = grp[i];
             if (ISNAN(REAL(x)[i])) continue;
-	        s[thisgrp] += REAL(x)[i];
-	        c[thisgrp]++;
-	    }
-	    break;
+            s[thisgrp] += REAL(x)[i];
+            c[thisgrp]++;
+        }
+        break;
     default:
         free(s); free(c);
         error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
@@ -170,6 +170,200 @@ SEXP gmean(SEXP x, SEXP narm)
     return(ans);
 }
 
-// TO DO: gmin, gmax, gsd, gprod, gwhich.min, gwhich.max
+// TO DO: gsd, gprod, gwhich.min, gwhich.max
 
+// gmin
+SEXP gmin(SEXP x, SEXP narm)
+{
+    if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
+    if (!isVectorAtomic(x)) error("GForce sum can only be applied to columns, not .SD or similar. To find min of all items in a list such as .SD, either add the prefix base::min(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lappy(.SD,min),by=,.SDcols=]'");
+    R_len_t i, thisgrp=0;
+    int n = LENGTH(x);
+    //clock_t start = clock();
+    SEXP ans;
+    if (grpn != length(x)) error("grpn [%d] != length(x) [%d] in gsum", grpn, length(x));
+    char *update = Calloc(ngrp, char);
+    if (update == NULL) error("Unable to allocate %d * %d bytes for gsum", ngrp, sizeof(char));
+    switch(TYPEOF(x)) {
+    case LGLSXP: case INTSXP:
+        ans = PROTECT(allocVector(INTSXP, ngrp));
+        for (i=0; i<ngrp; i++) INTEGER(ans)[i] = 0;
+        if (!LOGICAL(narm)[0]) {
+            for (i=0; i<n; i++) {
+                thisgrp = grp[i];
+                if (INTEGER(x)[i] != NA_INTEGER && INTEGER(ans)[thisgrp] != NA_INTEGER) {
+                    if ( update[thisgrp] != 1 || INTEGER(ans)[thisgrp] > INTEGER(x)[i] ) {
+                        INTEGER(ans)[thisgrp] = INTEGER(x)[i];
+                        if (update[thisgrp] != 1) update[thisgrp] = 1;
+                    }
+                } else INTEGER(ans)[thisgrp] = NA_INTEGER;
+            }
+        } else {
+            for (i=0; i<n; i++) {
+                thisgrp = grp[i];
+                if (INTEGER(x)[i] != NA_INTEGER) {
+                    if ( update[thisgrp] != 1 || INTEGER(ans)[thisgrp] > INTEGER(x)[i] ) {
+                        INTEGER(ans)[thisgrp] = INTEGER(x)[i];
+                        if (update[thisgrp] != 1) update[thisgrp] = 1;
+                    }
+                } else {
+                    if (update[thisgrp] != 1) {
+                        INTEGER(ans)[thisgrp] = NA_INTEGER;
+                    }
+                }
+            }
+            for (i=0; i<ngrp; i++) {
+                if (update[i] != 1)  {// equivalent of INTEGER(ans)[thisgrp] == NA_INTEGER
+                    warning("No non-missing values found in at least one group. Coercing to numeric type and returning 'Inf' for such groups to be consistent with base");
+                    UNPROTECT(1);
+                    ans = PROTECT(coerceVector(ans, REALSXP));
+                    for (i=0; i<ngrp; i++) {
+                        if (update[i] != 1) REAL(ans)[i] = R_PosInf;
+                    }
+                }
+            }
+        }
+        break;
+    case REALSXP:
+        ans = PROTECT(allocVector(REALSXP, ngrp));
+        for (i=0; i<ngrp; i++) REAL(ans)[i] = 0;
+        if (!LOGICAL(narm)[0]) {
+            for (i=0; i<n; i++) {
+                thisgrp = grp[i];
+                if ( !ISNA(REAL(x)[i]) && !ISNA(REAL(ans)[thisgrp]) ) {
+                    if ( update[thisgrp] != 1 || REAL(ans)[thisgrp] > REAL(x)[i] ) {
+                        REAL(ans)[thisgrp] = REAL(x)[i];
+                        if (update[thisgrp] != 1) update[thisgrp] = 1;
+                    }
+                } else REAL(ans)[thisgrp] = NA_REAL;
+            }
+        } else {
+            for (i=0; i<n; i++) {
+                thisgrp = grp[i];
+                if ( !ISNA(REAL(x)[i]) ) {
+                    if ( update[thisgrp] != 1 || REAL(ans)[thisgrp] > REAL(x)[i] ) {
+                        REAL(ans)[thisgrp] = REAL(x)[i];
+                        if (update[thisgrp] != 1) update[thisgrp] = 1;
+                    }
+                } else {
+                    if (update[thisgrp] != 1) {
+                        REAL(ans)[thisgrp] = R_PosInf;
+                    }
+                }
+            }
+            // everything taken care of already. Just warn if all NA groups have occurred at least once
+            for (i=0; i<ngrp; i++) {
+                if (update[i] != 1)  {// equivalent of REAL(ans)[thisgrp] == R_PosInf
+                    warning("No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base");
+                    break;
+                }
+            }
+        }
+        break;
+    default:
+        error("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
+    }
+    copyMostAttrib(x, ans); // all but names,dim and dimnames. And if so, we want a copy here, not keepattr's SET_ATTRIB.
+    UNPROTECT(1);
+    Free(update);
+    // Rprintf("this gsum took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);
+    return(ans);
+}
 
+// gmax
+SEXP gmax(SEXP x, SEXP narm)
+{
+    if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
+    if (!isVectorAtomic(x)) error("GForce sum can only be applied to columns, not .SD or similar. To find min of all items in a list such as .SD, either add the prefix base::min(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lappy(.SD,min),by=,.SDcols=]'");
+    R_len_t i, thisgrp=0;
+    int n = LENGTH(x);
+    //clock_t start = clock();
+    SEXP ans;
+    if (grpn != length(x)) error("grpn [%d] != length(x) [%d] in gsum", grpn, length(x));
+    char *update = Calloc(ngrp, char);
+    if (update == NULL) error("Unable to allocate %d * %d bytes for gsum", ngrp, sizeof(char));
+    switch(TYPEOF(x)) {
+    case LGLSXP: case INTSXP:
+        ans = PROTECT(allocVector(INTSXP, ngrp));
+        for (i=0; i<ngrp; i++) INTEGER(ans)[i] = 0;
+        if (!LOGICAL(narm)[0]) { // simple case - deal in a straightforward manner first
+            for (i=0; i<n; i++) {
+                thisgrp = grp[i];
+                if (INTEGER(x)[i] != NA_INTEGER && INTEGER(ans)[thisgrp] != NA_INTEGER) {
+                    if ( update[thisgrp] != 1 || INTEGER(ans)[thisgrp] < INTEGER(x)[i] ) {
+                        INTEGER(ans)[thisgrp] = INTEGER(x)[i];
+                        if (update[thisgrp] != 1) update[thisgrp] = 1;
+                    }
+                } else  INTEGER(ans)[thisgrp] = NA_INTEGER;
+            }
+        } else {
+            for (i=0; i<n; i++) {
+                thisgrp = grp[i];
+                if (INTEGER(x)[i] != NA_INTEGER) {
+                    if ( update[thisgrp] != 1 || INTEGER(ans)[thisgrp] < INTEGER(x)[i] ) {
+                        INTEGER(ans)[thisgrp] = INTEGER(x)[i];
+                        if (update[thisgrp] != 1) update[thisgrp] = 1;
+                    }
+                } else {
+                    if (update[thisgrp] != 1) {
+                        INTEGER(ans)[thisgrp] = NA_INTEGER;
+                    }
+                }
+            }
+            for (i=0; i<ngrp; i++) {
+                if (update[i] != 1)  {// equivalent of INTEGER(ans)[thisgrp] == NA_INTEGER
+                    warning("No non-missing values found in at least one group. Coercing to numeric type and returning 'Inf' for such groups to be consistent with base");
+                    UNPROTECT(1);
+                    ans = PROTECT(coerceVector(ans, REALSXP));
+                    for (i=0; i<ngrp; i++) {
+                        if (update[i] != 1) REAL(ans)[i] = -R_PosInf;
+                    }
+                }
+            }
+        }
+        break;
+    case REALSXP:
+        ans = PROTECT(allocVector(REALSXP, ngrp));
+        for (i=0; i<ngrp; i++) REAL(ans)[i] = 0;
+        if (!LOGICAL(narm)[0]) {
+            for (i=0; i<n; i++) {
+                thisgrp = grp[i];
+                if ( !ISNA(REAL(x)[i]) && !ISNA(REAL(ans)[thisgrp]) ) {
+                    if ( update[thisgrp] != 1 || REAL(ans)[thisgrp] < REAL(x)[i] ) {
+                        REAL(ans)[thisgrp] = REAL(x)[i];
+                        if (update[thisgrp] != 1) update[thisgrp] = 1;
+                    }
+                } else REAL(ans)[thisgrp] = NA_REAL;
+            }
+        } else {
+            for (i=0; i<n; i++) {
+                thisgrp = grp[i];
+                if ( !ISNA(REAL(x)[i]) ) {
+                    if ( update[thisgrp] != 1 || REAL(ans)[thisgrp] < REAL(x)[i] ) {
+                        REAL(ans)[thisgrp] = REAL(x)[i];
+                        if (update[thisgrp] != 1) update[thisgrp] = 1;
+                    }
+                } else {
+                    if (update[thisgrp] != 1) {
+                        REAL(ans)[thisgrp] = -R_PosInf;
+                    }
+                }
+            }
+            // everything taken care of already. Just warn if all NA groups have occurred at least once
+            for (i=0; i<ngrp; i++) {
+                if (update[i] != 1)  { // equivalent of REAL(ans)[thisgrp] == -R_PosInf
+                    warning("No non-missing values found in at least one group. Returning '-Inf' for such groups to be consistent with base");
+                    break;
+                }
+            }
+        }
+        break;
+    default:
+        error("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
+    }
+    copyMostAttrib(x, ans); // all but names,dim and dimnames. And if so, we want a copy here, not keepattr's SET_ATTRIB.
+    UNPROTECT(1);
+    Free(update);
+    // Rprintf("this gsum took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);
+    return(ans);
+}
diff --git a/src/init.c b/src/init.c
@@ -40,6 +40,8 @@ SEXP gstart();
 SEXP gend();
 SEXP gsum();
 SEXP gmean();
+SEXP gmin();
+SEXP gmax();
 SEXP isOrderedSubset();
 SEXP pointWrapper();
 SEXP setNumericRounding();
@@ -89,6 +91,8 @@ R_CallMethodDef callMethods[] = {
 {"Cgend", (DL_FUNC) &gend, -1},
 {"Cgsum", (DL_FUNC) &gsum, -1},
 {"Cgmean", (DL_FUNC) &gmean, -1},
+{"Cgmin", (DL_FUNC) &gmin, -1},
+{"Cgmax", (DL_FUNC) &gmax, -1},
 {"CisOrderedSubset", (DL_FUNC) &isOrderedSubset, -1},
 {"CpointWrapper", (DL_FUNC) &pointWrapper, -1},
 {"CsetNumericRounding", (DL_FUNC) &setNumericRounding, -1},