Skip to content

Commit

Permalink
gforce now optimises 'median' as well, #523.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Oct 30, 2015
1 parent 9362c4d commit d2f7d63
Show file tree
Hide file tree
Showing 7 changed files with 314 additions and 3 deletions.
3 changes: 2 additions & 1 deletion R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1540,7 +1540,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
}
} else {
# Apply GForce
gfuns = c("sum","mean",".N", "min", "max") # added .N for #5760
gfuns = c("sum", "mean", "median", ".N", "min", "max") # added .N for #5760
.ok <- function(q) {
if (dotN(q)) return(TRUE) # For #5760
ans = is.call(q) && as.character(q[[1L]]) %chin% gfuns && !is.call(q[[2L]]) && (length(q)==2 || identical("na",substring(names(q)[3L],1,2)))
Expand Down Expand Up @@ -2511,6 +2511,7 @@ rleidv <- function(x, cols=seq_along(x)) {

gsum <- function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm)
gmean <- function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
gmedian <- function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm)
gmin <- function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm)
gmax <- function(x, na.rm=FALSE) .Call(Cgmax, x, na.rm)
gstart <- function(o, f, l) .Call(Cgstart, o, f, l)
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@

9. `print.data.table` now warns when `bit64` package isn't loaded but the `data.table` contains `integer64` columns, [#975](https://github.com/Rdatatable/data.table/issues/975). Thanks to @StephenMcInerney.

10. GForce is now also optimised for `median`. Partly addresses [#523](https://github.com/Rdatatable/data.table/issues/523). Check that issue for benchmarks.

#### BUG FIXES

1. Now compiles and runs on IBM AIX gcc. Thanks to Vinh Nguyen for investigation and testing, [#1351](https://github.com/Rdatatable/data.table/issues/1351).
Expand Down
24 changes: 24 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -7102,6 +7102,30 @@ test(1578.4, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4))
test(1578.5, fread("530_fread.txt", skip=47L), data.table(V1=1:2, V2=3:4))
test(1578.6, fread("530_fread.txt", skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4))

# testing gforce::gmedian
dt = data.table(x = sample(letters, 300, TRUE),
i1 = sample(-10:10, 300, TRUE),
i2 = sample(c(-10:10, NA), 300, TRUE),
d1 = as.numeric(sample(-10:10, 300, TRUE)),
d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE)))
if ('package:bit64' %in% search()) {
dt[, `:=`(d3 = as.integer64(sample(-10:10, 300, TRUE)))]
dt[, `:=`(d4 = as.integer64(sample(c(-10:10,NA), 300, TRUE)))]
}

# make sure gforce is on
optim = getOption("datatable.optimize")
options(datatable.optimize=2L)
test(1579.1, dt[, lapply(.SD, median), by=x],
dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x])
test(1579.2, dt[, lapply(.SD, median, na.rm=TRUE), by=x],
dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x])
test(1579.3, dt[, lapply(.SD, median), keyby=x],
dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x])
test(1579.4, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x],
dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x])
options(datatable.optimize=optim)

##########################


Expand Down
4 changes: 4 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,7 @@ SEXP coerce_to_char(SEXP s, SEXP env);

// rbindlist.c
SEXP combineFactorLevels(SEXP factorLevels, int * factorType, Rboolean * isRowOrdered);

// quickselect
double dquickselect(double *x, int n, int k);
double iquickselect(int *x, int n, int k);
180 changes: 178 additions & 2 deletions src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,15 @@

static int *grp = NULL; // the group of each x item, like a factor
static int ngrp = 0; // number of groups
static int *grpsize = NULL; // size of each group, used by gmean not gsum
static int *grpsize = NULL; // size of each group, used by gmean (and gmedian) not gsum
static int grpn = 0; // length of underlying x == length(grp)
// for gmedian
static int maxgrpn = 0;
static int *oo = NULL;
static int *ff = NULL;
static int isunsorted = 0;
static union {double d;
long long ll;} u;

SEXP gstart(SEXP o, SEXP f, SEXP l) {
int i, j, g, *this;
Expand All @@ -21,6 +28,7 @@ SEXP gstart(SEXP o, SEXP f, SEXP l) {
grp = malloc(grpn * sizeof(int));
if (!grp) error("Unable to allocate %d * %d bytes in gstart", grpn, sizeof(int));
if (LENGTH(o)) {
isunsorted = 1; // for gmedian
for (g=0; g<ngrp; g++) {
this = INTEGER(o) + INTEGER(f)[g]-1;
for (j=0; j<grpsize[g]; j++) grp[ this[j]-1 ] = g;
Expand All @@ -31,12 +39,17 @@ SEXP gstart(SEXP o, SEXP f, SEXP l) {
for (j=0; j<grpsize[g]; j++) this[j] = g;
}
}
// for gmedian
// initialise maxgrpn
maxgrpn = INTEGER(getAttrib(o, install("maxgrpn")))[0];
oo = INTEGER(o);
ff = INTEGER(f);
// Rprintf("gstart took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);
return(R_NilValue);
}

SEXP gend() {
free(grp); grp = NULL; ngrp = 0;
free(grp); grp = NULL; ngrp = 0; maxgrpn = 0;
return(R_NilValue);
}

Expand Down Expand Up @@ -441,3 +454,166 @@ SEXP gmax(SEXP x, SEXP narm)
// Rprintf("this gmax took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);
return(ans);
}

// gmedian, always returns numeric type (to avoid as.numeric() wrap..)
SEXP gmedian(SEXP x, SEXP narm) {
if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
if (!isVectorAtomic(x)) error("GForce max can only be applied to columns, not .SD or similar. To find max of all items in a list such as .SD, either add the prefix base::max(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,max),by=,.SDcols=]'");
R_len_t i=0, j=0, k=0, imed=0, thisgrpsize=0, medianindex=0, nacount=0;
double val = 0.0;
Rboolean isna = FALSE, isint64 = FALSE;
SEXP ans, sub, class;
void *ptr;
if (grpn != length(x)) error("grpn [%d] != length(x) [%d] in gmax", grpn, length(x));
switch(TYPEOF(x)) {
case REALSXP:
class = getAttrib(x, R_ClassSymbol);
isint64 = (isString(class) && STRING_ELT(class, 0) == char_integer64);
ans = PROTECT(allocVector(REALSXP, ngrp));
sub = PROTECT(allocVector(REALSXP, maxgrpn)); // allocate once upfront
ptr = DATAPTR(sub);
if (!LOGICAL(narm)[0]) {
for (i=0; i<ngrp; i++) {
isna = FALSE;
thisgrpsize = grpsize[i];
SETLENGTH(sub, thisgrpsize);
for (j=0; j<thisgrpsize; j++) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
// TODO: raise this if-statement?
if (!isint64) {
if (ISNAN(REAL(x)[k])) {
REAL(ans)[i] = NA_REAL;
isna = TRUE; break;
} else REAL(sub)[j] = REAL(x)[k];
} else {
u.d = REAL(x)[k];
if (u.ll == NAINT64) {
REAL(ans)[i] = NA_REAL;
isna = TRUE; break;
} else REAL(sub)[j] = (double)u.ll;
}
}
if (isna) continue;
medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
REAL(ans)[i] = dquickselect(ptr, thisgrpsize, medianindex-1); // 0-indexed
// all elements to the left of thisgrpsize/2 is < the value at that index
// we just need to get min of last half
if (thisgrpsize % 2 == 0) {
val = REAL(sub)[medianindex]; // 0-indexed
for (imed=medianindex+1; imed<thisgrpsize; imed++) {
val = REAL(sub)[imed] > val ? val : REAL(sub)[imed];
}
REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
}
}
} else {
for (i=0; i<ngrp; i++) {
nacount = 0;
thisgrpsize = grpsize[i];
for (j=0; j<thisgrpsize; j++) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
// TODO: raise this if-statement?
if (!isint64) {
if (ISNAN(REAL(x)[k])) {
nacount++; continue;
} else REAL(sub)[j-nacount] = REAL(x)[k];
} else {
u.d = REAL(x)[k];
if (u.ll == NAINT64) {
nacount++; continue;
} else REAL(sub)[j-nacount] = (double)u.ll;
}
}
if (nacount == thisgrpsize) {
REAL(ans)[i] = NA_REAL; // all NAs
continue;
}
thisgrpsize -= nacount;
SETLENGTH(sub, thisgrpsize);
medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
REAL(ans)[i] = dquickselect(ptr, thisgrpsize, medianindex-1);
if (thisgrpsize % 2 == 0) {
// all elements to the left of thisgrpsize/2 is < the value at that index
// we just need to get min of last half
val = REAL(sub)[medianindex]; // 0-indexed
for (imed=medianindex+1; imed<thisgrpsize; imed++) {
val = REAL(sub)[imed] > val ? val : REAL(sub)[imed];
}
REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
}
}
}
SETLENGTH(sub, maxgrpn);
break;
case LGLSXP: case INTSXP:
ans = PROTECT(allocVector(REALSXP, ngrp));
sub = PROTECT(allocVector(INTSXP, maxgrpn)); // allocate once upfront
ptr = DATAPTR(sub);
if (!LOGICAL(narm)[0]) {
for (i=0; i<ngrp; i++) {
isna = FALSE;
thisgrpsize = grpsize[i];
SETLENGTH(sub, thisgrpsize);
for (j=0; j<thisgrpsize; j++) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
if (INTEGER(x)[k] == NA_INTEGER) {
REAL(ans)[i] = NA_REAL;
isna = TRUE; break;
} else INTEGER(sub)[j] = INTEGER(x)[k];
}
if (isna) continue;
medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
REAL(ans)[i] = iquickselect(ptr, thisgrpsize, medianindex-1); // 0-indexed
// all elements to the left of thisgrpsize/2 is < the value at that index
// we just need to get min of last half
if (thisgrpsize % 2 == 0) {
val = INTEGER(sub)[medianindex]; // 0-indexed
for (imed=medianindex+1; imed<thisgrpsize; imed++) {
val = INTEGER(sub)[imed] > val ? val : INTEGER(sub)[imed];
}
REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
}
}
} else {
for (i=0; i<ngrp; i++) {
nacount = 0;
thisgrpsize = grpsize[i];
for (j=0; j<thisgrpsize; j++) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
if (INTEGER(x)[k] == NA_INTEGER) {
nacount++; continue;
}
INTEGER(sub)[j-nacount] = INTEGER(x)[k];
}
if (nacount == thisgrpsize) {
REAL(ans)[i] = NA_REAL; // all NAs
continue;
}
thisgrpsize -= nacount;
SETLENGTH(sub, thisgrpsize);
medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
REAL(ans)[i] = iquickselect(ptr, thisgrpsize, medianindex-1);
if (thisgrpsize % 2 == 0) {
// all elements to the left of thisgrpsize/2 is < the value at that index
// we just need to get min of last half
val = INTEGER(sub)[medianindex]; // 0-indexed
for (imed=medianindex+1; imed<thisgrpsize; imed++) {
val = INTEGER(sub)[imed] > val ? val : INTEGER(sub)[imed];
}
REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
}
}
}
SETLENGTH(sub, maxgrpn);
break;
default:
error("Type '%s' not supported by GForce median (gmedian). Either add the prefix base::max(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
}
UNPROTECT(2);
return(ans);
}

2 changes: 2 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ SEXP anyNA();
SEXP isReallyReal();
SEXP setlevels();
SEXP rleid();
SEXP gmedian();

// .Externals
SEXP fastmean();
Expand Down Expand Up @@ -122,6 +123,7 @@ R_CallMethodDef callMethods[] = {
{"CisReallyReal", (DL_FUNC) &isReallyReal, -1},
{"Csetlevels", (DL_FUNC) &setlevels, -1},
{"Crleid", (DL_FUNC) &rleid, -1},
{"Cgmedian", (DL_FUNC) &gmedian, -1},

{NULL, NULL, 0}
};
Expand Down
102 changes: 102 additions & 0 deletions src/quickselect.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#include "data.table.h"
#include <Rdefines.h>
//#include <sys/mman.h>
#include <Rversion.h>
#include <fcntl.h>
#include <time.h>

// from good ol' Numerical Recipes in C
#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;

double dquickselect(double *x, int n, int k) {
unsigned long i,ir,j,l,mid;
double a,temp;

l=0;
ir=n-1;
for(;;) {
if (ir <= l+1) {
if (ir == l+1 && x[ir] < x[l]) {
SWAP(x[l],x[ir]);
}
return x[k];
} else {
mid=(l+ir) >> 1;
SWAP(x[mid],x[l+1]);
if (x[l] > x[ir]) {
SWAP(x[l],x[ir]);
}
if (x[l+1] > x[ir]) {
SWAP(x[l+1],x[ir]);
}
if (x[l] > x[l+1]) {
SWAP(x[l],x[l+1]);
}
i=l+1;
j=ir;
a=x[l+1];
for (;;) {
do i++; while (x[i] < a);
do j--; while (x[j] > a);
if (j < i) break;
SWAP(x[i],x[j]);
}
x[l+1]=x[j];
x[j]=a;
if (j >= k) ir=j-1;
if (j <= k) l=i;
}
}
}

double iquickselect(int *x, int n, int k) {
unsigned long i,ir,j,l,mid;
int a,temp;

l=0;
ir=n-1;
for(;;) {
if (ir <= l+1) {
if (ir == l+1 && x[ir] < x[l]) {
SWAP(x[l],x[ir]);
}
return (double)(x[k]);
} else {
mid=(l+ir) >> 1;
SWAP(x[mid],x[l+1]);
if (x[l] > x[ir]) {
SWAP(x[l],x[ir]);
}
if (x[l+1] > x[ir]) {
SWAP(x[l+1],x[ir]);
}
if (x[l] > x[l+1]) {
SWAP(x[l],x[l+1]);
}
i=l+1;
j=ir;
a=x[l+1];
for (;;) {
do i++; while (x[i] < a);
do j--; while (x[j] > a);
if (j < i) break;
SWAP(x[i],x[j]);
}
x[l+1]=x[j];
x[j]=a;
if (j >= k) ir=j-1;
if (j <= k) l=i;
}
}
}


// SEXP quickselect(SEXP xArg, SEXP n, SEXP k) {

// void *x = DATAPTR(xArg);
// SEXP ans = PROTECT(allocVector(REALSXP, 1L));
// REAL(ans)[0] = quickselectwrapper(x, INTEGER(n)[0], INTEGER(k)[0]-1);

// UNPROTECT(1);
// return(ans);
// }

0 comments on commit d2f7d63

Please sign in to comment.