Skip to content

Commit

Permalink
head(.SD, 1) and tail(.SD,1) are gforce optimised, #523.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Nov 8, 2015
1 parent 3a316c2 commit e615532
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 26 deletions.
15 changes: 12 additions & 3 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1431,7 +1431,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
jvnames = ansvars
}
} else {
if ( length(jsub) == 3L && (jsub[[1L]] == "[" || jsub[[1L]] == "head") && jsub[[2L]] == ".SD" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") ) {
if ( length(jsub) == 3L && (jsub[[1L]] == "[" || jsub[[1L]] == "head" || jsub[[1L]] == "tail") && jsub[[2L]] == ".SD" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") ) {
# optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
jsub = as.call(c(quote(list), lapply(ansvars, function(x) { jsub[[2L]] = as.name(x); jsub })))
jvnames = ansvars
Expand Down Expand Up @@ -1542,10 +1542,14 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
}
} else {
# Apply GForce
gfuns = c("sum", "mean", "median", ".N", "min", "max") # added .N for #5760
gfuns = c("sum", "mean", "median", ".N", "min", "max", "head", "last", "tail") # added .N for #5760
.ok <- function(q) {
if (dotN(q)) return(TRUE) # For #5760
ans = is.call(q) && as.character(q[[1L]]) %chin% gfuns && !is.call(q[[2L]]) && (length(q)==2 || identical("na",substring(names(q)[3L],1,2)))
cond = is.call(q) && as.character(q[[1L]]) %chin% gfuns && !is.call(q[[2L]])
ans = cond && (length(q)==2 || identical("na",substring(names(q)[3L],1,2)))
if (identical(ans, TRUE)) return(ans)
ans = cond && length(q)==3 && ( as.character(q[[1]]) %chin% c("head", "tail") &&
(identical(q[[3]], 1) || identical(q[[3]], 1L)) )
if (is.na(ans)) ans=FALSE
ans
}
Expand Down Expand Up @@ -2511,6 +2515,11 @@ rleidv <- function(x, cols=seq_along(x)) {
.Call(Crleid, x, -1L)
}

ghead <- function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment
gtail <- function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment
gfirst <- function(x) .Call(Cgfirst, x)
glast <- function(x) .Call(Cglast, x)

gsum <- function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm)
gmean <- function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
gmedian <- function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm)
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

11. GForce kicks in along with subsets in `i` as well, e.g., `DT[x > 2, mean(y), by=z]`. Partly addresses [#971](https://github.com/Rdatatable/data.table/issues/971).

12. GForce is optimised for `head(.SD, 1)` and `tail(.SD, 1`). Partly addresses [#523](https://github.com/Rdatatable/data.table/issues/523). Check the link for benchmarks.

#### BUG FIXES

1. Now compiles and runs on IBM AIX gcc. Thanks to Vinh Nguyen for investigation and testing, [#1351](https://github.com/Rdatatable/data.table/issues/1351).
Expand Down
52 changes: 51 additions & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -7094,7 +7094,7 @@ test(1578.4, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4))
test(1578.5, fread("530_fread.txt", skip=47L), data.table(V1=1:2, V2=3:4))
test(1578.6, fread("530_fread.txt", skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4))

# testing gforce::gmedian
# gforce optimisations
dt = data.table(x = sample(letters, 300, TRUE),
i1 = sample(-10:10, 300, TRUE),
i2 = sample(c(-10:10, NA), 300, TRUE),
Expand All @@ -7108,6 +7108,8 @@ if ('package:bit64' %in% search()) {
# make sure gforce is on
optim = getOption("datatable.optimize")
options(datatable.optimize=2L)

# testing gforce::gmedian
test(1579.1, dt[, lapply(.SD, median), by=x],
dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x])
test(1579.2, dt[, lapply(.SD, median, na.rm=TRUE), by=x],
Expand All @@ -7116,6 +7118,34 @@ test(1579.3, dt[, lapply(.SD, median), keyby=x],
dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x])
test(1579.4, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x],
dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x])
ans = capture.output(dt[, lapply(.SD, median), by=x, verbose=TRUE])
test(1579.5, any(grepl("GForce optimized", ans)), TRUE)

# testing gforce::ghead and gforce::gtail
# head(.SD, 1) and tail(.SD, 1) optimisation
test(1579.6, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x])
test(1579.7, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x])
test(1579.8, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x])
test(1579.9, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x])
test(1579.10, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x])
test(1579.11, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x])
test(1579.12, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x])
test(1579.13, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x])

test(1579.6, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x])
test(1579.7, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x])
test(1579.8, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x])
test(1579.9, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x])
test(1579.10, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x])
test(1579.11, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x])
test(1579.12, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x])
test(1579.13, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x])

ans = capture.output(dt[, head(.SD,1), by=x, verbose=TRUE])
test(1579.14, any(grepl("GForce optimized", ans)), TRUE)
ans = capture.output(dt[, tail(.SD,1), by=x, verbose=TRUE])
test(1579.15, any(grepl("GForce optimized", ans)), TRUE)

options(datatable.optimize=optim)

# test for #1419, rleid doesn't remove names attribute
Expand All @@ -7128,6 +7158,7 @@ test(1580, nx, names(x))
# make sure GForce kicks in and the results are identical
dt = dt[, .(x, d1, d2)]
optim = getOption("datatable.optimize")

options(datatable.optimize=1L)
opt1 <- capture.output(ans1 <- dt[x %in% letters[15:20],
c(.N, lapply(.SD, sum, na.rm=TRUE),
Expand All @@ -7147,8 +7178,27 @@ opt2 <- capture.output(ans2 <- dt[x %in% letters[15:20],
test(1581.1, ans1, ans2)
test(1581.2, any(grepl("^Starting dogroups", opt1)), TRUE)
test(1581.3, any(grepl("^GForce optimized j", opt2)), TRUE)

# subsets in 'i' for head and tail
options(datatable.optimize=1L)
opt1 <- capture.output(ans1 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE])
options(datatable.optimize=2L)
opt2 <- capture.output(ans2 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE])
test(1581.4, ans1, ans2)
test(1581.5, any(grepl("^Starting dogroups", opt1)), TRUE)
test(1581.6, any(grepl("^GForce optimized j", opt2)), TRUE)

options(datatable.optimize=1L)
opt1 <- capture.output(ans1 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE])
options(datatable.optimize=2L)
opt2 <- capture.output(ans2 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE])
test(1581.7, ans1, ans2)
test(1581.8, any(grepl("^Starting dogroups", opt1)), TRUE)
test(1581.9, any(grepl("^GForce optimized j", opt2)), TRUE)

options(datatable.optimize=optim)


##########################

# TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time.
Expand Down
173 changes: 152 additions & 21 deletions src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -507,20 +507,20 @@ SEXP gmedian(SEXP x, SEXP narm) {
k = (irowslen == -1) ? k : irows[k]-1;
// TODO: raise this if-statement?
if (!isint64) {
if (!ISNAN(REAL(x)[k])) {
REAL(sub)[j] = REAL(x)[k];
} else {
if (!ISNAN(REAL(x)[k])) {
REAL(sub)[j] = REAL(x)[k];
} else {
REAL(ans)[i] = NA_REAL;
isna = TRUE; break;
}
}
} else {
u.d = REAL(x)[k];
if (u.ll != NAINT64) {
REAL(sub)[j] = (double)u.ll;
} else {
if (u.ll != NAINT64) {
REAL(sub)[j] = (double)u.ll;
} else {
REAL(ans)[i] = NA_REAL;
isna = TRUE; break;
}
}
}
}
if (isna) continue;
Expand All @@ -546,14 +546,14 @@ SEXP gmedian(SEXP x, SEXP narm) {
k = (irowslen == -1) ? k : irows[k]-1;
// TODO: raise this if-statement?
if (!isint64) {
if (!ISNAN(REAL(x)[k])) {
REAL(sub)[j-nacount] = REAL(x)[k];
} else { nacount++; continue; }
if (!ISNAN(REAL(x)[k])) {
REAL(sub)[j-nacount] = REAL(x)[k];
} else { nacount++; continue; }
} else {
u.d = REAL(x)[k];
if (u.ll != NAINT64) {
REAL(sub)[j-nacount] = (double)u.ll;
} else { nacount++; continue; }
if (u.ll != NAINT64) {
REAL(sub)[j-nacount] = (double)u.ll;
} else { nacount++; continue; }
}
}
if (nacount == thisgrpsize) {
Expand Down Expand Up @@ -590,12 +590,12 @@ SEXP gmedian(SEXP x, SEXP narm) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
if (INTEGER(x)[k] != NA_INTEGER) {
INTEGER(sub)[j] = INTEGER(x)[k];
} else {
if (INTEGER(x)[k] != NA_INTEGER) {
INTEGER(sub)[j] = INTEGER(x)[k];
} else {
REAL(ans)[i] = NA_REAL;
isna = TRUE; break;
}
}
}
if (isna) continue;
medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
Expand All @@ -618,9 +618,9 @@ SEXP gmedian(SEXP x, SEXP narm) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
if (INTEGER(x)[k] != NA_INTEGER) {
INTEGER(sub)[j-nacount] = INTEGER(x)[k];
} else { nacount++; continue; }
if (INTEGER(x)[k] != NA_INTEGER) {
INTEGER(sub)[j-nacount] = INTEGER(x)[k];
} else { nacount++; continue; }
}
if (nacount == thisgrpsize) {
REAL(ans)[i] = NA_REAL; // all NAs
Expand Down Expand Up @@ -650,3 +650,134 @@ SEXP gmedian(SEXP x, SEXP narm) {
return(ans);
}

SEXP glast(SEXP x) {
if (!isVectorAtomic(x)) error("GForce tail can only be applied to columns, not .SD or similar. To get tail of all items in a list such as .SD, either add the prefix utils::tail(.SD) or turn off GForce optimization using options(datatable.optimize=1).");

R_len_t i,k;
int n = (irowslen == -1) ? length(x) : irowslen;
SEXP ans;
if (grpn != n) error("grpn [%d] != length(x) [%d] in gtail", grpn, n);
switch(TYPEOF(x)) {
case LGLSXP:
ans = PROTECT(allocVector(LGLSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
LOGICAL(ans)[i] = LOGICAL(x)[k];
}
break;
case INTSXP:
ans = PROTECT(allocVector(INTSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
INTEGER(ans)[i] = INTEGER(x)[k];
}
break;
case REALSXP:
ans = PROTECT(allocVector(REALSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
REAL(ans)[i] = REAL(x)[k];
}
break;
case STRSXP:
ans = PROTECT(allocVector(STRSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_STRING_ELT(ans, i, STRING_ELT(x, k));
}
break;
case VECSXP:
ans = PROTECT(allocVector(VECSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k));
}
break;
default:
error("Type '%s' not supported by GForce tail (gtail). Either add the prefix utils::tail(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
}
copyMostAttrib(x, ans);
UNPROTECT(1);
return(ans);
}

SEXP gfirst(SEXP x) {
if (!isVectorAtomic(x)) error("GForce head can only be applied to columns, not .SD or similar. To get head of all items in a list such as .SD, either add the prefix utils::head(.SD) or turn off GForce optimization using options(datatable.optimize=1).");

R_len_t i,k;
int n = (irowslen == -1) ? length(x) : irowslen;
SEXP ans;
if (grpn != n) error("grpn [%d] != length(x) [%d] in ghead", grpn, n);
switch(TYPEOF(x)) {
case LGLSXP:
ans = PROTECT(allocVector(LGLSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
LOGICAL(ans)[i] = LOGICAL(x)[k];
}
break;
case INTSXP:
ans = PROTECT(allocVector(INTSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
INTEGER(ans)[i] = INTEGER(x)[k];
}
break;
case REALSXP:
ans = PROTECT(allocVector(REALSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
REAL(ans)[i] = REAL(x)[k];
}
break;
case STRSXP:
ans = PROTECT(allocVector(STRSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_STRING_ELT(ans, i, STRING_ELT(x, k));
}
break;
case VECSXP:
ans = PROTECT(allocVector(VECSXP, ngrp));
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k));
}
break;
default:
error("Type '%s' not supported by GForce head (ghead). Either add the prefix utils::head(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
}
copyMostAttrib(x, ans);
UNPROTECT(1);
return(ans);
}

SEXP gtail(SEXP x, SEXP n) {
if (!isInteger(n) || LENGTH(n)!=1 || INTEGER(n)[0]!=1) error("Internal error, gtail is only implemented for n=1. This should have been caught before. Please report to datatable-help.");
return (glast(x));
}

SEXP ghead(SEXP x, SEXP n) {
if (!isInteger(n) || LENGTH(n)!=1 || INTEGER(n)[0]!=1) error("Internal error, ghead is only implemented for n=1. This should have been caught before. Please report to datatable-help.");
return (gfirst(x));
}
9 changes: 8 additions & 1 deletion src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ SEXP isReallyReal();
SEXP setlevels();
SEXP rleid();
SEXP gmedian();
SEXP gtail();
SEXP ghead();
SEXP glast();
SEXP gfirst();

// .Externals
SEXP fastmean();
Expand Down Expand Up @@ -124,7 +128,10 @@ R_CallMethodDef callMethods[] = {
{"Csetlevels", (DL_FUNC) &setlevels, -1},
{"Crleid", (DL_FUNC) &rleid, -1},
{"Cgmedian", (DL_FUNC) &gmedian, -1},

{"Cgtail", (DL_FUNC) &gtail, -1},
{"Cghead", (DL_FUNC) &ghead, -1},
{"Cglast", (DL_FUNC) &glast, -1},
{"Cgfirst", (DL_FUNC) &gfirst, -1},
{NULL, NULL, 0}
};

Expand Down

0 comments on commit e615532

Please sign in to comment.