Skip to content

Commit

Permalink
Gforce first last, head tail, coverage (#3463)
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki authored and mattdowle committed Apr 1, 2019
1 parent 025a911 commit 58c3b27
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 36 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#### BUG FIXES

1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting.

#### NOTES

1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.
Expand Down
11 changes: 8 additions & 3 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1595,7 +1595,12 @@ replace_dot_alias <- function(e) {
jvnames = ansvarsnew
}
} else if (length(as.character(jsub[[1L]])) == 1L) { # Else expect problems with <jsub[[1L]] == >
if (length(jsub) == 3L && (jsub[[1L]] == "[" || jsub[[1L]] == "head" || jsub[[1L]] == "tail") && jsub[[2L]] == ".SD" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") ) {
subopt = length(jsub) == 3L && jsub[[1L]] == "[" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N")
headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail"
firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030
if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") &&
(subopt || headopt || firstopt)) {
if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462
# optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
jsub = as.call(c(quote(list), lapply(ansvarsnew, function(x) { jsub[[2L]] = as.name(x); jsub })))
jvnames = ansvarsnew
Expand Down Expand Up @@ -1694,7 +1699,7 @@ replace_dot_alias <- function(e) {
else
cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L),"'\n",sep="")
}
dotN <- function(x) if (is.name(x) && x == ".N") TRUE else FALSE # For #5760
dotN <- function(x) is.name(x) && x == ".N" # For #5760
# FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with
# nomatch=0L even now.. but not switching it on yet, will deal it separately.
if (getOption("datatable.optimize")>=2 && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) {
Expand All @@ -1712,7 +1717,7 @@ replace_dot_alias <- function(e) {
# Need is.symbol() check. See #1369, #1974 or #2949 issues and explanation below by searching for one of these issues.
cond = is.call(q) && is.symbol(q[[1]]) && (q1c <- as.character(q[[1]])) %chin% gfuns && !is.call(q[[2L]])
# run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls
ans = cond && (length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L)))
ans = cond && (length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1c %chin% c("head","tail")) # head-tail uses default value n=6 which as of now should not go gforce
if (identical(ans, TRUE)) return(ans)
# otherwise there must be three arguments, and only in two cases --
# 1) head/tail(x, 1) or 2) x[n], n>0
Expand Down
70 changes: 58 additions & 12 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -13901,30 +13901,76 @@ test(2013.3, DT[2], error="Column 2 ['b'] is length 4 but column 1 is length 3;
## new fread keepLeadingZeros parameter in v1.12.2
# leading zeros in both integer and float numbers are converted to character when keepLeadingZeros=TRUE
test_data_single <- "0, 00, 01, 00010, 002.01\n"
test(1978.1, fread(test_data_single), data.table(0L, 0L, 1L, 10L, 2.01))
test(1978.2, fread(test_data_single, keepLeadingZeros = FALSE), data.table(0L, 0L, 1L, 10L, 2.01))
test(1978.3, fread(test_data_single, keepLeadingZeros = TRUE), data.table(0L, "00","01","00010","002.01"))
test(2014.1, fread(test_data_single), data.table(0L, 0L, 1L, 10L, 2.01))
test(2014.2, fread(test_data_single, keepLeadingZeros = FALSE), data.table(0L, 0L, 1L, 10L, 2.01))
test(2014.3, fread(test_data_single, keepLeadingZeros = TRUE), data.table(0L, "00","01","00010","002.01"))
# converts whole column to character when keepLeadingZeros = TRUE and at least 1 value contains a leading zero
test_data_mult <- paste0(c(sample(1:100),"0010",sample(1:100)), collapse="\n")
test(1978.4, class(fread(test_data_mult, keepLeadingZeros = TRUE)[[1]]), "character")
test(1978.5, class(fread(test_data_mult, keepLeadingZeros = FALSE)[[1]]), "integer")
test(2014.4, class(fread(test_data_mult, keepLeadingZeros = TRUE)[[1]]), "character")
test(2014.5, class(fread(test_data_mult, keepLeadingZeros = FALSE)[[1]]), "integer")

# rbindlist should drop NA from levels of source factors, relied on by package emil
test(1979, levels(rbindlist( list( data.frame(a=factor("a",levels=c("a",NA),exclude=NULL)) ))$a), "a") # the NA level should not be retained
test(2015, levels(rbindlist( list( data.frame(a=factor("a",levels=c("a",NA),exclude=NULL)) ))$a), "a") # the NA level should not be retained

# better save->load->set(<new column>) message, #2996
DT = data.table(a=1:3)
save(list="DT", file=tt<-tempfile())
rm(DT)
name = load(tt)
test(1980.1, name, "DT")
test(1980.2, DT, data.table(a=1:3))
test(1980.3, DT[2,a:=4L], data.table(a=INT(1,4,3))) # no error for := when existing column
test(1980.4, set(DT,3L,1L,5L), data.table(a=INT(1,4,5))) # no error for set() when existing column
test(1980.5, set(DT,2L,"newCol",5L), error="either been loaded from disk.*or constructed manually.*Please run setDT.*alloc.col.*on it first") # just set()
test(1980.6, DT[2,newCol:=6L], data.table(a=INT(1,4,5), newCol=INT(NA,6L,NA))) # := ok (it changes DT in caller)
test(2016.1, name, "DT")
test(2016.2, DT, data.table(a=1:3))
test(2016.3, DT[2,a:=4L], data.table(a=INT(1,4,3))) # no error for := when existing column
test(2016.4, set(DT,3L,1L,5L), data.table(a=INT(1,4,5))) # no error for set() when existing column
test(2016.5, set(DT,2L,"newCol",5L), error="either been loaded from disk.*or constructed manually.*Please run setDT.*alloc.col.*on it first") # just set()
test(2016.6, DT[2,newCol:=6L], data.table(a=INT(1,4,5), newCol=INT(NA,6L,NA))) # := ok (it changes DT in caller)
unlink(tt)

# gfirst(.SD) throws an error about not using head(.SD, n), but the latter works #2030
DT = data.table(id = c(1L,1L,2L), v = 1:3)
test(2017.1, DT[, first(.SD), by=id, .SDcols="v", verbose=TRUE], data.table(id=1:2, v=c(1L,3L)), output="optimized j to 'list(gfirst(v))'")
test(2017.2, DT[, first(v), by=id, verbose=TRUE], data.table(id=1:2, V1=c(1L,3L)), output="optimized j to 'gfirst(v)'")
test(2017.3, DT[, last(v), by=id, verbose=TRUE], data.table(id=1:2, V1=c(2L,3L)), output="optimized j to 'glast(v)'")
test(2017.4, DT[, v[1L], by=id, verbose=TRUE], data.table(id=1:2, V1=c(1L,3L)), output="optimized j to '`g[`(v, 1L)'")
DT = data.table(id = c(1L,1L,2L), v = 1:3, y = 3:1, z = c(TRUE, TRUE, FALSE), u = c("a","b","c"))
test(2017.5, DT[, first(.SD), by=id, .SDcols=c("v","y","z","u"), verbose=TRUE],
data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c")),
output="optimized j to 'list(gfirst(v), gfirst(y), gfirst(z), gfirst(u))'")
test(2017.6, DT[, last(.SD), by=id, .SDcols=c("v","y","z","u"), verbose=TRUE],
data.table(id=1:2, v=c(2L,3L), y=c(2L,1L), z=c(TRUE,FALSE), u=c("b","c")),
output="optimized j to 'list(glast(v), glast(y), glast(z), glast(u))'")
test(2017.7, DT[, .SD[1L], by=id, .SDcols=c("v","y","z","u"), verbose=TRUE],
data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c")),
output="optimized j to 'list(`g[`(v, 1L), `g[`(y, 1L), `g[`(z, 1L), `g[`(u, 1L))'")
# ghead argument "n" is missing, with no default #3462
DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1)
test(2018.1, DT[, head(.SD), a, verbose=TRUE],
data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1)),
output=c("lapply optimization changed j from 'head(.SD)' to 'list(head(b, n = 6L), head(d, n = 6L))'",
"GForce is on, left j unchanged"))
test(2018.2, DT[, head(b), a, verbose=TRUE],
data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12)),
output=c("lapply optimization is on, j unchanged as 'head(b)'",
"GForce is on, left j unchanged"))
test(2018.3, DT[, tail(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(2:7, 8:12), d=c(11:6, 5:1)))
test(2018.4, DT[, tail(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(2:7, 8:12)))
# gforce tests coverage
if (test_bit64) {
DT = data.table(id=c(rep(1L,3), rep(2L, 3)), v=bit64::as.integer64(c(1:3, 4L, 5:6)))
test(2019, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows
}
DT = data.table(id = c(1L,1L,2L), v = c(1i, 2i, 3i))
test(2020.01, DT[, min(v), by=id], error="'complex' not supported by GForce min")
test(2020.02, DT[, max(v), by=id], error="'complex' not supported by GForce max")
test(2020.03, DT[, median(v), by=id], error="'complex' not supported by GForce median")
test(2020.04, DT[, head(v, 1), by=id], error="'complex' not supported by GForce head")
test(2020.05, DT[, tail(v, 1), by=id], error="'complex' not supported by GForce tail")
test(2020.06, DT[, v[1], by=id], error="'complex' not supported by GForce subset")
test(2020.07, DT[, sd(v), by=id], error="'complex' not supported by GForce sd")
test(2020.08, DT[, var(v), by=id], error="'complex' not supported by GForce var")
test(2020.09, DT[, prod(v), by=id], error="'complex' not supported by GForce prod")
DT = data.table(id = c(1L,1L,2L,2L), v = c(1L, 2L, NA, NA))
test(2020.10, DT[, median(v), id], data.table(id=1:2, V1=c(1.5, NA))) # median whole group has NAs


###################################
# Add new tests above this line #
Expand Down
69 changes: 48 additions & 21 deletions src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -495,8 +495,8 @@ SEXP gmean(SEXP x, SEXP narm)
}
break;
default:
free(s); free(c);
error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
free(s); free(c); // # nocov because it already stops at gsum, remove nocov if gmean will support a type that gsum wont
error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x))); // # nocov
}
ans = PROTECT(allocVector(REALSXP, ngrp));
for (int i=0; i<ngrp; i++) {
Expand Down Expand Up @@ -955,32 +955,41 @@ SEXP glast(SEXP x) {
SEXP ans;
if (nrow != n) error("nrow [%d] != length(x) [%d] in gtail", nrow, n);
switch(TYPEOF(x)) {
case LGLSXP:
case LGLSXP: {
const int *ix = LOGICAL(x);
ans = PROTECT(allocVector(LGLSXP, ngrp));
int *ians = LOGICAL(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
LOGICAL(ans)[i] = LOGICAL(x)[k];
ians[i] = ix[k];
}
}
break;
case INTSXP:
case INTSXP: {
const int *ix = INTEGER(x);
ans = PROTECT(allocVector(INTSXP, ngrp));
int *ians = INTEGER(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
INTEGER(ans)[i] = INTEGER(x)[k];
ians[i] = ix[k];
}
}
break;
case REALSXP:
case REALSXP: {
const double *dx = REAL(x);
ans = PROTECT(allocVector(REALSXP, ngrp));
double *dans = REAL(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
REAL(ans)[i] = REAL(x)[k];
dans[i] = dx[k];
}
}
break;
case STRSXP:
ans = PROTECT(allocVector(STRSXP, ngrp));
Expand Down Expand Up @@ -1017,32 +1026,41 @@ SEXP gfirst(SEXP x) {
SEXP ans;
if (nrow != n) error("nrow [%d] != length(x) [%d] in ghead", nrow, n);
switch(TYPEOF(x)) {
case LGLSXP:
case LGLSXP: {
int const *ix = LOGICAL(x);
ans = PROTECT(allocVector(LGLSXP, ngrp));
int *ians = LOGICAL(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
LOGICAL(ans)[i] = LOGICAL(x)[k];
ians[i] = ix[k];
}
}
break;
case INTSXP:
case INTSXP: {
const int *ix = INTEGER(x);
ans = PROTECT(allocVector(INTSXP, ngrp));
int *ians = INTEGER(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
INTEGER(ans)[i] = INTEGER(x)[k];
ians[i] = ix[k];
}
}
break;
case REALSXP:
case REALSXP: {
const double *dx = REAL(x);
ans = PROTECT(allocVector(REALSXP, ngrp));
double *dans = REAL(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
REAL(ans)[i] = REAL(x)[k];
dans[i] = dx[k];
}
}
break;
case STRSXP:
ans = PROTECT(allocVector(STRSXP, ngrp));
Expand Down Expand Up @@ -1088,35 +1106,44 @@ SEXP gnthvalue(SEXP x, SEXP valArg) {
SEXP ans;
if (nrow != n) error("nrow [%d] != length(x) [%d] in ghead", nrow, n);
switch(TYPEOF(x)) {
case LGLSXP:
case LGLSXP: {
const int *ix = LOGICAL(x);
ans = PROTECT(allocVector(LGLSXP, ngrp));
int *ians = LOGICAL(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { LOGICAL(ans)[i] = NA_LOGICAL; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
LOGICAL(ans)[i] = LOGICAL(x)[k];
ians[i] = ix[k];
}
}
break;
case INTSXP:
case INTSXP: {
const int *ix = LOGICAL(x);
ans = PROTECT(allocVector(INTSXP, ngrp));
int *ians = LOGICAL(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
INTEGER(ans)[i] = INTEGER(x)[k];
ians[i] = ix[k];
}
}
break;
case REALSXP:
case REALSXP: {
const double *dx = REAL(x);
ans = PROTECT(allocVector(REALSXP, ngrp));
double *dans = REAL(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { REAL(ans)[i] = NA_REAL; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
REAL(ans)[i] = REAL(x)[k];
dans[i] = dx[k];
}
}
break;
case STRSXP:
ans = PROTECT(allocVector(STRSXP, ngrp));
Expand Down Expand Up @@ -1270,7 +1297,7 @@ SEXP gvarsd1(SEXP x, SEXP narm, Rboolean isSD)
SETLENGTH(sub, maxgrpn);
break;
default:
if (isSD) {
if (!isSD) {
error("Type '%s' not supported by GForce var (gvar). Either add the prefix stats::var(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
} else {
error("Type '%s' not supported by GForce sd (gsd). Either add the prefix stats::sd(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
Expand Down

0 comments on commit 58c3b27

Please sign in to comment.