Skip to content

Commit

Permalink
added infnan_count to forder.c (#3426)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdowle authored Feb 23, 2019
1 parent be8c7f6 commit 840f43f
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 15 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

8. `dcast.data.table` handles sorting of rows with `NA` in result correctly. Closes [#2202](https://github.com/Rdatatable/data.table/issues/2202). Thanks to @Galileo-Galilei for the report.

9. Sorting and grouping a numeric column containing at most one finite value could return incorrect results, [#3381](https://github.com/Rdatatable/data.table/issues/3381); e.g., `data.table(A=c(-Inf,0,Inf), V=1:3)[,sum(V),by=A]` would treat the 3 rows as one group. This was a regression in 1.12.0 only.

#### NOTES

1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background.
Expand All @@ -48,6 +50,7 @@

10. `foverlaps` provides better error messages when one interval is POSIXct and other isn't. Also, it warns when POSIXct interval columns are not all of same timezone. Closes [#1143](https://github.com/Rdatatable/data.table/issues/1143). Thanks to @DavidArenburg for the report.


### Changes in v1.12.0 (13 Jan 2019)

#### NEW FEATURES
Expand Down
10 changes: 10 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -13555,6 +13555,16 @@ yp <- data.table(start = tt, end = tt, key=c("start", "end"))
test(1993.1, foverlaps(xp, yp, nomatch = 0L, which=TRUE), data.table(xid=1L, yid=2L), warning="POSIXct interval cols have mixed timezones")
test(1993.2, foverlaps(xp, yp, by.x=c("day", "year")), error="Some interval cols are of type POSIXct while others are not")

# forderv NaN,Inf and Inf when at most 1 finite value is present, #3381. These broke in v1.12.0. They pass in v1.11.8.
test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L))
test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L))
test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L))
test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L))
test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L))
test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L))
test(1994.7, data.table(A=c(-Inf,21,Inf),V=1:3)[,sum(V),by=A]$V1, 1:3)


###################################
# Add new tests above this line #
###################################
Expand Down
28 changes: 13 additions & 15 deletions src/forder.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,23 +186,22 @@ static void range_i64(int64_t *x, int n, uint64_t *out_min, uint64_t *out_max, i
*out_max = max ^ 0x8000000000000000u;
}

static void range_d(double *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
// return range of finite numbers (excluding NA, NaN, -Inf, +Inf) and a count of
// strictly NA (not NaN) so caller knows if it's all NA or any Inf|-Inf|NaN are present
static void range_d(double *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, int *out_infnan_count)
// return range of finite numbers (excluding NA, NaN, -Inf, +Inf), a count of NA and a count of Inf|-Inf|NaN
{
uint64_t min = 0;
uint64_t max = 0;
int na_count = 0;
uint64_t min=0, max=0;
int na_count=0, infnan_count=0;
int i=0;
while(i<n && !R_FINITE(x[i])) { na_count+=ISNA(x[i]); i++; }
while(i<n && !R_FINITE(x[i])) { ISNA(x[i++]) ? na_count++ : infnan_count++; }
if (i<n) { max = min = dtwiddle(x, i++);}
for(; i<n; i++) {
if (!R_FINITE(x[i])) { na_count+=ISNA(x[i]); continue; }
if (!R_FINITE(x[i])) { ISNA(x[i]) ? na_count++ : infnan_count++; continue; }
uint64_t tmp = dtwiddle(x, i);
if (tmp>max) max=tmp;
else if (tmp<min) min=tmp;
}
*out_na_count = na_count;
*out_infnan_count = infnan_count;
*out_min = min;
*out_max = max;
}
Expand Down Expand Up @@ -483,8 +482,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
for (int col=0; col<ncol; col++) {
// Rprintf("Finding range of column %d ...\n", col);
SEXP x = VECTOR_ELT(DT,INTEGER(by)[col]-1);
uint64_t min, max; // min and max of non-NA values
int na_count=0;
uint64_t min=0, max=0; // min and max of non-NA finite values
int na_count=0, infnan_count=0;
if (sortType) sortType=INTEGER(ascArg)[col]; // if sortType!=0 (not first-appearance) then +1/-1 comes from ascArg.
//Rprintf("sortType = %d\n", sortType);
switch(TYPEOF(x)) {
Expand All @@ -495,7 +494,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
if (inherits(x, "integer64")) {
range_i64((int64_t *)REAL(x), nrow, &min, &max, &na_count);
} else {
range_d(REAL(x), nrow, &min, &max, &na_count);
range_d(REAL(x), nrow, &min, &max, &na_count, &infnan_count);
if (min==0 && na_count<nrow) { min=3; max=4; } // column contains no finite numbers and is not-all NA; create dummies to yield positive min-2 later
isReal = true;
}
Expand All @@ -508,10 +507,9 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
Error("Column %d of by= (%d) is type '%s', not yet supported", col+1, INTEGER(by)[col], type2char(TYPEOF(x)));
}
TEND(3);
if ((min==0 && na_count==nrow) || (min>0 && min==max && na_count==0)) {
// all same value; skip column as nothing to do
// min==0 implies na_count==n anyway for all types other than real when Inf,-Inf or NaN are present (excluded from [min,max] as well as NA)
if (min==0 && nalast==-1) { for (int i=0; i<nrow; i++) anso[i]=0; }
if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) {
// all same value; skip column as nothing to do; [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA)
if (na_count==nrow && nalast==-1) { for (int i=0; i<nrow; i++) anso[i]=0; }
if (TYPEOF(x)==STRSXP) free_ustr();
continue;
}
Expand Down

0 comments on commit 840f43f

Please sign in to comment.