added infnan_count to forder.c (#3426)

Rdatatable · Feb 23, 2019 · 840f43f · 840f43f
1 parent be8c7f6
commit 840f43f
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 15 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -26,6 +26,8 @@
 
 8. `dcast.data.table` handles sorting of rows with `NA` in result correctly. Closes [#2202](https://github.com/Rdatatable/data.table/issues/2202). Thanks to @Galileo-Galilei for the report.
 
+9. Sorting and grouping a numeric column containing at most one finite value could return incorrect results, [#3381](https://github.com/Rdatatable/data.table/issues/3381); e.g., `data.table(A=c(-Inf,0,Inf), V=1:3)[,sum(V),by=A]` would treat the 3 rows as one group. This was a regression in 1.12.0 only. 
+
 #### NOTES
 
 1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background.
@@ -48,6 +50,7 @@
 
 10. `foverlaps` provides better error messages when one interval is POSIXct and other isn't. Also, it warns when POSIXct interval columns are not all of same timezone. Closes [#1143](https://github.com/Rdatatable/data.table/issues/1143). Thanks to @DavidArenburg for the report.
 
+
 ### Changes in v1.12.0  (13 Jan 2019)
 
 #### NEW FEATURES

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -13555,6 +13555,16 @@ yp <- data.table(start = tt, end = tt, key=c("start", "end"))
 test(1993.1, foverlaps(xp, yp, nomatch = 0L, which=TRUE), data.table(xid=1L, yid=2L), warning="POSIXct interval cols have mixed timezones")
 test(1993.2, foverlaps(xp, yp, by.x=c("day", "year")), error="Some interval cols are of type POSIXct while others are not")
 
+# forderv NaN,Inf and Inf when at most 1 finite value is present, #3381. These broke in v1.12.0. They pass in v1.11.8.
+test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L))
+test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE),   structure(integer(), starts=1:3, maxgrpn=1L))
+test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE),      structure(integer(), starts=1:2, maxgrpn=1L))
+test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE),      structure(2:1, starts=1:2, maxgrpn=1L))
+test(1994.5, forderv(c(0, NaN), retGrp=TRUE),         structure(2:1, starts=1:2, maxgrpn=1L))
+test(1994.6, forderv(c(NaN, 0), retGrp=TRUE),         structure(integer(), starts=1:2, maxgrpn=1L))
+test(1994.7, data.table(A=c(-Inf,21,Inf),V=1:3)[,sum(V),by=A]$V1, 1:3)
+
+
 ###################################
 #  Add new tests above this line  #
 ###################################

diff --git a/src/forder.c b/src/forder.c
@@ -186,23 +186,22 @@ static void range_i64(int64_t *x, int n, uint64_t *out_min, uint64_t *out_max, i
   *out_max = max ^ 0x8000000000000000u;
 }
 
-static void range_d(double *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
-// return range of finite numbers (excluding NA, NaN, -Inf, +Inf) and a count of
-// strictly NA (not NaN) so caller knows if it's all NA or any Inf|-Inf|NaN are present
+static void range_d(double *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, int *out_infnan_count)
+// return range of finite numbers (excluding NA, NaN, -Inf, +Inf), a count of NA and a count of Inf|-Inf|NaN
 {
-  uint64_t min = 0;
-  uint64_t max = 0;
-  int na_count = 0;
+  uint64_t min=0, max=0;
+  int na_count=0, infnan_count=0;
   int i=0;
-  while(i<n && !R_FINITE(x[i])) { na_count+=ISNA(x[i]); i++; }
+  while(i<n && !R_FINITE(x[i])) { ISNA(x[i++]) ? na_count++ : infnan_count++; }
   if (i<n) { max = min = dtwiddle(x, i++);}
   for(; i<n; i++) {
-    if (!R_FINITE(x[i])) { na_count+=ISNA(x[i]); continue; }
+    if (!R_FINITE(x[i])) { ISNA(x[i]) ? na_count++ : infnan_count++; continue; }
     uint64_t tmp = dtwiddle(x, i);
     if (tmp>max) max=tmp;
     else if (tmp<min) min=tmp;
   }
   *out_na_count = na_count;
+  *out_infnan_count = infnan_count;
   *out_min = min;
   *out_max = max;
 }
@@ -483,8 +482,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
   for (int col=0; col<ncol; col++) {
     // Rprintf("Finding range of column %d ...\n", col);
     SEXP x = VECTOR_ELT(DT,INTEGER(by)[col]-1);
-    uint64_t min, max;     // min and max of non-NA values
-    int na_count=0;
+    uint64_t min=0, max=0;     // min and max of non-NA finite values
+    int na_count=0, infnan_count=0;
     if (sortType) sortType=INTEGER(ascArg)[col];  // if sortType!=0 (not first-appearance) then +1/-1 comes from ascArg.
     //Rprintf("sortType = %d\n", sortType);
     switch(TYPEOF(x)) {
@@ -495,7 +494,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
       if (inherits(x, "integer64")) {
         range_i64((int64_t *)REAL(x), nrow, &min, &max, &na_count);
       } else {
-        range_d(REAL(x), nrow, &min, &max, &na_count);
+        range_d(REAL(x), nrow, &min, &max, &na_count, &infnan_count);
         if (min==0 && na_count<nrow) { min=3; max=4; } // column contains no finite numbers and is not-all NA; create dummies to yield positive min-2 later
         isReal = true;
       }
@@ -508,10 +507,9 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
        Error("Column %d of by= (%d) is type '%s', not yet supported", col+1, INTEGER(by)[col], type2char(TYPEOF(x)));
     }
     TEND(3);
-    if ((min==0 && na_count==nrow) || (min>0 && min==max && na_count==0)) {
-      // all same value; skip column as nothing to do
-      // min==0 implies na_count==n anyway for all types other than real when Inf,-Inf or NaN are present (excluded from [min,max] as well as NA)
-      if (min==0 && nalast==-1) { for (int i=0; i<nrow; i++) anso[i]=0; }
+    if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) {
+      // all same value; skip column as nothing to do;  [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA)
+      if (na_count==nrow && nalast==-1) { for (int i=0; i<nrow; i++) anso[i]=0; }
       if (TYPEOF(x)==STRSXP) free_ustr();
       continue;
     }