nafill gains nan argument (#4025)

Rdatatable · Dec 18, 2019 · 51805cd · 51805cd
1 parent 8ce5f09
commit 51805cd
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 35 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -14,6 +14,9 @@
 
 4. `setnames(DT, new=new_names)` (i.e. explicitly named `new=` argument) now works as expected rather than an error message requesting that `old=` be supplied too, [#4041](https://github.com/Rdatatable/data.table/issues/4041). Thanks @Kodiologist for the suggestion.
 
+5. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed.
+
+
 ## BUG FIXES
 
 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).
@@ -35,6 +38,7 @@
 
 1. `DT[, {...; .(A,B)}]` (i.e. when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. Thanks also to @renkun-ken for his go-first strong testing which caught an issue not caught by the test suite or by revdep testing, related to NULL being the last item, [#4061](https://github.com/Rdatatable/data.table/issues/4061).
 
+
 ## BUG FIXES
 
 1. `frollapply` could segfault and exceed R's C protect limits, [#3993](https://github.com/Rdatatable/data.table/issues/3993). Thanks to @DavisVaughan for reporting and fixing.

diff --git a/R/shift.R b/R/shift.R
@@ -24,16 +24,16 @@ shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FA
   ans
 }
 
-nafill = function(x, type=c("const","locf","nocb"), fill=NA, verbose=getOption("datatable.verbose")) {
+nafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA) {
   type = match.arg(type)
   if (type!="const" && !missing(fill))
     warning("argument 'fill' ignored, only make sense for type='const'")
-  .Call(CnafillR, x, type, fill, FALSE, NULL, verbose)
+  .Call(CnafillR, x, type, fill, nan_is_na(nan), FALSE, NULL)
 }
 
-setnafill = function(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x), verbose=getOption("datatable.verbose")) {
+setnafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x)) {
   type = match.arg(type)
   if (type!="const" && !missing(fill))
     warning("argument 'fill' ignored, only make sense for type='const'")
-  invisible(.Call(CnafillR, x, type, fill, TRUE, cols, verbose))
+  invisible(.Call(CnafillR, x, type, fill, nan_is_na(nan), TRUE, cols))
 }
diff --git a/R/utils.R b/R/utils.R
@@ -13,6 +13,13 @@ if (base::getRversion() < "3.5.0") {
 isTRUEorNA    = function(x) is.logical(x) && length(x)==1L && (is.na(x) || x)
 isTRUEorFALSE = function(x) is.logical(x) && length(x)==1L && !is.na(x)
 allNA = function(x) .Call(C_allNAR, x)
+# helper for nan argument (e.g. nafill): TRUE -> treat NaN as NA
+nan_is_na = function(x) {
+  if (length(x) != 1L) stop("Argument 'nan' must be length 1")
+  if (identical(x, NA) || identical(x, NA_real_)) return(TRUE)
+  if (identical(x, NaN)) return(FALSE)
+  stop("Argument 'nan' must be NA or NaN")
+}
 
 if (base::getRversion() < "3.2.0") {  # Apr 2015
   isNamespaceLoaded = function(x) x %chin% loadedNamespaces()

diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw
@@ -44,8 +44,8 @@ z = y
 z[5L] = NaN
 z[2L] = Inf
 z[9L] = -Inf
-test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,NaN,NaN,7,8,-Inf,-Inf)/2)
-test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,NaN,7,7,8,-Inf,NA)/2)
+test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,4,4,7,8,-Inf,-Inf)/2)
+test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,7,7,7,8,-Inf,NA)/2)
 dt = data.table(x, y, z)
 test(1.31, nafill(dt, "locf"), unname(lapply(dt, nafill, "locf")))
 test(1.32, nafill(dt, "nocb"), unname(lapply(dt, nafill, "nocb")))
@@ -152,12 +152,13 @@ test(4.24, colnamesInt(dt, "a"), error="has no names")
 
 # verbose
 dt = data.table(a=c(1L, 2L, NA_integer_), b=c(1, 2, NA_real_))
-test(5.01, nafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took")
-test(5.02, setnafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took")
-test(5.03, nafill(dt, "locf", verbose=NA), error="verbose must be TRUE or FALSE")
+old=options(datatable.verbose=TRUE)
+test(5.01, nafill(dt, "locf"), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took")
+test(5.02, setnafill(dt, "locf"), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took")
 if (test_bit64) {
-  test(5.04, nafill(as.integer64(c(NA,2,NA,3)), "locf", verbose=TRUE), as.integer64(c(NA,2,2,3)), output="nafillInteger64: took.*nafillR.*took")
+  test(5.03, nafill(as.integer64(c(NA,2,NA,3)), "locf"), as.integer64(c(NA,2,2,3)), output="nafillInteger64: took.*nafillR.*took")
 }
+options(old)
 
 # coerceFill
 if (test_bit64) {
@@ -186,3 +187,19 @@ if (test_bit64) {
   test(6.53, identical(coerceFill(-2147483648), list(NA_integer_, -2147483648,  as.integer64("-2147483648"))))
   test(6.54, identical(coerceFill(-2147483649), list(NA_integer_, -2147483649,  as.integer64("-2147483649"))))
 }
+
+# nan argument to treat NaN as NA in nafill, #4020
+x = c(-1, NA, NaN, 0, 1, 2)
+ans1 = c(-1, 0, 0, 0:2)
+ans2 = c(-1, 0, NaN, 0:2)
+DT = data.table(a=x, b=x)
+test(7.01, nafill(x, fill=0), ans1)
+test(7.02, nafill(x, fill=0, nan=NaN), ans2)
+test(7.03, nafill(x, 'locf'), c(-1, -1, -1, 0:2))
+test(7.04, nafill(x, 'locf', nan=NaN), c(-1, -1, NaN, 0:2))
+test(7.05, nafill(x, 'nocb'), ans1)
+test(7.06, nafill(x, 'nocb', nan=NaN), c(-1, NaN, NaN, 0:2))
+test(7.07, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1])
+test(7.08, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)])
+test(7.09, nafill(x, fill=0, nan=c(NA, NaN)), error="Argument 'nan' must be length 1")
+test(7.10, nafill(x, fill=0, nan=Inf), error="Argument 'nan' must be NA or NaN")
diff --git a/man/nafill.Rd b/man/nafill.Rd
@@ -10,20 +10,20 @@
   Fast fill missing values using constant value, \emph{last observation carried forward} or \emph{next observation carried backward}.
 }
 \usage{
-nafill(x, type=c("const","locf","nocb"), fill=NA,
-       verbose=getOption("datatable.verbose"))
-setnafill(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x),
-          verbose=getOption("datatable.verbose"))
+nafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA)
+setnafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x))
 }
 \arguments{
   \item{x}{ vector, list, data.frame or data.table of numeric columns. }
   \item{type}{ character, one of \emph{"const"}, \emph{"locf"} or \emph{"nocb"}. Defaults to \code{"const"}. }
   \item{fill}{ numeric or integer, value to be used to fill when \code{type=="const"}. }
+  \item{nan}{ (numeric \code{x} only) Either \code{NaN} or \code{NA}; if the former, \code{NaN} is treated as distinct from \code{NA}, otherwise, they are treated the same during replacement? }
   \item{cols}{ numeric or character vector specifying columns to be updated. }
-  \item{verbose}{ logical, \code{TRUE} turns on timing messages to the console. }
 }
 \details{
   Only \emph{double} and \emph{integer} data types are currently supported.
+
+  Note that both \code{nafill} and \code{setnafill} provide some verbose output when \code{getOption('datatable.verbose')} is \code{TRUE}.
 }
 \value{
   A list except when the input is a \code{vector} in which case a \code{vector} is returned. For \code{setnafill} the input argument is returned, updated by reference.

diff --git a/src/data.table.h b/src/data.table.h
@@ -202,9 +202,9 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX
 SEXP frollapplyR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP align, SEXP rho);
 
 // nafill.c
-void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose);
+void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose);
 void nafillInteger(int32_t *x, uint_fast64_t nx, unsigned int type, int32_t fill, ans_t *ans, bool verbose);
-SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose);
+SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols);
 
 // between.c
 SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP check);

diff --git a/src/nafill.c b/src/nafill.c
@@ -1,22 +1,40 @@
 #include "data.table.h"
 
-void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose) {
+void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose) {
   double tic=0.0;
   if (verbose)
     tic = omp_get_wtime();
   if (type==0) { // const
-    for (uint_fast64_t i=0; i<nx; i++) {
-      ans->dbl_v[i] = ISNA(x[i]) ? fill : x[i];
+    if (nan_is_na) {
+      for (uint_fast64_t i=0; i<nx; i++) {
+        ans->dbl_v[i] = ISNAN(x[i]) ? fill : x[i];
+      }
+    } else {
+      for (uint_fast64_t i=0; i<nx; i++) {
+        ans->dbl_v[i] = ISNA(x[i]) ? fill : x[i];
+      }
     }
   } else if (type==1) { // locf
     ans->dbl_v[0] = x[0];
-    for (uint_fast64_t i=1; i<nx; i++) {
-      ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i];
+    if (nan_is_na) {
+      for (uint_fast64_t i=1; i<nx; i++) {
+        ans->dbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i-1] : x[i];
+      }
+    } else {
+      for (uint_fast64_t i=1; i<nx; i++) {
+        ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i];
+      }
     }
   } else if (type==2) { // nocb
     ans->dbl_v[nx-1] = x[nx-1];
-    for (int_fast64_t i=nx-2; i>=0; i--) {
-      ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i];
+    if (nan_is_na) {
+      for (int_fast64_t i=nx-2; i>=0; i--) {
+        ans->dbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i+1] : x[i];
+      }
+    } else {
+      for (int_fast64_t i=nx-2; i>=0; i--) {
+        ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i];
+      }
     }
   }
   if (verbose)
@@ -67,11 +85,9 @@ void nafillInteger64(int64_t *x, uint_fast64_t nx, unsigned int type, int64_t fi
     snprintf(ans->message[0], 500, "%s: took %.3fs\n", __func__, omp_get_wtime()-tic);
 }
 
-SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose) {
+SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols) {
   int protecti=0;
-  if (!IS_TRUE_OR_FALSE(verbose))
-    error("verbose must be TRUE or FALSE");
-  bool bverbose = LOGICAL(verbose)[0];
+  const bool verbose = GetVerbose();
 
   if (!xlength(obj))
     return(obj);
@@ -142,26 +158,29 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbo
     coerceFill(fill, &dfill, &ifill, &i64fill);
 
   double tic=0.0, toc=0.0;
-  if (bverbose)
+  if (verbose)
     tic = omp_get_wtime();
   #pragma omp parallel for if (nx>1) num_threads(getDTthreads())
   for (R_len_t i=0; i<nx; i++) {
     SEXP this_x = VECTOR_ELT(x, i);
     switch (TYPEOF(this_x)) {
     case REALSXP : {
       if (INHERITS(this_x, char_integer64) || INHERITS(this_x, char_nanotime)) {  // inside parallel region so can't call Rinherits()
-        nafillInteger64(i64x[i], inx[i], itype, i64fill, &vans[i], bverbose);
+        nafillInteger64(i64x[i], inx[i], itype, i64fill, &vans[i], verbose);
       } else {
-        nafillDouble(dx[i], inx[i], itype, dfill, &vans[i], bverbose);
+        if (!IS_TRUE_OR_FALSE(nan_is_na_arg))
+          error("nan_is_na must be TRUE or FALSE"); // # nocov
+        bool nan_is_na = LOGICAL(nan_is_na_arg)[0];
+        nafillDouble(dx[i], inx[i], itype, dfill, nan_is_na, &vans[i], verbose);
       }
     } break;
     case INTSXP : {
-      nafillInteger(ix[i], inx[i], itype, ifill, &vans[i], bverbose);
+      nafillInteger(ix[i], inx[i], itype, ifill, &vans[i], verbose);
     } break;
     default: error("Internal error: invalid type argument in nafillR function, should have been caught before. Please report to data.table issue tracker."); // # nocov
     }
   }
-  if (bverbose)
+  if (verbose)
     toc = omp_get_wtime();
 
   if (!binplace) {
@@ -171,9 +190,9 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbo
     }
   }
 
-  ansMsg(vans, nx, bverbose, __func__);
+  ansMsg(vans, nx, verbose, __func__);
 
-  if (bverbose)
+  if (verbose)
     Rprintf("%s: parallel processing of %d column(s) took %.3fs\n", __func__, nx, toc-tic);
 
   UNPROTECT(protecti);