diff --git a/NEWS.md b/NEWS.md index 5016cf79c..8f17827eb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,9 @@ 4. `setnames(DT, new=new_names)` (i.e. explicitly named `new=` argument) now works as expected rather than an error message requesting that `old=` be supplied too, [#4041](https://github.com/Rdatatable/data.table/issues/4041). Thanks @Kodiologist for the suggestion. +5. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed. + + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). @@ -35,6 +38,7 @@ 1. `DT[, {...; .(A,B)}]` (i.e. when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. Thanks also to @renkun-ken for his go-first strong testing which caught an issue not caught by the test suite or by revdep testing, related to NULL being the last item, [#4061](https://github.com/Rdatatable/data.table/issues/4061). + ## BUG FIXES 1. `frollapply` could segfault and exceed R's C protect limits, [#3993](https://github.com/Rdatatable/data.table/issues/3993). Thanks to @DavisVaughan for reporting and fixing. diff --git a/R/shift.R b/R/shift.R index 125bf41ce..63a1cdec4 100644 --- a/R/shift.R +++ b/R/shift.R @@ -24,16 +24,16 @@ shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FA ans } -nafill = function(x, type=c("const","locf","nocb"), fill=NA, verbose=getOption("datatable.verbose")) { +nafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA) { type = match.arg(type) if (type!="const" && !missing(fill)) warning("argument 'fill' ignored, only make sense for type='const'") - .Call(CnafillR, x, type, fill, FALSE, NULL, verbose) + .Call(CnafillR, x, type, fill, nan_is_na(nan), FALSE, NULL) } -setnafill = function(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x), verbose=getOption("datatable.verbose")) { +setnafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x)) { type = match.arg(type) if (type!="const" && !missing(fill)) warning("argument 'fill' ignored, only make sense for type='const'") - invisible(.Call(CnafillR, x, type, fill, TRUE, cols, verbose)) + invisible(.Call(CnafillR, x, type, fill, nan_is_na(nan), TRUE, cols)) } diff --git a/R/utils.R b/R/utils.R index 9f36a803c..7a355e4e8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -13,6 +13,13 @@ if (base::getRversion() < "3.5.0") { isTRUEorNA = function(x) is.logical(x) && length(x)==1L && (is.na(x) || x) isTRUEorFALSE = function(x) is.logical(x) && length(x)==1L && !is.na(x) allNA = function(x) .Call(C_allNAR, x) +# helper for nan argument (e.g. nafill): TRUE -> treat NaN as NA +nan_is_na = function(x) { + if (length(x) != 1L) stop("Argument 'nan' must be length 1") + if (identical(x, NA) || identical(x, NA_real_)) return(TRUE) + if (identical(x, NaN)) return(FALSE) + stop("Argument 'nan' must be NA or NaN") +} if (base::getRversion() < "3.2.0") { # Apr 2015 isNamespaceLoaded = function(x) x %chin% loadedNamespaces() diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index ecaf61049..99a404b4d 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -44,8 +44,8 @@ z = y z[5L] = NaN z[2L] = Inf z[9L] = -Inf -test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,NaN,NaN,7,8,-Inf,-Inf)/2) -test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,NaN,7,7,8,-Inf,NA)/2) +test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,4,4,7,8,-Inf,-Inf)/2) +test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,7,7,7,8,-Inf,NA)/2) dt = data.table(x, y, z) test(1.31, nafill(dt, "locf"), unname(lapply(dt, nafill, "locf"))) test(1.32, nafill(dt, "nocb"), unname(lapply(dt, nafill, "nocb"))) @@ -152,12 +152,13 @@ test(4.24, colnamesInt(dt, "a"), error="has no names") # verbose dt = data.table(a=c(1L, 2L, NA_integer_), b=c(1, 2, NA_real_)) -test(5.01, nafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") -test(5.02, setnafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") -test(5.03, nafill(dt, "locf", verbose=NA), error="verbose must be TRUE or FALSE") +old=options(datatable.verbose=TRUE) +test(5.01, nafill(dt, "locf"), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") +test(5.02, setnafill(dt, "locf"), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") if (test_bit64) { - test(5.04, nafill(as.integer64(c(NA,2,NA,3)), "locf", verbose=TRUE), as.integer64(c(NA,2,2,3)), output="nafillInteger64: took.*nafillR.*took") + test(5.03, nafill(as.integer64(c(NA,2,NA,3)), "locf"), as.integer64(c(NA,2,2,3)), output="nafillInteger64: took.*nafillR.*took") } +options(old) # coerceFill if (test_bit64) { @@ -186,3 +187,19 @@ if (test_bit64) { test(6.53, identical(coerceFill(-2147483648), list(NA_integer_, -2147483648, as.integer64("-2147483648")))) test(6.54, identical(coerceFill(-2147483649), list(NA_integer_, -2147483649, as.integer64("-2147483649")))) } + +# nan argument to treat NaN as NA in nafill, #4020 +x = c(-1, NA, NaN, 0, 1, 2) +ans1 = c(-1, 0, 0, 0:2) +ans2 = c(-1, 0, NaN, 0:2) +DT = data.table(a=x, b=x) +test(7.01, nafill(x, fill=0), ans1) +test(7.02, nafill(x, fill=0, nan=NaN), ans2) +test(7.03, nafill(x, 'locf'), c(-1, -1, -1, 0:2)) +test(7.04, nafill(x, 'locf', nan=NaN), c(-1, -1, NaN, 0:2)) +test(7.05, nafill(x, 'nocb'), ans1) +test(7.06, nafill(x, 'nocb', nan=NaN), c(-1, NaN, NaN, 0:2)) +test(7.07, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) +test(7.08, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) +test(7.09, nafill(x, fill=0, nan=c(NA, NaN)), error="Argument 'nan' must be length 1") +test(7.10, nafill(x, fill=0, nan=Inf), error="Argument 'nan' must be NA or NaN") diff --git a/man/nafill.Rd b/man/nafill.Rd index 8d7615e8b..f8afb1dcf 100644 --- a/man/nafill.Rd +++ b/man/nafill.Rd @@ -10,20 +10,20 @@ Fast fill missing values using constant value, \emph{last observation carried forward} or \emph{next observation carried backward}. } \usage{ -nafill(x, type=c("const","locf","nocb"), fill=NA, - verbose=getOption("datatable.verbose")) -setnafill(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x), - verbose=getOption("datatable.verbose")) +nafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA) +setnafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x)) } \arguments{ \item{x}{ vector, list, data.frame or data.table of numeric columns. } \item{type}{ character, one of \emph{"const"}, \emph{"locf"} or \emph{"nocb"}. Defaults to \code{"const"}. } \item{fill}{ numeric or integer, value to be used to fill when \code{type=="const"}. } + \item{nan}{ (numeric \code{x} only) Either \code{NaN} or \code{NA}; if the former, \code{NaN} is treated as distinct from \code{NA}, otherwise, they are treated the same during replacement? } \item{cols}{ numeric or character vector specifying columns to be updated. } - \item{verbose}{ logical, \code{TRUE} turns on timing messages to the console. } } \details{ Only \emph{double} and \emph{integer} data types are currently supported. + + Note that both \code{nafill} and \code{setnafill} provide some verbose output when \code{getOption('datatable.verbose')} is \code{TRUE}. } \value{ A list except when the input is a \code{vector} in which case a \code{vector} is returned. For \code{setnafill} the input argument is returned, updated by reference. diff --git a/src/data.table.h b/src/data.table.h index a93e96a69..d972a2395 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -202,9 +202,9 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX SEXP frollapplyR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP align, SEXP rho); // nafill.c -void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose); +void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose); void nafillInteger(int32_t *x, uint_fast64_t nx, unsigned int type, int32_t fill, ans_t *ans, bool verbose); -SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose); +SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols); // between.c SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP check); diff --git a/src/nafill.c b/src/nafill.c index 841d9f6fb..4792c9c63 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -1,22 +1,40 @@ #include "data.table.h" -void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose) { +void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose) { double tic=0.0; if (verbose) tic = omp_get_wtime(); if (type==0) { // const - for (uint_fast64_t i=0; idbl_v[i] = ISNA(x[i]) ? fill : x[i]; + if (nan_is_na) { + for (uint_fast64_t i=0; idbl_v[i] = ISNAN(x[i]) ? fill : x[i]; + } + } else { + for (uint_fast64_t i=0; idbl_v[i] = ISNA(x[i]) ? fill : x[i]; + } } } else if (type==1) { // locf ans->dbl_v[0] = x[0]; - for (uint_fast64_t i=1; idbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i]; + if (nan_is_na) { + for (uint_fast64_t i=1; idbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i-1] : x[i]; + } + } else { + for (uint_fast64_t i=1; idbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i]; + } } } else if (type==2) { // nocb ans->dbl_v[nx-1] = x[nx-1]; - for (int_fast64_t i=nx-2; i>=0; i--) { - ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]; + if (nan_is_na) { + for (int_fast64_t i=nx-2; i>=0; i--) { + ans->dbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i+1] : x[i]; + } + } else { + for (int_fast64_t i=nx-2; i>=0; i--) { + ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]; + } } } if (verbose) @@ -67,11 +85,9 @@ void nafillInteger64(int64_t *x, uint_fast64_t nx, unsigned int type, int64_t fi snprintf(ans->message[0], 500, "%s: took %.3fs\n", __func__, omp_get_wtime()-tic); } -SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose) { +SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols) { int protecti=0; - if (!IS_TRUE_OR_FALSE(verbose)) - error("verbose must be TRUE or FALSE"); - bool bverbose = LOGICAL(verbose)[0]; + const bool verbose = GetVerbose(); if (!xlength(obj)) return(obj); @@ -142,7 +158,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbo coerceFill(fill, &dfill, &ifill, &i64fill); double tic=0.0, toc=0.0; - if (bverbose) + if (verbose) tic = omp_get_wtime(); #pragma omp parallel for if (nx>1) num_threads(getDTthreads()) for (R_len_t i=0; i