Skip to content

Commit

Permalink
nafill gains nan argument (#4025)
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelChirico authored and mattdowle committed Dec 18, 2019
1 parent 8ce5f09 commit 51805cd
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 35 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

4. `setnames(DT, new=new_names)` (i.e. explicitly named `new=` argument) now works as expected rather than an error message requesting that `old=` be supplied too, [#4041](https://github.com/Rdatatable/data.table/issues/4041). Thanks @Kodiologist for the suggestion.

5. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed.


## BUG FIXES

1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).
Expand All @@ -35,6 +38,7 @@

1. `DT[, {...; .(A,B)}]` (i.e. when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. Thanks also to @renkun-ken for his go-first strong testing which caught an issue not caught by the test suite or by revdep testing, related to NULL being the last item, [#4061](https://github.com/Rdatatable/data.table/issues/4061).


## BUG FIXES

1. `frollapply` could segfault and exceed R's C protect limits, [#3993](https://github.com/Rdatatable/data.table/issues/3993). Thanks to @DavisVaughan for reporting and fixing.
Expand Down
8 changes: 4 additions & 4 deletions R/shift.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FA
ans
}

nafill = function(x, type=c("const","locf","nocb"), fill=NA, verbose=getOption("datatable.verbose")) {
nafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA) {
type = match.arg(type)
if (type!="const" && !missing(fill))
warning("argument 'fill' ignored, only make sense for type='const'")
.Call(CnafillR, x, type, fill, FALSE, NULL, verbose)
.Call(CnafillR, x, type, fill, nan_is_na(nan), FALSE, NULL)
}

setnafill = function(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x), verbose=getOption("datatable.verbose")) {
setnafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x)) {
type = match.arg(type)
if (type!="const" && !missing(fill))
warning("argument 'fill' ignored, only make sense for type='const'")
invisible(.Call(CnafillR, x, type, fill, TRUE, cols, verbose))
invisible(.Call(CnafillR, x, type, fill, nan_is_na(nan), TRUE, cols))
}
7 changes: 7 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ if (base::getRversion() < "3.5.0") {
isTRUEorNA = function(x) is.logical(x) && length(x)==1L && (is.na(x) || x)
isTRUEorFALSE = function(x) is.logical(x) && length(x)==1L && !is.na(x)
allNA = function(x) .Call(C_allNAR, x)
# helper for nan argument (e.g. nafill): TRUE -> treat NaN as NA
nan_is_na = function(x) {
if (length(x) != 1L) stop("Argument 'nan' must be length 1")
if (identical(x, NA) || identical(x, NA_real_)) return(TRUE)
if (identical(x, NaN)) return(FALSE)
stop("Argument 'nan' must be NA or NaN")
}

if (base::getRversion() < "3.2.0") { # Apr 2015
isNamespaceLoaded = function(x) x %chin% loadedNamespaces()
Expand Down
29 changes: 23 additions & 6 deletions inst/tests/nafill.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ z = y
z[5L] = NaN
z[2L] = Inf
z[9L] = -Inf
test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,NaN,NaN,7,8,-Inf,-Inf)/2)
test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,NaN,7,7,8,-Inf,NA)/2)
test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,4,4,7,8,-Inf,-Inf)/2)
test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,7,7,7,8,-Inf,NA)/2)
dt = data.table(x, y, z)
test(1.31, nafill(dt, "locf"), unname(lapply(dt, nafill, "locf")))
test(1.32, nafill(dt, "nocb"), unname(lapply(dt, nafill, "nocb")))
Expand Down Expand Up @@ -152,12 +152,13 @@ test(4.24, colnamesInt(dt, "a"), error="has no names")

# verbose
dt = data.table(a=c(1L, 2L, NA_integer_), b=c(1, 2, NA_real_))
test(5.01, nafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took")
test(5.02, setnafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took")
test(5.03, nafill(dt, "locf", verbose=NA), error="verbose must be TRUE or FALSE")
old=options(datatable.verbose=TRUE)
test(5.01, nafill(dt, "locf"), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took")
test(5.02, setnafill(dt, "locf"), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took")
if (test_bit64) {
test(5.04, nafill(as.integer64(c(NA,2,NA,3)), "locf", verbose=TRUE), as.integer64(c(NA,2,2,3)), output="nafillInteger64: took.*nafillR.*took")
test(5.03, nafill(as.integer64(c(NA,2,NA,3)), "locf"), as.integer64(c(NA,2,2,3)), output="nafillInteger64: took.*nafillR.*took")
}
options(old)

# coerceFill
if (test_bit64) {
Expand Down Expand Up @@ -186,3 +187,19 @@ if (test_bit64) {
test(6.53, identical(coerceFill(-2147483648), list(NA_integer_, -2147483648, as.integer64("-2147483648"))))
test(6.54, identical(coerceFill(-2147483649), list(NA_integer_, -2147483649, as.integer64("-2147483649"))))
}

# nan argument to treat NaN as NA in nafill, #4020
x = c(-1, NA, NaN, 0, 1, 2)
ans1 = c(-1, 0, 0, 0:2)
ans2 = c(-1, 0, NaN, 0:2)
DT = data.table(a=x, b=x)
test(7.01, nafill(x, fill=0), ans1)
test(7.02, nafill(x, fill=0, nan=NaN), ans2)
test(7.03, nafill(x, 'locf'), c(-1, -1, -1, 0:2))
test(7.04, nafill(x, 'locf', nan=NaN), c(-1, -1, NaN, 0:2))
test(7.05, nafill(x, 'nocb'), ans1)
test(7.06, nafill(x, 'nocb', nan=NaN), c(-1, NaN, NaN, 0:2))
test(7.07, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1])
test(7.08, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)])
test(7.09, nafill(x, fill=0, nan=c(NA, NaN)), error="Argument 'nan' must be length 1")
test(7.10, nafill(x, fill=0, nan=Inf), error="Argument 'nan' must be NA or NaN")
10 changes: 5 additions & 5 deletions man/nafill.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,20 @@
Fast fill missing values using constant value, \emph{last observation carried forward} or \emph{next observation carried backward}.
}
\usage{
nafill(x, type=c("const","locf","nocb"), fill=NA,
verbose=getOption("datatable.verbose"))
setnafill(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x),
verbose=getOption("datatable.verbose"))
nafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA)
setnafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x))
}
\arguments{
\item{x}{ vector, list, data.frame or data.table of numeric columns. }
\item{type}{ character, one of \emph{"const"}, \emph{"locf"} or \emph{"nocb"}. Defaults to \code{"const"}. }
\item{fill}{ numeric or integer, value to be used to fill when \code{type=="const"}. }
\item{nan}{ (numeric \code{x} only) Either \code{NaN} or \code{NA}; if the former, \code{NaN} is treated as distinct from \code{NA}, otherwise, they are treated the same during replacement? }
\item{cols}{ numeric or character vector specifying columns to be updated. }
\item{verbose}{ logical, \code{TRUE} turns on timing messages to the console. }
}
\details{
Only \emph{double} and \emph{integer} data types are currently supported.

Note that both \code{nafill} and \code{setnafill} provide some verbose output when \code{getOption('datatable.verbose')} is \code{TRUE}.
}
\value{
A list except when the input is a \code{vector} in which case a \code{vector} is returned. For \code{setnafill} the input argument is returned, updated by reference.
Expand Down
4 changes: 2 additions & 2 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,9 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX
SEXP frollapplyR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP align, SEXP rho);

// nafill.c
void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose);
void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose);
void nafillInteger(int32_t *x, uint_fast64_t nx, unsigned int type, int32_t fill, ans_t *ans, bool verbose);
SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose);
SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols);

// between.c
SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP check);
Expand Down
55 changes: 37 additions & 18 deletions src/nafill.c
Original file line number Diff line number Diff line change
@@ -1,22 +1,40 @@
#include "data.table.h"

void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose) {
void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose) {
double tic=0.0;
if (verbose)
tic = omp_get_wtime();
if (type==0) { // const
for (uint_fast64_t i=0; i<nx; i++) {
ans->dbl_v[i] = ISNA(x[i]) ? fill : x[i];
if (nan_is_na) {
for (uint_fast64_t i=0; i<nx; i++) {
ans->dbl_v[i] = ISNAN(x[i]) ? fill : x[i];
}
} else {
for (uint_fast64_t i=0; i<nx; i++) {
ans->dbl_v[i] = ISNA(x[i]) ? fill : x[i];
}
}
} else if (type==1) { // locf
ans->dbl_v[0] = x[0];
for (uint_fast64_t i=1; i<nx; i++) {
ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i];
if (nan_is_na) {
for (uint_fast64_t i=1; i<nx; i++) {
ans->dbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i-1] : x[i];
}
} else {
for (uint_fast64_t i=1; i<nx; i++) {
ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i];
}
}
} else if (type==2) { // nocb
ans->dbl_v[nx-1] = x[nx-1];
for (int_fast64_t i=nx-2; i>=0; i--) {
ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i];
if (nan_is_na) {
for (int_fast64_t i=nx-2; i>=0; i--) {
ans->dbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i+1] : x[i];
}
} else {
for (int_fast64_t i=nx-2; i>=0; i--) {
ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i];
}
}
}
if (verbose)
Expand Down Expand Up @@ -67,11 +85,9 @@ void nafillInteger64(int64_t *x, uint_fast64_t nx, unsigned int type, int64_t fi
snprintf(ans->message[0], 500, "%s: took %.3fs\n", __func__, omp_get_wtime()-tic);
}

SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose) {
SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols) {
int protecti=0;
if (!IS_TRUE_OR_FALSE(verbose))
error("verbose must be TRUE or FALSE");
bool bverbose = LOGICAL(verbose)[0];
const bool verbose = GetVerbose();

if (!xlength(obj))
return(obj);
Expand Down Expand Up @@ -142,26 +158,29 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbo
coerceFill(fill, &dfill, &ifill, &i64fill);

double tic=0.0, toc=0.0;
if (bverbose)
if (verbose)
tic = omp_get_wtime();
#pragma omp parallel for if (nx>1) num_threads(getDTthreads())
for (R_len_t i=0; i<nx; i++) {
SEXP this_x = VECTOR_ELT(x, i);
switch (TYPEOF(this_x)) {
case REALSXP : {
if (INHERITS(this_x, char_integer64) || INHERITS(this_x, char_nanotime)) { // inside parallel region so can't call Rinherits()
nafillInteger64(i64x[i], inx[i], itype, i64fill, &vans[i], bverbose);
nafillInteger64(i64x[i], inx[i], itype, i64fill, &vans[i], verbose);
} else {
nafillDouble(dx[i], inx[i], itype, dfill, &vans[i], bverbose);
if (!IS_TRUE_OR_FALSE(nan_is_na_arg))
error("nan_is_na must be TRUE or FALSE"); // # nocov
bool nan_is_na = LOGICAL(nan_is_na_arg)[0];
nafillDouble(dx[i], inx[i], itype, dfill, nan_is_na, &vans[i], verbose);
}
} break;
case INTSXP : {
nafillInteger(ix[i], inx[i], itype, ifill, &vans[i], bverbose);
nafillInteger(ix[i], inx[i], itype, ifill, &vans[i], verbose);
} break;
default: error("Internal error: invalid type argument in nafillR function, should have been caught before. Please report to data.table issue tracker."); // # nocov
}
}
if (bverbose)
if (verbose)
toc = omp_get_wtime();

if (!binplace) {
Expand All @@ -171,9 +190,9 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbo
}
}

ansMsg(vans, nx, bverbose, __func__);
ansMsg(vans, nx, verbose, __func__);

if (bverbose)
if (verbose)
Rprintf("%s: parallel processing of %d column(s) took %.3fs\n", __func__, nx, toc-tic);

UNPROTECT(protecti);
Expand Down

0 comments on commit 51805cd

Please sign in to comment.