diff --git a/NAMESPACE b/NAMESPACE index aa3fb7292..e565c631c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,6 +48,8 @@ S3method(groupingsets, data.table) S3method(cube, data.table) S3method(rollup, data.table) export(frollmean) +export(nafill) +export(setnafill) S3method("[", data.table) S3method("[<-", data.table) diff --git a/NEWS.md b/NEWS.md index d68d35945..435568c11 100644 --- a/NEWS.md +++ b/NEWS.md @@ -35,6 +35,8 @@ 5. `print.data.table()` gains an option to display the timezone of `POSIXct` columns when available, [#2842](https://github.com/Rdatatable/data.table/issues/2842). Thanks to Michael Chirico for reporting and Felipe Parages for the PR. +6. New functions `nafill` and `setnafill`, [#854](https://github.com/Rdatatable/data.table/issues/854). Thanks to Matthieu Gomez for the request and Jan Gorecki for implementing. + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. diff --git a/R/shift.R b/R/shift.R index 4f6e7bdc7..536e75cbe 100644 --- a/R/shift.R +++ b/R/shift.R @@ -23,3 +23,21 @@ shift <- function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=F } ans } + +nafill = function(x, type=c("const","locf","nocb"), fill=NA, verbose=getOption("datatable.verbose")) { + type = match.arg(type) + if (type!="const" && !missing(fill)) + warning("argument 'fill' ignored, only make sense for type='const'") + .Call(CnafillR, x, type, fill, FALSE, NULL, verbose) +} + +setnafill = function(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x), verbose=getOption("datatable.verbose")) { + type = match.arg(type) + if (type!="const" && !missing(fill)) + warning("argument 'fill' ignored, only make sense for type='const'") + invisible(.Call(CnafillR, x, type, fill, TRUE, cols, verbose)) +} + +colnamesInt = function(x, cols) { + .Call(CcolnamesInt, x, cols) +} diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw new file mode 100644 index 000000000..2851f5726 --- /dev/null +++ b/inst/tests/nafill.Rraw @@ -0,0 +1,110 @@ +require(methods) +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if (!identical(suppressWarnings(packageDescription("data.table")), NA)) { + remove.packages("data.table") + stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n") + } + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + INT = data.table:::INT + colnamesInt = data.table:::colnamesInt +} + +x = 1:10 +x[c(1:2, 5:6, 9:10)] = NA +test(1.01, nafill(x, "locf"), INT(NA,NA,3,4,4,4,7,8,8,8)) +test(1.02, nafill(x, "nocb"), INT(3,3,3,4,7,7,7,8,NA,NA)) +test(1.03, nafill(x, fill=0L), INT(0,0,3,4,0,0,7,8,0,0)) +test(1.04, nafill(x, fill=5), INT(5,5,3,4,5,5,7,8,5,5)) +test(1.05, nafill(x, fill=NA_integer_), x) +test(1.06, nafill(x, fill=NA), x) +test(1.07, nafill(x, fill=NA_real_), x) +test(1.08, nafill(x, fill=Inf), x) +test(1.09, nafill(x, fill=NaN), x) +y = x/2 +test(1.11, nafill(y, "locf"), c(NA,NA,3,4,4,4,7,8,8,8)/2) +test(1.12, nafill(y, "nocb"), c(3,3,3,4,7,7,7,8,NA,NA)/2) +test(1.13, nafill(y, fill=0L), c(0,0,3,4,0,0,7,8,0,0)/2) +test(1.14, nafill(y, fill=5/2), c(5,5,3,4,5,5,7,8,5,5)/2) +test(1.15, nafill(y, fill=NA_integer_), y) +test(1.16, nafill(y, fill=NA), y) +test(1.17, nafill(y, fill=NA_real_), y) +test(1.18, nafill(y, fill=Inf), c(Inf,Inf,3,4,Inf,Inf,7,8,Inf,Inf)/2) +test(1.19, nafill(y, fill=NaN), c(NaN,NaN,3,4,NaN,NaN,7,8,NaN,NaN)/2) +z = y +z[5L] = NaN +z[2L] = Inf +z[9L] = -Inf +test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,NaN,NaN,7,8,-Inf,-Inf)/2) +test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,NaN,7,7,8,-Inf,NA)/2) +dt = data.table(x, y, z) +test(1.31, nafill(dt, "locf"), unname(lapply(dt, nafill, "locf"))) +test(1.32, nafill(dt, "nocb"), unname(lapply(dt, nafill, "nocb"))) +test(1.33, nafill(dt, fill=0), unname(lapply(dt, nafill, fill=0))) +l = list(x, y[1:8], z[1:6]) +test(1.41, nafill(l, "locf"), lapply(l, nafill, "locf")) +test(1.42, nafill(l, "nocb"), lapply(l, nafill, "nocb")) +test(1.43, nafill(l, fill=0), lapply(l, nafill, fill=0)) + +# setnafill +dt = data.table(V1=1:10, V2=10:1, V3=1:10/2) +dt[c(1L,4:5,9:10), V1:=NA][c(2:3, 5:6, 10L), V2:=NA][c(1:2, 5:6, 9:10), V3:=NA] +db = copy(dt) +test(2.01, {setnafill(dt, fill=0); dt}, as.data.table(nafill(db, fill=0))) +dt = copy(db) +test(2.02, {setnafill(dt, "locf"); dt}, as.data.table(nafill(db, "locf"))) +dt = copy(db) +test(2.03, {setnafill(dt, "nocb"); dt}, as.data.table(nafill(db, "nocb"))) +dt = copy(db) +test(2.04, {setnafill(dt, fill=0, cols=c("V2","V3")); dt}, db[, c(list(V1), nafill(.SD, fill=0)), .SDcols=c("V2","V3")]) +dt = copy(db) +test(2.05, {setnafill(dt, "locf", cols=c("V2","V3")); dt}, db[, c(list(V1), nafill(.SD, "locf")), .SDcols=c("V2","V3")]) +dt = copy(db) +test(2.06, {setnafill(dt, "nocb", cols=c("V2","V3")); dt}, db[, c(list(V1), nafill(.SD, "nocb")), .SDcols=c("V2","V3")]) +db[, "V4" := c(letters[1:3],NA,letters[5:7],NA,letters[9:10])] +dt = copy(db) +test(2.07, {setnafill(dt, "locf", cols=c("V2","V3")); dt}, db[, c(list(V1), nafill(.SD, "locf"), list(V4)), .SDcols=c("V2","V3")]) + +# exceptions test coverage +x = 1:10 +test(3.01, nafill(x, "locf", fill=0L), nafill(x, "locf"), warning="argument 'fill' ignored") +test(3.02, setnafill(list(copy(x)), "locf", fill=0L), setnafill(list(copy(x)), "locf"), warning="argument 'fill' ignored") +test(3.03, setnafill(x, "locf"), error="in-place update is supported only for list") +test(3.04, nafill(letters[1:5], fill=0), error="must be numeric type, or list/data.table") +test(3.05, setnafill(list(letters[1:5]), fill=0), error="must be numeric type, or list/data.table") +test(3.06, nafill(x, fill=1:2), error="fill must be a vector of length 1") +test(3.07, nafill(x, fill="asd"), error="fill must be numeric") + +# colnamesInt helper +dt = data.table(a=1, b=2, d=3) +test(4.01, colnamesInt(dt, "a"), 1L) +test(4.02, colnamesInt(dt, 1L), 1L) +test(4.03, colnamesInt(dt, 1), 1L) +test(4.04, colnamesInt(dt, c("a","d")), c(1L, 3L)) +test(4.05, colnamesInt(dt, c(1L, 3L)), c(1L, 3L)) +test(4.06, colnamesInt(dt, c(1, 3)), c(1L, 3L)) +test(4.07, colnamesInt(dt, c("a", "e")), error="argument specify non existing column") +test(4.08, colnamesInt(dt, c(1L, 4L)), error="argument specify non existing column") +test(4.09, colnamesInt(dt, c(1, 4)), error="argument specify non existing column") +test(4.10, colnamesInt(dt, c("a", NA)), error="argument specify non existing column") +test(4.11, colnamesInt(dt, c(1L, NA)), error="argument specify non existing column") +test(4.12, colnamesInt(dt, c(1, NA)), error="argument specify non existing column") +test(4.13, colnamesInt(dt, c("a","d","a")), error="argument specify duplicated column") +test(4.14, colnamesInt(dt, c(1L, 3L, 1L)), error="argument specify duplicated column") +test(4.15, colnamesInt(dt, c(1, 3, 1)), error="argument specify duplicated column") +test(4.16, colnamesInt(dt, list("a")), error="argument must be character or numeric") +test(4.17, colnamesInt(dt, NA), error="argument must be character or numeric") +test(4.18, colnamesInt(dt, character()), integer()) +test(4.19, colnamesInt(dt, numeric()), integer()) +test(4.20, colnamesInt(dt, integer()), integer()) +test(4.21, colnamesInt(dt, NULL), seq_along(dt)) + +# verbose +dt = data.table(a=c(1L, 2L, NA_integer_), b=c(1, 2, NA_real_)) +test(5.01, nafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") +test(5.02, setnafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") +test(5.03, nafill(dt, "locf", verbose=NA), error="verbose must be TRUE or FALSE") + diff --git a/man/nafill.Rd b/man/nafill.Rd new file mode 100644 index 000000000..8d7615e8b --- /dev/null +++ b/man/nafill.Rd @@ -0,0 +1,45 @@ +\name{nafill} +\alias{nafill} +\alias{fill} +\alias{setnafill} +\alias{locf} +\alias{nocb} +\alias{na.fill} +\title{Fill missing values} +\description{ + Fast fill missing values using constant value, \emph{last observation carried forward} or \emph{next observation carried backward}. +} +\usage{ +nafill(x, type=c("const","locf","nocb"), fill=NA, + verbose=getOption("datatable.verbose")) +setnafill(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x), + verbose=getOption("datatable.verbose")) +} +\arguments{ + \item{x}{ vector, list, data.frame or data.table of numeric columns. } + \item{type}{ character, one of \emph{"const"}, \emph{"locf"} or \emph{"nocb"}. Defaults to \code{"const"}. } + \item{fill}{ numeric or integer, value to be used to fill when \code{type=="const"}. } + \item{cols}{ numeric or character vector specifying columns to be updated. } + \item{verbose}{ logical, \code{TRUE} turns on timing messages to the console. } +} +\details{ + Only \emph{double} and \emph{integer} data types are currently supported. +} +\value{ + A list except when the input is a \code{vector} in which case a \code{vector} is returned. For \code{setnafill} the input argument is returned, updated by reference. +} +\examples{ +x = 1:10 +x[c(1:2, 5:6, 9:10)] = NA +nafill(x, "locf") + +dt = data.table(v1=x, v2=shift(x)/2, v3=shift(x, -1L)/2) +nafill(dt, "nocb") + +setnafill(dt, "locf", cols=c("v2","v3")) +dt +} +\seealso{ + \code{\link{shift}}, \code{\link{data.table}} +} +\keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index 6875bef72..b0cdc444d 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -169,3 +169,9 @@ void fadaptiverollmeanExact(double *x, uint_fast64_t nx, double_ans_t *ans, int // frollR.c SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasNA, SEXP adaptive, SEXP verbose); + +// nafill.c +SEXP colnamesInt(SEXP x, SEXP cols); +void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose); +void nafillInteger(int32_t *x, uint_fast64_t nx, unsigned int type, int32_t fill, ans_t *ans, bool verbose); +SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose); diff --git a/src/init.c b/src/init.c index b3c307bad..7039bf747 100644 --- a/src/init.c +++ b/src/init.c @@ -80,6 +80,8 @@ SEXP hasOpenMP(); SEXP uniqueNlogical(); SEXP frollfunR(); SEXP dllVersion(); +SEXP nafillR(); +SEXP colnamesInt(); // .Externals SEXP fastmean(); @@ -162,6 +164,8 @@ R_CallMethodDef callMethods[] = { {"CuniqueNlogical", (DL_FUNC) &uniqueNlogical, -1}, {"CfrollfunR", (DL_FUNC) &frollfunR, -1}, {"CdllVersion", (DL_FUNC) &dllVersion, -1}, +{"CnafillR", (DL_FUNC) &nafillR, -1}, +{"CcolnamesInt", (DL_FUNC) &colnamesInt, -1}, {NULL, NULL, 0} }; diff --git a/src/nafill.c b/src/nafill.c new file mode 100644 index 000000000..51cf02364 --- /dev/null +++ b/src/nafill.c @@ -0,0 +1,194 @@ +#include "data.table.h" +#include + +SEXP colnamesInt(SEXP x, SEXP cols) { + if (!isNewList(x)) error("'x' argument must be data.table"); + int protecti=0; + R_len_t nx = length(x); + SEXP ricols = R_NilValue; + if (isNull(cols)) { // seq_along(x) + ricols = PROTECT(allocVector(INTSXP, nx)); protecti++; + int *icols = INTEGER(ricols); + for (int i=0; inx) || (icols[i]<1)) error("'cols' argument specify non existing column(s)"); // handles NAs also + } else if (isString(cols)) { + SEXP xnames = PROTECT(getAttrib(x, R_NamesSymbol)); protecti++; + if (isNull(xnames)) error("'x' argument data.table has no names"); + ricols = PROTECT(chmatch(cols, xnames, 0)); protecti++; + int *icols = INTEGER(ricols); + for (int i=0; idbl_v[i] = ISNA(x[i]) ? fill : x[i]; + } + } else if (type==1) { // locf + ans->dbl_v[0] = x[0]; + for (uint_fast64_t i=1; idbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i]; + } + } else if (type==2) { // nocb + ans->dbl_v[nx-1] = x[nx-1]; + for (int_fast64_t i=nx-2; i>=0; i--) { + ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]; + } + } + if (verbose) sprintf(ans->message[0], "%s: took %.3fs\n", __func__, omp_get_wtime()-tic); +} + +void nafillInteger(int32_t *x, uint_fast64_t nx, unsigned int type, int32_t fill, ans_t *ans, bool verbose) { + double tic; + if (verbose) tic = omp_get_wtime(); + if (type==0) { // const + for (uint_fast64_t i=0; iint_v[i] = x[i]==NA_INTEGER ? fill : x[i]; + } + } else if (type==1) { // locf + ans->int_v[0] = x[0]; + for (uint_fast64_t i=1; iint_v[i] = x[i]==NA_INTEGER ? ans->int_v[i-1] : x[i]; + } + } else if (type==2) { // nocb + ans->int_v[nx-1] = x[nx-1]; + for (int_fast64_t i=nx-2; i>=0; i--) { + ans->int_v[i] = x[i]==NA_INTEGER ? ans->int_v[i+1] : x[i]; + } + } + if (verbose) sprintf(ans->message[0], "%s: took %.3fs\n", __func__, omp_get_wtime()-tic); +} + +SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose) { + int protecti=0; + if (!isLogical(verbose) || length(verbose)!=1 || LOGICAL(verbose)[0]==NA_LOGICAL) + error("verbose must be TRUE or FALSE"); + bool bverbose = LOGICAL(verbose)[0]; + + if (!xlength(obj)) return(obj); + + bool binplace = LOGICAL(inplace)[0]; + SEXP x = R_NilValue; + if (isVectorAtomic(obj)) { + if (binplace) { + error("'x' argument is atomic vector, in-place update is supported only for list/data.table"); + } else if (!(isReal(obj) || isInteger(obj))) { + error("'x' argument must be numeric type, or list/data.table of numeric types"); + } + x = PROTECT(allocVector(VECSXP, 1)); protecti++; // wrap into list + SET_VECTOR_ELT(x, 0, obj); + } else { + SEXP ricols = PROTECT(colnamesInt(obj, cols)); protecti++; // nafill cols=NULL which turns into seq_along(obj) + x = PROTECT(allocVector(VECSXP, length(ricols))); protecti++; + int *icols = INTEGER(ricols); + for (int i=0; i1) schedule(auto) num_threads(getDTthreads()) + for (R_len_t i=0; i