diff --git a/R/fcast.R b/R/fcast.R index 011b0f847..c9a854135 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -93,8 +93,8 @@ aggregate_funs <- function(funs, vals, sep="_", ...) { dcast.data.table <- function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose")) { if (!is.data.table(data)) stop("'data' must be a data.table.") if (anyDuplicated(names(data))) stop('data.table to cast must have unique column names') - drop = as.logical(drop[1]) - if (is.na(drop)) stop("'drop' must be logical TRUE/FALSE") + drop = as.logical(rep(drop, length.out=2L)) + if (any(is.na(drop))) stop("'drop' must be logical TRUE/FALSE") lvals = value_vars(value.var, names(data)) valnames = unique(unlist(lvals)) lvars = check_formula(formula, names(data), valnames) @@ -177,14 +177,15 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, sep = "_", ... if (length(rhsnames)) { lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames) # handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-). - if (drop) { + if (all(drop)) { map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense"))) maporder = lapply(map, order_) mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]])) lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs)) rhs = .Call(CsubsetDT, rhs, maporder[[2L]], seq_along(rhs)) } else { - lhs_ = cj_uniq(lhs); rhs_ = cj_uniq(rhs) + lhs_ = if (!drop[1L]) cj_uniq(lhs) else setkey(unique(lhs, by=names(lhs))) + rhs_ = if (!drop[2L]) cj_uniq(rhs) else setkey(unique(rhs, by=names(rhs))) map = vector("list", 2L) .Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE]) .Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE]) diff --git a/README.md b/README.md index d53f69c08..40225f993 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,8 @@ 18. `merge.data.table` by default also checks for common key columns between the two `data.table`s before resulting in error when `by` or `by.x, by.y` arguments are not provided, [#1517](https://github.com/Rdatatable/data.table/issues/1517). Thanks @DavidArenburg. + 19. `dcast.data.table` now allows `drop = c(FALSE, TRUE)` and `drop = c(TRUE, FALSE)`. The former only fills all missing combinations of formula LHS, where as the latter fills only all missing combinations of formula RHS. Thanks to Ananda Mahto for [this SO post](http://stackoverflow.com/q/34830908/559784) and to Jaap for filing [#1512](https://github.com/Rdatatable/data.table/issues/1512). + #### BUG FIXES 1. Now compiles and runs on IBM AIX gcc. Thanks to Vinh Nguyen for investigation and testing, [#1351](https://github.com/Rdatatable/data.table/issues/1351). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5a1aa4246..e3688c222 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7368,6 +7368,15 @@ dt1 = data.table(x=c(1,1,2), y=1:3) dt2 = data.table(x=c(2,3,4), z=4:6) test(1595, merge(dt1,dt2), merge(dt1,dt2, by="x")) +# FR 1512, drop argument for dcast.data.table +DT <- data.table(v1 = c(1.1, 1.1, 1.1, 2.2, 2.2, 2.2), + v2 = factor(c(1L, 1L, 1L, 3L, 3L, 3L), levels=1:3), + v3 = factor(c(2L, 3L, 5L, 1L, 2L, 6L), levels=1:6), + v4 = c(3L, 2L, 2L, 5L, 4L, 3L)) +ans1 <- dcast(DT, v1+v2~v3, value.var="v4", drop=FALSE) +test(1596.1, dcast(DT, v1+v2~v3, value.var="v4", drop=c(FALSE, TRUE)), ans1[, -6, with=FALSE]) +test(1596.2, dcast(DT, v1+v2~v3, value.var="v4", drop=c(TRUE, FALSE)), ans1[c(1,6)]) + ########################## # TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time. diff --git a/man/dcast.data.table.Rd b/man/dcast.data.table.Rd index 7bfd127bf..d859ea961 100644 --- a/man/dcast.data.table.Rd +++ b/man/dcast.data.table.Rd @@ -28,7 +28,10 @@ \item{margins}{Not implemented yet. Should take variable names to compute margins on. A value of \code{TRUE} would compute all margins.} \item{subset}{Specified if casting should be done on subset of the data. Ex: subset = .(col1 <= 5) or subset = .(variable != "January").} \item{fill}{Value to fill missing cells with. If \code{fun.aggregate} is present, takes the value by applying the function on 0-length vector.} - \item{drop}{\code{FALSE} will cast by including all missing combinations.} + \item{drop}{\code{FALSE} will cast by including all missing combinations. + + \bold{NEW:} Following \href{https://github.com/Rdatatable/data.table/issues/1512}{#1512}, \code{c(FALSE, TRUE)} will only include all missing combinations of formula \code{LHS}. And \code{c(TRUE, FALSE)} will only include all missing combinations of formula RHS. See examples.} + \item{value.var}{Name of the column whose values will be filled to cast. Function `guess()` tries to, well, guess this column automatically, if none is provided. \bold{NEW}: it is possible to cast multiple \code{value.var} columns simultaneously now. See \code{examples}.} @@ -68,6 +71,17 @@ dcast(DT, diet+chick ~ time, drop=FALSE, fill=0) # using subset dcast(DT, chick ~ time, fun=mean, subset=.(time < 10 & chick < 20)) +# drop argument, #1512 +DT <- data.table(v1 = c(1.1, 1.1, 1.1, 2.2, 2.2, 2.2), + v2 = factor(c(1L, 1L, 1L, 3L, 3L, 3L), levels=1:3), + v3 = factor(c(2L, 3L, 5L, 1L, 2L, 6L), levels=1:6), + v4 = c(3L, 2L, 2L, 5L, 4L, 3L)) +# drop=TRUE +dcast(DT, v1 + v2 ~ v3) # default is drop=TRUE +dcast(DT, v1 + v2 ~ v3, drop=FALSE) # all missing combinations of both LHS and RHS +dcast(DT, v1 + v2 ~ v3, drop=c(FALSE, TRUE)) # all missing combinations of only LHS +dcast(DT, v1 + v2 ~ v3, drop=c(TRUE, FALSE)) # all missing combinations of only RHS + \dontrun{ # benchmark against reshape2's dcast, minimum of 3 runs set.seed(45)