dcast takes multiple 'value.vars' and fun.agg. Closes #739, #716.

Rdatatable · Nov 14, 2014 · fc753c2 · fc753c2
1 parent ef61dcc
commit fc753c2
Show file tree

Hide file tree

Showing 4 changed files with 260 additions and 392 deletions.
diff --git a/R/fcast.R b/R/fcast.R
@@ -12,118 +12,190 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, ..., margins =
     subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose")) {
     if (!is.data.table(data)) stop("'data' must be a data.table.")
     if (anyDuplicated(names(data))) stop('data.table to cast must have unique column names')
-    is.formula <- function(x) class(x) == "formula"
-    strip <- function(x) gsub("[[:space:]]*", "", x)
-    if (is.formula(formula)) formula <- deparse(formula, 500)
-    if (is.character(formula)) {
-        ff <- strsplit(strip(formula), "~", fixed=TRUE)[[1]]
-        if (length(ff) > 2)
-            stop("Cast formula length is > 2, must be = 2.")
-        ff <- strsplit(ff, "+", fixed=TRUE)
-        setattr(ff, 'names', c("ll", "rr"))
-        ff <- lapply(ff, function(x) x[x != "."])
-        ff_ <- unlist(ff, use.names=FALSE)
-        replace_dots <- function(x) {
-            if (!is.list(x)) x = as.list(x)
-            for (i in seq_along(x)) {
-                if (x[[i]] == "...") 
-                    x[[i]] = setdiff(names(data), c(value.var, ff_))
+    drop = as.logical(drop[1])
+    if (is.na(drop)) stop("'drop' must be logical TRUE/FALSE")
+    # formula - UPDATE: formula handles expressions as well
+    deparser <- function(call, allvars) {
+        lapply(call, function(this) {
+            if (is.call(this)) {
+                if (this[[1L]] == quote(`+`))
+                    unlist(deparser(as.list(this)[-1L], allvars))
+                else if (this[[1L]] == quote(`.`))
+                    this[[2L]]
+                else stop("Cast formula should be of the form LHS ~ RHS, with '+' being the only operator allowed to separate items in LHS and RHS, for e.g., a + b ~ c. Use .() to construct expressions, for e.g., a + b ~ c + .(paste(d, 1:5)).")
+            } else if (is.name(this)) {
+                if (this == quote(`...`)) {
+                    subvars = setdiff(names(data), allvars)
+                    lapply(subvars, as.name)
+                } else if (this != quote(`.`)) this
             }
-            unlist(x)
-        }
-        ff <- lapply(ff, replace_dots)
-    } else stop("Invalid formula.")
-    ff_ <- unlist(ff, use.names=FALSE)
-    if (length(is_wrong <- which(is.na(chmatch(ff_, names(data))))) > 0) stop("Column '", ff_[is_wrong[1]], "' not found.")
-    if (length(ff$ll) == 0) stop("LHS of formula evaluates to 'character(0)', invalid formula.")
-    if (length(value.var) != 1 || !is.character(value.var)) stop("'value.var' must be a character vector of length 1.")
-    if (is.na(chmatch(value.var, names(data)))) stop("'value.var' column '", value.var, "' not found.")
-    if (any(unlist(lapply(as.list(data)[ff_], class), use.names=FALSE) == "list")) 
-        stop("Only 'value.var' column maybe of type 'list'. This may change in the future.")
-    drop <- as.logical(drop[1])
-    if (is.na(drop)) stop("'drop' must be TRUE/FALSE")
-
-    # subset
-    m <- as.list(match.call()[-1])
-    subset <- m$subset[[2]]
+        })
+    }
+    if (is.character(formula)) formula = as.formula(formula)
+    if (class(formula) != "formula" || length(formula) != 3L)
+        stop("Invalid formula. Cast formula should be of the form LHS ~ RHS, for e.g., a + b ~ c.")
+    expr = as.list(formula)[-1L]
+    vars_ = unlist(lapply(expr, function(x) { x=all.vars(x); x[!x %chin% c(".", "...")] }))
+    expr = deparser(expr, c(vars_, value.var))
+    expr_ = unlist(expr)
+    setattr(expr, 'names', c("lhs", "rhs"))
+    # value.var - UPDATE: more than one value.var possible now.
+    if (!is.character(value.var)) 
+        stop("value.var must be a character vector (column names) of positive length (> 0).")
+    value.var = unique(value.var)
+    iswrong = which(!value.var %in% names(data))
+    if (length(iswrong))
+        stop("value.var values [", paste(value.var[iswrong], collapse=", "), "] are not found in 'data'.")
+    val_ = lapply(value.var, as.name)
+    # get 'dat'
+    dat = lapply(c(expr_, val_), function(x) {
+            val = eval(x, data, parent.frame())
+            if (is.list(val)) stop("Only 'value.var' column maybe of type list")
+            if (is.function(val)) stop("Column [", deparse(x), "] not found or of unknown type.")
+            val
+        })
+    # Have to take care of duplicate names, and provide names for expression columns properly.
+    allnames = make.unique(sapply(expr_, function(x) all.names(x, max.names=1L)), sep="_")
+    lhsnames = head(allnames, length(expr[[1L]]))
+    rhsnames = tail(allnames, -length(expr[[1L]]))
+    if (any(value.var %chin% allnames)) {
+        value.var = tail(make.unique(c(allnames, value.var)), -length(allnames))
+    }
+    setattr(dat, 'names', c(allnames, value.var))
+    setDT(dat)
+    # subset - UPDATE: subset on `dat`, not `data`; `data` may've much more columns not in formula.
+    m <- as.list(match.call()[-1L])
+    subset <- m[["subset"]][[2L]]
     if (!is.null(subset)) {
         if (is.name(subset)) subset = as.call(list(quote(`(`), subset))
-        data = data[eval(subset, data, parent.frame()), unique(c(ff_, value.var)), with=FALSE]
+        idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys?
+        dat = .Call(CsubsetDT, dat, idx, seq_along(dat))
     }
-    if (nrow(data) == 0L || ncol(data) == 0L) stop("Can't 'cast' on an empty data.table")
-
-    # set 'fun.aggregate = length' if max group size > 1
-    fun.null=FALSE
-    if (is.null(fun.aggregate)) {
-        fun.null=TRUE
-        oo = forderv(data, by=ff_, retGrp=TRUE)
+    if (!nrow(dat) || !ncol(dat)) stop("Can not cast an empty data.table")
+    # fun.aggregate - UPDATE: more than one fun.aggregate possible now.
+    # if fun.aggregate is NULL, set to 'length' if maxgrp > 1L (to be consistent with reshape2::dcast)
+    # E.g., fun = quote(funs(length, .(mean), bla=.(function(x) as.numeric(median(x, na.rm=FALSE)))))
+    # E.g., fun = quote(funs(length, .(mean), .(function(x) 1L)))
+    # E.g., fun = quote(funs(length, .(mean, vars="d")))
+    fun.call = m[["fun.aggregate"]]
+    fill.default = NULL
+    if (is.null(fun.call)) {
+        oo = forderv(dat, by=c(allnames), retGrp=TRUE)
         if (attr(oo, 'maxgrpn') > 1L) {
             message("Aggregate function missing, defaulting to 'length'")
-            fun.aggregate <- length
-            m[["fun.aggregate"]] = quote(length)
+            fun.call = quote(length)
         }
     }
-    fill.default <- NULL
-    if (!is.null(fun.aggregate)) { # construct the 'call'
-        fill.default = fun.aggregate(data[[value.var]][0], ...)
-        if (!length(fill.default) && (is.null(fill) || !length(fill)))
-            stop("Aggregating function provided to argument 'fun.aggregate' should always return a length 1 vector, but returns 0-length value for fun.aggregate(", typeof(data[[value.var]]), "(0)).", " This value will have to be used to fill missing combinations, if any, and therefore can not be of length 0. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.")
-        args <- c("data", "formula", "margins", "subset", "fill", "value.var", "verbose", "drop")
-        m <- m[setdiff(names(m), args)]
-        .CASTcall = as.call(c(m[1], as.name(value.var), m[-1])) # issues/713
-        .CASTcall = as.call(c(as.name("list"), setattr(list(.CASTcall), 'names', value.var)))
-        # workaround until #5191 (issues/497) is fixed
-        if (length(intersect(value.var, ff_))) 
-            .CASTcall = as.call(list(as.name("{"), as.name(".SD"), .CASTcall))
-    }
-    # special case
-    if (length(ff$rr) == 0) {
-        if (is.null(fun.aggregate))
-            ans = data[, c(ff$ll, value.var), with=FALSE]
-        else {
-            # workaround until #5191 (issues/497) is fixed
-            if (length(intersect(value.var, ff_))) ans = data[, eval(.CASTcall), by=c(ff$ll), .SDcols=value.var]
-            else ans = data[, eval(.CASTcall), by=c(ff$ll)]
+    if (!is.null(fun.call)) {
+        cast_funs <- function(fun, allvars, ...) {
+            funname = names(fun); fun = fun[[1L]]
+            dots = list(...)
+            fsymbol <- function(fun, vars) {
+                allfuns = lapply(vars, function(x) {
+                    expr = list(fun, as.name(x))
+                    if (length(dots)) expr = c(expr, dots)
+                    as.call(expr)
+                })
+                setattr(allfuns, 'names', paste(if (is.null(funname) || funname == "") 
+                    all.names(fun)[1L] else funname, vars, sep="_"))
+            }
+            if (is.name(fun)) fun = fsymbol(fun, allvars)
+            else if (is.call(fun)) {
+                if (fun[[1L]] == ".") {
+                    thisvars = eval(fun[["vars"]])
+                    fun = fun[[2L]]
+                } else thisvars = NULL
+                if (is.null(thisvars)) thisvars = allvars
+                else if (length(absent <- which(!thisvars %chin% allvars)))
+                    stop("Columns specified in 'vars' argument should be present in 'value.var'. Values [", paste(thisvars[absent], collapse=", "), "] are not present.")
+                fun = fsymbol(fun, thisvars)
+            } else stop("Invalid format for function in fun.aggregate. Please see DETAILS and EXAMPLE sections of ?dcast.data.table.")
+            fun
         }
-        if (anyDuplicated(names(ans))) {
-            message("Duplicate column names found in cast data.table. Setting unique names using 'make.unique'")   
-            setnames(ans, make.unique(names(ans)))
+        if (is.call(fun.call) && fun.call[[1L]] == "eval") fun.call = eval(fun.call[[2L]], parent.frame(), parent.frame())
+        fun.list = if (is.call(fun.call) && fun.call[[1L]] == "funs") as.list(fun.call)[-1L] else list(fun.call)
+        fun.list = unlist(lapply(seq_along(fun.list), function(i) cast_funs(fun.list[i], value.var, ...)), use.names=TRUE)
+        fun.call = as.call(c(quote(list), fun.list))
+
+        # fill argument : UPDATE: much easier way to get this done.
+        errmsg = "Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately."
+        if (is.null(fill)) {
+            tryCatch(fill.default <- dat[0][, eval(fun.call)], warning = function(x) stop(errmsg, call.=FALSE))
+            if (nrow(fill.default) != 1L) stop(errmsg, call.=FALSE)
         }
-        if (!identical(key(ans), ff$ll)) setkeyv(ans, names(ans)[seq_along(ff$ll)])
-        return(ans)
-    }
-    # aggregation moved to R now that 'adhoc-by' is crazy fast!
-    if (!is.null(fun.aggregate)) {
-        if (length(intersect(value.var, ff_))) {
-            data = data[, eval(.CASTcall), by=c(ff_), .SDcols=value.var]
-            value.var = tail(make.unique(names(data)), 1L)
-            setnames(data, ncol(data), value.var)
+        if (!any(value.var %chin% allnames)) {
+            dat = dat[, eval(fun.call), by=c(allnames)]
+        } else {
+            dat = dat[, { .SD; eval(fun.call) }, by=c(allnames), .SDcols = value.var]
         }
-        else data = data[, eval(.CASTcall), by=c(ff_)]
-        setkeyv(data, ff_)
-        # issues/693
-        fun_agg_chk <- function(x) {
-            # sorted now, 'forderv' should be as fast as uniqlist+uniqlengths
-            oo = forderv(data, by=key(data), retGrp=TRUE)
-            attr(oo, 'maxgrpn') > 1L
+    }
+    order_ <- function(x) {
+        o = forderv(x, retGrp=TRUE, sort=TRUE)
+        idx = attr(o, 'starts')
+        if (!length(o)) o = seq_along(x)
+        o[idx] # subsetVector retains attributes, using R's subset for now
+    }
+
+    cj_uniq <- function(DT) {
+        do.call("CJ", lapply(DT, function(x) 
+            if (is.factor(x)) {
+                xint = seq_along(levels(x))
+                setattr(xint, 'levels', levels(x))
+                setattr(xint, 'class', class(x))
+            } else .Call(CsubsetVector, x, order_(x))
+    ))}
+    valnames = setdiff(names(dat), allnames)
+    # 'dat' != 'data'? then setkey to speed things up (slightly), else ad-hoc (for now). Still very fast!
+    if (!is.null(fun.call) || !is.null(subset)) 
+        setkeyv(dat, allnames)
+    if (length(rhsnames)) {
+        lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames)
+        # handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
+        if (drop) {
+            map = setDT(lapply(list(lhsnames, rhsnames), function(by) frankv(dat, by=by, ties.method="dense")))
+            maporder = lapply(map, order_)
+            mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]]))
+            lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs))
+            rhs = .Call(CsubsetDT, rhs, maporder[[2L]], seq_along(rhs))
+        } else {
+            lhs_ = cj_uniq(lhs); rhs_ = cj_uniq(rhs)
+            map = vector("list", 2L)
+            .Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE])
+            .Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE])
+            setDT(map)
+            mapunique = vector("list", 2L)
+            .Call(Csetlistelt, mapunique, 1L, seq_len(nrow(lhs_)))
+            .Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
+            lhs = lhs_; rhs = rhs_
         }
-        if (!fun.null && fun_agg_chk(data))
-            stop("Aggregating function provided to argument 'fun.aggregate' should always return a length 1 vector for each group, but returns length != 1 for atleast one group. Please have a look at the DETAILS section of ?dcast.data.table ")
+        maplen = sapply(mapunique, length)
+        idx = do.call("CJ", mapunique)[map, I := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join.
+        ans = .Call("Cfcast", lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call))
+        allcols = do.call("paste", c(rhs, sep="_"))
+        if (length(valnames) > 1L)
+            allcols = do.call("paste", c(setcolorder(CJ(valnames, allcols, sorted=FALSE), 2:1), sep="_"))
+        setattr(ans, 'names', c(lhsnames, allcols))
+        setDT(ans); setattr(ans, 'sorted', lhsnames)
     } else {
-        if (is.null(subset))
-            data = data[, unique(c(ff_, value.var)), with=FALSE] # data is untouched so far. subset only required columns
-        if (length(oo)) .Call(Creorder, data, oo)
-        setattr(data, 'sorted', ff_)
-    }
-    .CASTenv = new.env(parent=parent.frame())
-    assign("forder", forderv, .CASTenv)
-    assign("CJ", CJ, .CASTenv)
-    ans <- .Call("Cfcast", data, ff$ll, ff$rr, value.var, fill, fill.default, is.null(fun.aggregate), .CASTenv, drop)
-    setDT(ans)
-    if (anyDuplicated(names(ans))) {
-        message("Duplicate column names found in cast data.table. Setting unique names using 'make.unique'")
-        setnames(ans, make.unique(names(ans)))
+        # formula is of the form x + y ~ . (rare case)
+        if (drop) {
+            if (is.null(subset) && is.null(fun.call)) {
+                dat = copy(dat) # can't be avoided
+                setkeyv(dat, lhsnames)
+            }
+            ans = dat
+        } else {
+            lhs = shallow(dat, lhsnames)
+            val = shallow(dat, valnames)
+            lhs_ = cj_uniq(lhs)
+            idx = lhs_[lhs, I := .I][["I"]]
+            lhs_[, I := NULL]
+            ans = .Call("Cfcast", lhs_, val, nrow(lhs_), 1L, idx, fill, fill.default, is.null(fun.call))
+            setDT(ans); setattr(ans, 'sorted', lhsnames)
+            setnames(ans, c(lhsnames, valnames))
+        }
+        if (length(valnames) == 1L)
+            setnames(ans, valnames, value.var)
     }
-    setattr(ans, 'sorted', names(ans)[seq_along(ff$ll)])
-    ans
+    return (ans)
 }
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -3427,8 +3427,8 @@ set.seed(1)
 x <- data.table(a=rep(1:5, each=5), b=runif(25))
 ### adding all extra arguments for no verbose during "test.data.table()" to all dcast.data.table tests
 test(1150.1, dcast.data.table(x, " a~ . ", value.var="b", fun=length), data.table(a=1:5, b=5L, key="a"))
-test(1150.2, dcast.data.table(x, "a ~  c ", value.var="b"), error="Column 'c' not found")
-test(1150.3, dcast.data.table(x, a ~  a, value.var="c"), error="'value.var' column 'c' not found")
+test(1150.2, dcast.data.table(x, "a ~  c ", value.var="b"), error="not found or of unknown type")
+test(1150.3, dcast.data.table(x, a ~  a, value.var="c"), error="are not found in 'data'.")
 
 # test uniqlengths
 set.seed(45)
@@ -3746,7 +3746,11 @@ if ("package:reshape2" %in% search()) {
                  a3=sample(letters[1:3], 10, TRUE), # no factor
                  b=factor(sample(tail(letters, 5), 10, replace=TRUE)))
     dt <- as.data.table(df)
-    test(1198.3, setkey(setDT(dcast(df, a1+a2+a3~b, drop=FALSE, value.var="b")), a1,a2,a3), dcast.data.table(dt, a1+a2+a3~b, drop=FALSE, value.var="b"))
+    ans1 <- setkey(setDT(dcast(df, a1+a2+a3~b, drop=FALSE, value.var="b")), a1,a2,a3)
+    # dcast.data.table preserves attributes wherever possible, even "factors"...
+    ans2 <- dcast.data.table(dt, a1+a2+a3~b, drop=FALSE, value.var="b")
+    ans2[, c("v", "x", "y", "z") := lapply(.SD, as.character), .SDcols=c("v", "x", "y", "z")]
+    test(1198.3, ans1, ans2)
 }
 
 DT = data.table(id=1:2, val1=6:1, val2=6:1)   # 5380
@@ -4531,7 +4535,7 @@ test(1294.31, dt[, e := list("bla2")]$e, rep("bla2", 3))
 
 # FR #5357, when LHS evaluates to integer(0), provide warning and return dt, not an error.
 dt = data.table(a = 1:5, b1 = 1:5, b2 = 1:5)
-test(1295, dt[, grep("c", names(d)) := NULL], dt, warning="length(LHS) = 0, meaning no columns to delete or assign RHS to")
+test(1295, dt[, grep("c", names(dt)) := NULL], dt, warning="length(LHS) = 0, meaning no columns to delete or assign RHS to")
 
 # Updating logical column in one-row DT (corruption of new R 3.1 internal globals for TRUE, FALSE and NA)
 DT = data.table(a=1:6, b=c(TRUE,FALSE))
@@ -4633,7 +4637,7 @@ test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)])
 
 # bug git #693 - dcast.data.table error message improvement:
 dt <- data.table(x=c(1,1), y=c(2,2), z = 3:4)
-test(1314, dcast.data.table(dt, x ~ y, value.var="z", fun.aggregate=identity), error="Aggregating function provided to argument 'fun.aggregate' should always return a length 1 vector")
+test(1314, dcast.data.table(dt, x ~ y, value.var="z", fun.aggregate=identity), error="Aggregating function(s) should take vector inputs and return a single value ")
 
 # bug #688 - preserving attributes
 DT = data.table(id = c(1,1,2,2), ty = c("a","b","a","b"), da = as.Date("2014-06-20"))
@@ -4832,7 +4836,7 @@ test(1346.2, dcast.data.table(DT, id ~ k, value.var="v", subset=.(bla), fun.aggr
 
 # issues/715
 DT <- data.table(id=rep(1:2, c(3,2)), k=c(letters[1:3], letters[1:2]), v=1:5)
-test(1347.1, dcast.data.table(DT, id ~ k, fun.aggregate=last, value.var="v"), error="Aggregating function provided to argument 'fun.aggregate' should always return a length 1 vector")
+test(1347.1, dcast.data.table(DT, id ~ k, fun.aggregate=last, value.var="v"), error="Aggregating function(s) should take vector inputs and return a single value")
 test(1347.2, dcast.data.table(DT, id ~ k, fun.aggregate=last, value.var="v", fill=NA_integer_), data.table(id=1:2, a=c(1L, 4L), b=c(2L,5L), c=c(3L,NA_integer_), key="id"))
 
 # .N now available in i

diff --git a/src/data.table.h b/src/data.table.h
@@ -34,6 +34,7 @@ SEXP reorder(SEXP x, SEXP order);
 
 // fcast.c
 SEXP vec_init(R_len_t n, SEXP val);
+SEXP coerce_to_char(SEXP s, SEXP env);
 
 // vecseq.c
 SEXP vecseq(SEXP x, SEXP len, SEXP clamp);
@@ -62,9 +63,6 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
 // bmerge.c
 SEXP bmerge(SEXP left, SEXP right, SEXP leftcols, SEXP rightcols, SEXP isorted, SEXP xoArg, SEXP rollarg, SEXP rollends, SEXP nomatch, SEXP retFirst, SEXP retLength, SEXP allLen1);
 
-// fcast.c
-SEXP coerce_to_char(SEXP s, SEXP env);
-
 // frank.c
 SEXP dt_na(SEXP x, SEXP cols);