diff --git a/NEWS.md b/NEWS.md index e86960c8d..439ea4b4e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ 4. `NA` in `between`'s `lower` and `upper` are now taken as missing bounds and return `TRUE` rather than than `NA`. This is now documented. +5. `.SDcols` in `[.data.table` now accepts `patterns`, similar to the existing usage in `melt.data.table`, for filtering columns according to a pattern, concisely and dynamically, [#1878](https://github.com/Rdatatable/data.table/issues/1878). Thanks to many people for pushing for this and @MichaelChirico for ultimately filing the PR. See `?data.table` for full details and examples. + #### BUG FIXES 1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting. diff --git a/R/data.table.R b/R/data.table.R index 87dbb7e1f..a9139a5d9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1016,8 +1016,14 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { # .SDcols is of the format a:b .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names(x)), parent.frame()) } else { - .SDcols = eval(colsub, parent.frame(), parent.frame()) + if (is.call(colsub) && colsub[[1L]] == "patterns") { + # each pattern gives a new filter condition, intersect the end result + .SDcols = Reduce(intersect, do_patterns(colsub, names(x))) + } else { + .SDcols = eval(colsub, parent.frame(), parent.frame()) + } } + if (!length(.SDcols)) return(null.data.table()) if (anyNA(.SDcols)) stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols)))) if (is.logical(.SDcols)) { diff --git a/R/fmelt.R b/R/fmelt.R index 8f55368b3..8e747a24c 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -34,18 +34,7 @@ melt.data.table <- function(data, id.vars, measure.vars, variable.name = "variab if (missing(measure.vars)) measure.vars = NULL measure.sub = substitute(measure.vars) if (is.call(measure.sub) && measure.sub[[1L]] == "patterns") { - measure.sub = as.list(measure.sub)[-1L] - idx = which(names(measure.sub) == "cols") - if (length(idx)) { - cols = eval(measure.sub[["cols"]], parent.frame()) - measure.sub = measure.sub[-idx] - } else cols = names(data) - pats = lapply(measure.sub, eval, parent.frame()) - measure.vars = patterns(pats, cols=cols) - # replace with lengths when R 3.2.0 dependency arrives - if (length(idx <- which(sapply(measure.vars, length) == 0L))) - stop('Pattern', if (length(idx) > 1L) 's', ' not found: [', - paste(pats[idx], collapse = ', '), ']') + measure.vars = do_patterns(measure.sub, names(data)) } if (is.list(measure.vars) && length(measure.vars) > 1L) { meas.nm = names(measure.vars) diff --git a/R/utils.R b/R/utils.R index 01b9cdf2d..0a83bb4e1 100644 --- a/R/utils.R +++ b/R/utils.R @@ -93,3 +93,23 @@ brackify = function(x) { if (length(x) > 10L) x = c(x[1:10], '...') sprintf('[%s]', paste(x, collapse = ', ')) } + +# patterns done via NSE in melt.data.table and .SDcols in `[.data.table` +do_patterns = function(pat_sub, all_cols) { + # received as substitute(patterns(...)) + pat_sub = as.list(pat_sub)[-1L] + # identify cols = argument if present + idx = which(names(pat_sub) == "cols") + if (length(idx)) { + cols = eval(pat_sub[["cols"]], parent.frame(2L)) + pat_sub = pat_sub[-idx] + } else cols = all_cols + pats = lapply(pat_sub, eval, parent.frame(2L)) + matched = patterns(pats, cols=cols) + # replace with lengths when R 3.2.0 dependency arrives + if (length(idx <- which(sapply(matched, length) == 0L))) + stop('Pattern', if (length(idx) > 1L) 's', ' not found: [', + paste(pats[idx], collapse = ', '), ']') + + return(matched) +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c20448a64..46a45d3db 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12468,6 +12468,43 @@ gs = groupingsets(d, j = sum(val), by = c("a", "b", "c"), character()), id=TRUE) test(1961, cb, gs) +# #3185 -- .SDcols = integer(0L) completes gracefully +DT = data.table(a = 1:10) +test(1962.1, DT[ , .SD, .SDcols = integer(0L)], data.table(NULL)) +test(1962.2, DT[ , .SD, .SDcols = character(0L)], data.table(NULL)) + +# #1878 -- patterns API in .SDcols +library(data.table) +DT = data.table( + i = 1:10, + c = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), + V1 = c(0.4, -0.1, -1.1, -2.6, -0.1, -1.3, 0.3, -2.1, -0.6, 0.9), + V2 = c(-0.1, -2.5, -1, -0.1, -0.5, -0.7, -1, -2.1, 2.7, -1.2), + V3 = c(1.1, -1.6, 0.7, 1.6, -1.4, 1, -0.6, 1.2, -0.8, 0.1), + V4 = c(1.3, -0.8, 2.3, -0.7, 0.5, 0.5, 0.2, 0.7, -1.4, 0.8), + V5 = c(-0.1, -0.5, 1.5, -0.5, 1.9, 0.2, -0.1, -0.7, -1.7, -0.9), + V6 = c(0.8, -1.3, -0.7, -0.3, 1.4, 0.7, 0.4, 0.3, -1.6, -1.3), + V7 = c(-0.1, 0.8, 0.7, -0.2, -2, 0.5, 0.4, -0.2, -1.2, -0.7), + V8 = c(0.7, -1, 1.3, 0.5, 0.2, 0.8, 0.6, -1.4, -2, -0.1), + V9 = c(0.2, -0.1, 1.2, -0.5, 1.4, 1, 0.2, 0.7, 0.4, 1.6), + V10 = c(0.8, 0.7, -1.2, -0.9, -0.6, 0.4, -2.3, 2.2, 0.5, -1.4) +) + +test(1963.1, DT[ , lapply(.SD, sum), .SDcols = patterns('^V')], + data.table(V1 = -6.3, V2 = -6.5, V3 = 1.3, V4 = 3.4, V5 = -0.9, + V6 = -1.6, V7 = -2, V8 = -0.4, V9 = 6.1, V10 = -1.8)) +# multiple pattens --> intersection of patterns +test(1963.2, DT[ , lapply(.SD, sum), .SDcols = patterns('^V[0-4]', '^V[5-9]')], + data.table(NULL)) +test(1963.3, DT[ , lapply(.SD, sum), .SDcols = patterns('^V[02468]', '^V[48]')], + data.table(V4 = 3.4, V8 = -0.4)) + +# also with !/- inversion +test(1963.4, DT[ , lapply(.SD, sum), .SDcols = !patterns('^c|i')], + data.table(V1 = -6.3, V2 = -6.5, V3 = 1.3, V4 = 3.4, V5 = -0.9, + V6 = -1.6, V7 = -2, V8 = -0.4, V9 = 6.1, V10 = -1.8)) + + ################################### # Add new tests above this line # diff --git a/man/data.table.Rd b/man/data.table.Rd index c9ca636ef..d93d9a1d1 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -141,7 +141,13 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{.SDcols}{ Specifies the columns of \code{x} to be included in the special symbol \code{\link{.SD}} which stands for \code{Subset of data.table}. May be character column names or numeric positions. This is useful for speed when applying a function through a subset of (possible very many) columns; e.g., \code{DT[, lapply(.SD, sum), by="x,y", .SDcols=301:350]}. - For convenient interactive use, the form \code{startcol:endcol} is also allowed (as in \code{by}), e.g., \code{DT[, lapply(.SD, sum), by=x:y, .SDcols=a:f]} + For convenient interactive use, the form \code{startcol:endcol} is also allowed (as in \code{by}), e.g., \code{DT[, lapply(.SD, sum), by=x:y, .SDcols=a:f]}. + + Inversion (column dropping instead of keeping) can be accomplished be prepending the argument with \code{!} or \code{-} (there's no difference between these), e.g. \code{.SDcols = !c('x', 'y')}. + + Finally, you can filter columns to include in \code{.SD} according to regular expressions via \code{.SDcols=patterns(regex1, regex2, ...)}. The included columns will be the \emph{intersection} of the columns identified by each pattern; pattern unions can easily be specified with \code{|} in a regex. You can also invert a pattern as usual with \code{.SDcols = !patterns(...)}. + + Empty \code{.SDcols} will return an empty \code{data.table}. } \item{verbose}{ \code{TRUE} turns on status and information messages to the console. Turn this on by default using \code{options(datatable.verbose=TRUE)}. The quantity and types of verbosity may be expanded in future. @@ -357,16 +363,18 @@ kDT[!.("a")] # not join kDT[!"a"] # same # more on special symbols, see also ?"special-symbols" -DT[.N] # last row -DT[, .N] # total number of rows in DT -DT[, .N, by=x] # number of rows in each group -DT[, .SD, .SDcols=x:y] # select columns 'x' and 'y' -DT[, .SD[1]] # first row of all columns -DT[, .SD[1], by=x] # first row of 'y' and 'v' for each group in 'x' -DT[, c(.N, lapply(.SD, sum)), by=x] # get rows *and* sum columns 'v' and 'y' by group -DT[, .I[1], by=x] # row number in DT corresponding to each group -DT[, grp := .GRP, by=x] # add a group counter column -X[, DT[.BY, y, on="x"], by=x] # join within each group +DT[.N] # last row +DT[, .N] # total number of rows in DT +DT[, .N, by=x] # number of rows in each group +DT[, .SD, .SDcols=x:y] # select columns 'x' through 'y' +DT[ , .SD, .SDcols = !x:y] # drop columns 'x' through 'y' +DT[ , .SD, .SDcols = patterns('^[xv]')] # select columns matching '^x' or '^v' +DT[, .SD[1]] # first row of all columns +DT[, .SD[1], by=x] # first row of 'y' and 'v' for each group in 'x' +DT[, c(.N, lapply(.SD, sum)), by=x] # get rows *and* sum columns 'v' and 'y' by group +DT[, .I[1], by=x] # row number in DT corresponding to each group +DT[, grp := .GRP, by=x] # add a group counter column +X[, DT[.BY, y, on="x"], by=x] # join within each group # add/update/delete by reference (see ?assign) print(DT[, z:=42L]) # add new column by reference