From 54c6dc494e8cf15eeb55cb83762cd57ad6a4ee21 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 18 Jan 2018 08:31:50 +0800 Subject: [PATCH] Closes #1547 -- improve ability to name value columns with melt (#2568) --- NEWS.md | 2 ++ R/fmelt.R | 23 ++++++++++++++++++++--- inst/tests/tests.Rraw | 36 ++++++++++++++++++++++++++++++++++++ man/melt.data.table.Rd | 26 ++++++++++++++------------ 4 files changed, 72 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2d6c56804..61764961d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -61,6 +61,8 @@ 13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR. +14. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation. + #### BUG FIXES 1. The new quote rules handles this single field `"Our Stock Screen Delivers an Israeli Software Company (MNDO, CTCH)<\/a> SmallCapInvestor.com - Thu, May 19, 2011 10:02 AM EDT<\/cite><\/div>Yesterday in \""Google, But for Finding diff --git a/R/fmelt.R b/R/fmelt.R index 813a6992e..4aaf0e262 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -7,7 +7,9 @@ melt <- function(data, ..., na.rm = FALSE, value.name = "value") { } patterns <- function(..., cols=character(0)) { - p = unlist(list(...), use.names=FALSE) + # if ... has no names, names(list(...)) will be ""; + # this assures they'll be NULL instead + p = unlist(list(...), use.names = any(nzchar(names(...)))) if (!is.character(p)) stop("Input patterns must be of type character.") lapply(p, grep, cols) @@ -31,8 +33,23 @@ melt.data.table <- function(data, id.vars, measure.vars, variable.name = "variab measure.vars = patterns(pats, cols=cols) } if (is.list(measure.vars) && length(measure.vars) > 1L) { - if (length(value.name) == 1L) - value.name = paste(value.name, seq_along(measure.vars), sep="") + meas.nm = names(measure.vars) + if (is.null(meas.nm)) { + # user-provided or default stub + if (length(value.name) == 1L) { + value.name = paste0(value.name, seq_along(measure.vars)) + } + } else { + if (length(value.name) > 1L) { + warning("'value.name' provided in both 'measure.vars'", + "and 'value.name argument'; value provided in", + "'measure.vars' is given precedence.") + } + if (any(is.na(meas.nm)) || !all(nzchar(meas.nm))) { + stop("Please provide a name to each element of 'measure.vars'.") + } + value.name = meas.nm + } } ans <- .Call(Cfmelt, data, id.vars, measure.vars, as.logical(variable.factor), as.logical(value.factor), diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0f268c504..ccc2a615f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11277,6 +11277,42 @@ cat(data, file=(f<-tempfile()), sep="\n") test(1865, fread(f, header=FALSE), error="Too many fields.*Read all 7 expected.*but more are present.*a,b,c,d,e,f,g,") unlink(f) +# "Natural" provision of value.name in measure.vars list, #1547 and #2551 +DT = data.table( + meas1_jan = 0.45, meas1_feb = 0.38, meas1_mar = 0.62, + meas2_jan = 0.42, meas2_feb = 0.48, meas2_mar = 0.46, + meas3_jan = 0.54, meas3_feb = 0.47 +) +DTout = data.table( + variable = factor(1:3), + jan = c(0.45, 0.42, 0.54), + feb = c(0.38, 0.48, 0.47), + mar = c(0.62, 0.46, NA) +) +test(1866.1, melt(DT, measure.vars = patterns(jan="_jan", feb="_feb", mar="_mar")), DTout) +mvlist = list( + jan = sprintf('meas%d_jan', 1:3), + feb = sprintf('meas%d_feb', 1:3), + mar = sprintf('meas%d_mar', 1:2) +) +test(1866.2, melt(DT, measure.vars = mvlist), DTout) +test(1866.3, melt(DT, measure.vars = mvlist, value.name = c('a', 'b', 'c')), + DTout, warning = 'value.name.*given precedence') +names(mvlist) = NULL +names(mvlist)[1L] = 'jan' # NA names +test(1866.4, melt(DT, measure.vars = mvlist), error = 'Please provide a name') +names(mvlist) = NULL +names(mvlist) = c('jan', '', '') #partially-missing names +test(1866.5, melt(DT, measure.vars = mvlist), error = 'Please provide a name') +# previously untested behavior used in ?patterns +DT = data.table(x1=1:5, x2=6:10, y1=letters[1:5], y2=letters[6:10]) +DTout = data.table( + variable = factor(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L)), + value1 = 1:10, + value2 = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j") +) +test(1866.6, melt(DT, measure.vars = patterns("^x", "^y", cols=names(DT))), DTout) + ########################## diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index e02914d06..84adfbd10 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -23,18 +23,18 @@ load \code{reshape2} package \emph{before} loading \code{data.table}. \item{id.vars}{vector of id variables. Can be integer (corresponding id column numbers) or character (id column names) vector. If missing, all non-measure columns will be assigned to it. If integer, must be positive; see Details. } -\item{measure.vars}{vector of measure variables. Can be integer (corresponding -measure column numbers) or character (measure column names) vector. If missing, -all non-id columns will be assigned to it. - -\code{measure.vars} also now accepts a list of character/integer vectors to -melt into multiple columns - i.e., melt into more than one \code{value} columns -simultaneously. Use \code{\link{patterns}} to provide multiple patterns -conveniently. See also \code{Examples}.} -\item{variable.name}{name for the measured variable names column. The default -name is 'variable'.} -\item{value.name}{name for the molten data values column. The default name is -'value'.} +\item{measure.vars}{Measure variables for \code{melt}ing. Can be missing, vector, list, or pattern-based. + + \itemize{ + \item{ When missing, \code{measure.vars} will become all columns outside \code{id.vars}. } + \item{ Vector can be \code{integer} (implying column numbers) or \code{character} (column names). } + \item{ \code{list} is a generalization of the vector version -- each element of the list (which should be \code{integer} or \code{character} as above) will become a \code{melt}ed column. } + \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} syntax; multiple patterns will produce multiple columns. } + } + + For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } +\item{variable.name}{name for the measured variable names column. The default name is \code{'variable'}.} +\item{value.name}{name for the molten data values column(s). The default name is \code{'value'}. Multiple names can be provided here for the case when \code{measure.vars} is a \code{list}, though note well that the names provided in \code{measure.vars} take precedence. } \item{na.rm}{If \code{TRUE}, \code{NA} values will be removed from the molten data.} \item{variable.factor}{If \code{TRUE}, the \code{variable} column will be @@ -127,6 +127,8 @@ melt(DT, id=1, measure=c("c_1", "i_2"), na.rm=TRUE) # remove NA melt(DT, id=1:2, measure=patterns("^f_", "^d_"), value.factor=TRUE) # same as above, but provide list of columns directly by column names or indices melt(DT, id=1:2, measure=list(3:4, c("d_1", "d_2")), value.factor=TRUE) +# same as above, but provide names directly: +melt(DT, id=1:2, measure=patterns(f="^f_", d="^d_"), value.factor=TRUE) # na.rm=TRUE removes rows with NAs in any 'value' columns melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE)