From 54c6dc494e8cf15eeb55cb83762cd57ad6a4ee21 Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Thu, 18 Jan 2018 08:31:50 +0800
Subject: [PATCH] Closes #1547 -- improve ability to name value columns with
 melt (#2568)

---
 NEWS.md                |  2 ++
 R/fmelt.R              | 23 ++++++++++++++++++++---
 inst/tests/tests.Rraw  | 36 ++++++++++++++++++++++++++++++++++++
 man/melt.data.table.Rd | 26 ++++++++++++++------------
 4 files changed, 72 insertions(+), 15 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 2d6c56804..61764961d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -61,6 +61,8 @@
 
 13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR. 
 
+14. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation.
+
 #### BUG FIXES
 
 1. The new quote rules handles this single field `"Our Stock Screen Delivers an Israeli Software Company (MNDO, CTCH)<\/a> SmallCapInvestor.com - Thu, May 19, 2011 10:02 AM EDT<\/cite><\/div>Yesterday in \""Google, But for Finding
diff --git a/R/fmelt.R b/R/fmelt.R
index 813a6992e..4aaf0e262 100644
--- a/R/fmelt.R
+++ b/R/fmelt.R
@@ -7,7 +7,9 @@ melt <- function(data, ..., na.rm = FALSE, value.name = "value") {
 }
 
 patterns <- function(..., cols=character(0)) {
-  p = unlist(list(...), use.names=FALSE)
+  # if ... has no names, names(list(...)) will be "";
+  #   this assures they'll be NULL instead
+  p = unlist(list(...), use.names = any(nzchar(names(...))))
   if (!is.character(p))
     stop("Input patterns must be of type character.")
   lapply(p, grep, cols)
@@ -31,8 +33,23 @@ melt.data.table <- function(data, id.vars, measure.vars, variable.name = "variab
     measure.vars = patterns(pats, cols=cols)
   }
   if (is.list(measure.vars) && length(measure.vars) > 1L) {
-    if (length(value.name) == 1L)
-      value.name = paste(value.name, seq_along(measure.vars), sep="")
+    meas.nm = names(measure.vars)
+    if (is.null(meas.nm)) {
+      # user-provided or default stub
+      if (length(value.name) == 1L) {
+        value.name = paste0(value.name, seq_along(measure.vars))
+      }
+    } else {
+      if (length(value.name) > 1L) {
+        warning("'value.name' provided in both 'measure.vars'",
+                "and 'value.name argument'; value provided in",
+                "'measure.vars' is given precedence.")
+      }
+      if (any(is.na(meas.nm)) || !all(nzchar(meas.nm))) {
+        stop("Please provide a name to each element of 'measure.vars'.")
+      }
+      value.name = meas.nm
+    }
   }
   ans <- .Call(Cfmelt, data, id.vars, measure.vars,
       as.logical(variable.factor), as.logical(value.factor),
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 0f268c504..ccc2a615f 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11277,6 +11277,42 @@ cat(data, file=(f<-tempfile()), sep="\n")
 test(1865, fread(f, header=FALSE), error="Too many fields.*Read all 7 expected.*but more are present.*a,b,c,d,e,f,g,")
 unlink(f)
 
+# "Natural" provision of value.name in measure.vars list, #1547 and #2551
+DT = data.table(
+  meas1_jan = 0.45, meas1_feb = 0.38, meas1_mar = 0.62,
+  meas2_jan = 0.42, meas2_feb = 0.48, meas2_mar = 0.46,
+  meas3_jan = 0.54, meas3_feb = 0.47
+)
+DTout = data.table(
+  variable = factor(1:3),
+  jan = c(0.45, 0.42, 0.54),
+  feb = c(0.38, 0.48, 0.47),
+  mar = c(0.62, 0.46, NA)
+)
+test(1866.1, melt(DT, measure.vars = patterns(jan="_jan", feb="_feb", mar="_mar")), DTout)
+mvlist = list(
+  jan = sprintf('meas%d_jan', 1:3),
+  feb = sprintf('meas%d_feb', 1:3),
+  mar = sprintf('meas%d_mar', 1:2)
+)
+test(1866.2, melt(DT, measure.vars = mvlist), DTout)
+test(1866.3, melt(DT, measure.vars = mvlist, value.name = c('a', 'b', 'c')),
+             DTout, warning = 'value.name.*given precedence')
+names(mvlist) = NULL
+names(mvlist)[1L] = 'jan' # NA names
+test(1866.4, melt(DT, measure.vars = mvlist), error = 'Please provide a name')
+names(mvlist) = NULL
+names(mvlist) = c('jan', '', '') #partially-missing names
+test(1866.5, melt(DT, measure.vars = mvlist), error = 'Please provide a name')
+# previously untested behavior used in ?patterns
+DT = data.table(x1=1:5, x2=6:10, y1=letters[1:5], y2=letters[6:10])
+DTout = data.table(
+  variable = factor(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L)),
+  value1 = 1:10,
+  value2 = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j")
+)
+test(1866.6, melt(DT, measure.vars = patterns("^x", "^y", cols=names(DT))), DTout)
+
 
 ##########################
 
diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd
index e02914d06..84adfbd10 100644
--- a/man/melt.data.table.Rd
+++ b/man/melt.data.table.Rd
@@ -23,18 +23,18 @@ load \code{reshape2} package \emph{before} loading \code{data.table}.
 \item{id.vars}{vector of id variables. Can be integer (corresponding id
 column numbers) or character (id column names) vector. If missing, all
 non-measure columns will be assigned to it. If integer, must be positive; see Details. }
-\item{measure.vars}{vector of measure variables. Can be integer (corresponding
-measure column numbers) or character (measure column names) vector. If missing,
-all non-id columns will be assigned to it.
-
-\code{measure.vars} also now accepts a list of character/integer vectors to
-melt into multiple columns - i.e., melt into more than one \code{value} columns
-simultaneously. Use \code{\link{patterns}} to provide multiple patterns
-conveniently. See also \code{Examples}.}
-\item{variable.name}{name for the measured variable names column. The default
-name is 'variable'.}
-\item{value.name}{name for the molten data values column. The default name is
-'value'.}
+\item{measure.vars}{Measure variables for \code{melt}ing. Can be missing, vector, list, or pattern-based.
+
+  \itemize{
+    \item{ When missing, \code{measure.vars} will become all columns outside \code{id.vars}. }
+    \item{ Vector can be \code{integer} (implying column numbers) or \code{character} (column names). }
+    \item{ \code{list} is a generalization of the vector version -- each element of the list (which should be \code{integer} or \code{character} as above) will become a \code{melt}ed column. }
+    \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} syntax; multiple patterns will produce multiple columns. }
+  }
+
+    For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. }
+\item{variable.name}{name for the measured variable names column. The default name is \code{'variable'}.}
+\item{value.name}{name for the molten data values column(s). The default name is \code{'value'}. Multiple names can be provided here for the case when \code{measure.vars} is a \code{list}, though note well that the names provided in \code{measure.vars} take precedence. }
 \item{na.rm}{If \code{TRUE}, \code{NA} values will be removed from the molten
 data.}
 \item{variable.factor}{If \code{TRUE}, the \code{variable} column will be
@@ -127,6 +127,8 @@ melt(DT, id=1, measure=c("c_1", "i_2"), na.rm=TRUE) # remove NA
 melt(DT, id=1:2, measure=patterns("^f_", "^d_"), value.factor=TRUE)
 # same as above, but provide list of columns directly by column names or indices
 melt(DT, id=1:2, measure=list(3:4, c("d_1", "d_2")), value.factor=TRUE)
+# same as above, but provide names directly:
+melt(DT, id=1:2, measure=patterns(f="^f_", d="^d_"), value.factor=TRUE)
 
 # na.rm=TRUE removes rows with NAs in any 'value' columns
 melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE)