mplatzer · mplatzer · Mar 13, 2017 · Mar 13, 2017 · Mar 13, 2017 · Mar 13, 2017
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: BTYDplus
 Type: Package
 Title: Probabilistic Models for Assessing and Predicting your Customer Base
-Version: 1.1.1
+Version: 1.1.2
 Authors@R: person("Michael", "Platzer", email = "michael.platzer@gmail.com", role = c("aut", "cre"))
 Description: Provides advanced statistical methods to describe and predict customers'
   purchase behavior in a non-contractual setting. It uses historic transaction records to fit a

diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+1.1.2
+- fix `mcmc.ExpectedCumulativeTransactions` for Pareto/NBD (Abe) with covariates (#55)
+- add ability to provide covariates to `abe.generateData`
+
 1.1.1
 - added `sales.x` to `elog2cbs` output, which sums over sales in calibration, but excludes initial transaction (thanks to msinjin for the suggestion)
 

diff --git a/R/mcmc.R b/R/mcmc.R
@@ -207,6 +207,8 @@ mcmc.plotPActiveDiagnostic <- function(cbs, xstar, title = "Diagnostic Plot for
 #' @param x Number of transactions for which probability is calculated. May also
 #'   be a vector.
 #' @param sample_size Sample size for estimating the probability distribution.
+#' @param covariates (optional) Matrix of covariates, for Pareto/NBD (Abe)
+#'   model, passed to \code{\link{abe.GenerateData}} for simulating data.
 #' @return \eqn{P(X(t)=x)}. If either \code{t} or \code{x} is a vector, then the
 #'   output will be a vector as well. If both are vectors, the output will be a
 #'   matrix.
@@ -218,7 +220,7 @@ mcmc.plotPActiveDiagnostic <- function(cbs, xstar, title = "Diagnostic Plot for
 #'   mcmc = 200, burnin = 100, thin = 20, chains = 1) # short MCMC to run demo fast
 #' mcmc.pmf(param.draws, t = 52, x = 0:6)
 #' mcmc.pmf(param.draws, t = c(26, 52), x = 0:6)
-mcmc.pmf <- function(draws, t, x, sample_size = 10000) {
+mcmc.pmf <- function(draws, t, x, sample_size = 10000, covariates = NULL) {
   cohort_draws <- as.matrix(draws$level_2)
   nr_of_draws <- nrow(cohort_draws)
   # use posterior mean
@@ -238,7 +240,8 @@ mcmc.pmf <- function(draws, t, x, sample_size = 10000) {
                                p["cov_log_lambda_log_mu"],
                                p["var_log_mu"]),
                              ncol = 2)
-      abe.GenerateData(n = n, T.cal = 0, T.star = unique(t), params = params)$cbs
+      abe.GenerateData(n = n, T.cal = 0, T.star = unique(t), params = params,
+                       covariates = covariates)$cbs
     }
   }))
   pmf <- sapply(1:length(t), function(idx) {
@@ -302,6 +305,8 @@ mcmc.Expectation <- function(draws, t, sample_size = 10000) {
 #' @param n.periods.final Number of time periods in the calibration and holdout
 #'   periods.
 #' @param sample_size Sample size for estimating the probability distribution.
+#' @param covariates (optional) Matrix of covariates, for Pareto/NBD (Abe)
+#'   model, passed to \code{\link{abe.GenerateData}} for simulating data.
 #' @return Numeric vector of expected cumulative total repeat transactions by
 #'   all customers.
 #' @export
@@ -314,7 +319,8 @@ mcmc.Expectation <- function(draws, t, sample_size = 10000) {
 #' # weeks, with every eigth week being reported.
 #' mcmc.ExpectedCumulativeTransactions(param.draws,
 #'   T.cal = cbs$T.cal, T.tot = 104, n.periods.final = 104/8, sample_size = 1000)
-mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.final, sample_size = 10000) {
+mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.final,
+                                                sample_size = 10000, covariates = NULL) {
   if (any(T.cal < 0) || !is.numeric(T.cal))
     stop("T.cal must be numeric and may not contain negative numbers.")
   if (length(T.tot) > 1 || T.tot < 0 || !is.numeric(T.tot))
@@ -333,13 +339,14 @@ mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.f
     } else if (model == "abe") {
       p <- as.list(cohort_draws[i, ])
       params <- list()
-      params$beta  <- matrix(p[grepl("^log\\_", names(p))], byrow = TRUE, ncol = 2)
-      params$gamma <- matrix(c(p["var_log_lambda"],
+      params$beta  <- matrix(as.numeric(p[grepl("^log\\_", names(p))]), byrow = TRUE, ncol = 2)
+      params$gamma <- matrix(as.numeric(c(p["var_log_lambda"],
                                p["cov_log_lambda_log_mu"],
                                p["cov_log_lambda_log_mu"],
-                               p["var_log_mu"]),
+                               p["var_log_mu"])),
                              ncol = 2)
-      elog <- abe.GenerateData(n = n, T.cal = T.tot, T.star = 0, params = params)$elog
+      elog <- abe.GenerateData(n = n, T.cal = T.tot, T.star = 0, params = params,
+                               covariates = covariates)$elog
     }
     setDT(elog)
     elog$cust <- paste0(elog$cust, "_", i)
@@ -389,6 +396,8 @@ mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.f
 #' @param ymax Upper boundary for y axis.
 #' @param sample_size Sample size for estimating the probability distribution.
 #'   See \code{\link{mcmc.ExpectedCumulativeTransactions}}.
+#' @param covariates (optional) Matrix of covariates, for Pareto/NBD (Abe)
+#'   model, passed to \code{\link{abe.GenerateData}} for simulating data.
 #' @return Matrix containing actual and expected cumulative repeat transactions.
 #' @export
 #' @seealso \code{\link{mcmc.PlotTrackingInc}}
@@ -407,10 +416,11 @@ mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.f
 mcmc.PlotTrackingCum <- function(draws, T.cal, T.tot, actual.cu.tracking.data,
                                  xlab = "Week", ylab = "Cumulative Transactions",
                                  xticklab = NULL, title = "Tracking Cumulative Transactions",
-                                 ymax = NULL, sample_size = 10000) {
+                                 ymax = NULL, sample_size = 10000, covariates = NULL) {
 
   actual <- actual.cu.tracking.data
-  expected <- mcmc.ExpectedCumulativeTransactions(draws, T.cal, T.tot, length(actual), sample_size = sample_size)
+  expected <- mcmc.ExpectedCumulativeTransactions(draws, T.cal, T.tot, length(actual),
+                                                  sample_size = sample_size, covariates = covariates)
 
   dc.PlotTracking(actual = actual, expected = expected, T.cal = T.cal,
                   xlab = xlab, ylab = ylab, title = title,
@@ -445,6 +455,8 @@ mcmc.PlotTrackingCum <- function(draws, T.cal, T.tot, actual.cu.tracking.data,
 #' @param ymax Upper boundary for y axis.
 #' @param sample_size Sample size for estimating the probability distribution.
 #'   See \code{\link{mcmc.ExpectedCumulativeTransactions}}.
+#' @param covariates (optional) Matrix of covariates, for Pareto/NBD (Abe)
+#'   model, passed to \code{\link{abe.GenerateData}} for simulating data.
 #' @return Matrix containing actual and expected incremental repeat
 #'   transactions.
 #' @export
@@ -464,10 +476,11 @@ mcmc.PlotTrackingCum <- function(draws, T.cal, T.tot, actual.cu.tracking.data,
 mcmc.PlotTrackingInc <- function(draws, T.cal, T.tot, actual.inc.tracking.data,
                                  xlab = "Week", ylab = "Transactions",
                                  xticklab = NULL, title = "Tracking Weekly Transactions",
-                                 ymax = NULL, sample_size = 10000) {
+                                 ymax = NULL, sample_size = 10000, covariates = NULL) {
 
   actual <- actual.inc.tracking.data
-  expected_cum <- mcmc.ExpectedCumulativeTransactions(draws, T.cal, T.tot, length(actual), sample_size = sample_size)
+  expected_cum <- mcmc.ExpectedCumulativeTransactions(draws, T.cal, T.tot, length(actual),
+                                                      sample_size = sample_size, covariates = covariates)
   expected <- BTYD::dc.CumulativeToIncremental(expected_cum)
 
   dc.PlotTracking(actual = actual, expected = expected, T.cal = T.cal,

diff --git a/R/pareto-nbd-abe.R b/R/pareto-nbd-abe.R
@@ -261,6 +261,7 @@ abe.mcmc.DrawParameters <- function(cal.cbs, covariates = c(), mcmc = 2500, burn
 #' @param T.star Length of holdout period. This may be a vector.
 #' @param params A list of model parameters: \code{beta} and \code{gamma}.
 #' @param date.zero Initial date for cohort start. Can be of class character, Date or POSIXt.
+#' @param covariates Provide matrix of customer covariates. If NULL then random covariate values between [-1,1] are drawn.
 #' @return List of length 2:
 #' \item{\code{cbs}}{A data.frame with a row for each customer and the summary statistic as columns.}
 #' \item{\code{elog}}{A data.frame with a row for each transaction, and columns \code{cust}, \code{date} and \code{t}.}
@@ -273,7 +274,7 @@ abe.mcmc.DrawParameters <- function(cal.cbs, covariates = c(), mcmc = 2500, burn
 #' data <- abe.GenerateData(n = 2000, T.cal = 32, T.star = 32, params)
 #' cbs <- data$cbs  # customer by sufficient summary statistic - one row per customer
 #' elog <- data$elog  # Event log - one row per event/purchase
-abe.GenerateData <- function(n, T.cal, T.star, params, date.zero = "2000-01-01") {
+abe.GenerateData <- function(n, T.cal, T.star, params, date.zero = "2000-01-01", covariates = NULL) {
 
   # set start date for each customer, so that they share same T.cal date
   T.cal.fix <- max(T.cal)
@@ -285,9 +286,24 @@ abe.GenerateData <- function(n, T.cal, T.star, params, date.zero = "2000-01-01")
     params$beta <- matrix(params$beta, nrow = 1, ncol = 2)
 
   nr_covars <- nrow(params$beta)
-  covars <- matrix(c(rep(1, n), runif( (nr_covars - 1) * n, -1, 1)), nrow = n, ncol = nr_covars)
-  colnames(covars) <- paste("covariate", 0:(nr_covars - 1), sep = "_")
-  colnames(covars)[1] <- "intercept"
+  if (!is.null(covariates)) {
+    # ensure that provided covariates are in matrix format, with intercept
+    covars <- covariates
+    if (is.data.frame(covars)) covars <- as.matrix(covars)
+    if (!is.matrix(covars)) covars <- matrix(covars, ncol = 1, dimnames = list(NULL, "covariate_1"))
+    if (!all(covars[, 1] == 1)) covars <- cbind("intercept" = rep(1, nrow(covars)), covars)
+    if (is.null(colnames(covars)) & ncol(covars) > 1)
+      colnames(covars)[-1] <- paste("covariate", 1:(nr_covars - 1), sep = "_")
+    if (nr_covars != ncol(covars))
+      stop("provided number of covariate columns does not match implied covariate number by parameter `beta`")
+    if (n != nrow(covars))
+      covars <- covars[sample(1:nrow(covars), n, replace = TRUE), ]
+  } else {
+    # simulate covariates, if not provided
+    covars <- matrix(c(rep(1, n), runif( (nr_covars - 1) * n, -1, 1)), nrow = n, ncol = nr_covars)
+    colnames(covars) <- paste("covariate", 0:(nr_covars - 1), sep = "_")
+    colnames(covars)[1] <- "intercept"
+  }
 
   # sample log-normal distributed parameters lambda/mu for each customer
   thetas <- exp( (covars %*% params$beta) + mvtnorm::rmvnorm(n, mean = c(0, 0), sigma = params$gamma))

diff --git a/man/abe.GenerateData.Rd b/man/abe.GenerateData.Rd
diff --git a/man/mcmc.ExpectedCumulativeTransactions.Rd b/man/mcmc.ExpectedCumulativeTransactions.Rd
diff --git a/man/mcmc.PlotTrackingCum.Rd b/man/mcmc.PlotTrackingCum.Rd
diff --git a/man/mcmc.PlotTrackingInc.Rd b/man/mcmc.PlotTrackingInc.Rd
diff --git a/man/mcmc.pmf.Rd b/man/mcmc.pmf.Rd
diff --git a/tests/testthat/test-pareto-nbd-abe.R b/tests/testthat/test-pareto-nbd-abe.R
@@ -3,21 +3,50 @@ context("mcmc")
 test_that("Pareto/NBD (Abe) MCMC", {
   skip_on_cran()
 
+  # test basic data simulation
+  n <- 100
+  params <- list()
+  params$beta <- matrix(c(0.18, -2.5, 0.5, -0.3, -0.2, 0.8), byrow = TRUE, ncol = 2)
+  params$gamma <- matrix(c(0.05, 0.1, 0.1, 0.2), ncol = 2)
+  expect_silent(abe.GenerateData(n, 36, 36, params,
+                                 covariates = matrix(c(rnorm(n), runif(n)), ncol = 2)))
+  expect_silent(abe.GenerateData(n, 36, 36, params,
+                                 covariates = matrix(c(rnorm(n), runif(n)), ncol = 2,
+                                                     dimnames = list(NULL, c("x1", "x2")))))
+  expect_error(abe.GenerateData(n, 36, 36, params,
+                                covariates = matrix(c(rnorm(n)), ncol = 1,
+                                                    dimnames = list(NULL, c("x1")))),
+               "covariate columns")
+  expect_error(abe.GenerateData(n, 36, 36, params,
+                                covariates = matrix(c(rnorm(n), runif(n), runif(n)), ncol = 3,
+                                                    dimnames = list(NULL, c("x1", "x2", "x3")))),
+               "covariate columns")
+  params$beta <- params$beta[1:2, ]
+  expect_silent(abe.GenerateData(n, 36, 36, params, covariates = rnorm(n)))
+  expect_silent(abe.GenerateData(n, 36, 36, params, covariates = matrix(rnorm(n), ncol = 1,
+                                                                  dimnames = list(NULL, "x1"))))
+
   # generate artificial Pareto/NBD (Abe) with 2 covariates
   set.seed(1)
   params <- list()
   params$beta <- matrix(c(0.18, -2.5, 0.5, -0.3, -0.2, 0.8), byrow = TRUE, ncol = 2)
   params$gamma <- matrix(c(0.05, 0.1, 0.1, 0.2), ncol = 2)
-  expect_silent(abe.GenerateData(n = 100, T.cal = 32, T.star = c(16, 32), params))
   n <- 5000
   sim <- abe.GenerateData(n,
                           round(runif(n, 36, 96) / 12) * 12,
                           36,
                           params,
                           "2010-01-01")
+  # TODO: test param recoverability with covars being provided to abe.GenerateData
+  # covars <- matrix(c(runif(n, 0, 10), runif(n, 10, 20)), ncol = 2,
+  #                  dimnames = list(NULL, c("x1", "x2")))
+
   cbs <- sim$cbs
+  expect_true(all(c("intercept", "covariate_1", "covariate_2") %in% names(cbs)))
 
   # test basic parameter estimation
+  draws <- abe.mcmc.DrawParameters(as.data.table(cbs),
+                                   mcmc = 10, burnin = 0, thin = 1, mc.cores = 1)
   draws <- abe.mcmc.DrawParameters(as.data.table(cbs), covariates = c("covariate_1"),
                                    mcmc = 10, burnin = 0, thin = 1, mc.cores = 1)
 
@@ -57,4 +86,21 @@ test_that("Pareto/NBD (Abe) MCMC", {
   expect_true(all(cbs$x.star == round(cbs$x.star)))
   expect_true(all(cbs$palive >= 0 & cbs$palive <= 1))
 
+  # check tracking plots
+  inc <- elog2inc(sim$elog)
+  mat <- mcmc.PlotTrackingInc(draws,
+                              T.cal = cbs$T.cal,
+                              T.tot = max(cbs$T.cal + cbs$T.star),
+                              actual.inc.tracking.data = inc,
+                              covariates = cbs[, c("covariate_1", "covariate_2")])
+  mat.sum <- apply(mat, 1, sum)
+  expect_equal(mat[1], mat[2], tolerance = 0.1)
+  cum <- elog2cum(sim$elog)
+  mat <- mcmc.PlotTrackingCum(draws,
+                              T.cal = cbs$T.cal,
+                              T.tot = max(cbs$T.cal + cbs$T.star),
+                              actual.cu.tracking.data = cum,
+                              covariates = cbs[, c("covariate_1", "covariate_2")])
+  expect_equal(mat[, ncol(mat) - 1], mat[, ncol(mat) - 1], tolerance = 0.1)
+
 })