Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix mcmc.ExpectedCumulativeTransactions for Pareto/NBD (Abe) #57

Merged
merged 3 commits into from
Mar 13, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: BTYDplus
Type: Package
Title: Probabilistic Models for Assessing and Predicting your Customer Base
Version: 1.1.1
Version: 1.1.2
Authors@R: person("Michael", "Platzer", email = "michael.platzer@gmail.com", role = c("aut", "cre"))
Description: Provides advanced statistical methods to describe and predict customers'
purchase behavior in a non-contractual setting. It uses historic transaction records to fit a
Expand Down
4 changes: 4 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
1.1.2
- fix `mcmc.ExpectedCumulativeTransactions` for Pareto/NBD (Abe) with covariates (#55)
- add ability to provide covariates to `abe.generateData`

1.1.1
- added `sales.x` to `elog2cbs` output, which sums over sales in calibration, but excludes initial transaction (thanks to msinjin for the suggestion)

Expand Down
35 changes: 24 additions & 11 deletions R/mcmc.R
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ mcmc.plotPActiveDiagnostic <- function(cbs, xstar, title = "Diagnostic Plot for
#' @param x Number of transactions for which probability is calculated. May also
#' be a vector.
#' @param sample_size Sample size for estimating the probability distribution.
#' @param covariates (optional) Matrix of covariates, for Pareto/NBD (Abe)
#' model, passed to \code{\link{abe.GenerateData}} for simulating data.
#' @return \eqn{P(X(t)=x)}. If either \code{t} or \code{x} is a vector, then the
#' output will be a vector as well. If both are vectors, the output will be a
#' matrix.
Expand All @@ -218,7 +220,7 @@ mcmc.plotPActiveDiagnostic <- function(cbs, xstar, title = "Diagnostic Plot for
#' mcmc = 200, burnin = 100, thin = 20, chains = 1) # short MCMC to run demo fast
#' mcmc.pmf(param.draws, t = 52, x = 0:6)
#' mcmc.pmf(param.draws, t = c(26, 52), x = 0:6)
mcmc.pmf <- function(draws, t, x, sample_size = 10000) {
mcmc.pmf <- function(draws, t, x, sample_size = 10000, covariates = NULL) {
cohort_draws <- as.matrix(draws$level_2)
nr_of_draws <- nrow(cohort_draws)
# use posterior mean
Expand All @@ -238,7 +240,8 @@ mcmc.pmf <- function(draws, t, x, sample_size = 10000) {
p["cov_log_lambda_log_mu"],
p["var_log_mu"]),
ncol = 2)
abe.GenerateData(n = n, T.cal = 0, T.star = unique(t), params = params)$cbs
abe.GenerateData(n = n, T.cal = 0, T.star = unique(t), params = params,
covariates = covariates)$cbs
}
}))
pmf <- sapply(1:length(t), function(idx) {
Expand Down Expand Up @@ -302,6 +305,8 @@ mcmc.Expectation <- function(draws, t, sample_size = 10000) {
#' @param n.periods.final Number of time periods in the calibration and holdout
#' periods.
#' @param sample_size Sample size for estimating the probability distribution.
#' @param covariates (optional) Matrix of covariates, for Pareto/NBD (Abe)
#' model, passed to \code{\link{abe.GenerateData}} for simulating data.
#' @return Numeric vector of expected cumulative total repeat transactions by
#' all customers.
#' @export
Expand All @@ -314,7 +319,8 @@ mcmc.Expectation <- function(draws, t, sample_size = 10000) {
#' # weeks, with every eigth week being reported.
#' mcmc.ExpectedCumulativeTransactions(param.draws,
#' T.cal = cbs$T.cal, T.tot = 104, n.periods.final = 104/8, sample_size = 1000)
mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.final, sample_size = 10000) {
mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.final,
sample_size = 10000, covariates = NULL) {
if (any(T.cal < 0) || !is.numeric(T.cal))
stop("T.cal must be numeric and may not contain negative numbers.")
if (length(T.tot) > 1 || T.tot < 0 || !is.numeric(T.tot))
Expand All @@ -333,13 +339,14 @@ mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.f
} else if (model == "abe") {
p <- as.list(cohort_draws[i, ])
params <- list()
params$beta <- matrix(p[grepl("^log\\_", names(p))], byrow = TRUE, ncol = 2)
params$gamma <- matrix(c(p["var_log_lambda"],
params$beta <- matrix(as.numeric(p[grepl("^log\\_", names(p))]), byrow = TRUE, ncol = 2)
params$gamma <- matrix(as.numeric(c(p["var_log_lambda"],
p["cov_log_lambda_log_mu"],
p["cov_log_lambda_log_mu"],
p["var_log_mu"]),
p["var_log_mu"])),
ncol = 2)
elog <- abe.GenerateData(n = n, T.cal = T.tot, T.star = 0, params = params)$elog
elog <- abe.GenerateData(n = n, T.cal = T.tot, T.star = 0, params = params,
covariates = covariates)$elog
}
setDT(elog)
elog$cust <- paste0(elog$cust, "_", i)
Expand Down Expand Up @@ -389,6 +396,8 @@ mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.f
#' @param ymax Upper boundary for y axis.
#' @param sample_size Sample size for estimating the probability distribution.
#' See \code{\link{mcmc.ExpectedCumulativeTransactions}}.
#' @param covariates (optional) Matrix of covariates, for Pareto/NBD (Abe)
#' model, passed to \code{\link{abe.GenerateData}} for simulating data.
#' @return Matrix containing actual and expected cumulative repeat transactions.
#' @export
#' @seealso \code{\link{mcmc.PlotTrackingInc}}
Expand All @@ -407,10 +416,11 @@ mcmc.ExpectedCumulativeTransactions <- function(draws, T.cal, T.tot, n.periods.f
mcmc.PlotTrackingCum <- function(draws, T.cal, T.tot, actual.cu.tracking.data,
xlab = "Week", ylab = "Cumulative Transactions",
xticklab = NULL, title = "Tracking Cumulative Transactions",
ymax = NULL, sample_size = 10000) {
ymax = NULL, sample_size = 10000, covariates = NULL) {

actual <- actual.cu.tracking.data
expected <- mcmc.ExpectedCumulativeTransactions(draws, T.cal, T.tot, length(actual), sample_size = sample_size)
expected <- mcmc.ExpectedCumulativeTransactions(draws, T.cal, T.tot, length(actual),
sample_size = sample_size, covariates = covariates)

dc.PlotTracking(actual = actual, expected = expected, T.cal = T.cal,
xlab = xlab, ylab = ylab, title = title,
Expand Down Expand Up @@ -445,6 +455,8 @@ mcmc.PlotTrackingCum <- function(draws, T.cal, T.tot, actual.cu.tracking.data,
#' @param ymax Upper boundary for y axis.
#' @param sample_size Sample size for estimating the probability distribution.
#' See \code{\link{mcmc.ExpectedCumulativeTransactions}}.
#' @param covariates (optional) Matrix of covariates, for Pareto/NBD (Abe)
#' model, passed to \code{\link{abe.GenerateData}} for simulating data.
#' @return Matrix containing actual and expected incremental repeat
#' transactions.
#' @export
Expand All @@ -464,10 +476,11 @@ mcmc.PlotTrackingCum <- function(draws, T.cal, T.tot, actual.cu.tracking.data,
mcmc.PlotTrackingInc <- function(draws, T.cal, T.tot, actual.inc.tracking.data,
xlab = "Week", ylab = "Transactions",
xticklab = NULL, title = "Tracking Weekly Transactions",
ymax = NULL, sample_size = 10000) {
ymax = NULL, sample_size = 10000, covariates = NULL) {

actual <- actual.inc.tracking.data
expected_cum <- mcmc.ExpectedCumulativeTransactions(draws, T.cal, T.tot, length(actual), sample_size = sample_size)
expected_cum <- mcmc.ExpectedCumulativeTransactions(draws, T.cal, T.tot, length(actual),
sample_size = sample_size, covariates = covariates)
expected <- BTYD::dc.CumulativeToIncremental(expected_cum)

dc.PlotTracking(actual = actual, expected = expected, T.cal = T.cal,
Expand Down
24 changes: 20 additions & 4 deletions R/pareto-nbd-abe.R
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ abe.mcmc.DrawParameters <- function(cal.cbs, covariates = c(), mcmc = 2500, burn
#' @param T.star Length of holdout period. This may be a vector.
#' @param params A list of model parameters: \code{beta} and \code{gamma}.
#' @param date.zero Initial date for cohort start. Can be of class character, Date or POSIXt.
#' @param covariates Provide matrix of customer covariates. If NULL then random covariate values between [-1,1] are drawn.
#' @return List of length 2:
#' \item{\code{cbs}}{A data.frame with a row for each customer and the summary statistic as columns.}
#' \item{\code{elog}}{A data.frame with a row for each transaction, and columns \code{cust}, \code{date} and \code{t}.}
Expand All @@ -273,7 +274,7 @@ abe.mcmc.DrawParameters <- function(cal.cbs, covariates = c(), mcmc = 2500, burn
#' data <- abe.GenerateData(n = 2000, T.cal = 32, T.star = 32, params)
#' cbs <- data$cbs # customer by sufficient summary statistic - one row per customer
#' elog <- data$elog # Event log - one row per event/purchase
abe.GenerateData <- function(n, T.cal, T.star, params, date.zero = "2000-01-01") {
abe.GenerateData <- function(n, T.cal, T.star, params, date.zero = "2000-01-01", covariates = NULL) {

# set start date for each customer, so that they share same T.cal date
T.cal.fix <- max(T.cal)
Expand All @@ -285,9 +286,24 @@ abe.GenerateData <- function(n, T.cal, T.star, params, date.zero = "2000-01-01")
params$beta <- matrix(params$beta, nrow = 1, ncol = 2)

nr_covars <- nrow(params$beta)
covars <- matrix(c(rep(1, n), runif( (nr_covars - 1) * n, -1, 1)), nrow = n, ncol = nr_covars)
colnames(covars) <- paste("covariate", 0:(nr_covars - 1), sep = "_")
colnames(covars)[1] <- "intercept"
if (!is.null(covariates)) {
# ensure that provided covariates are in matrix format, with intercept
covars <- covariates
if (is.data.frame(covars)) covars <- as.matrix(covars)
if (!is.matrix(covars)) covars <- matrix(covars, ncol = 1, dimnames = list(NULL, "covariate_1"))
if (!all(covars[, 1] == 1)) covars <- cbind("intercept" = rep(1, nrow(covars)), covars)
if (is.null(colnames(covars)) & ncol(covars) > 1)
colnames(covars)[-1] <- paste("covariate", 1:(nr_covars - 1), sep = "_")
if (nr_covars != ncol(covars))
stop("provided number of covariate columns does not match implied covariate number by parameter `beta`")
if (n != nrow(covars))
covars <- covars[sample(1:nrow(covars), n, replace = TRUE), ]
} else {
# simulate covariates, if not provided
covars <- matrix(c(rep(1, n), runif( (nr_covars - 1) * n, -1, 1)), nrow = n, ncol = nr_covars)
colnames(covars) <- paste("covariate", 0:(nr_covars - 1), sep = "_")
colnames(covars)[1] <- "intercept"
}

# sample log-normal distributed parameters lambda/mu for each customer
thetas <- exp( (covars %*% params$beta) + mvtnorm::rmvnorm(n, mean = c(0, 0), sigma = params$gamma))
Expand Down
5 changes: 4 additions & 1 deletion man/abe.GenerateData.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/mcmc.ExpectedCumulativeTransactions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/mcmc.PlotTrackingCum.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/mcmc.PlotTrackingInc.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/mcmc.pmf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 47 additions & 1 deletion tests/testthat/test-pareto-nbd-abe.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,50 @@ context("mcmc")
test_that("Pareto/NBD (Abe) MCMC", {
skip_on_cran()

# test basic data simulation
n <- 100
params <- list()
params$beta <- matrix(c(0.18, -2.5, 0.5, -0.3, -0.2, 0.8), byrow = TRUE, ncol = 2)
params$gamma <- matrix(c(0.05, 0.1, 0.1, 0.2), ncol = 2)
expect_silent(abe.GenerateData(n, 36, 36, params,
covariates = matrix(c(rnorm(n), runif(n)), ncol = 2)))
expect_silent(abe.GenerateData(n, 36, 36, params,
covariates = matrix(c(rnorm(n), runif(n)), ncol = 2,
dimnames = list(NULL, c("x1", "x2")))))
expect_error(abe.GenerateData(n, 36, 36, params,
covariates = matrix(c(rnorm(n)), ncol = 1,
dimnames = list(NULL, c("x1")))),
"covariate columns")
expect_error(abe.GenerateData(n, 36, 36, params,
covariates = matrix(c(rnorm(n), runif(n), runif(n)), ncol = 3,
dimnames = list(NULL, c("x1", "x2", "x3")))),
"covariate columns")
params$beta <- params$beta[1:2, ]
expect_silent(abe.GenerateData(n, 36, 36, params, covariates = rnorm(n)))
expect_silent(abe.GenerateData(n, 36, 36, params, covariates = matrix(rnorm(n), ncol = 1,
dimnames = list(NULL, "x1"))))

# generate artificial Pareto/NBD (Abe) with 2 covariates
set.seed(1)
params <- list()
params$beta <- matrix(c(0.18, -2.5, 0.5, -0.3, -0.2, 0.8), byrow = TRUE, ncol = 2)
params$gamma <- matrix(c(0.05, 0.1, 0.1, 0.2), ncol = 2)
expect_silent(abe.GenerateData(n = 100, T.cal = 32, T.star = c(16, 32), params))
n <- 5000
sim <- abe.GenerateData(n,
round(runif(n, 36, 96) / 12) * 12,
36,
params,
"2010-01-01")
# TODO: test param recoverability with covars being provided to abe.GenerateData
# covars <- matrix(c(runif(n, 0, 10), runif(n, 10, 20)), ncol = 2,
# dimnames = list(NULL, c("x1", "x2")))

cbs <- sim$cbs
expect_true(all(c("intercept", "covariate_1", "covariate_2") %in% names(cbs)))

# test basic parameter estimation
draws <- abe.mcmc.DrawParameters(as.data.table(cbs),
mcmc = 10, burnin = 0, thin = 1, mc.cores = 1)
draws <- abe.mcmc.DrawParameters(as.data.table(cbs), covariates = c("covariate_1"),
mcmc = 10, burnin = 0, thin = 1, mc.cores = 1)

Expand Down Expand Up @@ -57,4 +86,21 @@ test_that("Pareto/NBD (Abe) MCMC", {
expect_true(all(cbs$x.star == round(cbs$x.star)))
expect_true(all(cbs$palive >= 0 & cbs$palive <= 1))

# check tracking plots
inc <- elog2inc(sim$elog)
mat <- mcmc.PlotTrackingInc(draws,
T.cal = cbs$T.cal,
T.tot = max(cbs$T.cal + cbs$T.star),
actual.inc.tracking.data = inc,
covariates = cbs[, c("covariate_1", "covariate_2")])
mat.sum <- apply(mat, 1, sum)
expect_equal(mat[1], mat[2], tolerance = 0.1)
cum <- elog2cum(sim$elog)
mat <- mcmc.PlotTrackingCum(draws,
T.cal = cbs$T.cal,
T.tot = max(cbs$T.cal + cbs$T.star),
actual.cu.tracking.data = cum,
covariates = cbs[, c("covariate_1", "covariate_2")])
expect_equal(mat[, ncol(mat) - 1], mat[, ncol(mat) - 1], tolerance = 0.1)

})