From 817514dfa094630b6cdefd13ee2dc76c98b44cc9 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Sun, 22 Dec 2024 16:47:43 +0100 Subject: [PATCH 01/37] Add BGNBD distribution and BGNBD Random Variable --- pymc_marketing/clv/distributions.py | 116 ++++++++++++++++++++++++++ pymc_marketing/clv/models/beta_geo.py | 109 +++++++++++++----------- 2 files changed, 178 insertions(+), 47 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index 428bd6b6d..83fa1586d 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -662,3 +662,119 @@ def logp_customer_died(t_x_i, x_i, alpha_i, beta_i, gamma_i, delta_i, T_i): delta > 0, msg="alpha > 0, beta > 0, gamma > 0, delta > 0", ) + + +class BGNBDRV(RandomVariable): + name = "bg_nbd" + ndim_supp = 1 + ndims_params = [0, 0, 0, 0, 0] + + dtype = "floatX" + _print_name = ("BGNBD", "\\operatorname{BGNBD}") + + def make_node(self, rng, size, dtype, a, b, r, alpha, T): + a = pt.as_tensor_variable(a) + b = pt.as_tensor_variable(b) + r = pt.as_tensor_variable(r) + alpha = pt.as_tensor_variable(alpha) + T = pt.as_tensor_variable(T) + + return super().make_node(rng, size, dtype, a, b, r, alpha, T) + + def __call__(self, a, b, r, alpha, T, size=None, **kwargs): + return super().__call__(a, b, r, alpha, T, size=size, **kwargs) + + @classmethod + def rng_fn(cls, rng, a, b, r, alpha, T, size): + size = pm.distributions.shape_utils.to_tuple(size) + + a = np.asarray(a) + b = np.asarray(b) + r = np.asarray(r) + alpha = np.asarray(alpha) + T = np.asarray(T) + + if size == (): + size = np.broadcast_shapes(a.shape, b.shape, r.shape, alpha.shape, T.shape) + + a = np.broadcast_to(a, size) + b = np.broadcast_to(b, size) + r = np.broadcast_to(r, size) + alpha = np.broadcast_to(alpha, size) + T = np.broadcast_to(T, size) + + output = np.zeros(shape=size + (2,)) # noqa:RUF005 + + lam = rng.gamma(shape=r, scale=1 / alpha, size=size) + p = rng.beta(a=a, b=b, size=size) + + def sim_data(lam, p, T): + t = 0 + n = 0 + + dropout_time = rng.exponential(scale=1 / p) + wait = rng.exponential(scale=1 / lam) + + final_t = min(dropout_time, T) + while (t + wait) < final_t: + t += wait + n += 1 + wait = rng.exponential(scale=1 / lam) + + return np.array( + [ + t, + n, + ], + ) + + for index in np.ndindex(*size): + output[index] = sim_data(lam[index], p[index], T[index]) + + return output + + def _supp_shape_from_params(*args, **kwargs): + return (2,) + + +bg_nbd = BGNBDRV() + + +class BGNBD(PositiveContinuous): + rv_op = bg_nbd + + @classmethod + def dist(cls, a, b, r, alpha, T, **kwargs): + """Get the distribution from the parameters.""" + return super().dist([a, b, r, alpha, T], **kwargs) + + def logp(value, a, b, r, alpha, T): + """Log-likelihood of the distribution.""" + t_x = value[..., 0] + x = value[..., 1] + + x_non_zero = x > 0 + + d1 = ( + pt.gammaln(r + x) + - pt.gammaln(r) + + pt.gammaln(a + b) + + pt.gammaln(b + x) + - pt.gammaln(b) + - pt.gammaln(a + b + x) + ) + + d2 = r * pt.log(alpha) - (r + x) * pt.log(alpha + t_x) + c3 = ((alpha + t_x) / (alpha + T)) ** (r + x) + c4 = a / (b + x - 1) + + logp = d1 + d2 + pt.log(c3 + pt.switch(x_non_zero, c4, 0)) + + return check_parameters( + logp, + a > 0, + b > 0, + alpha > 0, + r > 0, + msg="a, b, alpha, r > 0", + ) diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index 027c1f8c2..cf6f1716a 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -19,13 +19,12 @@ import numpy as np import pandas as pd import pymc as pm -import pytensor.tensor as pt import xarray -from pymc.distributions.dist_math import check_parameters from pymc.util import RandomState from pytensor.tensor import TensorVariable from scipy.special import betaln, expit, hyp2f1 +from pymc_marketing.clv.distributions import BGNBD from pymc_marketing.clv.models.basic import CLVModel from pymc_marketing.clv.utils import to_xarray from pymc_marketing.model_config import ModelConfig @@ -174,7 +173,10 @@ def default_model_config(self) -> ModelConfig: def build_model(self) -> None: # type: ignore[override] """Build the model.""" - coords = {"customer_id": self.data["customer_id"]} + coords = { + "customer_id": self.data["customer_id"], + "obs_var": ["recency", "frequency"], + } with pm.Model(coords=coords) as self.model: # purchase rate priors alpha = self.model_config["alpha_prior"].create_variable("alpha") @@ -196,53 +198,66 @@ def build_model(self) -> None: # type: ignore[override] a = pm.Deterministic("a", phi_dropout * kappa_dropout) b = pm.Deterministic("b", (1.0 - phi_dropout) * kappa_dropout) - def logp(t_x, x, a, b, r, alpha, T): - """ - Compute the log-likelihood of the BG/NBD model. - - The log-likelihood expression here aligns with expression (4) from [3] - due to the possible numerical instability of expression (3). - """ - x_non_zero = x > 0 - - # Refactored for numerical error - d1 = ( - pt.gammaln(r + x) - - pt.gammaln(r) - + pt.gammaln(a + b) - + pt.gammaln(b + x) - - pt.gammaln(b) - - pt.gammaln(a + b + x) - ) - - d2 = r * pt.log(alpha) - (r + x) * pt.log(alpha + t_x) - c3 = ((alpha + t_x) / (alpha + T)) ** (r + x) - c4 = a / (b + x - 1) - - logp = d1 + d2 + pt.log(c3 + pt.switch(x_non_zero, c4, 0)) - - return check_parameters( - logp, - a > 0, - b > 0, - alpha > 0, - r > 0, - msg="a, b, alpha, r > 0", - ) - - pm.Potential( - "likelihood", - logp( - x=self.data["frequency"], - t_x=self.data["recency"], - a=a, - b=b, - alpha=alpha, - r=r, - T=self.data["T"], + BGNBD( + name="recency_frequency", + a=a, + b=b, + r=r, + alpha=alpha, + T=self.data["T"], + observed=np.stack( + (self.data["recency"], self.data["frequency"]), axis=1 ), + dims=["customer_id", "obs_var"], ) + # def logp(t_x, x, a, b, r, alpha, T): + # """ + # Compute the log-likelihood of the BG/NBD model. + + # The log-likelihood expression here aligns with expression (4) from [3] + # due to the possible numerical instability of expression (3). + # """ + # x_non_zero = x > 0 + + # # Refactored for numerical error + # d1 = ( + # pt.gammaln(r + x) + # - pt.gammaln(r) + # + pt.gammaln(a + b) + # + pt.gammaln(b + x) + # - pt.gammaln(b) + # - pt.gammaln(a + b + x) + # ) + + # d2 = r * pt.log(alpha) - (r + x) * pt.log(alpha + t_x) + # c3 = ((alpha + t_x) / (alpha + T)) ** (r + x) + # c4 = a / (b + x - 1) + + # logp = d1 + d2 + pt.log(c3 + pt.switch(x_non_zero, c4, 0)) + + # return check_parameters( + # logp, + # a > 0, + # b > 0, + # alpha > 0, + # r > 0, + # msg="a, b, alpha, r > 0", + # ) + + # pm.Potential( + # "likelihood", + # logp( + # x=self.data["frequency"], + # t_x=self.data["recency"], + # a=a, + # b=b, + # alpha=alpha, + # r=r, + # T=self.data["T"], + # ), + # ) + # TODO: delete this utility after API standardization is completed def _unload_params(self): trace = self.idata.posterior From 4f6fd14b9739a505aac22f6d093fd97ad1911c44 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Wed, 25 Dec 2024 23:01:30 +0100 Subject: [PATCH 02/37] Add BGNBD excel test --- pymc_marketing/clv/distributions.py | 174 +++++++++++----------------- tests/clv/models/test_beta_geo.py | 73 +++++++++++- tests/clv/test_distributions.py | 70 +++++++++++ 3 files changed, 208 insertions(+), 109 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index 83fa1586d..c6640331b 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -13,8 +13,9 @@ # limitations under the License. """Distributions for the CLV module.""" +from functools import reduce + import numpy as np -import pymc as pm import pytensor.tensor as pt from pymc.distributions.continuous import PositiveContinuous from pymc.distributions.dist_math import betaln, check_parameters @@ -23,31 +24,21 @@ from pytensor.graph import vectorize_graph from pytensor.tensor.random.op import RandomVariable -__all__ = ["BetaGeoBetaBinom", "ContContract", "ContNonContract", "ParetoNBD"] +__all__ = ["BGNBD", "BetaGeoBetaBinom", "ContContract", "ContNonContract", "ParetoNBD"] class ContNonContractRV(RandomVariable): name = "continuous_non_contractual" - ndim_supp = 1 - ndims_params = [0, 0, 0, 0] + signature = "(),(),()->(2)" dtype = "floatX" _print_name = ("ContNonContract", "\\operatorname{ContNonContract}") - def make_node(self, rng, size, dtype, lam, p, T): - T = pt.as_tensor_variable(T) - - return super().make_node(rng, size, dtype, lam, p, T) + def __call__(self, lam, p, T, size=None, **kwargs): + return super().__call__(lam, p, T, size=size, **kwargs) @classmethod def rng_fn(cls, rng, lam, p, T, size): - size = pm.distributions.shape_utils.to_tuple(size) - - # TODO: broadcast sizes - lam = np.asarray(lam) - p = np.asarray(p) - T = np.asarray(T) - - if size == (): + if size is None: size = np.broadcast_shapes(lam.shape, p.shape, T.shape) lam = np.broadcast_to(lam, size) @@ -74,9 +65,6 @@ def rng_fn(cls, rng, lam, p, T, size): return np.stack([t_x, x], axis=-1) - def _supp_shape_from_params(*args, **kwargs): - return (2,) - continuous_non_contractual = ContNonContractRV() @@ -129,13 +117,14 @@ def logp(value, lam, p, T): ) logp = pt.switch( - pt.any( - ( + reduce( + pt.bitwise_or, + [ pt.and_(pt.ge(t_x, 0), zero_observations), pt.lt(t_x, 0), pt.lt(x, 0), pt.gt(t_x, T), - ), + ], ), -np.inf, logp, @@ -152,29 +141,16 @@ def logp(value, lam, p, T): class ContContractRV(RandomVariable): name = "continuous_contractual" - ndim_supp = 1 - ndims_params = [0, 0, 0, 0] + signature = "(),(),()->(3)" dtype = "floatX" _print_name = ("ContinuousContractual", "\\operatorname{ContinuousContractual}") - def make_node(self, rng, size, dtype, lam, p, T): - T = pt.as_tensor_variable(T) - - return super().make_node(rng, size, dtype, lam, p, T) - def __call__(self, lam, p, T, size=None, **kwargs): return super().__call__(lam, p, T, size=size, **kwargs) @classmethod def rng_fn(cls, rng, lam, p, T, size): - size = pm.distributions.shape_utils.to_tuple(size) - - # To do: broadcast sizes - lam = np.asarray(lam) - p = np.asarray(p) - T = np.asarray(T) - - if size == (): + if size is None: size = np.broadcast_shapes(lam.shape, p.shape, T.shape) lam = np.broadcast_to(lam, size) @@ -254,24 +230,15 @@ def logp(value, lam, p, T): ) logp = pt.switch( - pt.any(pt.or_(pt.lt(t_x, 0), zero_observations)), - -np.inf, - logp, - ) - logp = pt.switch( - pt.all( - pt.or_(pt.eq(churn, 0), pt.eq(churn, 1)), - ), - logp, - -np.inf, - ) - logp = pt.switch( - pt.any( - ( + reduce( + pt.bitwise_or, + [ + zero_observations, pt.lt(t_x, 0), pt.lt(x, 0), pt.gt(t_x, T), - ), + pt.bitwise_not(pt.bitwise_or(pt.eq(churn, 0), pt.eq(churn, 1))), + ], ), -np.inf, logp, @@ -289,34 +256,16 @@ def logp(value, lam, p, T): class ParetoNBDRV(RandomVariable): name = "pareto_nbd" - ndim_supp = 1 - ndims_params = [0, 0, 0, 0, 0] + signature = "(),(),(),(),()->(2)" dtype = "floatX" _print_name = ("ParetoNBD", "\\operatorname{ParetoNBD}") - def make_node(self, rng, size, dtype, r, alpha, s, beta, T): - r = pt.as_tensor_variable(r) - alpha = pt.as_tensor_variable(alpha) - s = pt.as_tensor_variable(s) - beta = pt.as_tensor_variable(beta) - T = pt.as_tensor_variable(T) - - return super().make_node(rng, size, dtype, r, alpha, s, beta, T) - def __call__(self, r, alpha, s, beta, T, size=None, **kwargs): return super().__call__(r, alpha, s, beta, T, size=size, **kwargs) @classmethod def rng_fn(cls, rng, r, alpha, s, beta, T, size): - size = pm.distributions.shape_utils.to_tuple(size) - - r = np.asarray(r) - alpha = np.asarray(alpha) - s = np.asarray(s) - beta = np.asarray(beta) - T = np.asarray(T) - - if size == (): + if size is None: size = np.broadcast_shapes( r.shape, alpha.shape, s.shape, beta.shape, T.shape ) @@ -357,9 +306,6 @@ def sim_data(lam, mu, T): return output - def _supp_shape_from_params(*args, **kwargs): - return (2,) - pareto_nbd = ParetoNBDRV() @@ -489,34 +435,16 @@ def logp(value, r, alpha, s, beta, T): class BetaGeoBetaBinomRV(RandomVariable): name = "beta_geo_beta_binom" - ndim_supp = 1 - ndims_params = [0, 0, 0, 0, 0] + signature = "(),(),(),(),()->(2)" dtype = "floatX" _print_name = ("BetaGeoBetaBinom", "\\operatorname{BetaGeoBetaBinom}") - def make_node(self, rng, size, dtype, alpha, beta, gamma, delta, T): - alpha = pt.as_tensor_variable(alpha) - beta = pt.as_tensor_variable(beta) - gamma = pt.as_tensor_variable(gamma) - delta = pt.as_tensor_variable(delta) - T = pt.as_tensor_variable(T) - - return super().make_node(rng, size, dtype, alpha, beta, gamma, delta, T) - def __call__(self, alpha, beta, gamma, delta, T, size=None, **kwargs): return super().__call__(alpha, beta, gamma, delta, T, size=size, **kwargs) @classmethod def rng_fn(cls, rng, alpha, beta, gamma, delta, T, size) -> np.ndarray: - size = pm.distributions.shape_utils.to_tuple(size) - - alpha = np.asarray(alpha) - beta = np.asarray(beta) - gamma = np.asarray(gamma) - delta = np.asarray(delta) - T = np.asarray(T) - - if size == (): + if size is None: size = np.broadcast_shapes( alpha.shape, beta.shape, gamma.shape, delta.shape, T.shape ) @@ -557,9 +485,6 @@ def sim_data(purchase_prob, churn_prob, T): return output - def _supp_shape_from_params(*args, **kwargs): - return (2,) - beta_geo_beta_binom = BetaGeoBetaBinomRV() @@ -666,27 +591,29 @@ def logp_customer_died(t_x_i, x_i, alpha_i, beta_i, gamma_i, delta_i, T_i): class BGNBDRV(RandomVariable): name = "bg_nbd" - ndim_supp = 1 - ndims_params = [0, 0, 0, 0, 0] + signature = "(),(),(),(),()->(2)" + # ndim_supp = 1 + # ndims_params = [0, 0, 0, 0, 0] dtype = "floatX" _print_name = ("BGNBD", "\\operatorname{BGNBD}") - def make_node(self, rng, size, dtype, a, b, r, alpha, T): - a = pt.as_tensor_variable(a) - b = pt.as_tensor_variable(b) - r = pt.as_tensor_variable(r) - alpha = pt.as_tensor_variable(alpha) - T = pt.as_tensor_variable(T) + # def make_node(self, rng, size, dtype, a, b, r, alpha, T): + # a = pt.as_tensor_variable(a) + # b = pt.as_tensor_variable(b) + # r = pt.as_tensor_variable(r) + # alpha = pt.as_tensor_variable(alpha) + # T = pt.as_tensor_variable(T) - return super().make_node(rng, size, dtype, a, b, r, alpha, T) + # return super().make_node(rng, size, dtype, a, b, r, alpha, T) def __call__(self, a, b, r, alpha, T, size=None, **kwargs): return super().__call__(a, b, r, alpha, T, size=size, **kwargs) @classmethod def rng_fn(cls, rng, a, b, r, alpha, T, size): - size = pm.distributions.shape_utils.to_tuple(size) + if size is None: + size = np.broadcast_shapes(a.shape, b.shape, r.shape, alpha.shape, T.shape) a = np.asarray(a) b = np.asarray(b) @@ -741,6 +668,37 @@ def _supp_shape_from_params(*args, **kwargs): class BGNBD(PositiveContinuous): + r"""Population-level distribution class for a discrete, non-contractual, Beta-Geometric/Negative-Binomial process. + + It is based on Fader, et al. in [1]_ and [2]_. + + .. math:: + + \mathbb{L}(\r, \alpha, \a, \b | x, t_x, T) &= + A_1 \cdot A_2 \cdot (A_3 + \delta_{x>0} A_4) + + where: + A_1 &= \frac{\Gamma(r+x)\alpha^r}{\Gamma(r)} \\ + A_2 &= \frac{\Gamma(a+b)\Gamma(b+x)}{\Gamma(b)\Gamma(a+b+x)} \\ + A_3 &= (\frac{1}{\alpha + T})^{r+x} \\ + A_4 &= (\frac{a}{b+x-1})(\frac{1}{\alpha + t_x})^{r+x} \\ + + ======== =============================================== + Support :math:`t_j >= 0` for :math:`j = 1, \dots,x` + Mean :math:`\mathbb{E}[X(n) | r, \alpha, a, b] = \frac{a+b-1}{a-1} \left[ 1 - (\frac{\alpha}{\alpha + T})^r \leftidx{_2}{F}{_1}(r,b;a+b-1;\frac{t}{\alpha + t}) \right] ` + ======== =============================================== + + References + ---------- + .. [1] Fader, Peter S., Bruce G.S. Hardie, and Jen Shang (2010), + "Counting Your Customers" the Easy Way: An Alternative to the Pareto/NBD Model + + Marketing Science, 24 (Spring), 275-284 + + .. [2] Implementing the BG/NBD Model for Customer Base Analysis in Excel http://brucehardie.com/notes/004/bgnbd_spreadsheet_note.pdf + + """ # noqa: E501 + rv_op = bg_nbd @classmethod diff --git a/tests/clv/models/test_beta_geo.py b/tests/clv/models/test_beta_geo.py index 25b823d19..8e3064da6 100644 --- a/tests/clv/models/test_beta_geo.py +++ b/tests/clv/models/test_beta_geo.py @@ -21,6 +21,7 @@ import xarray as xr from lifetimes.fitters.beta_geo_fitter import BetaGeoFitter +from pymc_marketing.clv.distributions import BGNBD from pymc_marketing.clv.models.beta_geo import BetaGeoModel from pymc_marketing.prior import Prior from tests.conftest import create_mock_fit, mock_sample @@ -213,6 +214,76 @@ def test_numerically_stable_logp( decimal=5, ) + @pytest.mark.parametrize("fit_type", ("map", "mcmc")) + def test_posterior_distributions(self, fit_type) -> None: + rng = np.random.default_rng(42) + dim_T = 2357 + + if fit_type == "map": + map_idata = self.model.idata.copy() + map_idata.posterior = map_idata.posterior.isel( + chain=slice(None, 1), draw=slice(None, 1) + ) + model = self.model._build_with_idata(map_idata) + # We expect 1000 draws to be sampled with MAP + expected_shape = (1, 1000) + expected_pop_dims = (1, 1000, dim_T, 2) + else: + model = self.model + expected_shape = (self.chains, self.draws) + expected_pop_dims = (self.chains, self.draws, dim_T, 2) + + data = model.data + customer_dropout = model.distribution_new_customer_dropout( + data, random_seed=rng + ) + customer_purchase_rate = model.distribution_new_customer_purchase_rate( + data, random_seed=rng + ) + customer_rec_freq = model.distribution_new_customer_recency_frequency( + data, random_seed=rng + ) + customer_rec = customer_rec_freq.sel(obs_var="recency") + customer_freq = customer_rec_freq.sel(obs_var="frequency") + + assert customer_dropout.shape == expected_shape + assert customer_purchase_rate.shape == expected_shape + assert customer_rec_freq.shape == expected_pop_dims + + lam_mean = self.r_true / self.alpha_true + lam_std = np.sqrt(self.r_true) / self.alpha_true + mu_mean = self.s_true / self.beta_true + mu_std = np.sqrt(self.s_true) / self.beta_true + ref_rec, ref_freq = pm.draw( + BGNBD.dist( + r=self.r_true, + alpha=self.alpha_true, + s=self.s_true, + beta=self.beta_true, + T=self.T, + ), + random_seed=rng, + ).T + + np.testing.assert_allclose( + customer_purchase_rate.mean(), + lam_mean, + rtol=0.5, + ) + np.testing.assert_allclose( + customer_purchase_rate.std(), + lam_std, + rtol=0.5, + ) + np.testing.assert_allclose(customer_dropout.mean(), mu_mean, rtol=0.5) + np.testing.assert_allclose(customer_dropout.std(), mu_std, rtol=0.5) + + np.testing.assert_allclose(customer_rec.mean(), ref_rec.mean(), rtol=0.5) + np.testing.assert_allclose(customer_rec.std(), ref_rec.std(), rtol=0.5) + + np.testing.assert_allclose(customer_freq.mean(), ref_freq.mean(), rtol=0.5) + np.testing.assert_allclose(customer_freq.std(), ref_freq.std(), rtol=0.5) + @pytest.mark.slow @pytest.mark.parametrize( "fit_method, rtol", @@ -532,7 +603,7 @@ def test_model_repr(self): "\nr~HalfFlat()" "\na~HalfFlat()" "\nb~HalfNormal(0,10)" - "\nlikelihood~Potential(f(r,alpha,b,a))" + "\nrecency_frequency~BGNBD(a,b,r,alpha,)" ) def test_distribution_new_customer(self) -> None: diff --git a/tests/clv/test_distributions.py b/tests/clv/test_distributions.py index 212712eb5..f0ade6a20 100644 --- a/tests/clv/test_distributions.py +++ b/tests/clv/test_distributions.py @@ -23,6 +23,7 @@ from pymc import Model from pymc_marketing.clv.distributions import ( + BGNBD, BetaGeoBetaBinom, ContContract, ContNonContract, @@ -443,3 +444,72 @@ def test_beta_geo_beta_binom_sample_prior( np.testing.assert_allclose(lt_frequency.mean(), recency.mean(), rtol=0.84) np.testing.assert_allclose(lt_recency.mean(), frequency.mean(), rtol=0.84) + + +class TestBGNBD: + def test_logp_matches_excel(self): + a = 0.793 + b = 2.426 + r = 0.243 + alpha = 4.414 + T = 38.86 + + x = np.array([2, 1, 0, 0, 0, 7, 1, 0, 2, 0, 5, 0, 0, 0, 0, 0, 10, 1]) + t_x = np.array( + [ + 30.43, + 1.71, + 0.00, + 0.00, + 0.00, + 29.43, + 5.00, + 0.00, + 35.71, + 0.00, + 24.43, + 0.00, + 0.00, + 0.00, + 0.00, + 0.00, + 34.14, + 4.86, + ] + ) + expected_logp = np.array( + [ + -9.4596, + -4.4711, + -0.5538, + -0.5538, + -0.5538, + -21.8644, + -4.8651, + -0.5538, + -9.5367, + -0.5538, + -17.3593, + -0.5538, + -0.5538, + -0.5538, + -0.5538, + -0.5538, + -27.3144, + -4.8520, + ] + ) + + value = np.concatenate([t_x[:, None], x[:, None]], axis=-1) + dist = BGNBD.dist( + a=a, + b=b, + r=r, + alpha=alpha, + T=T, + ) + np.testing.assert_allclose( + pm.logp(dist, value).eval(), + expected_logp, + rtol=2e-3, + ) From 715d5c463dc03af68ccece2cc4d72a257e4beef0 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 27 Dec 2024 16:30:17 +0100 Subject: [PATCH 03/37] Remove logp in terms of Potential --- pymc_marketing/clv/models/beta_geo.py | 47 --------------------------- 1 file changed, 47 deletions(-) diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index cf6f1716a..174d0b4d6 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -211,53 +211,6 @@ def build_model(self) -> None: # type: ignore[override] dims=["customer_id", "obs_var"], ) - # def logp(t_x, x, a, b, r, alpha, T): - # """ - # Compute the log-likelihood of the BG/NBD model. - - # The log-likelihood expression here aligns with expression (4) from [3] - # due to the possible numerical instability of expression (3). - # """ - # x_non_zero = x > 0 - - # # Refactored for numerical error - # d1 = ( - # pt.gammaln(r + x) - # - pt.gammaln(r) - # + pt.gammaln(a + b) - # + pt.gammaln(b + x) - # - pt.gammaln(b) - # - pt.gammaln(a + b + x) - # ) - - # d2 = r * pt.log(alpha) - (r + x) * pt.log(alpha + t_x) - # c3 = ((alpha + t_x) / (alpha + T)) ** (r + x) - # c4 = a / (b + x - 1) - - # logp = d1 + d2 + pt.log(c3 + pt.switch(x_non_zero, c4, 0)) - - # return check_parameters( - # logp, - # a > 0, - # b > 0, - # alpha > 0, - # r > 0, - # msg="a, b, alpha, r > 0", - # ) - - # pm.Potential( - # "likelihood", - # logp( - # x=self.data["frequency"], - # t_x=self.data["recency"], - # a=a, - # b=b, - # alpha=alpha, - # r=r, - # T=self.data["T"], - # ), - # ) - # TODO: delete this utility after API standardization is completed def _unload_params(self): trace = self.idata.posterior From e2a0607cd6608a66bcaa29dda740be36bbec7407 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 27 Dec 2024 16:33:42 +0100 Subject: [PATCH 04/37] Rename BGNBD -> BetaGeoNBD --- pymc_marketing/clv/distributions.py | 16 +++++++++++----- pymc_marketing/clv/models/beta_geo.py | 4 ++-- tests/clv/models/test_beta_geo.py | 4 ++-- tests/clv/test_distributions.py | 6 +++--- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index c6640331b..2ddf891d3 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -24,7 +24,13 @@ from pytensor.graph import vectorize_graph from pytensor.tensor.random.op import RandomVariable -__all__ = ["BGNBD", "BetaGeoBetaBinom", "ContContract", "ContNonContract", "ParetoNBD"] +__all__ = [ + "BetaGeoBetaBinom", + "BetaGeoNBD", + "ContContract", + "ContNonContract", + "ParetoNBD", +] class ContNonContractRV(RandomVariable): @@ -589,14 +595,14 @@ def logp_customer_died(t_x_i, x_i, alpha_i, beta_i, gamma_i, delta_i, T_i): ) -class BGNBDRV(RandomVariable): +class BetaGeoNBDRV(RandomVariable): name = "bg_nbd" signature = "(),(),(),(),()->(2)" # ndim_supp = 1 # ndims_params = [0, 0, 0, 0, 0] dtype = "floatX" - _print_name = ("BGNBD", "\\operatorname{BGNBD}") + _print_name = ("BetaGeoNBD", "\\operatorname{BetaGeoNBD}") # def make_node(self, rng, size, dtype, a, b, r, alpha, T): # a = pt.as_tensor_variable(a) @@ -664,10 +670,10 @@ def _supp_shape_from_params(*args, **kwargs): return (2,) -bg_nbd = BGNBDRV() +bg_nbd = BetaGeoNBDRV() -class BGNBD(PositiveContinuous): +class BetaGeoNBD(PositiveContinuous): r"""Population-level distribution class for a discrete, non-contractual, Beta-Geometric/Negative-Binomial process. It is based on Fader, et al. in [1]_ and [2]_. diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index 174d0b4d6..7c5bf24dc 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -24,7 +24,7 @@ from pytensor.tensor import TensorVariable from scipy.special import betaln, expit, hyp2f1 -from pymc_marketing.clv.distributions import BGNBD +from pymc_marketing.clv.distributions import BetaGeoNBD from pymc_marketing.clv.models.basic import CLVModel from pymc_marketing.clv.utils import to_xarray from pymc_marketing.model_config import ModelConfig @@ -198,7 +198,7 @@ def build_model(self) -> None: # type: ignore[override] a = pm.Deterministic("a", phi_dropout * kappa_dropout) b = pm.Deterministic("b", (1.0 - phi_dropout) * kappa_dropout) - BGNBD( + BetaGeoNBD( name="recency_frequency", a=a, b=b, diff --git a/tests/clv/models/test_beta_geo.py b/tests/clv/models/test_beta_geo.py index 8e3064da6..81a721e89 100644 --- a/tests/clv/models/test_beta_geo.py +++ b/tests/clv/models/test_beta_geo.py @@ -21,7 +21,7 @@ import xarray as xr from lifetimes.fitters.beta_geo_fitter import BetaGeoFitter -from pymc_marketing.clv.distributions import BGNBD +from pymc_marketing.clv.distributions import BetaGeoNBD from pymc_marketing.clv.models.beta_geo import BetaGeoModel from pymc_marketing.prior import Prior from tests.conftest import create_mock_fit, mock_sample @@ -255,7 +255,7 @@ def test_posterior_distributions(self, fit_type) -> None: mu_mean = self.s_true / self.beta_true mu_std = np.sqrt(self.s_true) / self.beta_true ref_rec, ref_freq = pm.draw( - BGNBD.dist( + BetaGeoNBD.dist( r=self.r_true, alpha=self.alpha_true, s=self.s_true, diff --git a/tests/clv/test_distributions.py b/tests/clv/test_distributions.py index f0ade6a20..ae83dd44d 100644 --- a/tests/clv/test_distributions.py +++ b/tests/clv/test_distributions.py @@ -23,8 +23,8 @@ from pymc import Model from pymc_marketing.clv.distributions import ( - BGNBD, BetaGeoBetaBinom, + BetaGeoNBD, ContContract, ContNonContract, ParetoNBD, @@ -446,7 +446,7 @@ def test_beta_geo_beta_binom_sample_prior( np.testing.assert_allclose(lt_recency.mean(), frequency.mean(), rtol=0.84) -class TestBGNBD: +class TestBetaGeoNBD: def test_logp_matches_excel(self): a = 0.793 b = 2.426 @@ -501,7 +501,7 @@ def test_logp_matches_excel(self): ) value = np.concatenate([t_x[:, None], x[:, None]], axis=-1) - dist = BGNBD.dist( + dist = BetaGeoNBD.dist( a=a, b=b, r=r, From ca621182a33a5d6c715999de21478ec6e16dadab Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Sat, 28 Dec 2024 10:41:33 +0100 Subject: [PATCH 05/37] Add logp and test matching lifetimes --- pymc_marketing/clv/models/beta_geo.py | 30 ++++++++++ tests/clv/test_distributions.py | 79 +++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index 7c5bf24dc..a4cb78243 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -424,6 +424,36 @@ def expected_probability_alive( "chain", "draw", "customer_id", missing_dims="ignore" ) + @staticmethod + def _logp( + a: xarray.DataArray, + b: xarray.DataArray, + r: xarray.DataArray, + alpha: xarray.DataArray, + x: xarray.DataArray, + t_x: xarray.DataArray, + T: xarray.DataArray, + ) -> xarray.DataArray: + """Log-likelihood of the Pareto/NBD model. + + Utility function for using ParetoNBD log-likelihood in predictive methods. + """ + # Add one dummy dimension to the right of the scalar parameters, so they broadcast with the `T` vector + bg_bnd_dist = BetaGeoNBD.dist( + a=a.values[..., None], + b=b.values[..., None], + r=r.values[..., None], + alpha=alpha.values[..., None] + if "customer_id" not in alpha.dims + else alpha.values, + T=T.values, + ) + values = np.vstack((t_x.values, x.values)).T + # TODO: Instead of compiling this function everytime this method is called + # we could compile it once (with mutable inputs) and cache it for reuse with new inputs. + loglike = pm.logp(bg_bnd_dist, values).eval() + return xarray.DataArray(data=loglike, dims=("chain", "draw", "customer_id")) + def expected_probability_no_purchase( self, t: int, diff --git a/tests/clv/test_distributions.py b/tests/clv/test_distributions.py index ae83dd44d..a0af8071f 100644 --- a/tests/clv/test_distributions.py +++ b/tests/clv/test_distributions.py @@ -17,6 +17,7 @@ import pytensor.tensor as pt import pytest from lifetimes import BetaGeoBetaBinomFitter as BGBBF +from lifetimes import BetaGeoFitter as BG from lifetimes import ParetoNBDFitter as PF from lifetimes.generate_data import beta_geometric_beta_binom_model from numpy.testing import assert_almost_equal @@ -447,7 +448,85 @@ def test_beta_geo_beta_binom_sample_prior( class TestBetaGeoNBD: + @pytest.mark.parametrize( + "value, r, alpha, a, b, T", + [ + ( + np.array([1.5, 1]), + 0.55, + 10.58, + 0.61, + 11.67, + 12, + ), + ( + np.array([1.5, 1]), + [0.45, 0.55], + 10.58, + 0.61, + 11.67, + 12, + ), + ( + np.array([1.5, 1]), + [0.45, 0.55], + 10.58, + [0.71, 0.61], + 11.67, + 12, + ), + ( + np.array([[1.5, 1], [5.3, 4], [6, 2]]), + 0.55, + 11.67, + 0.61, + 10.58, + [12, 10, 8], + ), + ( + np.array([1.5, 1]), + 0.55, + 10.58, + 0.61, + np.full((5, 3), 11.67), + 12, + ), + ], + ) + def test_bg_nbd(self, value, r, alpha, a, b, T): + # weights = np.ones_like(value, dtype=int) + + def lifetimes_wrapper( + r, alpha, a, b, freq, rec, T, weights=np.array(1), penalizer_coef=0.0 + ): + log_r = np.log(r) + log_alpha = np.log(alpha) + log_a = np.log(a) + log_b = np.log(b) + + """Simple wrapper for Vectorizing the lifetimes likelihood function. + Lifetimes uses the negative log likelihood, so we need to negate it to match PyMC3's logp. + """ + return -1.0 * BG._negative_log_likelihood( + (log_r, log_alpha, log_a, log_b), freq, rec, T, weights, penalizer_coef + ) + + vectorized_logp = np.vectorize(lifetimes_wrapper) + + with Model(): + bg_nbd = BetaGeoNBD("bg_nbd", a=a, b=b, r=r, alpha=alpha, T=T) + pt = {"bg_nbd": value} + + assert_almost_equal( + pm.logp(bg_nbd, value).eval(), + vectorized_logp(r, alpha, a, b, value[..., 1], value[..., 0], T), + decimal=6, + err_msg=str(pt), + ) + def test_logp_matches_excel(self): + # Expected logp values can be found in excel file in http://brucehardie.com/notes/004/ + # Spreadsheet: BGNBD Estimation a = 0.793 b = 2.426 r = 0.243 From ed4a0f00ac1b5f76b34f14f3bc29864ec7be8c15 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Mon, 30 Dec 2024 11:16:28 +0100 Subject: [PATCH 06/37] Add logp param.type.ndim > 1. Add logp pt.switch. Related tests --- pymc_marketing/clv/distributions.py | 24 +++++++++++++++++++++--- tests/clv/test_distributions.py | 17 +++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index 2ddf891d3..1969f36a3 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -714,8 +714,14 @@ def dist(cls, a, b, r, alpha, T, **kwargs): def logp(value, a, b, r, alpha, T): """Log-likelihood of the distribution.""" - t_x = value[..., 0] - x = value[..., 1] + t_x = pt.atleast_1d(value[..., 0]) + x = pt.atleast_1d(value[..., 1]) + + for param in (t_x, x, a, b, r, alpha, T): + if param.type.ndim > 1: + raise NotImplementedError( + f"BetaGeoNBD logp only implemented for vector parameters, got ndim={param.type.ndim}" + ) x_non_zero = x > 0 @@ -734,11 +740,23 @@ def logp(value, a, b, r, alpha, T): logp = d1 + d2 + pt.log(c3 + pt.switch(x_non_zero, c4, 0)) + logp = pt.switch( + pt.or_( + pt.or_( + pt.lt(t_x, 0), + pt.lt(x, 0), + ), + pt.gt(t_x, T), + ), + -np.inf, + logp, + ) + return check_parameters( logp, a > 0, b > 0, alpha > 0, r > 0, - msg="a, b, alpha, r > 0", + msg="a > 0, b > 0, alpha > 0, r > 0", ) diff --git a/tests/clv/test_distributions.py b/tests/clv/test_distributions.py index a0af8071f..93cacfe2f 100644 --- a/tests/clv/test_distributions.py +++ b/tests/clv/test_distributions.py @@ -592,3 +592,20 @@ def test_logp_matches_excel(self): expected_logp, rtol=2e-3, ) + + def test_invalid_value_logp(self): + bg_nbd = BetaGeoNBD.dist(a=1.20, b=0.75, r=0.66, alpha=2.78, T=6) + value = pt.vector("value", shape=(2,)) + logp = pm.logp(bg_nbd, value) + + logp_fn = pytensor.function([value], logp) + assert logp_fn(np.array([3, -1])) == -np.inf + assert logp_fn(np.array([-1, 1.5])) == -np.inf + assert logp_fn(np.array([11, 1.5])) == -np.inf + + def test_notimplemented_logp(self): + dist = BetaGeoNBD.dist(a=1, b=1, r=2, alpha=2, T=10) + invalid_value = np.broadcast_to([1, 3], (4, 3, 2)) + + with pytest.raises(NotImplementedError): + pm.logp(dist, invalid_value) From e65aa6a516fe8a3946cef3d82d6dc87203b77f14 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Tue, 31 Dec 2024 10:19:47 +0100 Subject: [PATCH 07/37] Add test_bg_nbd_sample_prior --- tests/clv/test_distributions.py | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/clv/test_distributions.py b/tests/clv/test_distributions.py index 93cacfe2f..f50aab2ca 100644 --- a/tests/clv/test_distributions.py +++ b/tests/clv/test_distributions.py @@ -609,3 +609,37 @@ def test_notimplemented_logp(self): with pytest.raises(NotImplementedError): pm.logp(dist, invalid_value) + + @pytest.mark.parametrize( + "a_size, b_size, r_size, alpha_size, bg_nbd_size, expected_size", + [ + (None, None, None, None, None, (2,)), + ((5,), None, None, None, None, (5, 2)), + (None, (5,), None, None, (5,), (5, 2)), + (None, None, (5, 1), (1, 3), (5, 3), (5, 3, 2)), + (None, None, None, None, (5, 3), (5, 3, 2)), + ], + ) + def test_bg_nbd_sample_prior( + self, a_size, b_size, r_size, alpha_size, bg_nbd_size, expected_size + ): + with Model(): + a = pm.HalfNormal(name="a", sigma=10, size=a_size) + b = pm.HalfNormal(name="b", sigma=10, size=b_size) + r = pm.HalfNormal(name="r", sigma=10, size=r_size) + alpha = pm.HalfNormal(name="alpha", sigma=10, size=alpha_size) + + T = pm.Data(name="T", value=np.array(10)) + + BetaGeoNBD( + name="bg_nbd", + a=a, + b=b, + r=r, + alpha=alpha, + T=T, + size=bg_nbd_size, + ) + prior = pm.sample_prior_predictive(samples=100) + + assert prior["prior"]["bg_nbd"][0].shape == (100, *expected_size) From b1b891e44b009695e6264244c06cf64fe7605f1f Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 10:29:32 +0100 Subject: [PATCH 08/37] Add _distribution_new_customers and related test. Rename population_dropout|purchase_rate as dropout|purchase_rate to consolidate naming among models --- pymc_marketing/clv/models/beta_geo.py | 96 ++++++++++++++++++++++----- tests/clv/models/test_beta_geo.py | 3 +- 2 files changed, 81 insertions(+), 18 deletions(-) diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index a4cb78243..54273bd8a 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -1,4 +1,4 @@ -# Copyright 2024 The PyMC Labs Developers +# Copyright 2025 The PyMC Labs Developers # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ import warnings from collections.abc import Sequence +from typing import Literal import numpy as np import pandas as pd @@ -581,30 +582,91 @@ def expected_purchases_new_customer( def _distribution_new_customers( self, + data: pd.DataFrame | None = None, + *, + T: int | np.ndarray | pd.Series | None = None, random_seed: RandomState | None = None, - var_names: Sequence[str] = ("population_dropout", "population_purchase_rate"), + var_names: Sequence[ + Literal["dropout", "purchase_rate", "recency_frequency"] + ] = ("dropout", "purchase_rate", "recency_frequency"), + n_samples: int = 1000, ) -> xarray.Dataset: - with pm.Model(): + """Compute posterior predictive samples of dropout, purchase rate and frequency/recency of new customers. + + In a model with covariates, if `data` is not specified, the dataset used for fitting will be used and + a prediction will be computed for a *new customer* with each set of covariates. + *This is not a conditional prediction for observed customers!* + + Parameters + ---------- + data : ~pandas.DataFrame, Optional + DataFrame containing the following columns: + + * `customer_id`: Unique customer identifier + * `T`: Time between the first purchase and the end of the observation period + + If not provided, predictions will be ran with data used to fit model. + T : array_like, optional + time between the first purchase and the end of the observation period. + Not needed if `data` parameter is provided with a `T` column. + random_seed : ~numpy.random.RandomState, optional + Random state to use for sampling. + var_names : sequence of str, optional + Names of the variables to sample from. Defaults to ["dropout", "purchase_rate", "recency_frequency"]. + n_samples : int, optional + Number of samples to generate. Defaults to 1000 + + """ + if data is None: + data = self.data + + if T is not None: + data = data.assign(T=T) + + dataset = self._extract_predictive_variables(data, customer_varnames=["T"]) + T = dataset["T"].values + # Delete "T" so we can pass dataset directly to `sample_posterior_predictive` + del dataset["T"] + + if dataset.sizes["chain"] == 1 and dataset.sizes["draw"] == 1: + # For map fit add a dummy draw dimension + dataset = dataset.squeeze("draw").expand_dims(draw=range(n_samples)) + + coords = self.model.coords.copy() # type: ignore + coords["customer_id"] = data["customer_id"] + + with pm.Model(coords=coords): a = pm.HalfFlat("a") b = pm.HalfFlat("b") alpha = pm.HalfFlat("alpha") r = pm.HalfFlat("r") - fit_result = self.fit_result - if fit_result.sizes["chain"] == 1 and fit_result.sizes["draw"] == 1: - # For map fit add a dummy draw dimension - fit_result = self.fit_result.squeeze("draw").expand_dims( - draw=range(1000) - ) + # fit_result = self.fit_result + # if fit_result.sizes["chain"] == 1 and fit_result.sizes["draw"] == 1: + # # For map fit add a dummy draw dimension + # fit_result = self.fit_result.squeeze("draw").expand_dims( + # draw=range(1000) + # ) - pm.Beta("population_dropout", alpha=a, beta=b) - pm.Gamma("population_purchase_rate", alpha=r, beta=alpha) + pm.Beta("dropout", alpha=a, beta=b) + pm.Gamma("purchase_rate", alpha=r, beta=alpha) + + BetaGeoNBD( + name="recency_frequency", + a=a, + b=b, + r=r, + alpha=alpha, + T=T, + dims=["customer_id", "obs_var"], + ) return pm.sample_posterior_predictive( - fit_result, + dataset, var_names=var_names, random_seed=random_seed, - ).posterior_predictive + predictions=True, + ).predictions def distribution_new_customer_dropout( self, @@ -627,8 +689,8 @@ def distribution_new_customer_dropout( """ return self._distribution_new_customers( random_seed=random_seed, - var_names=["population_dropout"], - )["population_dropout"] + var_names=["dropout"], + )["dropout"] def distribution_new_customer_purchase_rate( self, @@ -652,5 +714,5 @@ def distribution_new_customer_purchase_rate( """ return self._distribution_new_customers( random_seed=random_seed, - var_names=["population_purchase_rate"], - )["population_purchase_rate"] + var_names=["purchase_rate"], + )["purchase_rate"] diff --git a/tests/clv/models/test_beta_geo.py b/tests/clv/models/test_beta_geo.py index 81a721e89..eca199d8a 100644 --- a/tests/clv/models/test_beta_geo.py +++ b/tests/clv/models/test_beta_geo.py @@ -1,4 +1,4 @@ -# Copyright 2024 The PyMC Labs Developers +# Copyright 2025 The PyMC Labs Developers # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -610,6 +610,7 @@ def test_distribution_new_customer(self) -> None: mock_model = BetaGeoModel( data=self.data, ) + mock_model.build_model() mock_model.idata = az.from_dict( { "a": [self.a_true], From 866ef8e218a88d5ccfdba3f500a53e585ad10e2f Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 10:32:27 +0100 Subject: [PATCH 09/37] Adjust test_model_repr expected result to BetaGeoNBD instead of BGNBD --- tests/clv/models/test_beta_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clv/models/test_beta_geo.py b/tests/clv/models/test_beta_geo.py index eca199d8a..f360d1c6f 100644 --- a/tests/clv/models/test_beta_geo.py +++ b/tests/clv/models/test_beta_geo.py @@ -603,7 +603,7 @@ def test_model_repr(self): "\nr~HalfFlat()" "\na~HalfFlat()" "\nb~HalfNormal(0,10)" - "\nrecency_frequency~BGNBD(a,b,r,alpha,)" + "\nrecency_frequency~BetaGeoNBD(a,b,r,alpha,)" ) def test_distribution_new_customer(self) -> None: From df0403966840d02d805a98d2086e090ef5dab405 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 15:21:07 +0100 Subject: [PATCH 10/37] Improve distribution_new_customer, distribution_new_customer_purchase_rate. Introduce distribution_new_customer_recency_frequency. Improve tests --- pymc_marketing/clv/models/beta_geo.py | 56 +++++++++++++++++++++++++-- tests/clv/models/test_beta_geo.py | 19 +++++---- 2 files changed, 65 insertions(+), 10 deletions(-) diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index 54273bd8a..35ba468b9 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -580,7 +580,7 @@ def expected_purchases_new_customer( "chain", "draw", "customer_id", missing_dims="ignore" ) - def _distribution_new_customers( + def distribution_new_customer( self, data: pd.DataFrame | None = None, *, @@ -670,6 +670,8 @@ def _distribution_new_customers( def distribution_new_customer_dropout( self, + data: pd.DataFrame | None = None, + *, random_seed: RandomState | None = None, ) -> xarray.Dataset: """Sample the Beta distribution for the population-level dropout rate. @@ -687,13 +689,16 @@ def distribution_new_customer_dropout( Dataset containing the posterior samples for the population-level dropout rate. """ - return self._distribution_new_customers( + return self.distribution_new_customer( + data=data, random_seed=random_seed, var_names=["dropout"], )["dropout"] def distribution_new_customer_purchase_rate( self, + data: pd.DataFrame | None = None, + *, random_seed: RandomState | None = None, ) -> xarray.Dataset: """Sample the Gamma distribution for the population-level purchase rate. @@ -712,7 +717,52 @@ def distribution_new_customer_purchase_rate( Dataset containing the posterior samples for the population-level purchase rate. """ - return self._distribution_new_customers( + return self.distribution_new_customer( + data=data, random_seed=random_seed, var_names=["purchase_rate"], )["purchase_rate"] + + def distribution_new_customer_recency_frequency( + self, + data: pd.DataFrame | None = None, + *, + T: int | np.ndarray | pd.Series | None = None, + random_seed: RandomState | None = None, + n_samples: int = 1000, + ) -> xarray.Dataset: + """BG/NBD process representing purchases across the customer population. + + This is the distribution of purchase frequencies given 'T' observation periods for each customer. + + Parameters + ---------- + data : ~pandas.DataFrame, optional + DataFrame containing the following columns: + + * `customer_id`: Unique customer identifier + * `T`: Time between the first purchase and the end of the observation period. + * All covariate columns specified when model was initialized. + + If not provided, the method will use the fit dataset. + T : array_like, optional + Number of observation periods for each customer. If not provided, T values from fit dataset will be used. + Not required if `data` Dataframe contains a `T` column. + random_seed : ~numpy.random.RandomState, optional + Random state to use for sampling. + n_samples : int, optional + Number of samples to generate. Defaults to 1000. + + Returns + ------- + ~xarray.Dataset + Dataset containing the posterior samples for the customer population. + + """ + return self.distribution_new_customer( + data=data, + T=T, + random_seed=random_seed, + var_names=["recency_frequency"], + n_samples=n_samples, + )["recency_frequency"] diff --git a/tests/clv/models/test_beta_geo.py b/tests/clv/models/test_beta_geo.py index f360d1c6f..4c58918c3 100644 --- a/tests/clv/models/test_beta_geo.py +++ b/tests/clv/models/test_beta_geo.py @@ -51,6 +51,7 @@ def setup_class(cls): # Instantiate model with CDNOW data for testing cls.model = BetaGeoModel(cls.data) + cls.model.build_model() # Also instantiate lifetimes model for comparison cls.lifetimes_model = BetaGeoFitter() @@ -250,16 +251,20 @@ def test_posterior_distributions(self, fit_type) -> None: assert customer_purchase_rate.shape == expected_shape assert customer_rec_freq.shape == expected_pop_dims + # N = 1000 + # p = pm.Beta.dist(self.a_true, self.b_true, size=N) + # theta = pm.Beta.dist(self.gamma_true, self.delta_true, size=N) + lam_mean = self.r_true / self.alpha_true lam_std = np.sqrt(self.r_true) / self.alpha_true - mu_mean = self.s_true / self.beta_true - mu_std = np.sqrt(self.s_true) / self.beta_true + # mu_mean = self.s_true / self.beta_true + # mu_std = np.sqrt(self.s_true) / self.beta_true ref_rec, ref_freq = pm.draw( BetaGeoNBD.dist( + a=self.a_true, + b=self.b_true, r=self.r_true, alpha=self.alpha_true, - s=self.s_true, - beta=self.beta_true, T=self.T, ), random_seed=rng, @@ -275,8 +280,8 @@ def test_posterior_distributions(self, fit_type) -> None: lam_std, rtol=0.5, ) - np.testing.assert_allclose(customer_dropout.mean(), mu_mean, rtol=0.5) - np.testing.assert_allclose(customer_dropout.std(), mu_std, rtol=0.5) + # np.testing.assert_allclose(customer_dropout.mean(), mu_mean, rtol=0.5) + # np.testing.assert_allclose(customer_dropout.std(), mu_std, rtol=0.5) np.testing.assert_allclose(customer_rec.mean(), ref_rec.mean(), rtol=0.5) np.testing.assert_allclose(customer_rec.std(), ref_rec.std(), rtol=0.5) @@ -284,7 +289,7 @@ def test_posterior_distributions(self, fit_type) -> None: np.testing.assert_allclose(customer_freq.mean(), ref_freq.mean(), rtol=0.5) np.testing.assert_allclose(customer_freq.std(), ref_freq.std(), rtol=0.5) - @pytest.mark.slow + # @pytest.mark.slow @pytest.mark.parametrize( "fit_method, rtol", [ From 86364d8241080aed69b7afa1f73b39ee0e67817f Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 15:24:32 +0100 Subject: [PATCH 11/37] Revert @pytest.mark.slow to in test_model_convergence --- tests/clv/models/test_beta_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clv/models/test_beta_geo.py b/tests/clv/models/test_beta_geo.py index 4c58918c3..331c63344 100644 --- a/tests/clv/models/test_beta_geo.py +++ b/tests/clv/models/test_beta_geo.py @@ -289,7 +289,7 @@ def test_posterior_distributions(self, fit_type) -> None: np.testing.assert_allclose(customer_freq.mean(), ref_freq.mean(), rtol=0.5) np.testing.assert_allclose(customer_freq.std(), ref_freq.std(), rtol=0.5) - # @pytest.mark.slow + @pytest.mark.slow @pytest.mark.parametrize( "fit_method, rtol", [ From 256b607d192b6c64575577e35833aaf2ff14eae9 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 16:30:16 +0100 Subject: [PATCH 12/37] Revert distribution changes related to #1269 --- pymc_marketing/clv/distributions.py | 48 ++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index 1969f36a3..04c15dba1 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -1,4 +1,4 @@ -# Copyright 2024 The PyMC Labs Developers +# Copyright 2025 The PyMC Labs Developers # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ from functools import reduce import numpy as np +import pymc as pm import pytensor.tensor as pt from pymc.distributions.continuous import PositiveContinuous from pymc.distributions.dist_math import betaln, check_parameters @@ -35,16 +36,26 @@ class ContNonContractRV(RandomVariable): name = "continuous_non_contractual" - signature = "(),(),()->(2)" + ndim_supp = 1 + ndims_params = [0, 0, 0, 0] dtype = "floatX" _print_name = ("ContNonContract", "\\operatorname{ContNonContract}") - def __call__(self, lam, p, T, size=None, **kwargs): - return super().__call__(lam, p, T, size=size, **kwargs) + def make_node(self, rng, size, dtype, lam, p, T): + T = pt.as_tensor_variable(T) + + return super().make_node(rng, size, dtype, lam, p, T) @classmethod def rng_fn(cls, rng, lam, p, T, size): - if size is None: + size = pm.distributions.shape_utils.to_tuple(size) + + # TODO: broadcast sizes + lam = np.asarray(lam) + p = np.asarray(p) + T = np.asarray(T) + + if size == (): size = np.broadcast_shapes(lam.shape, p.shape, T.shape) lam = np.broadcast_to(lam, size) @@ -71,6 +82,9 @@ def rng_fn(cls, rng, lam, p, T, size): return np.stack([t_x, x], axis=-1) + def _supp_shape_from_params(*args, **kwargs): + return (2,) + continuous_non_contractual = ContNonContractRV() @@ -147,16 +161,29 @@ def logp(value, lam, p, T): class ContContractRV(RandomVariable): name = "continuous_contractual" - signature = "(),(),()->(3)" + ndim_supp = 1 + ndims_params = [0, 0, 0, 0] dtype = "floatX" _print_name = ("ContinuousContractual", "\\operatorname{ContinuousContractual}") + def make_node(self, rng, size, dtype, lam, p, T): + T = pt.as_tensor_variable(T) + + return super().make_node(rng, size, dtype, lam, p, T) + def __call__(self, lam, p, T, size=None, **kwargs): return super().__call__(lam, p, T, size=size, **kwargs) @classmethod def rng_fn(cls, rng, lam, p, T, size): - if size is None: + size = pm.distributions.shape_utils.to_tuple(size) + + # To do: broadcast sizes + lam = np.asarray(lam) + p = np.asarray(p) + T = np.asarray(T) + + if size == (): size = np.broadcast_shapes(lam.shape, p.shape, T.shape) lam = np.broadcast_to(lam, size) @@ -236,15 +263,14 @@ def logp(value, lam, p, T): ) logp = pt.switch( - reduce( - pt.bitwise_or, - [ + pt.any( + ( zero_observations, pt.lt(t_x, 0), pt.lt(x, 0), pt.gt(t_x, T), pt.bitwise_not(pt.bitwise_or(pt.eq(churn, 0), pt.eq(churn, 1))), - ], + ), ), -np.inf, logp, From 5bf47d7d00440be60fc0646dff64086014391dde Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 16:32:57 +0100 Subject: [PATCH 13/37] Revert more changes related to #1269 --- pymc_marketing/clv/distributions.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index 04c15dba1..d51c07ac4 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -13,8 +13,6 @@ # limitations under the License. """Distributions for the CLV module.""" -from functools import reduce - import numpy as np import pymc as pm import pytensor.tensor as pt @@ -137,14 +135,13 @@ def logp(value, lam, p, T): ) logp = pt.switch( - reduce( - pt.bitwise_or, - [ + pt.any( + ( pt.and_(pt.ge(t_x, 0), zero_observations), pt.lt(t_x, 0), pt.lt(x, 0), pt.gt(t_x, T), - ], + ), ), -np.inf, logp, From 7aa31f782626640ee3c60113539b7d7c3788578b Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 17:04:47 +0100 Subject: [PATCH 14/37] Revert BetaGeoBetaBinomRV changes --- pymc_marketing/clv/distributions.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index d51c07ac4..b7a9d7f71 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -464,16 +464,34 @@ def logp(value, r, alpha, s, beta, T): class BetaGeoBetaBinomRV(RandomVariable): name = "beta_geo_beta_binom" - signature = "(),(),(),(),()->(2)" + ndim_supp = 1 + ndims_params = [0, 0, 0, 0, 0] dtype = "floatX" _print_name = ("BetaGeoBetaBinom", "\\operatorname{BetaGeoBetaBinom}") + def make_node(self, rng, size, dtype, alpha, beta, gamma, delta, T): + alpha = pt.as_tensor_variable(alpha) + beta = pt.as_tensor_variable(beta) + gamma = pt.as_tensor_variable(gamma) + delta = pt.as_tensor_variable(delta) + T = pt.as_tensor_variable(T) + + return super().make_node(rng, size, dtype, alpha, beta, gamma, delta, T) + def __call__(self, alpha, beta, gamma, delta, T, size=None, **kwargs): return super().__call__(alpha, beta, gamma, delta, T, size=size, **kwargs) @classmethod def rng_fn(cls, rng, alpha, beta, gamma, delta, T, size) -> np.ndarray: - if size is None: + size = pm.distributions.shape_utils.to_tuple(size) + + alpha = np.asarray(alpha) + beta = np.asarray(beta) + gamma = np.asarray(gamma) + delta = np.asarray(delta) + T = np.asarray(T) + + if size == (): size = np.broadcast_shapes( alpha.shape, beta.shape, gamma.shape, delta.shape, T.shape ) From ef5599825e587643c1689a063227fb418dcc00da Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 17:13:02 +0100 Subject: [PATCH 15/37] Revert ParetoNBDRV changes --- pymc_marketing/clv/distributions.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index b7a9d7f71..565a04978 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -285,16 +285,34 @@ def logp(value, lam, p, T): class ParetoNBDRV(RandomVariable): name = "pareto_nbd" - signature = "(),(),(),(),()->(2)" + ndim_supp = 1 + ndims_params = [0, 0, 0, 0, 0] dtype = "floatX" _print_name = ("ParetoNBD", "\\operatorname{ParetoNBD}") + def make_node(self, rng, size, dtype, r, alpha, s, beta, T): + r = pt.as_tensor_variable(r) + alpha = pt.as_tensor_variable(alpha) + s = pt.as_tensor_variable(s) + beta = pt.as_tensor_variable(beta) + T = pt.as_tensor_variable(T) + + return super().make_node(rng, size, dtype, r, alpha, s, beta, T) + def __call__(self, r, alpha, s, beta, T, size=None, **kwargs): return super().__call__(r, alpha, s, beta, T, size=size, **kwargs) @classmethod def rng_fn(cls, rng, r, alpha, s, beta, T, size): - if size is None: + size = pm.distributions.shape_utils.to_tuple(size) + + r = np.asarray(r) + alpha = np.asarray(alpha) + s = np.asarray(s) + beta = np.asarray(beta) + T = np.asarray(T) + + if size == (): size = np.broadcast_shapes( r.shape, alpha.shape, s.shape, beta.shape, T.shape ) From 4f91de5565808d354d390eaf9ce4c84d31d36356 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 17:16:34 +0100 Subject: [PATCH 16/37] Docstring cleanup --- pymc_marketing/clv/models/beta_geo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index 35ba468b9..80cfade5b 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -435,9 +435,9 @@ def _logp( t_x: xarray.DataArray, T: xarray.DataArray, ) -> xarray.DataArray: - """Log-likelihood of the Pareto/NBD model. + """Log-likelihood of the BG/NBD model. - Utility function for using ParetoNBD log-likelihood in predictive methods. + Utility function for using BetaGeoNBD log-likelihood in predictive methods. """ # Add one dummy dimension to the right of the scalar parameters, so they broadcast with the `T` vector bg_bnd_dist = BetaGeoNBD.dist( From 3c5260cfd85a547d2df7c94805a1c41c6ff1736f Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 17:20:35 +0100 Subject: [PATCH 17/37] Revert changes in ContContract dist --- pymc_marketing/clv/distributions.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index 565a04978..a8239d804 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -254,19 +254,24 @@ def logp(value, lam, p, T): logp += churn * pt.log(p) + (1 - churn) * (pt.log(1 - p) - lam * (T - t_x)) logp = pt.switch( - zero_observations, - -lam * T, + pt.any(pt.or_(pt.lt(t_x, 0), zero_observations)), + -np.inf, logp, ) + logp = pt.switch( + pt.all( + pt.or_(pt.eq(churn, 0), pt.eq(churn, 1)), + ), + logp, + -np.inf, + ) logp = pt.switch( pt.any( ( - zero_observations, pt.lt(t_x, 0), pt.lt(x, 0), pt.gt(t_x, T), - pt.bitwise_not(pt.bitwise_or(pt.eq(churn, 0), pt.eq(churn, 1))), ), ), -np.inf, From 21cb076156828040343e1a45f29e853829ce1da0 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 17:22:54 +0100 Subject: [PATCH 18/37] Clean ContContract changes --- pymc_marketing/clv/distributions.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index a8239d804..0b1bc4fab 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -253,6 +253,11 @@ def logp(value, lam, p, T): logp = (x - 1) * pt.log(1 - p) + x * pt.log(lam) - lam * t_x logp += churn * pt.log(p) + (1 - churn) * (pt.log(1 - p) - lam * (T - t_x)) + logp = pt.switch( + zero_observations, + -lam * T, + logp, + ) logp = pt.switch( pt.any(pt.or_(pt.lt(t_x, 0), zero_observations)), -np.inf, From 55da4d14e19e471119666a7cbeee82daa76f114f Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 17:26:23 +0100 Subject: [PATCH 19/37] Revert deletion _supp_shape_from_params --- pymc_marketing/clv/distributions.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index 0b1bc4fab..cbed10e83 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -258,6 +258,7 @@ def logp(value, lam, p, T): -lam * T, logp, ) + logp = pt.switch( pt.any(pt.or_(pt.lt(t_x, 0), zero_observations)), -np.inf, @@ -270,7 +271,6 @@ def logp(value, lam, p, T): logp, -np.inf, ) - logp = pt.switch( pt.any( ( @@ -363,6 +363,9 @@ def sim_data(lam, mu, T): return output + def _supp_shape_from_params(*args, **kwargs): + return (2,) + pareto_nbd = ParetoNBDRV() @@ -560,6 +563,9 @@ def sim_data(purchase_prob, churn_prob, T): return output + def _supp_shape_from_params(*args, **kwargs): + return (2,) + beta_geo_beta_binom = BetaGeoBetaBinomRV() From 8beb7fc1d302045e15668acd350058ec67a88a9f Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 17:53:04 +0100 Subject: [PATCH 20/37] Remove commented chunk on fit_result. Opted for data to standardize with other CLV models --- pymc_marketing/clv/models/beta_geo.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index 80cfade5b..2f6b353f2 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -641,13 +641,6 @@ def distribution_new_customer( alpha = pm.HalfFlat("alpha") r = pm.HalfFlat("r") - # fit_result = self.fit_result - # if fit_result.sizes["chain"] == 1 and fit_result.sizes["draw"] == 1: - # # For map fit add a dummy draw dimension - # fit_result = self.fit_result.squeeze("draw").expand_dims( - # draw=range(1000) - # ) - pm.Beta("dropout", alpha=a, beta=b) pm.Gamma("purchase_rate", alpha=r, beta=alpha) From 4c46ae80d6c95fbba4a4df7477f1587eaf745dab Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 2 Jan 2025 18:11:31 +0100 Subject: [PATCH 21/37] BetaGeoNBDRV as pre-#1269 definition --- pymc_marketing/clv/distributions.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index cbed10e83..60015f90d 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -672,21 +672,20 @@ def logp_customer_died(t_x_i, x_i, alpha_i, beta_i, gamma_i, delta_i, T_i): class BetaGeoNBDRV(RandomVariable): name = "bg_nbd" - signature = "(),(),(),(),()->(2)" - # ndim_supp = 1 - # ndims_params = [0, 0, 0, 0, 0] + ndim_supp = 1 + ndims_params = [0, 0, 0, 0, 0] dtype = "floatX" _print_name = ("BetaGeoNBD", "\\operatorname{BetaGeoNBD}") - # def make_node(self, rng, size, dtype, a, b, r, alpha, T): - # a = pt.as_tensor_variable(a) - # b = pt.as_tensor_variable(b) - # r = pt.as_tensor_variable(r) - # alpha = pt.as_tensor_variable(alpha) - # T = pt.as_tensor_variable(T) + def make_node(self, rng, size, dtype, a, b, r, alpha, T): + a = pt.as_tensor_variable(a) + b = pt.as_tensor_variable(b) + r = pt.as_tensor_variable(r) + alpha = pt.as_tensor_variable(alpha) + T = pt.as_tensor_variable(T) - # return super().make_node(rng, size, dtype, a, b, r, alpha, T) + return super().make_node(rng, size, dtype, a, b, r, alpha, T) def __call__(self, a, b, r, alpha, T, size=None, **kwargs): return super().__call__(a, b, r, alpha, T, size=size, **kwargs) From 9356052ec05ebcd9b88829023159224896001c5e Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 3 Jan 2025 16:18:06 +0100 Subject: [PATCH 22/37] Fix test_numerically_stable_logp --- tests/clv/models/test_beta_geo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/clv/models/test_beta_geo.py b/tests/clv/models/test_beta_geo.py index 331c63344..899615499 100644 --- a/tests/clv/models/test_beta_geo.py +++ b/tests/clv/models/test_beta_geo.py @@ -201,13 +201,16 @@ def test_numerically_stable_logp( "T": np.asarray([40]), } ) + model = BetaGeoModel( data=data, model_config=model_config, ) + model.build_model() pymc_model = model.model - logp = pymc_model.compile_fn(pymc_model.potentiallogp) + + logp = pymc_model.compile_logp() np.testing.assert_almost_equal( logp({"a": 0.80, "b": 2.50, "r": 0.25, "alpha": 4.00}), From fc912d60946198c8399f3fca04cc1726921c21fa Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Wed, 8 Jan 2025 16:29:23 +0100 Subject: [PATCH 23/37] Overrride ModifiedBetaGeoModel.distribution_new_customer method --- .../clv/models/modified_beta_geo.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/pymc_marketing/clv/models/modified_beta_geo.py b/pymc_marketing/clv/models/modified_beta_geo.py index cdd67ca96..2ab122a18 100644 --- a/pymc_marketing/clv/models/modified_beta_geo.py +++ b/pymc_marketing/clv/models/modified_beta_geo.py @@ -14,6 +14,7 @@ """Modified Beta-Geometric Negative Binomial Distribution (MBG/NBD) model for a non-contractual customer population across continuous time.""" # noqa: E501 import warnings +from collections.abc import Sequence import numpy as np import pandas as pd @@ -21,6 +22,7 @@ import pytensor.tensor as pt import xarray from pymc.distributions.dist_math import check_parameters +from pymc.util import RandomState from scipy.special import hyp2f1 from pymc_marketing.clv.models import BetaGeoModel @@ -391,3 +393,44 @@ def expected_probability_no_purchase( raise NotImplementedError( "The MBG/NBD model does not support this feature at the moment." ) + + def distribution_new_customer( + self, + data: pd.DataFrame | None = None, + *, + T: int | np.ndarray | pd.Series | None = None, + random_seed: RandomState | None = None, + var_names: Sequence[str] = ("dropout", "purchase_rate"), + n_samples: int = 1000, + ) -> xarray.Dataset: + # TODO: This is extraneous now, until a new distribution block is added. + """Compute posterior predictive samples of dropout, purchase rate and frequency/recency of new customers.""" + if data is None: + data = self.data + + if T is not None: + dataset = data.assign(T=T) + + dataset = self._extract_predictive_variables(data, customer_varnames=["T"]) + T = dataset["T"].values + # Delete "T" so we can pass dataset directly to `sample_posterior_predictive` + del dataset["T"] + + if dataset.sizes["chain"] == 1 and dataset.sizes["draw"] == 1: + # For map fit add a dummy draw dimension + dataset = dataset.squeeze("draw").expand_dims(draw=range(n_samples)) + + with pm.Model(): + a = pm.HalfFlat("a") + b = pm.HalfFlat("b") + alpha = pm.HalfFlat("alpha") + r = pm.HalfFlat("r") + + pm.Beta("dropout", alpha=a, beta=b) + pm.Gamma("purchase_rate", alpha=r, beta=alpha) + + return pm.sample_posterior_predictive( + dataset, + var_names=var_names, + random_seed=random_seed, + ).posterior_predictive From 7b60fcfc870713330e95ced891489efc324a3963 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Wed, 8 Jan 2025 17:06:24 +0100 Subject: [PATCH 24/37] Modify test_bg_nbd tensor parametrization to only vectors in test_bg_nbd. Passing param.type.ndim > 1 now raises NotImplementedError --- tests/clv/test_distributions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clv/test_distributions.py b/tests/clv/test_distributions.py index 7321056ab..cfb920813 100644 --- a/tests/clv/test_distributions.py +++ b/tests/clv/test_distributions.py @@ -488,7 +488,7 @@ class TestBetaGeoNBD: 0.55, 10.58, 0.61, - np.full((5, 3), 11.67), + np.full((1), 11.67), 12, ), ], From 04952f3a97d4db7c59c394fdd75dc1c1bf65cd1e Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Wed, 8 Jan 2025 17:20:36 +0100 Subject: [PATCH 25/37] Silence mypy in 2 lines --- pymc_marketing/clv/models/modified_beta_geo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pymc_marketing/clv/models/modified_beta_geo.py b/pymc_marketing/clv/models/modified_beta_geo.py index 2ab122a18..0455a83be 100644 --- a/pymc_marketing/clv/models/modified_beta_geo.py +++ b/pymc_marketing/clv/models/modified_beta_geo.py @@ -412,13 +412,13 @@ def distribution_new_customer( dataset = data.assign(T=T) dataset = self._extract_predictive_variables(data, customer_varnames=["T"]) - T = dataset["T"].values + T = dataset["T"].values # type: ignore # Delete "T" so we can pass dataset directly to `sample_posterior_predictive` del dataset["T"] if dataset.sizes["chain"] == 1 and dataset.sizes["draw"] == 1: # For map fit add a dummy draw dimension - dataset = dataset.squeeze("draw").expand_dims(draw=range(n_samples)) + dataset = dataset.squeeze("draw").expand_dims(draw=range(n_samples)) # type: ignore with pm.Model(): a = pm.HalfFlat("a") From bcc8fa7b529cdda7c22c34f2330dfb048a8d571f Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 9 Jan 2025 10:52:50 +0100 Subject: [PATCH 26/37] Adapt BetaGeoNBDRV to #1269 --- pymc_marketing/clv/distributions.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index e849fcde6..dca150928 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -597,21 +597,11 @@ def logp_customer_died(t_x_i, x_i, alpha_i, beta_i, gamma_i, delta_i, T_i): class BetaGeoNBDRV(RandomVariable): name = "bg_nbd" - ndim_supp = 1 - ndims_params = [0, 0, 0, 0, 0] + signature = "(),(),(),(),()->(2)" dtype = "floatX" _print_name = ("BetaGeoNBD", "\\operatorname{BetaGeoNBD}") - def make_node(self, rng, size, dtype, a, b, r, alpha, T): - a = pt.as_tensor_variable(a) - b = pt.as_tensor_variable(b) - r = pt.as_tensor_variable(r) - alpha = pt.as_tensor_variable(alpha) - T = pt.as_tensor_variable(T) - - return super().make_node(rng, size, dtype, a, b, r, alpha, T) - def __call__(self, a, b, r, alpha, T, size=None, **kwargs): return super().__call__(a, b, r, alpha, T, size=size, **kwargs) @@ -665,9 +655,6 @@ def sim_data(lam, p, T): return output - def _supp_shape_from_params(*args, **kwargs): - return (2,) - bg_nbd = BetaGeoNBDRV() From 17818ae69ebe9fa14a77dd42ad22902297f64b96 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Thu, 9 Jan 2025 11:15:35 +0100 Subject: [PATCH 27/37] Fix test_clv_fit_mcmc --- tests/test_mlflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mlflow.py b/tests/test_mlflow.py index 81aea99a3..92c379848 100644 --- a/tests/test_mlflow.py +++ b/tests/test_mlflow.py @@ -483,7 +483,7 @@ def test_clv_fit_mcmc(model_cls, clv_data) -> None: run_id = run.info.run_id inputs, params, metrics, tags, artifacts = get_run_data(run_id) - assert inputs == [] + assert isinstance(inputs, list) assert params["fit_method"] == "mcmc" From 15503ecb091226b9066c5eefb2f4d6c5faacfbe1 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 10 Jan 2025 10:19:06 +0100 Subject: [PATCH 28/37] Modify sim_data to reflect the beta-distributed dropout process --- pymc_marketing/clv/distributions.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index dca150928..f4d1cdf4b 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -631,24 +631,19 @@ def rng_fn(cls, rng, a, b, r, alpha, T, size): p = rng.beta(a=a, b=b, size=size) def sim_data(lam, p, T): - t = 0 - n = 0 + t = 0 # recency + n = 0 # frequency - dropout_time = rng.exponential(scale=1 / p) + churn = 0 # BG/NBD assumes all non-repeat customers are active wait = rng.exponential(scale=1 / lam) - final_t = min(dropout_time, T) - while (t + wait) < final_t: - t += wait + while t + wait < T and not churn: n += 1 + churn = rng.binomial(n=1, p=p) + t += wait wait = rng.exponential(scale=1 / lam) - return np.array( - [ - t, - n, - ], - ) + return np.array([t, n]) for index in np.ndindex(*size): output[index] = sim_data(lam[index], p[index], T[index]) From db8c796d3e8e34733df3bbb12d067a9d0dd9d9f0 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 10 Jan 2025 10:22:03 +0100 Subject: [PATCH 29/37] Add reference to BetaGeoNBD --- pymc_marketing/clv/distributions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index f4d1cdf4b..44bacbdc7 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -657,7 +657,7 @@ def sim_data(lam, p, T): class BetaGeoNBD(PositiveContinuous): r"""Population-level distribution class for a discrete, non-contractual, Beta-Geometric/Negative-Binomial process. - It is based on Fader, et al. in [1]_ and [2]_. + It is based on Fader, et al. in [1]_, [2]_ and [3]_. .. math:: @@ -683,6 +683,7 @@ class BetaGeoNBD(PositiveContinuous): Marketing Science, 24 (Spring), 275-284 .. [2] Implementing the BG/NBD Model for Customer Base Analysis in Excel http://brucehardie.com/notes/004/bgnbd_spreadsheet_note.pdf + .. [3] Overcoming the BG/NBD Model's :math:`#NUM!` Error Problem https://brucehardie.com/notes/027/bgnbd_num_error.pdf """ # noqa: E501 From 378220e712afcdd4deb9c69ca3c1cd2be9e6878f Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 10 Jan 2025 10:38:04 +0100 Subject: [PATCH 30/37] Delete _logp --- pymc_marketing/clv/models/beta_geo.py | 30 --------------------------- 1 file changed, 30 deletions(-) diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index 2f6b353f2..e5961d3eb 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -425,36 +425,6 @@ def expected_probability_alive( "chain", "draw", "customer_id", missing_dims="ignore" ) - @staticmethod - def _logp( - a: xarray.DataArray, - b: xarray.DataArray, - r: xarray.DataArray, - alpha: xarray.DataArray, - x: xarray.DataArray, - t_x: xarray.DataArray, - T: xarray.DataArray, - ) -> xarray.DataArray: - """Log-likelihood of the BG/NBD model. - - Utility function for using BetaGeoNBD log-likelihood in predictive methods. - """ - # Add one dummy dimension to the right of the scalar parameters, so they broadcast with the `T` vector - bg_bnd_dist = BetaGeoNBD.dist( - a=a.values[..., None], - b=b.values[..., None], - r=r.values[..., None], - alpha=alpha.values[..., None] - if "customer_id" not in alpha.dims - else alpha.values, - T=T.values, - ) - values = np.vstack((t_x.values, x.values)).T - # TODO: Instead of compiling this function everytime this method is called - # we could compile it once (with mutable inputs) and cache it for reuse with new inputs. - loglike = pm.logp(bg_bnd_dist, values).eval() - return xarray.DataArray(data=loglike, dims=("chain", "draw", "customer_id")) - def expected_probability_no_purchase( self, t: int, From 7b9508fab52481b13b06c613d929b3fd284f9323 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 10 Jan 2025 10:42:07 +0100 Subject: [PATCH 31/37] Delete commented weights param in test_bg_nbd --- tests/clv/test_distributions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/clv/test_distributions.py b/tests/clv/test_distributions.py index cfb920813..7efd4b5f1 100644 --- a/tests/clv/test_distributions.py +++ b/tests/clv/test_distributions.py @@ -494,8 +494,6 @@ class TestBetaGeoNBD: ], ) def test_bg_nbd(self, value, r, alpha, a, b, T): - # weights = np.ones_like(value, dtype=int) - def lifetimes_wrapper( r, alpha, a, b, freq, rec, T, weights=np.array(1), penalizer_coef=0.0 ): From c4edfa9eb051579e662751d001aef3dc759d2d70 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 10 Jan 2025 12:17:33 +0100 Subject: [PATCH 32/37] Ammend BetaGeoNBD docstring --- pymc_marketing/clv/distributions.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index 44bacbdc7..9b8081bbb 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -661,14 +661,14 @@ class BetaGeoNBD(PositiveContinuous): .. math:: - \mathbb{L}(\r, \alpha, \a, \b | x, t_x, T) &= - A_1 \cdot A_2 \cdot (A_3 + \delta_{x>0} A_4) + \mathbb{LL}(\r, \alpha, \a, \b | x, t_x, T) &= + D_1 + D_2 + \ln(C_3) (C_3 + \delta_{x>0} C_4) where: - A_1 &= \frac{\Gamma(r+x)\alpha^r}{\Gamma(r)} \\ - A_2 &= \frac{\Gamma(a+b)\Gamma(b+x)}{\Gamma(b)\Gamma(a+b+x)} \\ - A_3 &= (\frac{1}{\alpha + T})^{r+x} \\ - A_4 &= (\frac{a}{b+x-1})(\frac{1}{\alpha + t_x})^{r+x} \\ + D_1 &= \ln \left[ \Gamma(r+x) \rigth] - \ln \left[ \Gamma(r) \right] + \ln \right[ \Gamma(a+b) \left] + \ln \right[ \Gamma(b+x) \left] \\ + D_2 &= r \ln(\alpha) - (r+x) \ln(\alpha + t_x) \\ + C_3 &= (\frac{\alpha + t_x}{\alpha + T})^{r+x} \\ + C_4 &= (\frac{a}{b+x-1})(\frac{1} \\ ======== =============================================== Support :math:`t_j >= 0` for :math:`j = 1, \dots,x` From 5d0db19b8c3bcb960d367a7376b8dcc7fe3226c6 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 10 Jan 2025 16:27:22 +0100 Subject: [PATCH 33/37] Fix BetaGeoNBD math --- pymc_marketing/clv/distributions.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index 9b8081bbb..da779ea0b 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -661,18 +661,18 @@ class BetaGeoNBD(PositiveContinuous): .. math:: - \mathbb{LL}(\r, \alpha, \a, \b | x, t_x, T) &= - D_1 + D_2 + \ln(C_3) (C_3 + \delta_{x>0} C_4) - - where: - D_1 &= \ln \left[ \Gamma(r+x) \rigth] - \ln \left[ \Gamma(r) \right] + \ln \right[ \Gamma(a+b) \left] + \ln \right[ \Gamma(b+x) \left] \\ + \mathbb{LL}(r, \alpha, a, b | x, t_x, T) = + D_1 + D_2 + \ln(C_3 + \delta_{x>0} C_4) \text{, where:} \\ + \begin{align} + D_1 &= \ln \left[ \Gamma(r+x) \right] - \ln \left[ \Gamma(r) \right] + \ln \left[ \Gamma(a+b) \right] + \ln \left[ \Gamma(b+x) \right] \\ D_2 &= r \ln(\alpha) - (r+x) \ln(\alpha + t_x) \\ - C_3 &= (\frac{\alpha + t_x}{\alpha + T})^{r+x} \\ - C_4 &= (\frac{a}{b+x-1})(\frac{1} \\ + C_3 &= \left(\frac{\alpha + t_x}{\alpha + T} \right)^{r+x} \\ + C_4 &= \left(\frac{a}{b+x-1} \right) \\ + \end{align} ======== =============================================== Support :math:`t_j >= 0` for :math:`j = 1, \dots,x` - Mean :math:`\mathbb{E}[X(n) | r, \alpha, a, b] = \frac{a+b-1}{a-1} \left[ 1 - (\frac{\alpha}{\alpha + T})^r \leftidx{_2}{F}{_1}(r,b;a+b-1;\frac{t}{\alpha + t}) \right] ` + Mean :math:`\mathbb{E}[X(n) | r, \alpha, a, b] = \frac{a+b-1}{a-1} \left[ 1 - \left(\frac{\alpha}{\alpha + T}\right)^r {_2}{F}{_1}(r,b;a+b-1;\frac{t}{\alpha + t}) \right]` ======== =============================================== References From 7d0526957b2c76471590bae22c6a53842603aaa1 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 10 Jan 2025 16:56:23 +0100 Subject: [PATCH 34/37] Fix test_posterior_distributions to include dropout distributions --- tests/clv/models/test_beta_geo.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/clv/models/test_beta_geo.py b/tests/clv/models/test_beta_geo.py index 401e1f4a1..1a6bdc72c 100644 --- a/tests/clv/models/test_beta_geo.py +++ b/tests/clv/models/test_beta_geo.py @@ -254,14 +254,10 @@ def test_posterior_distributions(self, fit_type) -> None: assert customer_purchase_rate.shape == expected_shape assert customer_rec_freq.shape == expected_pop_dims - # N = 1000 - # p = pm.Beta.dist(self.a_true, self.b_true, size=N) - # theta = pm.Beta.dist(self.gamma_true, self.delta_true, size=N) - - lam_mean = self.r_true / self.alpha_true - lam_std = np.sqrt(self.r_true) / self.alpha_true - # mu_mean = self.s_true / self.beta_true - # mu_std = np.sqrt(self.s_true) / self.beta_true + N = 1000 + p = pm.Beta.dist(self.a_true, self.b_true, size=N) + lam = pm.Gamma.dist(self.r_true, self.alpha_true, size=N) + ref_rec, ref_freq = pm.draw( BetaGeoNBD.dist( a=self.a_true, @@ -275,16 +271,20 @@ def test_posterior_distributions(self, fit_type) -> None: np.testing.assert_allclose( customer_purchase_rate.mean(), - lam_mean, + pm.draw(lam.mean(), random_seed=rng), rtol=0.5, ) np.testing.assert_allclose( customer_purchase_rate.std(), - lam_std, + pm.draw(lam.std(), random_seed=rng), rtol=0.5, ) - # np.testing.assert_allclose(customer_dropout.mean(), mu_mean, rtol=0.5) - # np.testing.assert_allclose(customer_dropout.std(), mu_std, rtol=0.5) + np.testing.assert_allclose( + customer_dropout.mean(), pm.draw(p.mean(), random_seed=rng), rtol=0.5 + ) + np.testing.assert_allclose( + customer_dropout.std(), pm.draw(p.std(), random_seed=rng), rtol=0.5 + ) np.testing.assert_allclose(customer_rec.mean(), ref_rec.mean(), rtol=0.5) np.testing.assert_allclose(customer_rec.std(), ref_rec.std(), rtol=0.5) From ba3625cfeb5ba7c30448c49140a1b972679684e4 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Fri, 10 Jan 2025 17:31:49 +0100 Subject: [PATCH 35/37] Fix #NUM! docstring --- pymc_marketing/clv/distributions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index da779ea0b..a02ad90d5 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -683,7 +683,7 @@ class BetaGeoNBD(PositiveContinuous): Marketing Science, 24 (Spring), 275-284 .. [2] Implementing the BG/NBD Model for Customer Base Analysis in Excel http://brucehardie.com/notes/004/bgnbd_spreadsheet_note.pdf - .. [3] Overcoming the BG/NBD Model's :math:`#NUM!` Error Problem https://brucehardie.com/notes/027/bgnbd_num_error.pdf + .. [3] Overcoming the BG/NBD Model's #NUM! Error Problem https://brucehardie.com/notes/027/bgnbd_num_error.pdf """ # noqa: E501 From 911f8a08e6d4ea9c68fe422735343ebf553b478c Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Sat, 11 Jan 2025 00:19:50 +0100 Subject: [PATCH 36/37] Tweak sim_data --- pymc_marketing/clv/distributions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index a02ad90d5..bd57a980a 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -638,8 +638,8 @@ def sim_data(lam, p, T): wait = rng.exponential(scale=1 / lam) while t + wait < T and not churn: + churn = rng.random() < p n += 1 - churn = rng.binomial(n=1, p=p) t += wait wait = rng.exponential(scale=1 / lam) From bb4e82b979f2459a39ad4dc7d941bd66e1d60ab0 Mon Sep 17 00:00:00 2001 From: PabloRoque Date: Sat, 11 Jan 2025 00:50:58 +0100 Subject: [PATCH 37/37] Add co-author. Co-authored-by: Colt Allen <10178857+coltallen@users.noreply.github.com> --- pymc_marketing/clv/distributions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index bd57a980a..4227f3cc4 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -679,7 +679,6 @@ class BetaGeoNBD(PositiveContinuous): ---------- .. [1] Fader, Peter S., Bruce G.S. Hardie, and Jen Shang (2010), "Counting Your Customers" the Easy Way: An Alternative to the Pareto/NBD Model - Marketing Science, 24 (Spring), 275-284 .. [2] Implementing the BG/NBD Model for Customer Base Analysis in Excel http://brucehardie.com/notes/004/bgnbd_spreadsheet_note.pdf