From 6fec04ff3614b017cf9f23895a580b3220e73965 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Capretto?= Date: Fri, 13 Jan 2023 17:57:19 -0300 Subject: [PATCH] Add default handler for posterior predictive distribution (#625) * Ideas on how to generalize pps * Make posterior predictive more general * Remove commented code * Make predict work with multivariate families * Add docstrings * update changelog * Make posterior predictive more robust. Make categorical a univariate family. Remove cylic import issues * For some things, Categorical family must be treated as multivariate --- Changelog.md | 1 + bambi/backend/model_components.py | 3 +- bambi/backend/terms.py | 7 +- bambi/defaults/families.py | 3 +- bambi/families/family.py | 129 +++++++++++++++++++++++++- bambi/families/multivariate.py | 100 +-------------------- bambi/families/univariate.py | 144 ++++++++++++------------------ bambi/model_components.py | 2 +- tests/test_predict.py | 12 ++- 9 files changed, 210 insertions(+), 191 deletions(-) diff --git a/Changelog.md b/Changelog.md index 08f4367d1..60f0e41e3 100644 --- a/Changelog.md +++ b/Changelog.md @@ -5,6 +5,7 @@ ### New features * Refactored the codebase to support distributional models (#607) +* Added a default method to handle posterior predictive sampling for custom families (#625) ### Maintenance and fixes diff --git a/bambi/backend/model_components.py b/bambi/backend/model_components.py index ac22a9f61..6fe2f8568 100644 --- a/bambi/backend/model_components.py +++ b/bambi/backend/model_components.py @@ -5,6 +5,7 @@ from bambi.backend.terms import CommonTerm, GroupSpecificTerm, InterceptTerm, ResponseTerm from bambi.backend.utils import get_distribution from bambi.families.multivariate import MultivariateFamily +from bambi.families.univariate import Categorical from bambi.utils import get_aliased_name @@ -118,7 +119,7 @@ def build_group_specific_terms(self, pymc_backend, bmb_model): if predictor.ndim > 1: for col in range(predictor.shape[1]): self.output += coef[:, col] * predictor[:, col] - elif isinstance(bmb_model.family, MultivariateFamily): + elif isinstance(bmb_model.family, (MultivariateFamily, Categorical)): self.output += coef * predictor[:, np.newaxis] else: self.output += coef * predictor diff --git a/bambi/backend/terms.py b/bambi/backend/terms.py index 874ba3bc2..07c5b2ba5 100644 --- a/bambi/backend/terms.py +++ b/bambi/backend/terms.py @@ -5,6 +5,7 @@ from bambi.backend.utils import has_hyperprior, get_distribution from bambi.families.multivariate import MultivariateFamily +from bambi.families.univariate import Categorical from bambi.priors import Prior from bambi.utils import get_aliased_name @@ -36,7 +37,7 @@ def build(self, spec): # Dims of the response variable response_dims = [] - if isinstance(spec.family, MultivariateFamily): + if isinstance(spec.family, (MultivariateFamily, Categorical)): response_dims = list(spec.response_component.response_term.coords) response_dims_n = len(spec.response_component.response_term.coords[response_dims[0]]) # Arguments may be of shape (a,) but we need them to be of shape (a, b) @@ -99,7 +100,7 @@ def build(self, spec): # Dims of the response variable (e.g. categorical) response_dims = [] - if isinstance(spec.family, MultivariateFamily): + if isinstance(spec.family, (MultivariateFamily, Categorical)): response_dims = list(spec.response_component.response_term.coords) dims = list(self.coords) + response_dims @@ -172,7 +173,7 @@ def build(self, spec): dist = get_distribution(self.term.prior.name) label = self.name # Prepends one dimension if response is multivariate - if isinstance(spec.family, MultivariateFamily): + if isinstance(spec.family, (MultivariateFamily, Categorical)): dims = list(spec.response_component.response_term.coords) dist = dist(label, dims=dims, **self.term.prior.args)[np.newaxis, :] else: diff --git a/bambi/defaults/families.py b/bambi/defaults/families.py index a259603a8..567611ddb 100644 --- a/bambi/defaults/families.py +++ b/bambi/defaults/families.py @@ -4,6 +4,7 @@ Bernoulli, Beta, Binomial, + Categorical, Gamma, Gaussian, NegativeBinomial, @@ -13,7 +14,7 @@ VonMises, Wald, ) -from bambi.families.multivariate import Categorical, Multinomial +from bambi.families.multivariate import Multinomial # fmt: off diff --git a/bambi/families/family.py b/bambi/families/family.py index b3d5ab5c8..f3b8cec26 100644 --- a/bambi/families/family.py +++ b/bambi/families/family.py @@ -1,7 +1,11 @@ from typing import Dict, Union +import numpy as np +import pymc as pm +import xarray as xr + from bambi.families.link import Link -from bambi.utils import get_auxiliary_parameters +from bambi.utils import get_auxiliary_parameters, get_aliased_name class Family: @@ -101,9 +105,132 @@ def set_default_priors(self, priors): priors = {k: v for k, v in priors.items() if k in auxiliary_parameters} self.default_priors.update(priors) + def posterior_predictive(self, model, posterior, **kwargs): # pylint: disable = unused-argument + """Get draws from the posterior predictive distribution + + This function works for almost all the families. It grabs the draws for the parameters + needed in the response distribution, and then gets samples from the posterior predictive + distribution using `pm.draw()`. It won't work when the response distribution requires + parameters that are not available in `posterior`. + + Parameters + ---------- + model : bambi.Model + The model + posterior : xr.Dataset + The xarray dataset that contains the draws for all the parameters in the posterior. + It must contain the parameters that are needed in the distribution of the response, or + the parameters that allow to derive them. + + Returns + ------- + xr.DataArray + A data array with the draws from the posterior predictive distribution + """ + response_dist = get_response_dist(model.family) + params = model.family.likelihood.params + response_aliased_name = get_aliased_name(model.response_component.response_term) + + kwargs = {} + output_dataset_list = [] + + # In the posterior xr.Dataset we need to consider aliases. + # But we don't use aliases when passing kwargs to the PyMC distribution + for param in params: + # Extract posterior draws for the parent parameter + if param == model.family.likelihood.parent: + component = model.components[model.response_name] + var_name = f"{response_aliased_name}_mean" + kwargs[param] = posterior[var_name].to_numpy() + output_dataset_list.append(posterior[var_name]) + else: + # Extract posterior draws for non-parent parameters + component = model.components[param] + component_aliased_name = component.alias if component.alias else param + var_name = f"{response_aliased_name}_{component_aliased_name}" + if var_name in posterior: + kwargs[param] = posterior[var_name].to_numpy() + output_dataset_list.append(posterior[var_name]) + elif hasattr(component, "prior") and isinstance(component.prior, (int, float)): + kwargs[param] = np.asarray(component.prior) + + # Determine the array with largest number of dimensions + ndims_max = max(x.ndim for x in kwargs.values()) + + # Append a dimension when needed. Required to make `pm.draw()` work. + for key, values in kwargs.items(): + kwargs[key] = expand_array(values, ndims_max) + + # NOTE: Wouldn't it be better to always use parametrizations compatible with PyMC? + # The current approach allows more flexibility, but it's more painful. + if hasattr(model.family, "transform_backend_kwargs"): + kwargs = model.family.transform_backend_kwargs(kwargs) + + output_array = pm.draw(response_dist.dist(**kwargs)) + output_coords_all = xr.merge(output_dataset_list).coords + + if hasattr(model.family, "KIND") and model.family.KIND == "Multivariate": + coord_names = ( + "chain", + "draw", + response_aliased_name + "_obs", + response_aliased_name + "_mean_dim", + ) + else: # Assume it's univariate family + coord_names = ("chain", "draw", response_aliased_name + "_obs") + + output_coords = {} + for coord_name in coord_names: + output_coords[coord_name] = output_coords_all[coord_name] + return xr.DataArray(output_array, coords=output_coords) + def __str__(self): msg_list = [f"Family: {self.name}", f"Likelihood: {self.likelihood}", f"Link: {self.link}"] return "\n".join(msg_list) def __repr__(self): return self.__str__() + + +def get_response_dist(family): + """Get the PyMC distribution for the response + + Parameters + ---------- + family : bambi.Family + The family for which the response distribution is wanted + + Returns + ------- + pm.Distribution + The response distribution + """ + if family.likelihood.dist: + dist = family.likelihood.dist + else: + dist = getattr(pm, family.likelihood.name) + return dist + + +def expand_array(x, ndim): + """Add dimensions to an array to match the number of desired dimensions + + If x.ndim < ndim, it adds ndim - x.ndim dimensions after the last axis. If not, it is left + untouched. + + Parameters + ---------- + x : np.ndarray + The array + ndim : int + The number of desired dimensions + + Returns + ------- + np.ndarray + The array with the expanded dimensions + """ + if x.ndim == ndim: + return x + dims_to_expand = tuple(range(ndim - 1, x.ndim - 1, -1)) + return np.expand_dims(x, dims_to_expand) diff --git a/bambi/families/multivariate.py b/bambi/families/multivariate.py index 17f2dc5d6..d8ac71364 100644 --- a/bambi/families/multivariate.py +++ b/bambi/families/multivariate.py @@ -1,94 +1,14 @@ # pylint: disable=unused-argument -import pytensor.tensor as pt import numpy as np import xarray as xr +import pytensor.tensor as pt from bambi.families.family import Family from bambi.utils import extract_argument_names, extra_namespace, get_aliased_name class MultivariateFamily(Family): - def posterior_predictive(self, model, posterior): - raise NotImplementedError - - -class Categorical(MultivariateFamily): - SUPPORTED_LINKS = {"p": ["softmax"]} - UFUNC_KWARGS = {"axis": -1} - - def transform_linear_predictor(self, model, linear_predictor): - response_name = get_aliased_name(model.response_component.response_term) - response_levels_dim = response_name + "_dim" - linear_predictor = linear_predictor.pad({response_levels_dim: (1, 0)}, constant_values=0) - return linear_predictor - - def transform_coords(self, model, mean): - # The mean has the reference level in the dimension, a new name is needed - response_name = get_aliased_name(model.response_component.response_term) - response_levels_dim = response_name + "_dim" - response_levels_dim_complete = response_name + "_mean_dim" - levels_complete = model.response_component.response_term.levels - mean = mean.rename({response_levels_dim: response_levels_dim_complete}) - mean = mean.assign_coords({response_levels_dim_complete: levels_complete}) - return mean - - # NOTE: Check posterior predictive - def posterior_predictive(self, model, posterior, **kwargs): - def draw_categorical_samples(probability_matrix, items): - # https://stackoverflow.com/questions/34187130 - # probability_matrix is a matrix of shape (n_chain * n_draw, n_levels) - cumsum = probability_matrix.cumsum(axis=1) - idx = np.random.rand(probability_matrix.shape[0])[:, np.newaxis] - idx = (cumsum < idx).sum(axis=1) - return items[idx] - - response_name = get_aliased_name(model.response_component.response_term) - response_dim = response_name + "_obs" - response_levels = np.arange(len(model.response_component.response_term.levels)) - mean = posterior[response_name + "_mean"] - - mean = mean.to_numpy() - shape = mean.shape - - # Stack chains and draws - mean = mean.reshape((mean.shape[0] * mean.shape[1], mean.shape[2], mean.shape[3])) - draws_n = mean.shape[0] - obs_n = mean.shape[1] - - pps = np.empty((draws_n, obs_n), dtype=int) - for idx in range(obs_n): - pps[:, idx] = draw_categorical_samples(mean[:, idx, :], response_levels) - - pps = pps.reshape((shape[0], shape[1], obs_n)) - pps = xr.DataArray( - pps, - coords={ - "chain": np.arange(shape[0]), - "draw": np.arange(shape[1]), - response_dim: np.arange(obs_n), - }, - ) - return pps - - def get_data(self, response): - return np.nonzero(response.term.data)[1] - - def get_coords(self, response): - name = response.name + "_dim" - return {name: [level for level in response.levels if level != response.reference]} - - def get_reference(self, response): - return get_reference_level(response.term) - - @staticmethod - def transform_backend_nu(nu, data): - # Add column of zeros to the linear predictor for the reference level (the first one) - shape = (data.shape[0], 1) - - # The first line makes sure the intercept-only models work - nu = np.ones(shape) * nu # (response_levels, ) -> (n, response_levels) - nu = pt.concatenate([np.zeros(shape), nu], axis=1) - return nu + KIND = "Multivariate" class Multinomial(MultivariateFamily): @@ -176,19 +96,3 @@ def transform_backend_nu(nu, data): nu = np.ones(shape) * nu # (response_levels, ) -> (n, response_levels) nu = pt.concatenate([np.zeros(shape), nu], axis=1) return nu - - -# pylint: disable = protected-access -def get_reference_level(term): - if term.kind != "categoric": - return None - - if term.levels is None: - return None - - levels = term.levels - intermediate_data = term.components[0]._intermediate_data - if hasattr(intermediate_data, "_contrast"): - return intermediate_data._contrast.reference - - return levels[0] diff --git a/bambi/families/univariate.py b/bambi/families/univariate.py index 21bf1c70a..f177cc477 100644 --- a/bambi/families/univariate.py +++ b/bambi/families/univariate.py @@ -1,14 +1,13 @@ import numpy as np import xarray as xr -from scipy import stats +import pytensor.tensor as pt from bambi.families.family import Family from bambi.utils import get_aliased_name class UnivariateFamily(Family): - def posterior_predictive(self, model, posterior, **kwargs): - raise NotImplementedError + KIND = "Univariate" class AsymmetricLaplace(UnivariateFamily): @@ -19,24 +18,10 @@ class AsymmetricLaplace(UnivariateFamily): "q": ["logit", "probit", "cloglog"], } - def posterior_predictive(self, model, posterior, **kwargs): - """Sample from posterior predictive distribution""" - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - b = posterior[response_name + "_b"] - kappa = posterior[response_name + "_kappa"] - return xr.apply_ufunc(stats.laplace_asymmetric.rvs, kappa, mean, b) - class Bernoulli(UnivariateFamily): SUPPORTED_LINKS = {"p": ["identity", "logit", "probit", "cloglog"]} - def posterior_predictive(self, model, posterior, **kwargs): - """Sample from posterior predictive distribution""" - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - return xr.apply_ufunc(np.random.binomial, 1, mean) - def get_data(self, response): if response.term.data.ndim == 1: return response.term.data @@ -52,14 +37,6 @@ def get_success_level(self, response): class Beta(UnivariateFamily): SUPPORTED_LINKS = {"mu": ["logit", "probit", "cloglog"], "kappa": ["log"]} - def posterior_predictive(self, model, posterior, **kwargs): - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - kappa = posterior[response_name + "_kappa"] - alpha = mean * kappa - beta = (1 - mean) * kappa - return xr.apply_ufunc(np.random.beta, alpha, beta) - @staticmethod def transform_backend_kwargs(kwargs): mu = kwargs.pop("mu") @@ -92,15 +69,49 @@ def transform_backend_kwargs(kwargs): return kwargs -class Gamma(UnivariateFamily): - SUPPORTED_LINKS = {"mu": ["identity", "log", "inverse"], "alpha": ["log"]} +class Categorical(UnivariateFamily): + SUPPORTED_LINKS = {"p": ["softmax"]} + UFUNC_KWARGS = {"axis": -1} - def posterior_predictive(self, model, posterior, **kwargs): + def transform_linear_predictor(self, model, linear_predictor): response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - alpha = posterior[response_name + "_alpha"] - beta = alpha / mean - return xr.apply_ufunc(np.random.gamma, alpha, 1 / beta) + response_levels_dim = response_name + "_dim" + linear_predictor = linear_predictor.pad({response_levels_dim: (1, 0)}, constant_values=0) + return linear_predictor + + def transform_coords(self, model, mean): + # The mean has the reference level in the dimension, a new name is needed + response_name = get_aliased_name(model.response_component.response_term) + response_levels_dim = response_name + "_dim" + response_levels_dim_complete = response_name + "_mean_dim" + levels_complete = model.response_component.response_term.levels + mean = mean.rename({response_levels_dim: response_levels_dim_complete}) + mean = mean.assign_coords({response_levels_dim_complete: levels_complete}) + return mean + + def get_data(self, response): + return np.nonzero(response.term.data)[1] + + def get_coords(self, response): + name = response.name + "_dim" + return {name: [level for level in response.levels if level != response.reference]} + + def get_reference(self, response): + return get_reference_level(response.term) + + @staticmethod + def transform_backend_nu(nu, data): + # Add column of zeros to the linear predictor for the reference level (the first one) + shape = (data.shape[0], 1) + + # The first line makes sure the intercept-only models work + nu = np.ones(shape) * nu # (response_levels, ) -> (n, response_levels) + nu = pt.concatenate([np.zeros(shape), nu], axis=1) + return nu + + +class Gamma(UnivariateFamily): + SUPPORTED_LINKS = {"mu": ["identity", "log", "inverse"], "alpha": ["log"]} @staticmethod def transform_backend_kwargs(kwargs): @@ -114,83 +125,30 @@ def transform_backend_kwargs(kwargs): class Gaussian(UnivariateFamily): SUPPORTED_LINKS = {"mu": ["identity", "log", "inverse"], "sigma": ["log"]} - def posterior_predictive(self, model, posterior, **kwargs): - "Sample from posterior predictive distribution" - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - sigma = posterior[response_name + "_sigma"] - return xr.apply_ufunc(np.random.normal, mean, sigma) - class NegativeBinomial(UnivariateFamily): SUPPORTED_LINKS = {"mu": ["identity", "log", "cloglog"], "alpha": ["log"]} - def posterior_predictive(self, model, posterior, **kwargs): - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - n = posterior[response_name + "_alpha"] - p = n / (mean + n) - return xr.apply_ufunc(np.random.negative_binomial, n, p) - class Laplace(UnivariateFamily): SUPPORTED_LINKS = {"mu": ["identity", "log", "inverse"], "b": ["log"]} - def posterior_predictive(self, model, posterior, **kwargs): - "Sample from posterior predictive distribution" - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - b = posterior[response_name + "_b"] - return xr.apply_ufunc(np.random.laplace, mean, b) - class Poisson(UnivariateFamily): SUPPORTED_LINKS = {"mu": ["identity", "log"]} - def posterior_predictive(self, model, posterior, **kwargs): - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - return xr.apply_ufunc(np.random.poisson, mean) - class StudentT(UnivariateFamily): SUPPORTED_LINKS = {"mu": ["identity", "log", "inverse"], "sigma": ["log"], "nu": ["log"]} - def posterior_predictive(self, model, posterior, **kwargs): - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - sigma = posterior[response_name + "_sigma"] - nu_component = model.components["nu"] - - # Constant component with fixed value - if hasattr(nu_component, "prior") and isinstance(nu_component.prior, (int, float)): - nu = nu_component.prior - # Either constant or distributional, but non-constant value - else: - nu = posterior[response_name + "_nu"] - - return xr.apply_ufunc(stats.t.rvs, nu, mean, sigma) - class VonMises(UnivariateFamily): SUPPORTED_LINKS = {"mu": ["identity", "tan_2"], "kappa": ["log"]} - def posterior_predictive(self, model, posterior, **kwargs): - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - kappa = posterior[response_name + "_kappa"] - return xr.apply_ufunc(np.random.vonmises, mean, kappa) - class Wald(UnivariateFamily): SUPPORTED_LINKS = {"mu": ["inverse", "inverse_squared", "identity", "log"], "lam": ["log"]} - def posterior_predictive(self, model, posterior, **kwargs): - response_name = get_aliased_name(model.response_component.response_term) - mean = posterior[response_name + "_mean"] - lam = posterior[response_name + "_lam"] - return xr.apply_ufunc(np.random.wald, mean, lam) - # pylint: disable = protected-access def get_success_level(term): @@ -206,3 +164,19 @@ def get_success_level(term): return intermediate_data._contrast.reference return levels[0] + + +# pylint: disable = protected-access +def get_reference_level(term): + if term.kind != "categoric": + return None + + if term.levels is None: + return None + + levels = term.levels + intermediate_data = term.components[0]._intermediate_data + if hasattr(intermediate_data, "_contrast"): + return intermediate_data._contrast.reference + + return levels[0] diff --git a/bambi/model_components.py b/bambi/model_components.py index 2c55d72a0..6f04eb4e0 100644 --- a/bambi/model_components.py +++ b/bambi/model_components.py @@ -162,7 +162,7 @@ def predict(self, idata, data=None, include_group_specific=True): to_stack_dims = ("chain", "draw") design_matrix_dims = (response_dim, "__variables__") - if isinstance(self.spec.family, multivariate.MultivariateFamily): + if isinstance(self.spec.family, (multivariate.MultivariateFamily, univariate.Categorical)): to_stack_dims = to_stack_dims + (response_levels_dim,) linear_predictor_dims = linear_predictor_dims + (response_levels_dim,) diff --git a/tests/test_predict.py b/tests/test_predict.py index ae1ba6924..22c2c807c 100644 --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -189,6 +189,16 @@ def test_predict_t(data_numeric_xy): model.predict(idata, kind="mean", data=data.iloc[:20, :]) model.predict(idata, kind="pps", data=data.iloc[:20, :]) + # A case where the prior for one of the parameters is constant + model = Model("y ~ x", data, family="t", priors={"nu": 4}) + idata = model.fit(tune=100, draws=100) + + model.predict(idata, kind="mean") + model.predict(idata, kind="pps") + + model.predict(idata, kind="mean", data=data.iloc[:20, :]) + model.predict(idata, kind="pps", data=data.iloc[:20, :]) + def test_predict_wald(data_gamma): data = data_gamma @@ -233,7 +243,7 @@ def test_posterior_predictive_categorical(inhaler): model = Model("rating ~ period", data=inhaler, family="categorical") idata = model.fit(tune=100, draws=100) model.predict(idata, kind="pps") - pps = idata.posterior_predictive["rating"].values + pps = idata.posterior_predictive["rating"].to_numpy() assert pps.shape[-1] == inhaler.shape[0] assert (np.unique(pps) == [0, 1, 2, 3]).all()