Add default handler for posterior predictive distribution (#625)

* Ideas on how to generalize pps * Make posterior predictive more general * Remove commented code * Make predict work with multivariate families * Add docstrings * update changelog * Make posterior predictive more robust. Make categorical a univariate family. Remove cylic import issues * For some things, Categorical family must be treated as multivariate
bambinos · Jan 13, 2023 · 6fec04f · 6fec04f
1 parent d9df772
commit 6fec04f
Show file tree

Hide file tree

Showing 9 changed files with 210 additions and 191 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -5,6 +5,7 @@
 ### New features
 
 * Refactored the codebase to support distributional models (#607)
+* Added a default method to handle posterior predictive sampling for custom families (#625)
 
 ### Maintenance and fixes
 

diff --git a/bambi/backend/model_components.py b/bambi/backend/model_components.py
@@ -5,6 +5,7 @@
 from bambi.backend.terms import CommonTerm, GroupSpecificTerm, InterceptTerm, ResponseTerm
 from bambi.backend.utils import get_distribution
 from bambi.families.multivariate import MultivariateFamily
+from bambi.families.univariate import Categorical
 from bambi.utils import get_aliased_name
 
 
@@ -118,7 +119,7 @@ def build_group_specific_terms(self, pymc_backend, bmb_model):
             if predictor.ndim > 1:
                 for col in range(predictor.shape[1]):
                     self.output += coef[:, col] * predictor[:, col]
-            elif isinstance(bmb_model.family, MultivariateFamily):
+            elif isinstance(bmb_model.family, (MultivariateFamily, Categorical)):
                 self.output += coef * predictor[:, np.newaxis]
             else:
                 self.output += coef * predictor

diff --git a/bambi/backend/terms.py b/bambi/backend/terms.py
@@ -5,6 +5,7 @@
 
 from bambi.backend.utils import has_hyperprior, get_distribution
 from bambi.families.multivariate import MultivariateFamily
+from bambi.families.univariate import Categorical
 from bambi.priors import Prior
 from bambi.utils import get_aliased_name
 
@@ -36,7 +37,7 @@ def build(self, spec):
 
         # Dims of the response variable
         response_dims = []
-        if isinstance(spec.family, MultivariateFamily):
+        if isinstance(spec.family, (MultivariateFamily, Categorical)):
             response_dims = list(spec.response_component.response_term.coords)
             response_dims_n = len(spec.response_component.response_term.coords[response_dims[0]])
             # Arguments may be of shape (a,) but we need them to be of shape (a, b)
@@ -99,7 +100,7 @@ def build(self, spec):
 
         # Dims of the response variable (e.g. categorical)
         response_dims = []
-        if isinstance(spec.family, MultivariateFamily):
+        if isinstance(spec.family, (MultivariateFamily, Categorical)):
             response_dims = list(spec.response_component.response_term.coords)
 
         dims = list(self.coords) + response_dims
@@ -172,7 +173,7 @@ def build(self, spec):
         dist = get_distribution(self.term.prior.name)
         label = self.name
         # Prepends one dimension if response is multivariate
-        if isinstance(spec.family, MultivariateFamily):
+        if isinstance(spec.family, (MultivariateFamily, Categorical)):
             dims = list(spec.response_component.response_term.coords)
             dist = dist(label, dims=dims, **self.term.prior.args)[np.newaxis, :]
         else:

diff --git a/bambi/defaults/families.py b/bambi/defaults/families.py
@@ -4,6 +4,7 @@
     Bernoulli,
     Beta,
     Binomial,
+    Categorical,
     Gamma,
     Gaussian,
     NegativeBinomial,
@@ -13,7 +14,7 @@
     VonMises,
     Wald,
 )
-from bambi.families.multivariate import Categorical, Multinomial
+from bambi.families.multivariate import Multinomial
 
 
 # fmt: off

diff --git a/bambi/families/family.py b/bambi/families/family.py
@@ -1,7 +1,11 @@
 from typing import Dict, Union
 
+import numpy as np
+import pymc as pm
+import xarray as xr
+
 from bambi.families.link import Link
-from bambi.utils import get_auxiliary_parameters
+from bambi.utils import get_auxiliary_parameters, get_aliased_name
 
 
 class Family:
@@ -101,9 +105,132 @@ def set_default_priors(self, priors):
         priors = {k: v for k, v in priors.items() if k in auxiliary_parameters}
         self.default_priors.update(priors)
 
+    def posterior_predictive(self, model, posterior, **kwargs):  # pylint: disable = unused-argument
+        """Get draws from the posterior predictive distribution
+
+        This function works for almost all the families. It grabs the draws for the parameters
+        needed in the response distribution, and then gets samples from the posterior predictive
+        distribution using `pm.draw()`. It won't work when the response distribution requires
+        parameters that are not available in `posterior`.
+
+        Parameters
+        ----------
+        model : bambi.Model
+            The model
+        posterior : xr.Dataset
+            The xarray dataset that contains the draws for all the parameters in the posterior.
+            It must contain the parameters that are needed in the distribution of the response, or
+            the parameters that allow to derive them.
+
+        Returns
+        -------
+        xr.DataArray
+            A data array with the draws from the posterior predictive distribution
+        """
+        response_dist = get_response_dist(model.family)
+        params = model.family.likelihood.params
+        response_aliased_name = get_aliased_name(model.response_component.response_term)
+
+        kwargs = {}
+        output_dataset_list = []
+
+        # In the posterior xr.Dataset we need to consider aliases.
+        # But we don't use aliases when passing kwargs to the PyMC distribution
+        for param in params:
+            # Extract posterior draws for the parent parameter
+            if param == model.family.likelihood.parent:
+                component = model.components[model.response_name]
+                var_name = f"{response_aliased_name}_mean"
+                kwargs[param] = posterior[var_name].to_numpy()
+                output_dataset_list.append(posterior[var_name])
+            else:
+                # Extract posterior draws for non-parent parameters
+                component = model.components[param]
+                component_aliased_name = component.alias if component.alias else param
+                var_name = f"{response_aliased_name}_{component_aliased_name}"
+                if var_name in posterior:
+                    kwargs[param] = posterior[var_name].to_numpy()
+                    output_dataset_list.append(posterior[var_name])
+                elif hasattr(component, "prior") and isinstance(component.prior, (int, float)):
+                    kwargs[param] = np.asarray(component.prior)
+
+        # Determine the array with largest number of dimensions
+        ndims_max = max(x.ndim for x in kwargs.values())
+
+        # Append a dimension when needed. Required to make `pm.draw()` work.
+        for key, values in kwargs.items():
+            kwargs[key] = expand_array(values, ndims_max)
+
+        # NOTE: Wouldn't it be better to always use parametrizations compatible with PyMC?
+        # The current approach allows more flexibility, but it's more painful.
+        if hasattr(model.family, "transform_backend_kwargs"):
+            kwargs = model.family.transform_backend_kwargs(kwargs)
+
+        output_array = pm.draw(response_dist.dist(**kwargs))
+        output_coords_all = xr.merge(output_dataset_list).coords
+
+        if hasattr(model.family, "KIND") and model.family.KIND == "Multivariate":
+            coord_names = (
+                "chain",
+                "draw",
+                response_aliased_name + "_obs",
+                response_aliased_name + "_mean_dim",
+            )
+        else:  # Assume it's univariate family
+            coord_names = ("chain", "draw", response_aliased_name + "_obs")
+
+        output_coords = {}
+        for coord_name in coord_names:
+            output_coords[coord_name] = output_coords_all[coord_name]
+        return xr.DataArray(output_array, coords=output_coords)
+
     def __str__(self):
         msg_list = [f"Family: {self.name}", f"Likelihood: {self.likelihood}", f"Link: {self.link}"]
         return "\n".join(msg_list)
 
     def __repr__(self):
         return self.__str__()
+
+
+def get_response_dist(family):
+    """Get the PyMC distribution for the response
+
+    Parameters
+    ----------
+    family : bambi.Family
+        The family for which the response distribution is wanted
+
+    Returns
+    -------
+    pm.Distribution
+        The response distribution
+    """
+    if family.likelihood.dist:
+        dist = family.likelihood.dist
+    else:
+        dist = getattr(pm, family.likelihood.name)
+    return dist
+
+
+def expand_array(x, ndim):
+    """Add dimensions to an array to match the number of desired dimensions
+
+    If x.ndim < ndim, it adds ndim - x.ndim dimensions after the last axis. If not, it is left
+    untouched.
+
+    Parameters
+    ----------
+    x : np.ndarray
+        The array
+    ndim : int
+        The number of desired dimensions
+
+    Returns
+    -------
+    np.ndarray
+        The array with the expanded dimensions
+    """
+    if x.ndim == ndim:
+        return x
+    dims_to_expand = tuple(range(ndim - 1, x.ndim - 1, -1))
+    return np.expand_dims(x, dims_to_expand)
diff --git a/bambi/families/multivariate.py b/bambi/families/multivariate.py
@@ -1,94 +1,14 @@
 # pylint: disable=unused-argument
-import pytensor.tensor as pt
 import numpy as np
 import xarray as xr
+import pytensor.tensor as pt
 
 from bambi.families.family import Family
 from bambi.utils import extract_argument_names, extra_namespace, get_aliased_name
 
 
 class MultivariateFamily(Family):
-    def posterior_predictive(self, model, posterior):
-        raise NotImplementedError
-
-
-class Categorical(MultivariateFamily):
-    SUPPORTED_LINKS = {"p": ["softmax"]}
-    UFUNC_KWARGS = {"axis": -1}
-
-    def transform_linear_predictor(self, model, linear_predictor):
-        response_name = get_aliased_name(model.response_component.response_term)
-        response_levels_dim = response_name + "_dim"
-        linear_predictor = linear_predictor.pad({response_levels_dim: (1, 0)}, constant_values=0)
-        return linear_predictor
-
-    def transform_coords(self, model, mean):
-        # The mean has the reference level in the dimension, a new name is needed
-        response_name = get_aliased_name(model.response_component.response_term)
-        response_levels_dim = response_name + "_dim"
-        response_levels_dim_complete = response_name + "_mean_dim"
-        levels_complete = model.response_component.response_term.levels
-        mean = mean.rename({response_levels_dim: response_levels_dim_complete})
-        mean = mean.assign_coords({response_levels_dim_complete: levels_complete})
-        return mean
-
-    # NOTE: Check posterior predictive
-    def posterior_predictive(self, model, posterior, **kwargs):
-        def draw_categorical_samples(probability_matrix, items):
-            # https://stackoverflow.com/questions/34187130
-            # probability_matrix is a matrix of shape (n_chain * n_draw, n_levels)
-            cumsum = probability_matrix.cumsum(axis=1)
-            idx = np.random.rand(probability_matrix.shape[0])[:, np.newaxis]
-            idx = (cumsum < idx).sum(axis=1)
-            return items[idx]
-
-        response_name = get_aliased_name(model.response_component.response_term)
-        response_dim = response_name + "_obs"
-        response_levels = np.arange(len(model.response_component.response_term.levels))
-        mean = posterior[response_name + "_mean"]
-
-        mean = mean.to_numpy()
-        shape = mean.shape
-
-        # Stack chains and draws
-        mean = mean.reshape((mean.shape[0] * mean.shape[1], mean.shape[2], mean.shape[3]))
-        draws_n = mean.shape[0]
-        obs_n = mean.shape[1]
-
-        pps = np.empty((draws_n, obs_n), dtype=int)
-        for idx in range(obs_n):
-            pps[:, idx] = draw_categorical_samples(mean[:, idx, :], response_levels)
-
-        pps = pps.reshape((shape[0], shape[1], obs_n))
-        pps = xr.DataArray(
-            pps,
-            coords={
-                "chain": np.arange(shape[0]),
-                "draw": np.arange(shape[1]),
-                response_dim: np.arange(obs_n),
-            },
-        )
-        return pps
-
-    def get_data(self, response):
-        return np.nonzero(response.term.data)[1]
-
-    def get_coords(self, response):
-        name = response.name + "_dim"
-        return {name: [level for level in response.levels if level != response.reference]}
-
-    def get_reference(self, response):
-        return get_reference_level(response.term)
-
-    @staticmethod
-    def transform_backend_nu(nu, data):
-        # Add column of zeros to the linear predictor for the reference level (the first one)
-        shape = (data.shape[0], 1)
-
-        # The first line makes sure the intercept-only models work
-        nu = np.ones(shape) * nu  # (response_levels, ) -> (n, response_levels)
-        nu = pt.concatenate([np.zeros(shape), nu], axis=1)
-        return nu
+    KIND = "Multivariate"
 
 
 class Multinomial(MultivariateFamily):
@@ -176,19 +96,3 @@ def transform_backend_nu(nu, data):
         nu = np.ones(shape) * nu  # (response_levels, ) -> (n, response_levels)
         nu = pt.concatenate([np.zeros(shape), nu], axis=1)
         return nu
-
-
-# pylint: disable = protected-access
-def get_reference_level(term):
-    if term.kind != "categoric":
-        return None
-
-    if term.levels is None:
-        return None
-
-    levels = term.levels
-    intermediate_data = term.components[0]._intermediate_data
-    if hasattr(intermediate_data, "_contrast"):
-        return intermediate_data._contrast.reference
-
-    return levels[0]