Skip to content

Commit

Permalink
Add default handler for posterior predictive distribution (#625)
Browse files Browse the repository at this point in the history
* Ideas on how to generalize pps

* Make posterior predictive more general

* Remove commented code

* Make predict work with multivariate families

* Add docstrings

* update changelog

* Make posterior predictive more robust. Make categorical a univariate family. Remove cylic import issues

* For some things, Categorical family must be treated as multivariate
  • Loading branch information
tomicapretto authored Jan 13, 2023
1 parent d9df772 commit 6fec04f
Show file tree
Hide file tree
Showing 9 changed files with 210 additions and 191 deletions.
1 change: 1 addition & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### New features

* Refactored the codebase to support distributional models (#607)
* Added a default method to handle posterior predictive sampling for custom families (#625)

### Maintenance and fixes

Expand Down
3 changes: 2 additions & 1 deletion bambi/backend/model_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from bambi.backend.terms import CommonTerm, GroupSpecificTerm, InterceptTerm, ResponseTerm
from bambi.backend.utils import get_distribution
from bambi.families.multivariate import MultivariateFamily
from bambi.families.univariate import Categorical
from bambi.utils import get_aliased_name


Expand Down Expand Up @@ -118,7 +119,7 @@ def build_group_specific_terms(self, pymc_backend, bmb_model):
if predictor.ndim > 1:
for col in range(predictor.shape[1]):
self.output += coef[:, col] * predictor[:, col]
elif isinstance(bmb_model.family, MultivariateFamily):
elif isinstance(bmb_model.family, (MultivariateFamily, Categorical)):
self.output += coef * predictor[:, np.newaxis]
else:
self.output += coef * predictor
Expand Down
7 changes: 4 additions & 3 deletions bambi/backend/terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from bambi.backend.utils import has_hyperprior, get_distribution
from bambi.families.multivariate import MultivariateFamily
from bambi.families.univariate import Categorical
from bambi.priors import Prior
from bambi.utils import get_aliased_name

Expand Down Expand Up @@ -36,7 +37,7 @@ def build(self, spec):

# Dims of the response variable
response_dims = []
if isinstance(spec.family, MultivariateFamily):
if isinstance(spec.family, (MultivariateFamily, Categorical)):
response_dims = list(spec.response_component.response_term.coords)
response_dims_n = len(spec.response_component.response_term.coords[response_dims[0]])
# Arguments may be of shape (a,) but we need them to be of shape (a, b)
Expand Down Expand Up @@ -99,7 +100,7 @@ def build(self, spec):

# Dims of the response variable (e.g. categorical)
response_dims = []
if isinstance(spec.family, MultivariateFamily):
if isinstance(spec.family, (MultivariateFamily, Categorical)):
response_dims = list(spec.response_component.response_term.coords)

dims = list(self.coords) + response_dims
Expand Down Expand Up @@ -172,7 +173,7 @@ def build(self, spec):
dist = get_distribution(self.term.prior.name)
label = self.name
# Prepends one dimension if response is multivariate
if isinstance(spec.family, MultivariateFamily):
if isinstance(spec.family, (MultivariateFamily, Categorical)):
dims = list(spec.response_component.response_term.coords)
dist = dist(label, dims=dims, **self.term.prior.args)[np.newaxis, :]
else:
Expand Down
3 changes: 2 additions & 1 deletion bambi/defaults/families.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Bernoulli,
Beta,
Binomial,
Categorical,
Gamma,
Gaussian,
NegativeBinomial,
Expand All @@ -13,7 +14,7 @@
VonMises,
Wald,
)
from bambi.families.multivariate import Categorical, Multinomial
from bambi.families.multivariate import Multinomial


# fmt: off
Expand Down
129 changes: 128 additions & 1 deletion bambi/families/family.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from typing import Dict, Union

import numpy as np
import pymc as pm
import xarray as xr

from bambi.families.link import Link
from bambi.utils import get_auxiliary_parameters
from bambi.utils import get_auxiliary_parameters, get_aliased_name


class Family:
Expand Down Expand Up @@ -101,9 +105,132 @@ def set_default_priors(self, priors):
priors = {k: v for k, v in priors.items() if k in auxiliary_parameters}
self.default_priors.update(priors)

def posterior_predictive(self, model, posterior, **kwargs): # pylint: disable = unused-argument
"""Get draws from the posterior predictive distribution
This function works for almost all the families. It grabs the draws for the parameters
needed in the response distribution, and then gets samples from the posterior predictive
distribution using `pm.draw()`. It won't work when the response distribution requires
parameters that are not available in `posterior`.
Parameters
----------
model : bambi.Model
The model
posterior : xr.Dataset
The xarray dataset that contains the draws for all the parameters in the posterior.
It must contain the parameters that are needed in the distribution of the response, or
the parameters that allow to derive them.
Returns
-------
xr.DataArray
A data array with the draws from the posterior predictive distribution
"""
response_dist = get_response_dist(model.family)
params = model.family.likelihood.params
response_aliased_name = get_aliased_name(model.response_component.response_term)

kwargs = {}
output_dataset_list = []

# In the posterior xr.Dataset we need to consider aliases.
# But we don't use aliases when passing kwargs to the PyMC distribution
for param in params:
# Extract posterior draws for the parent parameter
if param == model.family.likelihood.parent:
component = model.components[model.response_name]
var_name = f"{response_aliased_name}_mean"
kwargs[param] = posterior[var_name].to_numpy()
output_dataset_list.append(posterior[var_name])
else:
# Extract posterior draws for non-parent parameters
component = model.components[param]
component_aliased_name = component.alias if component.alias else param
var_name = f"{response_aliased_name}_{component_aliased_name}"
if var_name in posterior:
kwargs[param] = posterior[var_name].to_numpy()
output_dataset_list.append(posterior[var_name])
elif hasattr(component, "prior") and isinstance(component.prior, (int, float)):
kwargs[param] = np.asarray(component.prior)

# Determine the array with largest number of dimensions
ndims_max = max(x.ndim for x in kwargs.values())

# Append a dimension when needed. Required to make `pm.draw()` work.
for key, values in kwargs.items():
kwargs[key] = expand_array(values, ndims_max)

# NOTE: Wouldn't it be better to always use parametrizations compatible with PyMC?
# The current approach allows more flexibility, but it's more painful.
if hasattr(model.family, "transform_backend_kwargs"):
kwargs = model.family.transform_backend_kwargs(kwargs)

output_array = pm.draw(response_dist.dist(**kwargs))
output_coords_all = xr.merge(output_dataset_list).coords

if hasattr(model.family, "KIND") and model.family.KIND == "Multivariate":
coord_names = (
"chain",
"draw",
response_aliased_name + "_obs",
response_aliased_name + "_mean_dim",
)
else: # Assume it's univariate family
coord_names = ("chain", "draw", response_aliased_name + "_obs")

output_coords = {}
for coord_name in coord_names:
output_coords[coord_name] = output_coords_all[coord_name]
return xr.DataArray(output_array, coords=output_coords)

def __str__(self):
msg_list = [f"Family: {self.name}", f"Likelihood: {self.likelihood}", f"Link: {self.link}"]
return "\n".join(msg_list)

def __repr__(self):
return self.__str__()


def get_response_dist(family):
"""Get the PyMC distribution for the response
Parameters
----------
family : bambi.Family
The family for which the response distribution is wanted
Returns
-------
pm.Distribution
The response distribution
"""
if family.likelihood.dist:
dist = family.likelihood.dist
else:
dist = getattr(pm, family.likelihood.name)
return dist


def expand_array(x, ndim):
"""Add dimensions to an array to match the number of desired dimensions
If x.ndim < ndim, it adds ndim - x.ndim dimensions after the last axis. If not, it is left
untouched.
Parameters
----------
x : np.ndarray
The array
ndim : int
The number of desired dimensions
Returns
-------
np.ndarray
The array with the expanded dimensions
"""
if x.ndim == ndim:
return x
dims_to_expand = tuple(range(ndim - 1, x.ndim - 1, -1))
return np.expand_dims(x, dims_to_expand)
100 changes: 2 additions & 98 deletions bambi/families/multivariate.py
Original file line number Diff line number Diff line change
@@ -1,94 +1,14 @@
# pylint: disable=unused-argument
import pytensor.tensor as pt
import numpy as np
import xarray as xr
import pytensor.tensor as pt

from bambi.families.family import Family
from bambi.utils import extract_argument_names, extra_namespace, get_aliased_name


class MultivariateFamily(Family):
def posterior_predictive(self, model, posterior):
raise NotImplementedError


class Categorical(MultivariateFamily):
SUPPORTED_LINKS = {"p": ["softmax"]}
UFUNC_KWARGS = {"axis": -1}

def transform_linear_predictor(self, model, linear_predictor):
response_name = get_aliased_name(model.response_component.response_term)
response_levels_dim = response_name + "_dim"
linear_predictor = linear_predictor.pad({response_levels_dim: (1, 0)}, constant_values=0)
return linear_predictor

def transform_coords(self, model, mean):
# The mean has the reference level in the dimension, a new name is needed
response_name = get_aliased_name(model.response_component.response_term)
response_levels_dim = response_name + "_dim"
response_levels_dim_complete = response_name + "_mean_dim"
levels_complete = model.response_component.response_term.levels
mean = mean.rename({response_levels_dim: response_levels_dim_complete})
mean = mean.assign_coords({response_levels_dim_complete: levels_complete})
return mean

# NOTE: Check posterior predictive
def posterior_predictive(self, model, posterior, **kwargs):
def draw_categorical_samples(probability_matrix, items):
# https://stackoverflow.com/questions/34187130
# probability_matrix is a matrix of shape (n_chain * n_draw, n_levels)
cumsum = probability_matrix.cumsum(axis=1)
idx = np.random.rand(probability_matrix.shape[0])[:, np.newaxis]
idx = (cumsum < idx).sum(axis=1)
return items[idx]

response_name = get_aliased_name(model.response_component.response_term)
response_dim = response_name + "_obs"
response_levels = np.arange(len(model.response_component.response_term.levels))
mean = posterior[response_name + "_mean"]

mean = mean.to_numpy()
shape = mean.shape

# Stack chains and draws
mean = mean.reshape((mean.shape[0] * mean.shape[1], mean.shape[2], mean.shape[3]))
draws_n = mean.shape[0]
obs_n = mean.shape[1]

pps = np.empty((draws_n, obs_n), dtype=int)
for idx in range(obs_n):
pps[:, idx] = draw_categorical_samples(mean[:, idx, :], response_levels)

pps = pps.reshape((shape[0], shape[1], obs_n))
pps = xr.DataArray(
pps,
coords={
"chain": np.arange(shape[0]),
"draw": np.arange(shape[1]),
response_dim: np.arange(obs_n),
},
)
return pps

def get_data(self, response):
return np.nonzero(response.term.data)[1]

def get_coords(self, response):
name = response.name + "_dim"
return {name: [level for level in response.levels if level != response.reference]}

def get_reference(self, response):
return get_reference_level(response.term)

@staticmethod
def transform_backend_nu(nu, data):
# Add column of zeros to the linear predictor for the reference level (the first one)
shape = (data.shape[0], 1)

# The first line makes sure the intercept-only models work
nu = np.ones(shape) * nu # (response_levels, ) -> (n, response_levels)
nu = pt.concatenate([np.zeros(shape), nu], axis=1)
return nu
KIND = "Multivariate"


class Multinomial(MultivariateFamily):
Expand Down Expand Up @@ -176,19 +96,3 @@ def transform_backend_nu(nu, data):
nu = np.ones(shape) * nu # (response_levels, ) -> (n, response_levels)
nu = pt.concatenate([np.zeros(shape), nu], axis=1)
return nu


# pylint: disable = protected-access
def get_reference_level(term):
if term.kind != "categoric":
return None

if term.levels is None:
return None

levels = term.levels
intermediate_data = term.components[0]._intermediate_data
if hasattr(intermediate_data, "_contrast"):
return intermediate_data._contrast.reference

return levels[0]
Loading

0 comments on commit 6fec04f

Please sign in to comment.