From 861fb85ca09e2c65473b9d5f8d8b97a3871d1f63 Mon Sep 17 00:00:00 2001 From: Ricardo Vieira Date: Mon, 17 Apr 2023 16:54:09 +0200 Subject: [PATCH] Add GeneralizedPoisson distribution --- docs/api_reference.rst | 1 + pymc_experimental/distributions/__init__.py | 1 + pymc_experimental/distributions/discrete.py | 172 ++++++++++++++++++ .../tests/distributions/test_discrete.py | 118 ++++++++++++ 4 files changed, 292 insertions(+) create mode 100644 pymc_experimental/distributions/discrete.py create mode 100644 pymc_experimental/tests/distributions/test_discrete.py diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 2d85c3ab4..5fee57f3e 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -29,6 +29,7 @@ Distributions :toctree: generated/ GenExtreme + GeneralizedPoisson histogram_utils.histogram_approximation diff --git a/pymc_experimental/distributions/__init__.py b/pymc_experimental/distributions/__init__.py index fe7c70379..b5e8e9df8 100644 --- a/pymc_experimental/distributions/__init__.py +++ b/pymc_experimental/distributions/__init__.py @@ -18,6 +18,7 @@ """ from pymc_experimental.distributions.continuous import GenExtreme +from pymc_experimental.distributions.discrete import GeneralizedPoisson __all__ = [ "GenExtreme", diff --git a/pymc_experimental/distributions/discrete.py b/pymc_experimental/distributions/discrete.py new file mode 100644 index 000000000..b4dbadc7d --- /dev/null +++ b/pymc_experimental/distributions/discrete.py @@ -0,0 +1,172 @@ +# Copyright 2023 The PyMC Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pymc as pm +from pymc.distributions.dist_math import check_parameters, factln, logpow +from pymc.distributions.shape_utils import rv_size_is_none +from pytensor import tensor as pt +from pytensor.tensor.random.op import RandomVariable + + +class GeneralizedPoissonRV(RandomVariable): + name = "generalized_poisson" + ndim_supp = 0 + ndims_params = [0, 0] + dtype = "int64" + _print_name = ("GeneralizedPoisson", "\\operatorname{GeneralizedPoisson}") + + @classmethod + def rng_fn(cls, rng, theta, lam, size): + theta = np.asarray(theta) + lam = np.asarray(lam) + + if size is not None: + dist_size = size + else: + dist_size = np.broadcast_shapes(theta.shape, lam.shape) + + # A mix of 2 algorithms described by Famoye (1997) is used depending on parameter values + # 0: Inverse method, computed on the log scale. Used when lam <= 0. + # 1: Branching method. Used when lambda > 0. + x = np.empty(dist_size) + idxs_mask = np.broadcast_to(lam < 0, dist_size) + if np.any(idxs_mask): + x[idxs_mask] = cls._inverse_rng_fn(rng, theta, lam, dist_size, idxs_mask=idxs_mask)[ + idxs_mask + ] + idxs_mask = ~idxs_mask + if np.any(idxs_mask): + x[idxs_mask] = cls._branching_rng_fn(rng, theta, lam, dist_size, idxs_mask=idxs_mask)[ + idxs_mask + ] + return x + + @classmethod + def _inverse_rng_fn(cls, rng, theta, lam, dist_size, idxs_mask): + log_u = np.log(rng.uniform(size=dist_size)) + pos_lam = lam > 0 + abs_log_lam = np.log(np.abs(lam)) + theta_m_lam = theta - lam + log_s = -theta + log_p = log_s.copy() + x_ = 0 + x = np.zeros(dist_size) + below_cutpoint = log_s < log_u + with np.errstate(divide="ignore", invalid="ignore"): + while np.any(below_cutpoint[idxs_mask]): + x_ += 1 + x[below_cutpoint] += 1 + log_c = np.log(theta_m_lam + lam * x_) + # Compute log(1 + lam / C) + log1p_lam_m_C = np.where( + pos_lam, + np.log1p(np.exp(abs_log_lam - log_c)), + pm.math.log1mexp_numpy(abs_log_lam - log_c, negative_input=True), + ) + log_p = log_c + log1p_lam_m_C * (x_ - 1) + log_p - np.log(x_) - lam + log_s = np.logaddexp(log_s, log_p) + below_cutpoint = log_s < log_u + return x + + @classmethod + def _branching_rng_fn(cls, rng, theta, lam, dist_size, idxs_mask): + lam_ = np.abs(lam) # This algorithm is only valid for positive lam + y = rng.poisson(theta, size=dist_size) + x = y.copy() + higher_than_zero = y > 0 + while np.any(higher_than_zero[idxs_mask]): + y = rng.poisson(lam_ * y) + x[higher_than_zero] = x[higher_than_zero] + y[higher_than_zero] + higher_than_zero = y > 0 + return x + + +generalized_poisson = GeneralizedPoissonRV() + + +class GeneralizedPoisson(pm.distributions.Discrete): + R""" + Generalized Poisson. + Used to model count data that can be either overdispersed or underdispersed. + Offers greater flexibility than the standard Poisson which assumes equidispersion, + where the mean is equal to the variance. + The pmf of this distribution is + + .. math:: f(x \mid \mu, \lambda) = + \frac{\mu (\mu + \lambda x)^{x-1} e^{-\mu - \lambda x}}{x!} + ======== ====================================== + Support :math:`x \in \mathbb{N}_0` + Mean :math:`\frac{\mu}{1 - \lambda}` + Variance :math:`\frac{\mu}{(1 - \lambda)^3}` + ======== ====================================== + + Parameters + ---------- + mu : tensor_like of float + Mean parameter (mu > 0). + lam : tensor_like of float + Dispersion parameter (max(-1, -mu/4) <= lam <= 1). + + Notes + ----- + When lam = 0, the Generalized Poisson reduces to the standard Poisson with the same mu. + When lam < 0, the mean is greater than the variance (underdispersion). + When lam > 0, the mean is less than the variance (overdispersion). + + References + ---------- + The PMF is taken from [1] and the random generator function is adapted from [2]. + .. [1] Consul, PoC, and Felix Famoye. "Generalized Poisson regression model." + Communications in Statistics-Theory and Methods 21.1 (1992): 89-109. + .. [2] Famoye, Felix. "Generalized Poisson random variate generation." American + Journal of Mathematical and Management Sciences 17.3-4 (1997): 219-237. + """ + + rv_op = generalized_poisson + + @classmethod + def dist(cls, mu, lam, **kwargs): + mu = pt.as_tensor_variable(mu) + lam = pt.as_tensor_variable(lam) + return super().dist([mu, lam], **kwargs) + + def moment(rv, size, mu, lam): + mean = pt.floor(mu / (1 - lam)) + if not rv_size_is_none(size): + mean = pt.full(size, mean) + return mean + + def logp(value, mu, lam): + mu_lam_value = mu + lam * value + logprob = np.log(mu) + logpow(mu_lam_value, value - 1) - mu_lam_value - factln(value) + + # Probability is 0 when value > m, where m is the largest positive integer for + # which mu + m * lam > 0 (when lam < 0). + logprob = pt.switch( + pt.or_( + mu_lam_value < 0, + value < 0, + ), + -np.inf, + logprob, + ) + + return check_parameters( + logprob, + 0 < mu, + pt.abs(lam) <= 1, + (-mu / 4) <= lam, + msg="0 < mu, max(-1, -mu/4)) <= lam <= 1", + ) diff --git a/pymc_experimental/tests/distributions/test_discrete.py b/pymc_experimental/tests/distributions/test_discrete.py new file mode 100644 index 000000000..4ac9db016 --- /dev/null +++ b/pymc_experimental/tests/distributions/test_discrete.py @@ -0,0 +1,118 @@ +# Copyright 2023 The PyMC Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pymc as pm +import pytensor +import pytensor.tensor as pt +import pytest +import scipy.stats +from pymc.logprob.utils import ParameterValueError +from pymc.testing import ( + BaseTestDistributionRandom, + Domain, + Rplus, + assert_moment_is_expected, + discrete_random_tester, +) + +from pymc_experimental.distributions import GeneralizedPoisson + + +class TestGeneralizedPoisson: + class TestRandomVariable(BaseTestDistributionRandom): + pymc_dist = GeneralizedPoisson + pymc_dist_params = {"mu": 4.0, "lam": 1.0} + expected_rv_op_params = {"mu": 4.0, "lam": 1.0} + tests_to_run = [ + "check_pymc_params_match_rv_op", + "check_rv_size", + ] + + def test_random_matches_poisson(self): + discrete_random_tester( + dist=self.pymc_dist, + paramdomains={"mu": Rplus, "lam": Domain([0], edges=(None, None))}, + ref_rand=lambda mu, lam, size: scipy.stats.poisson.rvs(mu, size=size), + ) + + @pytest.mark.parametrize("mu", (2.5, 20, 50)) + def test_random_lam_expected_moments(self, mu): + lam = np.array([-0.9, -0.7, -0.2, 0, 0.2, 0.7, 0.9]) + dist = self.pymc_dist.dist(mu=mu, lam=lam, size=(10_000, len(lam))) + draws = dist.eval() + + expected_mean = mu / (1 - lam) + np.testing.assert_allclose(draws.mean(0), expected_mean, rtol=1e-1) + + expected_std = np.sqrt(mu / (1 - lam) ** 3) + np.testing.assert_allclose(draws.std(0), expected_std, rtol=1e-1) + + def test_logp_matches_poisson(self): + # We are only checking this distribution for lambda=0 where it's equivalent to Poisson. + mu = pt.scalar("mu") + lam = pt.scalar("lam") + value = pt.vector("value") + + logp = pm.logp(GeneralizedPoisson.dist(mu, lam), value) + logp_fn = pytensor.function([value, mu, lam], logp) + + test_value = np.array([0, 1, 2, 30]) + for test_mu in (0.01, 0.1, 0.9, 1, 1.5, 20, 100): + np.testing.assert_allclose( + logp_fn(test_value, test_mu, lam=0), + scipy.stats.poisson.logpmf(test_value, test_mu), + ) + + # Check out-of-bounds values + value = pt.scalar("value") + logp = pm.logp(GeneralizedPoisson.dist(mu, lam), value) + logp_fn = pytensor.function([value, mu, lam], logp) + + logp_fn(-1, mu=5, lam=0) == -np.inf + logp_fn(9, mu=5, lam=-1) == -np.inf + + # Check mu/lam restrictions + with pytest.raises(ParameterValueError): + logp_fn(1, mu=1, lam=2) + + with pytest.raises(ParameterValueError): + logp_fn(1, mu=0, lam=0) + + with pytest.raises(ParameterValueError): + logp_fn(1, mu=1, lam=-1) + + def test_logp_lam_expected_moments(self): + mu = 30 + lam = np.array([-0.9, -0.7, -0.2, 0, 0.2, 0.7, 0.9]) + with pm.Model(): + x = GeneralizedPoisson("x", mu=mu, lam=lam) + trace = pm.sample(chains=1, draws=10_000, random_seed=96).posterior + + expected_mean = mu / (1 - lam) + np.testing.assert_allclose(trace["x"].mean(("chain", "draw")), expected_mean, rtol=1e-1) + + expected_std = np.sqrt(mu / (1 - lam) ** 3) + np.testing.assert_allclose(trace["x"].std(("chain", "draw")), expected_std, rtol=1e-1) + + @pytest.mark.parametrize( + "mu, lam, size, expected", + [ + (50, [-0.6, 0, 0.6], None, np.floor(50 / (1 - np.array([-0.6, 0, 0.6])))), + ([5, 50], -0.1, (4, 2), np.full((4, 2), np.floor(np.array([5, 50]) / 1.1))), + ], + ) + def test_moment(self, mu, lam, size, expected): + with pm.Model() as model: + GeneralizedPoisson("x", mu=mu, lam=lam, size=size) + assert_moment_is_expected(model, expected)