Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add HyperGeometric Distribution to pymc3.distributions.discrete #3504 #4108

Closed
wants to merge 8 commits into from
1 change: 1 addition & 0 deletions RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

### New features
- `sample_posterior_predictive_w` can now feed on `xarray.Dataset` - e.g. from `InferenceData.posterior`. (see [#4042](https://github.com/pymc-devs/pymc3/pull/4042))
- Support HyperGeometric Distribution through `pymc3.distributions.discrete.HyperGeometric`. (see [#4108](https://github.com/pymc-devs/pymc3/pull/4108))


## PyMC3 3.9.3 (11 August 2020)
Expand Down
2 changes: 2 additions & 0 deletions pymc3/distributions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
from .discrete import ZeroInflatedBinomial
from .discrete import DiscreteUniform
from .discrete import Geometric
from .discrete import HyperGeometric
from .discrete import Categorical
from .discrete import OrderedLogistic

Expand Down Expand Up @@ -136,6 +137,7 @@
'ZeroInflatedBinomial',
'DiscreteUniform',
'Geometric',
'HyperGeometric',
'Categorical',
'OrderedLogistic',
'DensityDist',
Expand Down
107 changes: 106 additions & 1 deletion pymc3/distributions/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
__all__ = ['Binomial', 'BetaBinomial', 'Bernoulli', 'DiscreteWeibull',
'Poisson', 'NegativeBinomial', 'ConstantDist', 'Constant',
'ZeroInflatedPoisson', 'ZeroInflatedBinomial', 'ZeroInflatedNegativeBinomial',
'DiscreteUniform', 'Geometric', 'Categorical', 'OrderedLogistic']
'DiscreteUniform', 'Geometric', 'HyperGeometric', 'Categorical',
'OrderedLogistic']


class Binomial(Discrete):
Expand Down Expand Up @@ -819,6 +820,110 @@ def _repr_latex_(self, name=None, dist=None):
get_variable_name(p))


class HyperGeometric(Discrete):
R"""
Hypergeometric log-likelihood.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Hypergeometric log-likelihood.
Discrete hypergeometric distribution.

Isn't this better?


The probability of x successes in a sequence of n Bernoulli
trials (That is, sample size = n) - where the population
size is N, containing a total of k successful individuals.
The process is carried out without replacement.
Comment on lines +863 to +866
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
The probability of x successes in a sequence of n Bernoulli
trials (That is, sample size = n) - where the population
size is N, containing a total of k successful individuals.
The process is carried out without replacement.
The probability of :math:`x` successes in a sequence of :math:`n` bernoulli
trials taken without replacement from a population of :math:`N` objects,
containing :math:`k` good (or successful or Type I) objects.

Nitpick. Not a blocking comment.


The pmf of this distribution is
.. math:: f(x \mid N, n, k) = \frac{\binom{k}{x}\binom{N-k}{n-x}}{\binom{N}{n}}
.. plot::
Comment on lines +868 to +870
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
The pmf of this distribution is
.. math:: f(x \mid N, n, k) = \frac{\binom{k}{x}\binom{N-k}{n-x}}{\binom{N}{n}}
.. plot::
The pmf of this distribution is
.. math:: f(x \mid N, n, k) = \frac{\binom{k}{x}\binom{N-k}{n-x}}{\binom{N}{n}}
.. plot::

Nitpick. Not a blocking comment

import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
plt.style.use('seaborn-darkgrid')
x = np.arange(1, 15)
N = 50
k = 10
for n in [20, 25]:
pmf = st.hypergeom.pmf(x, N, k, n)
plt.plot(x, pmf, '-o', label='n = {}'.format(n))
plt.plot(x, pmf, '-o', label='N = {}'.format(N))
plt.plot(x, pmf, '-o', label='k = {}'.format(k))
plt.xlabel('x', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.legend(loc=1)
plt.show()

======== =============================
Support :math:`x \in \mathbb{N}_{>0}`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure this is right? On the Wikipedia page, support is given as x in [max(0, n - N + k), min(k, n)].

Mean :math:`\dfrac{n.k}{N}`
Variance :math:`\dfrac{(N-n).n.k.(N-k)}{(N-1).N^2}`
Comment on lines +890 to +891
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Mean :math:`\dfrac{n.k}{N}`
Variance :math:`\dfrac{(N-n).n.k.(N-k)}{(N-1).N^2}`
Mean :math:`\dfrac{nk}{N}`
Variance :math:`\dfrac{(N-n)nk(N-k)}{(N-1)N^2}`

======== =============================

Parameters
----------
N : integer
Total size of the population
n : integer
Number of samples drawn from the population
k : integer
Number of successful individuals in the population
"""

def __init__(self, N, k, n, *args, **kwargs):
super().__init__(*args, **kwargs)
self.N = N = tt.as_tensor_variable(intX(N))
self.k = k = tt.as_tensor_variable(intX(k))
self.n = n = tt.as_tensor_variable(intX(n))
Comment on lines +906 to +908
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.N = N = tt.as_tensor_variable(intX(N))
self.k = k = tt.as_tensor_variable(intX(k))
self.n = n = tt.as_tensor_variable(intX(n))
self.N = intX(N)
self.k = intX(k)
self.n = intX(n)

Nitpick. Not a blocking comment.

self.mode = intX(tt.floor((n + 1)*(k + 1)/(N + 2)))

def random(self, point=None, size=None):
r"""
Draw random values from HyperGeometric distribution.
Parameters
----------
Comment on lines +913 to +915
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Draw random values from HyperGeometric distribution.
Parameters
----------
Draw random values from HyperGeometric distribution.
Parameters
----------

Nitpick. Not a blocking comment

point : dict, optional
Dict of variable values on which random values are to be
conditioned (uses default point if not specified).
size : int, optional
Desired size of random sample (returns one sample if not
specified).
Returns
-------
array
Comment on lines +921 to +924
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
specified).
Returns
-------
array
specified).
Returns
-------
array

Nitpick. Not a blocking comment

"""
N, n, k = draw_values([self.N, self.n, self.k], point=point, size=size)
return generate_samples(np.random.hypergeometric, N, n, k,
dist_shape=self.shape,
size=size)

def logp(self, value):
r"""
Calculate log-probability of HyperGeometric distribution at specified value.
Parameters
----------
Comment on lines +933 to +935
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Calculate log-probability of HyperGeometric distribution at specified value.
Parameters
----------
Calculate log-probability of HyperGeometric distribution at specified value.
Parameters
----------

Nitpick. Not a blocking comment

value : numeric
Value(s) for which log-probability is calculated. If the log probabilities for multiple
values are desired the values must be provided in a numpy array or theano tensor
Returns
-------
Comment on lines +938 to +940
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
values are desired the values must be provided in a numpy array or theano tensor
Returns
-------
values are desired the values must be provided in a numpy array or theano tensor
Returns
-------

Nitpick. Not a blocking comment

TensorVariable
"""
N = self.N
k = self.k
n = self.n
return bound(binomln(k, value) + binomln(N - k, n - value) - binomln(N, n),
0 <= k, k <= N, 0 <= n, 0 <= N, n - N + k <= value, 0 <= value,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are already testing that 0 <= k and k <= N, so you should not have to include 0 <= N

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, point! Removing the unnecessary condition.

value <= k, value <= n)

def _repr_latex_(self, name=None, dist=None):
if dist is None:
dist = self
N = dist.N
n = dist.n
k = dist.k
name = r'\text{%s}' % name
return r'${} \sim \text{{HyperGeometric}}(\mathit{{N}}={},~\mathit{{n}}={},~\mathit{{k}}={})$'.format(name,
get_variable_name(N),
get_variable_name(n),
get_variable_name(k))


class DiscreteUniform(Discrete):
R"""
Discrete uniform distribution.
Expand Down
5 changes: 5 additions & 0 deletions pymc3/tests/test_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
Rice,
Kumaraswamy,
Moyal,
HyperGeometric
)

from ..distributions import continuous
Expand Down Expand Up @@ -817,6 +818,10 @@ def test_geometric(self):
Geometric, Nat, {"p": Unit}, lambda value, p: np.log(sp.geom.pmf(value, p))
)

def test_hypergeometric(self):
self.pymc3_matches_scipy(HyperGeometric, Nat, {'N': NatSmall, 'n': NatSmall, 'k': NatSmall},
lambda value, N, n, k: sp.hypergeom.logpmf(value, N, k, n))

def test_negative_binomial(self):
def test_fun(value, mu, alpha):
return sp.nbinom.logpmf(value, alpha, 1 - mu / (mu + alpha))
Expand Down
9 changes: 9 additions & 0 deletions pymc3/tests/test_distributions_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,11 @@ class TestGeometric(BaseTestCases.BaseTestCase):
distribution = pm.Geometric
params = {'p': 0.5}


class TestHyperGeometric(BaseTestCases.BaseTestCase):
distribution = pm.HyperGeometric
params = {'N': 50, 'n' : 25, 'k' :10}


class TestMoyal(BaseTestCases.BaseTestCase):
distribution = pm.Moyal
Expand Down Expand Up @@ -657,6 +662,10 @@ def ref_rand(size, alpha, mu):
def test_geometric(self):
pymc3_random_discrete(pm.Geometric, {'p': Unit}, size=500, fails=50, ref_rand=nr.geometric)

def test_hypergeometric(self):
pymc3_random_discrete(pm.HyperGeometric, {'N': Nat, 'n': Nat, 'k': Nat}, size=500, fails=50,
ref_rand=nr.hypergeometric)

def test_discrete_uniform(self):
def ref_rand(size, lower, upper):
return st.randint.rvs(lower, upper + 1, size=size)
Expand Down