Skip to content

Commit

Permalink
Add numerically safe ordered probit distribution. (#4232)
Browse files Browse the repository at this point in the history
* Add numerically safer ordered probit distribution.

* Fix styling issue.

* Use smaller values for probit & cutpoint testing

* fix import sorting

* Typo fix

* add a line to RELEASE-NOTES.md

Co-authored-by: Michael Osthege <michael.osthege@outlook.com>
Co-authored-by: Michael Osthege <m.osthege@fz-juelich.de>
  • Loading branch information
3 people authored Dec 16, 2020
1 parent dbcc49e commit 9dbf9ea
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 1 deletion.
2 changes: 2 additions & 0 deletions RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ This is the first release to support Python3.9 and to drop Python3.6.
- Make `sample_shape` same across all contexts in `draw_values` (see [#4305](https://github.com/pymc-devs/pymc3/pull/4305)).
- Removed `theanof.set_theano_config` because it illegally touched Theano's privates (see [#4329](https://github.com/pymc-devs/pymc3/pull/4329)).

### New Features
- `OrderedProbit` distribution added (see [#4232](https://github.com/pymc-devs/pymc3/pull/4232)).

## PyMC3 3.10.0 (7 December 2020)

Expand Down
2 changes: 2 additions & 0 deletions pymc3/distributions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
HyperGeometric,
NegativeBinomial,
OrderedLogistic,
OrderedProbit,
Poisson,
ZeroInflatedBinomial,
ZeroInflatedNegativeBinomial,
Expand Down Expand Up @@ -140,6 +141,7 @@
"HyperGeometric",
"Categorical",
"OrderedLogistic",
"OrderedProbit",
"DensityDist",
"Distribution",
"Continuous",
Expand Down
131 changes: 131 additions & 0 deletions pymc3/distributions/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
binomln,
bound,
factln,
log_diff_normal_cdf,
logpow,
normal_lccdf,
normal_lcdf,
random_choice,
)
from pymc3.distributions.distribution import Discrete, draw_values, generate_samples
Expand Down Expand Up @@ -1638,3 +1641,131 @@ def __init__(self, eta, cutpoints, *args, **kwargs):
p = p_cum[..., 1:] - p_cum[..., :-1]

super().__init__(p=p, *args, **kwargs)


class OrderedProbit(Categorical):
R"""
Ordered Probit log-likelihood.
Useful for regression on ordinal data values whose values range
from 1 to K as a function of some predictor, :math:`\eta`. The
cutpoints, :math:`c`, separate which ranges of :math:`\eta` are
mapped to which of the K observed dependent variables. The number
of cutpoints is K - 1. It is recommended that the cutpoints are
constrained to be ordered.
In order to stabilize the computation, log-likelihood is computed
in log space using the scaled error function `erfcx`.
.. math::
f(k \mid \eta, c) = \left\{
\begin{array}{l}
1 - \text{normal_cdf}(0, \sigma, \eta - c_1)
\,, \text{if } k = 0 \\
\text{normal_cdf}(0, \sigma, \eta - c_{k - 1}) -
\text{normal_cdf}(0, \sigma, \eta - c_{k})
\,, \text{if } 0 < k < K \\
\text{normal_cdf}(0, \sigma, \eta - c_{K - 1})
\,, \text{if } k = K \\
\end{array}
\right.
Parameters
----------
eta : float
The predictor.
c : array
The length K - 1 array of cutpoints which break :math:`\eta` into
ranges. Do not explicitly set the first and last elements of
:math:`c` to negative and positive infinity.
sigma: float
The standard deviation of probit function.
Example
--------
.. code:: python
# Generate data for a simple 1 dimensional example problem
n1_c = 300; n2_c = 300; n3_c = 300
cluster1 = np.random.randn(n1_c) + -1
cluster2 = np.random.randn(n2_c) + 0
cluster3 = np.random.randn(n3_c) + 2
x = np.concatenate((cluster1, cluster2, cluster3))
y = np.concatenate((1*np.ones(n1_c),
2*np.ones(n2_c),
3*np.ones(n3_c))) - 1
# Ordered probit regression
with pm.Model() as model:
cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2,
transform=pm.distributions.transforms.ordered)
y_ = pm.OrderedProbit("y", cutpoints=cutpoints, eta=x, observed=y)
tr = pm.sample(1000)
# Plot the results
plt.hist(cluster1, 30, alpha=0.5);
plt.hist(cluster2, 30, alpha=0.5);
plt.hist(cluster3, 30, alpha=0.5);
plt.hist(tr["cutpoints"][:,0], 80, alpha=0.2, color='k');
plt.hist(tr["cutpoints"][:,1], 80, alpha=0.2, color='k');
"""

def __init__(self, eta, cutpoints, *args, **kwargs):

self.eta = tt.as_tensor_variable(floatX(eta))
self.cutpoints = tt.as_tensor_variable(cutpoints)

probits = tt.shape_padright(self.eta) - self.cutpoints
_log_p = tt.concatenate(
[
tt.shape_padright(normal_lccdf(0, 1, probits[..., 0])),
log_diff_normal_cdf(0, 1, probits[..., :-1], probits[..., 1:]),
tt.shape_padright(normal_lcdf(0, 1, probits[..., -1])),
],
axis=-1,
)
_log_p = tt.as_tensor_variable(floatX(_log_p))

self._log_p = _log_p
self.mode = tt.argmax(_log_p, axis=-1)
p = tt.exp(_log_p)

super().__init__(p=p, *args, **kwargs)

def logp(self, value):
r"""
Calculate log-probability of Ordered Probit distribution at specified value.
Parameters
----------
value: numeric
Value(s) for which log-probability is calculated. If the log probabilities for multiple
values are desired the values must be provided in a numpy array or theano tensor
Returns
-------
TensorVariable
"""
logp = self._log_p
k = self.k

# Clip values before using them for indexing
value_clip = tt.clip(value, 0, k - 1)

if logp.ndim > 1:
if logp.ndim > value_clip.ndim:
value_clip = tt.shape_padleft(value_clip, logp.ndim - value_clip.ndim)
elif logp.ndim < value_clip.ndim:
logp = tt.shape_padleft(logp, value_clip.ndim - logp.ndim)
pattern = (logp.ndim - 1,) + tuple(range(logp.ndim - 1))
a = take_along_axis(
logp.dimshuffle(pattern),
value_clip,
)
else:
a = logp[value_clip]

return bound(a, value >= 0, value <= (k - 1))
40 changes: 40 additions & 0 deletions pymc3/distributions/dist_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,46 @@ def normal_lccdf(mu, sigma, x):
)


def log_diff_normal_cdf(mu, sigma, x, y):
"""
Compute :math:`\\log(\\Phi(\frac{x - \\mu}{\\sigma}) - \\Phi(\frac{y - \\mu}{\\sigma}))` safely in log space.
Parameters
----------
mu: float
mean
sigma: float
std
x: float
y: float
must be strictly less than x.
Returns
-------
log (\\Phi(x) - \\Phi(y))
"""
x = (x - mu) / sigma / tt.sqrt(2.0)
y = (y - mu) / sigma / tt.sqrt(2.0)

# To stabilize the computation, consider these three regions:
# 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y)
# 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y)
# 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine.
return tt.log(0.5) + tt.switch(
tt.gt(y, 0),
-tt.square(y) + tt.log(tt.erfcx(y) - tt.exp(tt.square(y) - tt.square(x)) * tt.erfcx(x)),
tt.switch(
tt.lt(x, 0), # 0 > x > y
-tt.square(x)
+ tt.log(tt.erfcx(-x) - tt.exp(tt.square(x) - tt.square(y)) * tt.erfcx(-y)),
tt.log(tt.erf(x) - tt.erf(y)), # x >0 > y
),
)


def sigma2rho(sigma):
"""
`sigma -> rho` theano converter
Expand Down
23 changes: 22 additions & 1 deletion pymc3/tests/test_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from numpy import array, exp, inf, log
from numpy.testing import assert_allclose, assert_almost_equal, assert_equal
from scipy import integrate
from scipy.special import logit
from scipy.special import erf, logit

import pymc3 as pm

Expand Down Expand Up @@ -74,6 +74,7 @@
NegativeBinomial,
Normal,
OrderedLogistic,
OrderedProbit,
Pareto,
Poisson,
Rice,
Expand Down Expand Up @@ -431,6 +432,17 @@ def orderedlogistic_logpdf(value, eta, cutpoints):
return np.where(np.all(ps >= 0), np.log(p), -np.inf)


def invprobit(x):
return (erf(x / np.sqrt(2)) + 1) / 2


def orderedprobit_logpdf(value, eta, cutpoints):
c = np.concatenate(([-np.inf], cutpoints, [np.inf]))
ps = np.array([invprobit(eta - cc) - invprobit(eta - cc1) for cc, cc1 in zip(c[:-1], c[1:])])
p = ps[value]
return np.where(np.all(ps >= 0), np.log(p), -np.inf)


class Simplex:
def __init__(self, n):
self.vals = list(simplex_values(n))
Expand Down Expand Up @@ -1536,6 +1548,15 @@ def test_orderedlogistic(self, n):
lambda value, eta, cutpoints: orderedlogistic_logpdf(value, eta, cutpoints),
)

@pytest.mark.parametrize("n", [2, 3, 4])
def test_orderedprobit(self, n):
self.pymc3_matches_scipy(
OrderedProbit,
Domain(range(n), "int64"),
{"eta": Runif, "cutpoints": UnitSortedVector(n - 1)},
lambda value, eta, cutpoints: orderedprobit_logpdf(value, eta, cutpoints),
)

def test_densitydist(self):
def logp(x):
return -log(2 * 0.5) - abs(x - 0.5) / 0.5
Expand Down

0 comments on commit 9dbf9ea

Please sign in to comment.