From 79499a7d8586da5a66dc8e8cfccc1f6d1f9b753c Mon Sep 17 00:00:00 2001 From: Christoph Sawade Date: Mon, 7 Jul 2014 10:14:31 +0200 Subject: [PATCH] Add functions for the hypergeometric distribution - add support - add mode, skewness, kurtosis - add tests --- doc/source/univariate.rst | 6 ++++- src/univariate/hypergeometric.jl | 26 +++++++++++++++--- src/univariates.jl | 1 + test/hypergeometric.jl | 46 ++++++++++++++++++++++++++++++++ test/noncentralhypergeometric.jl | 4 +-- test/runtests.jl | 1 + test/univariate.jl | 1 + 7 files changed, 79 insertions(+), 6 deletions(-) create mode 100644 test/hypergeometric.jl diff --git a/doc/source/univariate.rst b/doc/source/univariate.rst index 2835733f8c..628ebf4b60 100644 --- a/doc/source/univariate.rst +++ b/doc/source/univariate.rst @@ -298,7 +298,11 @@ A `Geometric distribution ` Hypergeometric Distribution ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A `Hypergeometric distribution `_ describes the number of successes in *n* draws without replacement from a finite population containing *s* successes and *f* failures. +A `Hypergeometric distribution `_ describes the number of successes in *n* draws without replacement from a finite population containing *s* successes and *f* failures. The probability mass function is: + +.. math:: + + P(X=x) = {{{s \choose x} {f \choose {n-x}}}\over {s+f \choose n}}, \quad x \in [\max(0, n - f), \min(n, s)] .. code-block:: julia diff --git a/src/univariate/hypergeometric.jl b/src/univariate/hypergeometric.jl index 9ac5ae2ddc..7c2ef4e865 100644 --- a/src/univariate/hypergeometric.jl +++ b/src/univariate/hypergeometric.jl @@ -13,10 +13,12 @@ end @_jl_dist_3p Hypergeometric hyper -function insupport(d::Hypergeometric, x::Real) - isinteger(x) && zero(x) <= x <= d.n && (d.n - d.nf) <= x <= d.ns -end +# handling support +minimum(d::Hypergeometric) = max(0, d.n - d.nf) +maximum(d::Hypergeometric) = min(d.ns, d.n) +support(d::Hypergeometric) = minimum(d):maximum(d) +# properties mean(d::Hypergeometric) = d.n * d.ns / (d.ns + d.nf) function var(d::Hypergeometric) @@ -24,3 +26,21 @@ function var(d::Hypergeometric) p = d.ns / N d.n * p * (1.0 - p) * (N - d.n) / (N - 1.0) end +mode(d::Hypergeometric) = int(floor((d.n+1) * (d.ns+1) / (d.ns+d.nf+2))) + +function modes(d::Hypergeometric) + if (d.ns == d.nf) && mod(d.n, 2) == 1 + [(d.n-1)/2, (d.n+1)/2] + else + [mode(d)] + end +end + +skewness(d::Hypergeometric) = (d.nf-d.ns)*sqrt(d.ns+d.nf-1)*(d.ns+d.nf-2*d.n)/sqrt(d.n*d.ns*d.nf*(d.ns+d.nf-d.n))/(d.ns+d.nf-2) +function kurtosis(d::Hypergeometric) + N = d.ns + d.nf + a = (N-1) * N^2 * (N * (N+1) - 6*d.ns * (N-d.ns) - 6*d.n*(N-d.n)) + 6*d.n*d.ns*(d.nf)*(N-d.n)*(5*N-6) + b = (d.n*d.ns*(N-d.ns) * (N-d.n)*(N-2)*(N-3)) + a/b +end + diff --git a/src/univariates.jl b/src/univariates.jl index 4b5e48f8cd..997bc7068c 100644 --- a/src/univariates.jl +++ b/src/univariates.jl @@ -56,6 +56,7 @@ proper_kurtosis(d::Distribution) = kurtosis(d, false) ## pdf, cdf, and friends logpdf(d::UnivariateDistribution, x::Number) = log(pdf(d, x)) +cdf(d::DiscreteUnivariateDistribution, k::Real) = sum([pdf(d,i) for i in minimum(d):k]) ccdf(d::UnivariateDistribution, q::Real) = 1.0 - cdf(d, q) cquantile(d::UnivariateDistribution, p::Real) = quantile(d, 1.0 - p) diff --git a/test/hypergeometric.jl b/test/hypergeometric.jl new file mode 100644 index 0000000000..f07bac5ab5 --- /dev/null +++ b/test/hypergeometric.jl @@ -0,0 +1,46 @@ +using Distributions +using Base.Test + +# simple example +ns = 4 +nf = 6 +n = 3 + +d = Hypergeometric(ns, nf, n) + +@test_approx_eq mean(d) 6.0/5.0 +@test_approx_eq var(d) 14.0/25.0 +@test_approx_eq skewness(d) 1.0/(2.0*sqrt(14.0)) +@test_approx_eq kurtosis(d) 128.0/49.0-3 +@test mode(d) == 1 + +@test quantile(d, 0.05) == 0 +@test quantile(d, 0.95) == 2 + +expected = [1/3 1 3/5 1/15]/2 +@test_approx_eq logpdf(d, 0:3) log(expected) +@test_approx_eq pdf(d, 0:3) expected +@test_approx_eq cdf(d, 0:3) cumsum(expected,2) + + +# http://en.wikipedia.org/wiki/Fisher's_noncentral_hypergeometric_distribution +# (distributions are both equal to the (central) hypergeometric distribution when the odds ratio is 1.) +ns = 80 +nf = 60 +n = 100 + +d = Hypergeometric(ns,nf,n) + +@test_approx_eq mean(d) 400.0/7 +@test_approx_eq var(d) 48000.0/6811.0 +@test_approx_eq skewness(d) sqrt(139/30)/92 +@test_approx_eq kurtosis(d) 44952739/15124800-3 +@test mode(d) == 57 + +@test quantile(d, 0.05) == 53 +@test quantile(d, 0.95) == 62 + +expected = [0.10934614 0.13729286 0.14961947 0.14173721 0.11682889 0.08382473] +@test_approx_eq_eps logpdf(d, 55:60) log(expected) 1e-7 +@test_approx_eq_eps pdf(d, 55:60) expected 1e-7 +@test_approx_eq_eps cdf(d, 55:60) cumsum(expected,2)+cdf(d, 54) 1e-7 diff --git a/test/noncentralhypergeometric.jl b/test/noncentralhypergeometric.jl index d4b46bef32..dc7bdd0dbf 100644 --- a/test/noncentralhypergeometric.jl +++ b/test/noncentralhypergeometric.jl @@ -40,7 +40,7 @@ ref = Hypergeometric(ns,nf,n) @test_approx_eq_eps cdf(d, 51) cdf(ref, 51) 1e-7 @test_approx_eq_eps quantile(d, 0.05) quantile(ref, 0.05) 1e-7 @test_approx_eq_eps quantile(d, 0.95) quantile(ref, 0.95) 1e-7 -@test mode(d) == 57 +@test mode(d) == mode(ref) ## Wallenius' noncentral hypergeometric distribution ns = 80 @@ -81,4 +81,4 @@ ref = Hypergeometric(ns,nf,n) @test_approx_eq_eps cdf(d, 51) cdf(ref, 51) 1e-7 @test_approx_eq_eps quantile(d, 0.05) quantile(ref, 0.05) 1e-7 @test_approx_eq_eps quantile(d, 0.95) quantile(ref, 0.95) 1e-7 -@test mode(d) == 57 \ No newline at end of file +@test mode(d) == mode(ref) \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index cf63a4423f..2e47897132 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,6 +13,7 @@ tests = [ "truncate", "multinomial", "dirichlet", + "hypergeometric", "multivariate", "mvnormal", "mvtdist", diff --git a/test/univariate.jl b/test/univariate.jl index f278a59f87..8bd5c3f03f 100644 --- a/test/univariate.jl +++ b/test/univariate.jl @@ -69,6 +69,7 @@ distlist = [Arcsine(), Hypergeometric(3.0, 2.0, 2.0), Hypergeometric(2.0, 3.0, 2.0), Hypergeometric(2.0, 2.0, 3.0), + Hypergeometric(60.0, 80.0, 100.0), InverseGaussian(1.0,1.0), InverseGaussian(2.0,7.0), InverseGamma(1.0, 1.0),