From 6998ba440323af766c06e9bed55bfaa3330a3e07 Mon Sep 17 00:00:00 2001 From: Erik Osheim Date: Fri, 8 May 2020 23:45:34 -0400 Subject: [PATCH] Add various statistical distributions to Gen. This commit adds support for several statistical distributions: - Gaussian (normal) - Exponential - Geometric - Poisson - Binomial These distributions are likely to be useful for writing generators, as well as for fine-grained control of numerical ranges. Currently, authors using Gen.choose have to decide whether to forbid testing with large Double values (by using a small range), or to allow large values (in which case their test will be dominated by large values). One solution is to use distributions where large values are possible but rare, such as an exponential distribution. Similarly, authors can use a geometric distribution to generate the size of a collection which is usually small, but may occasionally be large. --- .../org/scalacheck/StatSpecification.scala | 87 +++++++++ src/main/scala/org/scalacheck/Gen.scala | 165 +++++++++++++++++- 2 files changed, 244 insertions(+), 8 deletions(-) create mode 100644 jvm/src/test/scala/org/scalacheck/StatSpecification.scala diff --git a/jvm/src/test/scala/org/scalacheck/StatSpecification.scala b/jvm/src/test/scala/org/scalacheck/StatSpecification.scala new file mode 100644 index 000000000..b19e25c63 --- /dev/null +++ b/jvm/src/test/scala/org/scalacheck/StatSpecification.scala @@ -0,0 +1,87 @@ +package org.scalacheck + +import Prop.{forAllNoShrink => forAll} + +object StatsSpecification extends Properties("Stats") { + + // each test run generates 5k samples, so only do 10 of them. + override def overrideParameters(ps: Test.Parameters): Test.Parameters = + ps.withMinSuccessfulTests(10) + + // we sample the distribution 5000 times, and expect the mean and + // standard deviation to be within ±10% of the true value. + val Samples = 5000 + val ErrorRate = 0.1 + + // we'll generate relatively small, well-behaved mean values. + val genMean = Gen.choose(10.0, 20.0) + + // generate a number of trials for use with binomial + val genTrials = Gen.choose(10, 30) + + // generate a probability value + val genP = Gen.choose(0.2, 0.8) + + property("prob") = + forAll(genP) { p => + val gen = Gen.prob(p).map(b => if (b) 1.0 else 0.0) + check(gen, mean = p, stdDev = Math.sqrt(p * (1.0 - p))) + } + + property("gaussian") = + forAll(genMean, genMean) { (mean, stdDev) => + val gen = Gen.gaussian(mean, stdDev) + check(gen, mean, stdDev) + } + + property("exponential") = + forAll(genMean) { mean => + val gen = Gen.exponential(1.0 / mean) + check(gen, mean = mean, stdDev = mean) + } + + property("geometric") = + forAll(genMean) { mean => + val gen = Gen.geometric(mean).map(_.toDouble) + val p = 1.0 / (mean + 1.0) + val stdDev = Math.sqrt((1.0 - p) / (p * p)) + check(gen, mean, stdDev) + } + + property("poisson") = + forAll(genMean) { rate => + val gen = Gen.poisson(rate).map(_.toDouble) + check(gen, mean = rate, stdDev = Math.sqrt(rate)) + } + + property("binomial") = + forAll(genTrials, genP) { (trials, p) => + val gen = Gen.binomial(Gen.prob(p), trials).map(_.toDouble) + val mean = trials * p + val stdDev = Math.sqrt(trials * p * (1.0 - p)) + check(gen, mean, stdDev) + } + + def check(gen: Gen[Double], mean: Double, stdDev: Double): Prop = { + val (e1, e2) = (mean * ErrorRate, stdDev * ErrorRate) + val (μ, σ) = computeStats(gen, Samples) + (mean ± e1).contains(μ) && (stdDev ± e2).contains(σ) + } + + def computeStats(g: Gen[Double], samples: Int): (Double, Double) = { + val vg = Gen.buildableOfN[Vector[Double], Double](samples, g) + val xs = vg.sample.get + val mean = xs.sum / xs.size + val stdDev = Math.sqrt(xs.iterator.map(x => Math.pow(x - mean, 2)).sum / xs.size) + (mean, stdDev) + } + + case class Bounds(min: Double, max: Double) { + def contains(x: Double): Prop = + Prop(min <= x && x <= max) :| s"($min <= $x <= $max) was false" + } + + implicit class MakeBounds(val n: Double) extends AnyVal { + def ±(error: Double): Bounds = Bounds(n - error, n + error) + } +} diff --git a/src/main/scala/org/scalacheck/Gen.scala b/src/main/scala/org/scalacheck/Gen.scala index 3e997daf7..cd508c625 100644 --- a/src/main/scala/org/scalacheck/Gen.scala +++ b/src/main/scala/org/scalacheck/Gen.scala @@ -30,16 +30,13 @@ sealed abstract class Gen[+T] extends Serializable { self => import Gen.{R, gen} - /** Just an alias */ - private type P = Gen.Parameters - // This is no longer used but preserved here for binary compatibility. private[scalacheck] def sieveCopy(x: Any): Boolean = true // If you implement new Gen[_] directly (instead of using // combinators), make sure to use p.initialSeed or p.useInitialSeed // in the implementation, instead of using seed directly. - private[scalacheck] def doApply(p: P, seed: Seed): R[T] + private[scalacheck] def doApply(p: Gen.Parameters, seed: Seed): R[T] //// Public interface //// @@ -106,7 +103,7 @@ sealed abstract class Gen[+T] extends Serializable { self => * This method is identical to [Gen.filter]. */ def suchThat(f: T => Boolean): Gen[T] = new Gen[T] { - def doApply(p: P, seed: Seed): Gen.R[T] = + def doApply(p: Gen.Parameters, seed: Seed): Gen.R[T] = p.useInitialSeed(seed) { (p0, s0) => val r = self.doApply(p0, s0) r.copy(r = r.retrieve.filter(f)) @@ -126,7 +123,7 @@ sealed abstract class Gen[+T] extends Serializable { self => */ def retryUntil(p: T => Boolean, maxTries: Int): Gen[T] = { require(maxTries > 0) - def loop(params: P, seed: Seed, tries: Int): R[T] = + def loop(params: Gen.Parameters, seed: Seed, tries: Int): R[T] = if (tries > maxTries) throw RetryUntilException(tries) else { val r = self.doApply(params, seed) if (r.retrieve.exists(p)) r else loop(params, r.seed, tries + 1) @@ -175,7 +172,7 @@ sealed abstract class Gen[+T] extends Serializable { self => /** Put a label on the generator to make test reports clearer */ def label(l: String): Gen[T] = new Gen[T] { - def doApply(p: P, seed: Seed) = + def doApply(p: Gen.Parameters, seed: Seed) = p.useInitialSeed(seed) { (p0, s0) => val r = self.doApply(p0, s0) r.copy(l = r.labels + l) @@ -240,7 +237,7 @@ object Gen extends GenArities with GenVersionSpecific { r(None, seed).copy(l = labels) case Some(t) => val r = f(t) - r.copy(l = labels | r.labels, sd = r.seed) + r.copy(l = labels | r.labels) } } @@ -947,6 +944,158 @@ object Gen extends GenArities with GenVersionSpecific { //// Number Generators //// + /** + * Generate a uniformly-distributed Long. + * + * This method has an equally likely method of generating every + * possible Long value. + */ + val long: Gen[Long] = + gen { (_, s0) => + val (n, s1) = s0.long + r(Some(n), s1) + } + + /** + * Generate a Double uniformly-distributed in [0, 1). + * + * This method will generate one of 2^53 distinct Double values in + * the unit interval. + */ + val double: Gen[Double] = + gen { (_, s0) => + val (x, s1) = s0.double + r(Some(x), s1) + } + + /** + * Generates a Boolean which has the given chance to be true. + * + * - prob(1.0) is always true + * - prob(0.5) is true 50% of the time + * - prob(0.1) is true 10% of the time + * - prob(0.0) is never true + */ + def prob(chance: Double): Gen[Boolean] = + if (chance <= 0.0) Gen.const(false) + else if (chance >= 1.0) Gen.const(true) + else gen { (_, s0) => + val (x, s1) = s0.double + r(Some(x < chance), s1) + } + + /** + * Generates Double values according to the given gaussian + * distribution, specified by its mean and standard deviation. + * + * Gaussian distributions are also called normal distributions. + * + * The range of values is theoretically (-∞, ∞) but 99.7% of all + * values will be contained within (mean ± 3 * stdDev). + */ + def gaussian(mean: Double, stdDev: Double): Gen[Double] = { + def loop(s0: Seed): R[Double] = { + val (x0, s1) = s0.double + val (y0, s2) = s1.double + val x = x0 * 2.0 - 1.0 + val y = y0 * 2.0 - 1.0 + val s = x * x + y * y + if (s >= 1.0 || s == 0.0) { + loop(s2) + } else { + val scale = stdDev * Math.sqrt(-2.0 * Math.log(s) / s) + val res = x * scale + mean // dropping y * scale + mean + r(Some(res), s2) + } + } + gen((_, seed) => loop(seed)) + } + + /** + * Generates Double values according to the given exponential + * distribution, specified by its rate parameter. + * + * The mean and standard deviation are both equal to 1/rate. + * + * The range of values is [0, ∞). + */ + def exponential(rate: Double): Gen[Double] = { + require(rate > 0.0, s"rate must be positive (got: $rate)") + val mean = 1.0 / rate + gen { (_, s0) => + val (x, s1) = s0.double + r(Some(-Math.log(x) * mean), s1) + } + } + + /** + * Generates Int values according to the given geometric + * distribution, specified by its mean. + * + * This distribution represents the expected number of failures + * before a successful test, where the probability of a successful + * test is p = 1 / (mean + 1). + * + * The ideal range of values is [0, ∞), although the largest value + * that can be produced here is 2147483647 (Int.MaxValue). + */ + def geometric(mean: Double): Gen[Int] = { + require(mean > 0.0, s"mean must be positive (got: $mean)") + val p = 1.0 / (mean + 1.0) + val lognp = Math.log1p(-p) // log(1 - p) + gen { (_, s0) => + val (u, s1) = s0.double + r(Some(Math.floor(Math.log(u) / lognp).toInt), s1) + } + } + + /** + * Generates Int values according to the given Poisson distribution, + * specified by its rate parameters. + * + * The mean equals the rate; the standard deviation is sqrt(rate). + * + * In principle any positive value is a valid rate parameter. + * However, our method of generating values cannot handle large + * rates, so we require rate <= 745. + */ + def poisson(rate: Double): Gen[Int] = { + require(0 < rate && rate <= 745.0, s"rate must be between 0 and 745 (got $rate)") + val L = Math.exp(-rate) + def loop(s0: Seed, k: Int, p: Double): R[Int] = + if (p <= L) { + r(Some(k - 1), s0) + } else { + val (x, s1) = s0.double + loop(s1, k + 1, p * x) + } + + gen((_, s) => loop(s, 0, 1.0)) + } + + /** + * Generates Int values according to the given binomial + * distribution, specified by the number of trials to conduct, and + * the probability of a true test. + * + * This distribution counts the number of trials which were + * successful according to a given test probability. + * + * The range of values is [0, trials]. + */ + def binomial(test: Gen[Boolean], trials: Int): Gen[Int] = { + def loop(ps: Gen.Parameters, s: Seed, i: Int, n: Int): R[Int] = + if (i >= trials) { + r(Some(n), s) + } else { + val r = test.doPureApply(ps, s) + val success = r.retrieve.get + loop(ps, r.seed, i + 1, if (success) n + 1 else n) + } + gen((ps, s) => loop(ps, s, 0, 0)) + } + + /** Generates positive numbers of uniform distribution, with an * upper bound of the generation size parameter. */ def posNum[T](implicit num: Numeric[T], c: Choose[T]): Gen[T] = {