From bf2d79a5fbd529fdc4880644fa62d42e0fbd034d Mon Sep 17 00:00:00 2001 From: lhgravendeel Date: Mon, 21 May 2018 21:34:31 +0200 Subject: [PATCH] Fixing quantile calculation See addthis/stream-lib#138 --- .../analytics/stream/quantile/TDigest.java | 73 ++++----- .../stream/quantile/TDigestTest.java | 144 ++++++++++++++++++ 2 files changed, 175 insertions(+), 42 deletions(-) diff --git a/src/main/java/com/clearspring/analytics/stream/quantile/TDigest.java b/src/main/java/com/clearspring/analytics/stream/quantile/TDigest.java index c0aa150a5..6e2bd19fd 100644 --- a/src/main/java/com/clearspring/analytics/stream/quantile/TDigest.java +++ b/src/main/java/com/clearspring/analytics/stream/quantile/TDigest.java @@ -268,61 +268,50 @@ public double cdf(double x) { } /** + * Get the desired quantile. + * + * NOTE: The quantile can exceed the min/max of the input data points, + * especially if the amount of data points or centroids is small and the quantile is close to 0 or 1. + * This method of quantile calculation uses linear interpolation to construct the inverse CDF, + * where the first centroid's probability is at (centroid's weight / weight sum) / 2. + * * @param q The quantile desired. Can be in the range [0,1]. * @return The minimum value x such that we think that the proportion of samples is <= x is q. */ public double quantile(double q) { GroupTree values = summary; - Preconditions.checkArgument(values.size() > 1); + Preconditions.checkArgument(values.size() > 0); Iterator it = values.iterator(); Group center = it.next(); - Group leading = it.next(); if (!it.hasNext()) { - // only two centroids because of size limits - // both a and b have to have just a single element - double diff = (leading.mean() - center.mean()) / 2; - if (q > 0.75) { - return leading.mean() + diff * (4 * q - 3); - } else { - return center.mean() + diff * (4 * q - 1); - } - } else { - q *= count; - double right = (leading.mean() - center.mean()) / 2; - // we have nothing else to go on so make left hanging width same as right to start - double left = right; - - double t = center.count(); - while (it.hasNext()) { - if (t + center.count() / 2 >= q) { - // left side of center - return center.mean() - left * 2 * (q - t) / center.count(); - } else if (t + leading.count() >= q) { - // right of b but left of the left-most thing beyond - return center.mean() + right * 2.0 * (center.count() - (q - t)) / center.count(); - } - t += center.count(); + // If there is only 1 centroid, always use its value + return center.mean(); + } + Group leading = it.next(); + q *= count; + + // First pivot happens at the second centroid + double nextPivot = center.count() + leading.count() / 2.0; + if (q <= nextPivot || !it.hasNext()) { + // This quantile doesn't exceed the next pivot, or there is no next pivot. + // In either case, the slope can be derived from the first two centroids. + double slope = 2.0 * (leading.mean() - center.mean()) / (double) (leading.count() + center.count()); + return leading.mean() - (nextPivot - q) * slope; + } - center = leading; - leading = it.next(); - left = right; - right = (leading.mean() - center.mean()) / 2; - } - // ran out of data ... assume final width is symmetrical + while (it.hasNext()) { + double prevPivot = nextPivot; center = leading; - left = right; - if (t + center.count() / 2 >= q) { - // left side of center - return center.mean() - left * 2 * (q - t) / center.count(); - } else if (t + leading.count() >= q) { - // right of center but left of leading - return center.mean() + right * 2.0 * (center.count() - (q - t)) / center.count(); - } else { - // shouldn't be possible - return 1; + leading = it.next(); + nextPivot = prevPivot + ((center.count() + leading.count()) / 2.0); + if (q <= nextPivot || !it.hasNext()) { + double slope = 2.0 * (leading.mean() - center.mean()) / (double) (leading.count() + center.count()); + return leading.mean() - (nextPivot - q) * slope; } } + // shouldn't be possible + return 1; } public int centroidCount() { diff --git a/src/test/java/com/clearspring/analytics/stream/quantile/TDigestTest.java b/src/test/java/com/clearspring/analytics/stream/quantile/TDigestTest.java index 610deed8e..ed7714dc4 100644 --- a/src/test/java/com/clearspring/analytics/stream/quantile/TDigestTest.java +++ b/src/test/java/com/clearspring/analytics/stream/quantile/TDigestTest.java @@ -532,6 +532,150 @@ public void testMerge() { } } + /** + * Test a situation with just 1 datapoint, expecting 1 centroid to be generated and quantiles to be equal to the single data point. + */ + @Test + public void testOnePointQuantiles() { + TDigest digest = new TDigest(100); + digest.add(10); + + assertEquals(1, digest.centroidCount()); + verifyPercentilesIncremental(digest); + + assertEquals(10, digest.quantile(0.5), 0.001); + assertEquals(10, digest.quantile(0), 0.001); + assertEquals(10, digest.quantile(1), 0.001); + } + + /** + * Test a situation with just 2 datapoints, expecting 2 centroids to be generated and quantiles to be outside the min/max. + */ + @Test + public void testTwoQuantiles() { + TDigest digest = new TDigest(100); + digest.add(10); + digest.add(20); + + assertEquals(2, digest.centroidCount()); + verifyPercentilesIncremental(digest); + + assertEquals("Median is expected in the middle", 15.0, digest.quantile(0.5), 0.001); + + //NOTE: The quantiles for 0.0 and 1.0 will be outside the min/max. The calculation is only based on the centroids. + // Especially in the case of merged or serialized TDigests, the min/max information will be lost, so this can't be applied reliably + assertEquals("Left side is considered half the distance between the two points to the left", 5.0, digest.quantile(0), 0.001); + assertEquals("Right side is considered half the distance between the two points to the right",25.0, digest.quantile(1), 0.001); + } + + /** + * Test a situation with just 2 datapoints, with varying weights. + */ + @Test + public void testTwoQuantilesVaryingWeight() { + TDigest digest = new TDigest(100); + digest.add(10, 3); + digest.add(20, 1); + + assertEquals(2, digest.centroidCount()); + verifyPercentilesIncremental(digest); + + assertEquals(2.5, digest.quantile(0), 0.001); + assertEquals(12.5, digest.quantile(0.5), 0.001); + assertEquals(22.5, digest.quantile(1), 0.001); + } + + /** + * Test quantiles with 3 data points of equal weight. + */ + @Test + public void testQuantiles() { + TDigest digest = new TDigest(100); + digest.add(0); + digest.add(10); + digest.add(20); + + assertEquals(3, digest.centroidCount()); + verifyPercentilesIncremental(digest); + + assertEquals(-5.0, digest.quantile(0), 0.001); + assertEquals(10.0, digest.quantile(0.5), 0.001); + assertEquals(25.0, digest.quantile(1), 0.001); + } + + /** + * Test quantiles with 3 data points if varying weight. + */ + @Test + public void testVaryingWeightQuantiles() { + TDigest digest = new TDigest(100); + digest.add(0, 3); + digest.add(10, 1); + digest.add(20, 1); + + assertEquals(3, digest.centroidCount()); + verifyPercentilesIncremental(digest); + + assertEquals(-7.5, digest.quantile(0), 0.001); + assertEquals(5.0, digest.quantile(0.5), 0.001); + assertEquals(25.0, digest.quantile(1), 0.001); + + // Altering only the last datapoint should not change the median + digest = new TDigest(100); + digest.add(0, 3); + digest.add(10, 1); + digest.add(30, 1); + + assertEquals(3, digest.centroidCount()); + verifyPercentilesIncremental(digest); + + assertEquals(-7.5, digest.quantile(0), 0.001); + assertEquals(5.0, digest.quantile(0.5), 0.001); + assertEquals(40.0, digest.quantile(1), 0.001); + } + + /** + * Does basic sanity testing for a particular small example that used to fail. + * See https://github.com/addthis/stream-lib/issues/138 + */ + @Test + public void testThreePointExample() { + TDigest tdigest = new TDigest(100); + double x0 = 0.18615591526031494; + double x1 = 0.4241943657398224; + double x2 = 0.8813006281852722; + + tdigest.add(x0); + tdigest.add(x1); + tdigest.add(x2); + + double p10 = tdigest.quantile(0.1); + double p50 = tdigest.quantile(0.5); + double p90 = tdigest.quantile(0.9); + double p95 = tdigest.quantile(0.95); + double p99 = tdigest.quantile(0.99); + + assertTrue("ordering of quantiles", p10 <= p50); + assertTrue("ordering of quantiles", p50 <= p90); + assertTrue("ordering of quantiles", p90 <= p95); + assertTrue("ordering of quantiles", p95 <= p99); + + assertEquals("Median should be the middle value, as all weights are equal", x1, p50, 0.001); + } + + /** + * Verify that all percentiles from 0 to 100 (inclusive on both sides) are monotonically increasing (weakly increasing). + * @param digest + */ + private void verifyPercentilesIncremental(TDigest digest) { + double prevPercentile = digest.quantile(0); + for (int i = 1; i <= 100; i++) { + double percentile = digest.quantile(i / 100.0); + assertTrue("At percentile " + i + ", prev was " + prevPercentile + ", new is: " + percentile, percentile >= prevPercentile); + prevPercentile = percentile; + } + } + private double cdf(final double x, List data) { int n1 = 0; int n2 = 0;