Skip to content

Commit

Permalink
Fixing quantile calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
lhgravendeel committed May 21, 2018
1 parent 6e0edb5 commit bf2d79a
Show file tree
Hide file tree
Showing 2 changed files with 175 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -268,61 +268,50 @@ public double cdf(double x) {
}

/**
* Get the desired quantile.
*
* NOTE: The quantile can exceed the min/max of the input data points,
* especially if the amount of data points or centroids is small and the quantile is close to 0 or 1.
* This method of quantile calculation uses linear interpolation to construct the inverse CDF,
* where the first centroid's probability is at (centroid's weight / weight sum) / 2.
*
* @param q The quantile desired. Can be in the range [0,1].
* @return The minimum value x such that we think that the proportion of samples is <= x is q.
*/
public double quantile(double q) {
GroupTree values = summary;
Preconditions.checkArgument(values.size() > 1);
Preconditions.checkArgument(values.size() > 0);

Iterator<Group> it = values.iterator();
Group center = it.next();
Group leading = it.next();
if (!it.hasNext()) {
// only two centroids because of size limits
// both a and b have to have just a single element
double diff = (leading.mean() - center.mean()) / 2;
if (q > 0.75) {
return leading.mean() + diff * (4 * q - 3);
} else {
return center.mean() + diff * (4 * q - 1);
}
} else {
q *= count;
double right = (leading.mean() - center.mean()) / 2;
// we have nothing else to go on so make left hanging width same as right to start
double left = right;

double t = center.count();
while (it.hasNext()) {
if (t + center.count() / 2 >= q) {
// left side of center
return center.mean() - left * 2 * (q - t) / center.count();
} else if (t + leading.count() >= q) {
// right of b but left of the left-most thing beyond
return center.mean() + right * 2.0 * (center.count() - (q - t)) / center.count();
}
t += center.count();
// If there is only 1 centroid, always use its value
return center.mean();
}
Group leading = it.next();
q *= count;

// First pivot happens at the second centroid
double nextPivot = center.count() + leading.count() / 2.0;
if (q <= nextPivot || !it.hasNext()) {
// This quantile doesn't exceed the next pivot, or there is no next pivot.
// In either case, the slope can be derived from the first two centroids.
double slope = 2.0 * (leading.mean() - center.mean()) / (double) (leading.count() + center.count());
return leading.mean() - (nextPivot - q) * slope;
}

center = leading;
leading = it.next();
left = right;
right = (leading.mean() - center.mean()) / 2;
}
// ran out of data ... assume final width is symmetrical
while (it.hasNext()) {
double prevPivot = nextPivot;
center = leading;
left = right;
if (t + center.count() / 2 >= q) {
// left side of center
return center.mean() - left * 2 * (q - t) / center.count();
} else if (t + leading.count() >= q) {
// right of center but left of leading
return center.mean() + right * 2.0 * (center.count() - (q - t)) / center.count();
} else {
// shouldn't be possible
return 1;
leading = it.next();
nextPivot = prevPivot + ((center.count() + leading.count()) / 2.0);
if (q <= nextPivot || !it.hasNext()) {
double slope = 2.0 * (leading.mean() - center.mean()) / (double) (leading.count() + center.count());
return leading.mean() - (nextPivot - q) * slope;
}
}
// shouldn't be possible
return 1;
}

public int centroidCount() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,150 @@ public void testMerge() {
}
}

/**
* Test a situation with just 1 datapoint, expecting 1 centroid to be generated and quantiles to be equal to the single data point.
*/
@Test
public void testOnePointQuantiles() {
TDigest digest = new TDigest(100);
digest.add(10);

assertEquals(1, digest.centroidCount());
verifyPercentilesIncremental(digest);

assertEquals(10, digest.quantile(0.5), 0.001);
assertEquals(10, digest.quantile(0), 0.001);
assertEquals(10, digest.quantile(1), 0.001);
}

/**
* Test a situation with just 2 datapoints, expecting 2 centroids to be generated and quantiles to be outside the min/max.
*/
@Test
public void testTwoQuantiles() {
TDigest digest = new TDigest(100);
digest.add(10);
digest.add(20);

assertEquals(2, digest.centroidCount());
verifyPercentilesIncremental(digest);

assertEquals("Median is expected in the middle", 15.0, digest.quantile(0.5), 0.001);

//NOTE: The quantiles for 0.0 and 1.0 will be outside the min/max. The calculation is only based on the centroids.
// Especially in the case of merged or serialized TDigests, the min/max information will be lost, so this can't be applied reliably
assertEquals("Left side is considered half the distance between the two points to the left", 5.0, digest.quantile(0), 0.001);
assertEquals("Right side is considered half the distance between the two points to the right",25.0, digest.quantile(1), 0.001);
}

/**
* Test a situation with just 2 datapoints, with varying weights.
*/
@Test
public void testTwoQuantilesVaryingWeight() {
TDigest digest = new TDigest(100);
digest.add(10, 3);
digest.add(20, 1);

assertEquals(2, digest.centroidCount());
verifyPercentilesIncremental(digest);

assertEquals(2.5, digest.quantile(0), 0.001);
assertEquals(12.5, digest.quantile(0.5), 0.001);
assertEquals(22.5, digest.quantile(1), 0.001);
}

/**
* Test quantiles with 3 data points of equal weight.
*/
@Test
public void testQuantiles() {
TDigest digest = new TDigest(100);
digest.add(0);
digest.add(10);
digest.add(20);

assertEquals(3, digest.centroidCount());
verifyPercentilesIncremental(digest);

assertEquals(-5.0, digest.quantile(0), 0.001);
assertEquals(10.0, digest.quantile(0.5), 0.001);
assertEquals(25.0, digest.quantile(1), 0.001);
}

/**
* Test quantiles with 3 data points if varying weight.
*/
@Test
public void testVaryingWeightQuantiles() {
TDigest digest = new TDigest(100);
digest.add(0, 3);
digest.add(10, 1);
digest.add(20, 1);

assertEquals(3, digest.centroidCount());
verifyPercentilesIncremental(digest);

assertEquals(-7.5, digest.quantile(0), 0.001);
assertEquals(5.0, digest.quantile(0.5), 0.001);
assertEquals(25.0, digest.quantile(1), 0.001);

// Altering only the last datapoint should not change the median
digest = new TDigest(100);
digest.add(0, 3);
digest.add(10, 1);
digest.add(30, 1);

assertEquals(3, digest.centroidCount());
verifyPercentilesIncremental(digest);

assertEquals(-7.5, digest.quantile(0), 0.001);
assertEquals(5.0, digest.quantile(0.5), 0.001);
assertEquals(40.0, digest.quantile(1), 0.001);
}

/**
* Does basic sanity testing for a particular small example that used to fail.
* See https://github.com/addthis/stream-lib/issues/138
*/
@Test
public void testThreePointExample() {
TDigest tdigest = new TDigest(100);
double x0 = 0.18615591526031494;
double x1 = 0.4241943657398224;
double x2 = 0.8813006281852722;

tdigest.add(x0);
tdigest.add(x1);
tdigest.add(x2);

double p10 = tdigest.quantile(0.1);
double p50 = tdigest.quantile(0.5);
double p90 = tdigest.quantile(0.9);
double p95 = tdigest.quantile(0.95);
double p99 = tdigest.quantile(0.99);

assertTrue("ordering of quantiles", p10 <= p50);
assertTrue("ordering of quantiles", p50 <= p90);
assertTrue("ordering of quantiles", p90 <= p95);
assertTrue("ordering of quantiles", p95 <= p99);

assertEquals("Median should be the middle value, as all weights are equal", x1, p50, 0.001);
}

/**
* Verify that all percentiles from 0 to 100 (inclusive on both sides) are monotonically increasing (weakly increasing).
* @param digest
*/
private void verifyPercentilesIncremental(TDigest digest) {
double prevPercentile = digest.quantile(0);
for (int i = 1; i <= 100; i++) {
double percentile = digest.quantile(i / 100.0);
assertTrue("At percentile " + i + ", prev was " + prevPercentile + ", new is: " + percentile, percentile >= prevPercentile);
prevPercentile = percentile;
}
}

private double cdf(final double x, List<Double> data) {
int n1 = 0;
int n2 = 0;
Expand Down

0 comments on commit bf2d79a

Please sign in to comment.