Skip to content

Commit

Permalink
Fix errors in hyperloglog implementation
Browse files Browse the repository at this point in the history
This change fixes bugs where the bias corrected estimated was first
being calculated incorrectly, and then secondly not used.  Tests have
been updated to reflect the new implementation, and a new test has
been added to randomly verify bias corrected results have an error
less than 5%.
  • Loading branch information
Brian Rowe committed Sep 9, 2022
1 parent b6baa47 commit 0d199d4
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 7 deletions.
10 changes: 5 additions & 5 deletions crates/hyperloglogplusplus/src/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ impl<'s> Storage<'s> {
if h <= self.threshold() {
h as u64
} else {
e as u64
e_p as u64
}
}

Expand Down Expand Up @@ -156,7 +156,7 @@ impl<'s> Storage<'s> {

let mut value = 0.0;
for i in 0..6 {
value += distances[i] * bias_data[i];
value += distances[i] * bias_data[neighbors[i]];
}

return value;
Expand Down Expand Up @@ -296,7 +296,7 @@ mod tests {
hll.add_hash(hash(i));
}
// FIXME examine in more detail
assert_eq!(hll.estimate_count(), 469);
assert_eq!(hll.estimate_count(), 430);
}

#[test]
Expand Down Expand Up @@ -332,7 +332,7 @@ mod tests {
for i in 0..10000 {
hll.add_hash(hash(i));
}
assert_eq!(hll.estimate_count(), 11513);
assert_eq!(hll.estimate_count(), 11347);
}

#[test]
Expand All @@ -350,7 +350,7 @@ mod tests {
for i in 0..100000 {
hll.add_hash(hash(i));
}
assert_eq!(hll.estimate_count(), 126_448);
assert_eq!(hll.estimate_count(), 117304);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion crates/hyperloglogplusplus/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ mod tests {
for i in 0..100_000 {
hll.add(&i);
}
assert_eq!(hll.estimate_count(), 126_448);
assert_eq!(hll.estimate_count(), 117_304);
assert!(!hll.is_sparse());
assert_eq!(hll.num_bytes(), 49_153);
assert!(hll.num_bytes() <= (1 << 16) * 6 / 8 + 1)
Expand Down
35 changes: 34 additions & 1 deletion extension/src/hyperloglog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ mod tests {
use pgx::*;
use super::*;
use pgx_macros::pg_test;
use rand::distributions::{Distribution, Uniform};

#[pg_test]
fn test_hll_aggregate() {
Expand Down Expand Up @@ -770,7 +771,7 @@ mod tests {
.first()
.get_one::<i64>();

assert_eq!(count, Some(152));
assert_eq!(count, Some(153));
}

});
Expand Down Expand Up @@ -855,6 +856,38 @@ mod tests {
});
}

#[pg_test]
fn bias_correct_values_accurate() {
const NUM_BIAS_TRIALS: usize = 5;
const MAX_TRIAL_ERROR: f64 = 0.05;
Spi::execute(|client| {
// This should match THRESHOLD_DATA_VEC from b=12-18
let thresholds = vec![3100, 6500, 11500, 20000, 50000, 120000, 350000];
let rand_precision: Uniform<usize> = Uniform::new_inclusive(12, 18);
let mut rng = rand::thread_rng();
for _ in 0..NUM_BIAS_TRIALS {
let precision = rand_precision.sample(&mut rng);
let rand_cardinality: Uniform<usize> =
Uniform::new_inclusive(thresholds[precision - 12], 5 * (1 << precision));
let cardinality = rand_cardinality.sample(&mut rng);
let query = format!(
"SELECT hyperloglog({}, v) -> distinct_count() FROM generate_series(1, {}) v",
1 << precision,
cardinality
);

let estimate = client
.select(&query, None, None)
.first()
.get_one::<i64>()
.unwrap();

let error = (estimate as f64 / cardinality as f64).abs() - 1.;
assert!(error < MAX_TRIAL_ERROR, "hyperloglog with {} buckets on cardinality {} gave a result of {}. Resulting error {} exceeds max allowed ({})", 2^precision, cardinality, estimate, error, MAX_TRIAL_ERROR);
}
});
}

// FIXME these tests don't run on CI
// #[pg_test(error = "Invalid value for size 262145. Size must be between 16 and 262144, though less than 1024 not recommended")]
// fn test_hll_error_too_large() {
Expand Down

0 comments on commit 0d199d4

Please sign in to comment.