-
Notifications
You must be signed in to change notification settings - Fork 115
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bloom refactor #39
Bloom refactor #39
Changes from all commits
f047dd7
922b53e
c409219
cb90396
d538b8b
3ad7990
fafa7f5
ea447ee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
use super::BloomFilter; | ||
|
||
impl BloomFilter { | ||
// Technically, we need 3 out of 4 to calculate the other. | ||
// But we often want many of these to be either minimum itself or to allow other values to be minima. | ||
// So we often only need 2 out of 4 ot calculate other two values. | ||
// n: number of items (expected) in filter. | ||
// p: (target) false positive rate | ||
// m: number of bits in filter. | ||
// k: number of hashers | ||
// n = ceil(m / (-k / log(1 - exp(log(p) / k)))) | ||
// p = pow(1 - exp(-k / (m / n)), k) | ||
// m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); | ||
// k = round((m / n) * log(2)); | ||
|
||
pub fn optimal_number_of_hashers(size_in_bytes: usize, expected_elements: usize) -> usize { | ||
use std::f64::consts::LN_2; | ||
let n = expected_elements as f64; | ||
let m = (size_in_bytes * 8) as f64; | ||
let k = (m / n) * (LN_2); | ||
k.ceil() as usize | ||
} | ||
|
||
pub fn prob_of_false_positive( | ||
size_in_bytes: usize, | ||
expected_elements: usize, | ||
num_hashers: usize, | ||
) -> f64 { | ||
let n = expected_elements as f64; | ||
let m = (size_in_bytes * 8) as f64; | ||
let k = num_hashers as f64; | ||
(1.0 - (1.0 - (1.0 / m)).powf(k * n)).powf(k) | ||
} | ||
|
||
pub fn suggest_size_in_bytes( | ||
expected_elements: usize, | ||
desired_false_positive_rate: f64, | ||
) -> usize { | ||
let min_size: usize = 1 << 20; // 1MiB | ||
let max_size: usize = usize::MAX / 2; // 9E18 bytes 8exbi-bytes | ||
let mut size_in_bytes = min_size; | ||
while size_in_bytes < max_size | ||
&& Self::prob_of_false_positive( | ||
size_in_bytes, | ||
expected_elements, | ||
Self::optimal_number_of_hashers(size_in_bytes, expected_elements), | ||
) > desired_false_positive_rate | ||
{ | ||
size_in_bytes *= 2; | ||
} | ||
size_in_bytes | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,56 +1,97 @@ | ||
#[cfg(test)] | ||
mod tests { | ||
use super::super::BloomFilter; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that was part of a refactor into separate file. |
||
use super::BloomFilter; | ||
// n: number of items in filter. p: false positive rate | ||
// m: number of bits in filter. k: number of hashers | ||
// n = ceil(m / (-k / log(1 - exp(log(p) / k)))) | ||
// p = pow(1 - exp(-k / (m / n)), k) | ||
// m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); | ||
// k = round((m / n) * log(2)); | ||
|
||
// n: number of items in filter. p: false positive rate | ||
// m: number of bits in filter. k: number of hashers | ||
// n = ceil(m / (-k / log(1 - exp(log(p) / k)))) | ||
// p = pow(1 - exp(-k / (m / n)), k) | ||
#[cfg(test)] | ||
pub fn simplified_suggest_size(expected_elements: usize, target_fp_rate: f64) -> usize { | ||
// m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); | ||
// k = round((m / n) * log(2)); | ||
use std::f64::consts::LN_2; | ||
let theoretical_optimum = (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) | ||
.ceil() | ||
.div_euclid(8.0) as usize; | ||
let suggested_size = theoretical_optimum.next_power_of_two(); | ||
|
||
#[test] | ||
fn bloom_optimal_hasher_number() { | ||
let size_in_bytes = 1_000_000_000; | ||
let expected_elements = 1_000_000_000; | ||
assert_eq!( | ||
BloomFilter::optimal_number_of_hashers(size_in_bytes, expected_elements), | ||
6 | ||
); | ||
assert_eq!( | ||
BloomFilter::optimal_number_of_hashers(1_000_000, 500_000), | ||
12 | ||
) | ||
let min_size: usize = 1 << 20; //1 MiB | ||
let max_size: usize = usize::MAX / 2; // 9E18 bytes 8exbi-bytes | ||
suggested_size.clamp(min_size, max_size) | ||
} | ||
|
||
#[test] | ||
fn bloom_optimal_hasher_number() { | ||
let size_in_bytes = 1_000_000_000; | ||
let expected_elements = 1_000_000_000; | ||
assert_eq!( | ||
BloomFilter::optimal_number_of_hashers(size_in_bytes, expected_elements), | ||
6 | ||
); | ||
assert_eq!( | ||
BloomFilter::optimal_number_of_hashers(1_000_000, 500_000), | ||
12 | ||
) | ||
} | ||
#[test] | ||
fn bloom_test_prob_of_false_positive() { | ||
// calculated from https://hur.st/bloomfilter/ | ||
let size_in_bytes = 1_000_000_000; | ||
let expected_elements = 1_000_000_000; | ||
let num_hashers = 8; | ||
assert_eq!( | ||
BloomFilter::prob_of_false_positive(size_in_bytes, expected_elements, num_hashers), | ||
0.025_491_740_593_406_025 as f64 | ||
); | ||
assert_eq!( | ||
BloomFilter::prob_of_false_positive(1_048_576, 524288, 2), | ||
0.013_806_979_447_406_826 as f64 | ||
) | ||
} | ||
|
||
#[test] | ||
fn bloom_suggest_size() { | ||
// it's hard to derive this exactly since the algorithm is doing closest power of 2 | ||
// instead of exact theoretical optimum | ||
|
||
// Define a struct to hold test case data | ||
struct TestCase { | ||
elements: usize, | ||
fp_rate: f64, | ||
expected_size: usize, | ||
} | ||
#[test] | ||
fn bloom_test_prob_of_false_positive() { | ||
// calculated from https://hur.st/bloomfilter/ | ||
let size_in_bytes = 1_000_000_000; | ||
let expected_elements = 1_000_000_000; | ||
let num_hashers = 8; | ||
|
||
// Create a vector of test cases | ||
let test_cases = vec![ | ||
// test for minimum size | ||
TestCase { | ||
elements: 4_000, | ||
fp_rate: 1E-7, | ||
expected_size: 1024 * 1024, | ||
}, | ||
// test for average size | ||
TestCase { | ||
elements: 1_000_000, | ||
fp_rate: 1E-4, | ||
expected_size: 4_194_304, | ||
}, | ||
// Add more test cases here as needed | ||
]; | ||
|
||
for test_case in test_cases { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doing a loop over test cases means that if the first test case fails, none of the others will run. I think a better approach would be to create a separate method per test, and each such test would create a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it was a rough draft to illustrate potential test cases. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Parametrized tests sound good too. |
||
let tested_size = BloomFilter::suggest_size_in_bytes(test_case.elements, test_case.fp_rate); | ||
let simplified_size = simplified_suggest_size(test_case.elements, test_case.fp_rate); | ||
|
||
assert_eq!( | ||
BloomFilter::prob_of_false_positive(size_in_bytes, expected_elements, num_hashers), | ||
0.025_491_740_593_406_025 as f64 | ||
tested_size, test_case.expected_size, | ||
"Failed for elements: {}, fp_rate: {}", | ||
test_case.elements, test_case.fp_rate | ||
); | ||
assert_eq!( | ||
BloomFilter::prob_of_false_positive(1_048_576, 524288, 2), | ||
0.013_806_979_447_406_826 as f64 | ||
) | ||
} | ||
|
||
#[test] | ||
fn bloom_suggest_size() { | ||
// it's hard to derive this exactly since the algorithm is doing closest power of 2 | ||
// instead of exact theoretical optimum | ||
let expected_elements = 1_000_000; | ||
let desired_false_positive_rate = 0.0001 as f64; | ||
let theoretical_optimum = ((expected_elements as f64 * desired_false_positive_rate.ln()) | ||
/ f64::ln(1.0 / 2.0f64.powf(2.0f64.ln()))) | ||
.ceil() | ||
.div_euclid(8f64) as usize; | ||
let suggested_size = | ||
BloomFilter::suggest_size_in_bytes(expected_elements, desired_false_positive_rate); | ||
assert_eq!(suggested_size, 4_194_304); | ||
assert_eq!(suggested_size, theoretical_optimum.next_power_of_two()) | ||
tested_size, simplified_size, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i generally agree. |
||
"Failed for elements: {}, fp_rate: {}", | ||
test_case.elements, test_case.fp_rate | ||
); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does moving the maths functions here help with sharing them to other possible implementations in the future? The only way this maths can be reused is if the other bloom filter implementations keep a copy of
BloomFilter
internally. I don't think it's a good idea to do that.Maybe you can put the maths logic in a new struct. Then it would make sense for multiple filter implementations to consume the maths logic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that makes sense as well