diff --git a/Cargo.lock b/Cargo.lock index 840684b2d03..15cb17c8817 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -309,6 +309,10 @@ dependencies = [ "rustc_version 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "ethcore-bloom-journal" +version = "0.1.0" + [[package]] name = "ethcore-dapps" version = "1.4.0" @@ -528,6 +532,7 @@ dependencies = [ "env_logger 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "eth-secp256k1 0.5.4 (git+https://github.com/ethcore/rust-secp256k1)", "ethcore-bigint 0.1.0", + "ethcore-bloom-journal 0.1.0", "ethcore-devtools 1.4.0", "heapsize 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "itertools 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/util/Cargo.toml b/util/Cargo.toml index 81916555c6e..520a4e0038e 100644 --- a/util/Cargo.toml +++ b/util/Cargo.toml @@ -34,6 +34,7 @@ using_queue = { path = "using_queue" } table = { path = "table" } ansi_term = "0.7" tiny-keccak= "1.0" +ethcore-bloom-journal = { path = "bloom" } [features] default = [] diff --git a/util/bloom/Cargo.toml b/util/bloom/Cargo.toml new file mode 100644 index 00000000000..5397c691b92 --- /dev/null +++ b/util/bloom/Cargo.toml @@ -0,0 +1,9 @@ +[project] +name = "ethcore-bloom-journal" +version = "0.1.0" +authors = ["Ethcore"] +description = "Journaling bloom filter" +license = "GPL3" + +[lib] +path = "src/lib.rs" diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs new file mode 100644 index 00000000000..5824376519c --- /dev/null +++ b/util/bloom/src/lib.rs @@ -0,0 +1,247 @@ +// Copyright 2015, 2016 Ethcore (UK) Ltd. +// This file is part of Parity. + +// Parity is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Parity is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Parity. If not, see . + +use std::cmp; +use std::mem; +use std::f64; +use std::hash::{Hash, Hasher, SipHasher}; +use std::collections::HashSet; + +/// BitVec structure with journalling +/// Every time any of the blocks is getting set it's index is tracked +/// and can be then drained by `drain` method +struct BitVecJournal { + elems: Vec, + journal: HashSet, +} + +impl BitVecJournal { + pub fn new(size: usize) -> BitVecJournal { + let extra = if size % 8 > 0 { 1 } else { 0 }; + BitVecJournal { + elems: vec![0u64; size / 8 + extra], + journal: HashSet::new(), + } + } + + pub fn from_parts(parts: &[u64]) -> BitVecJournal { + BitVecJournal { + elems: parts.to_vec(), + journal: HashSet::new(), + } + } + + pub fn set(&mut self, index: usize) { + let e_index = index / 64; + let bit_index = index % 64; + let val = self.elems.get_mut(e_index).unwrap(); + *val |= 1u64 << bit_index; + self.journal.insert(e_index); + } + + pub fn get(&self, index: usize) -> bool { + let e_index = index / 64; + let bit_index = index % 64; + self.elems[e_index] & (1 << bit_index) != 0 + } + + pub fn drain(&mut self) -> Vec<(usize, u64)> { + let journal = mem::replace(&mut self.journal, HashSet::new()).into_iter(); + journal.map(|idx| (idx, self.elems[idx])).collect::>() + } + + pub fn saturation(&self) -> f64 { + self.elems.iter().fold(0u64, |acc, e| acc + e.count_ones() as u64) as f64 / (self.elems.len() * 64) as f64 + } +} + +/// Bloom filter structure +pub struct Bloom { + bitmap: BitVecJournal, + bitmap_bits: u64, + k_num: u32, + sips: [SipHasher; 2], +} + +impl Bloom { + /// Create a new bloom filter structure. + /// bitmap_size is the size in bytes (not bits) that will be allocated in memory + /// items_count is an estimation of the maximum number of items to store. + pub fn new(bitmap_size: usize, items_count: usize) -> Bloom { + assert!(bitmap_size > 0 && items_count > 0); + let bitmap_bits = (bitmap_size as u64) * 8u64; + let k_num = Bloom::optimal_k_num(bitmap_bits, items_count); + let bitmap = BitVecJournal::new(bitmap_bits as usize); + let sips = [Bloom::sip_new(), Bloom::sip_new()]; + Bloom { + bitmap: bitmap, + bitmap_bits: bitmap_bits, + k_num: k_num, + sips: sips, + } + } + + /// Initializes bloom filter from saved state + pub fn from_parts(parts: &[u64], k_num: u32) -> Bloom { + let bitmap_size = parts.len() * 8; + let bitmap_bits = (bitmap_size as u64) * 8u64; + let bitmap = BitVecJournal::from_parts(parts); + let sips = [Bloom::sip_new(), Bloom::sip_new()]; + Bloom { + bitmap: bitmap, + bitmap_bits: bitmap_bits, + k_num: k_num, + sips: sips, + } + } + + /// Create a new bloom filter structure. + /// items_count is an estimation of the maximum number of items to store. + /// fp_p is the wanted rate of false positives, in ]0.0, 1.0[ + pub fn new_for_fp_rate(items_count: usize, fp_p: f64) -> Bloom { + let bitmap_size = Bloom::compute_bitmap_size(items_count, fp_p); + Bloom::new(bitmap_size, items_count) + } + + /// Compute a recommended bitmap size for items_count items + /// and a fp_p rate of false positives. + /// fp_p obviously has to be within the ]0.0, 1.0[ range. + pub fn compute_bitmap_size(items_count: usize, fp_p: f64) -> usize { + assert!(items_count > 0); + assert!(fp_p > 0.0 && fp_p < 1.0); + let log2 = f64::consts::LN_2; + let log2_2 = log2 * log2; + ((items_count as f64) * f64::ln(fp_p) / (-8.0 * log2_2)).ceil() as usize + } + + /// Records the presence of an item. + pub fn set(&mut self, item: T) + where T: Hash + { + let mut hashes = [0u64, 0u64]; + for k_i in 0..self.k_num { + let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize; + self.bitmap.set(bit_offset); + } + } + + /// Check if an item is present in the set. + /// There can be false positives, but no false negatives. + pub fn check(&self, item: T) -> bool + where T: Hash + { + let mut hashes = [0u64, 0u64]; + for k_i in 0..self.k_num { + let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize; + if !self.bitmap.get(bit_offset) { + return false; + } + } + true + } + + /// Return the number of bits in the filter + pub fn number_of_bits(&self) -> u64 { + self.bitmap_bits + } + + /// Return the number of hash functions used for `check` and `set` + pub fn number_of_hash_functions(&self) -> u32 { + self.k_num + } + + fn optimal_k_num(bitmap_bits: u64, items_count: usize) -> u32 { + let m = bitmap_bits as f64; + let n = items_count as f64; + let k_num = (m / n * f64::ln(2.0f64)).ceil() as u32; + cmp::max(k_num, 1) + } + + fn bloom_hash(&self, hashes: &mut [u64; 2], item: &T, k_i: u32) -> u64 + where T: Hash + { + if k_i < 2 { + let sip = &mut self.sips[k_i as usize].clone(); + item.hash(sip); + let hash = sip.finish(); + hashes[k_i as usize] = hash; + hash + } else { + hashes[0].wrapping_add((k_i as u64).wrapping_mul(hashes[1]) % 0xffffffffffffffc5) + } + } + + fn sip_new() -> SipHasher { + SipHasher::new() + } + + /// Drains the bloom journal returning the updated bloom part + pub fn drain_journal(&mut self) -> BloomJournal { + BloomJournal { + entries: self.bitmap.drain(), + hash_functions: self.k_num, + } + } + + /// Returns the ratio of set bits in the bloom filter to the total bits + pub fn saturation(&self) -> f64 { + self.bitmap.saturation() + } +} + +/// Bloom journal +/// Returns the tuple of (bloom part index, bloom part value) where each one is representing +/// an index of bloom parts that was updated since the last drain +pub struct BloomJournal { + pub hash_functions: u32, + pub entries: Vec<(usize, u64)>, +} + + +#[cfg(test)] +mod tests { + use super::Bloom; + + #[test] + fn get_set() { + let mut bloom = Bloom::new(10, 80); + let key = vec![115u8, 99]; + assert!(!bloom.check(&key)); + bloom.set(&key); + assert!(bloom.check(&key)); + } + + #[test] + fn journalling() { + let initial = vec![0u64; 8]; + let mut bloom = Bloom::from_parts(&initial, 3); + bloom.set(&vec![5u8, 4]); + let drain = bloom.drain_journal(); + + assert_eq!(2, drain.entries.len()) + } + + #[test] + fn saturation() { + let initial = vec![0u64; 8]; + let mut bloom = Bloom::from_parts(&initial, 3); + bloom.set(&vec![5u8, 4]); + + let full = bloom.saturation(); + // 2/8/64 = 0.00390625 + assert!(full >= 0.0039f64 && full <= 0.004f64); + } +}