Skip to content

Commit

Permalink
feat: add cuckoofilter to nippy-jar (#4533)
Browse files Browse the repository at this point in the history
  • Loading branch information
joshieDo authored Sep 11, 2023
1 parent 8c58aae commit c8f6307
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 56 deletions.
25 changes: 25 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions crates/storage/nippy-jar/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ thiserror = "1.0"
bincode = "1.3"
serde = { version = "1.0", features = ["derive"] }
bytes = "1.5"
cuckoofilter = { version = "0.5.0", features = ["serde_support", "serde_bytes"] }
tempfile = "3.4"

[features]
default = []
2 changes: 1 addition & 1 deletion crates/storage/nippy-jar/src/compression/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pub trait Compression {
#[derive(Debug, PartialEq, Serialize, Deserialize)]
pub enum Compressors {
Zstd(Zstd),
// Acoids irrefutable let errors. Remove this after adding another one.
// Avoids irrefutable let errors. Remove this after adding another one.
Unused,
}

Expand Down
87 changes: 87 additions & 0 deletions crates/storage/nippy-jar/src/filter/cuckoo.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
use super::Filter;
use crate::NippyJarError;
use cuckoofilter::{self, CuckooFilter, ExportedCuckooFilter};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::collections::hash_map::DefaultHasher;

/// [CuckooFilter](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf). It builds and provides an approximated set-membership filter to answer queries such as "Does this element belong to this set?". Has a theoretical 3% false positive rate.
pub struct Cuckoo {
/// Remaining number of elements that can be added.
///
/// This is necessary because the inner implementation will fail on adding an element past capacity, **but it will still add it and remove other**: [source](https://github.com/axiomhq/rust-cuckoofilter/tree/624da891bed1dd5d002c8fa92ce0dcd301975561#notes--todos)
remaining: usize,

/// CuckooFilter.
filter: CuckooFilter<DefaultHasher>, // TODO does it need an actual hasher?
}

impl Cuckoo {
pub fn new(max_capacity: usize) -> Self {
Cuckoo { remaining: max_capacity, filter: CuckooFilter::with_capacity(max_capacity) }
}
}

impl Filter for Cuckoo {
fn add(&mut self, element: &[u8]) -> Result<(), NippyJarError> {
if self.remaining == 0 {
return Err(NippyJarError::FilterMaxCapacity)
}

self.remaining -= 1;

Ok(self.filter.add(element)?)
}

fn contains(&self, element: &[u8]) -> Result<bool, NippyJarError> {
Ok(self.filter.contains(element))
}
}

impl std::fmt::Debug for Cuckoo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Cuckoo")
.field("remaining", &self.remaining)
.field("filter_size", &self.filter.memory_usage())
.finish_non_exhaustive()
}
}

impl PartialEq for Cuckoo {
fn eq(&self, _other: &Self) -> bool {
self.remaining == _other.remaining &&
{
#[cfg(not(test))]
{
unimplemented!("No way to figure it out without exporting (expensive), so only allow direct comparison on a test")
}
#[cfg(test)]
{
let f1 = self.filter.export();
let f2 = _other.filter.export();
return f1.length == f2.length && f1.values == f2.values
}
}
}
}

impl<'de> Deserialize<'de> for Cuckoo {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let (remaining, exported): (usize, ExportedCuckooFilter) =
Deserialize::deserialize(deserializer)?;

Ok(Cuckoo { remaining, filter: exported.into() })
}
}

impl Serialize for Cuckoo {
/// Potentially expensive, but should be used only when creating the file.
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
(self.remaining, self.filter.export()).serialize(serializer)
}
}
36 changes: 36 additions & 0 deletions crates/storage/nippy-jar/src/filter/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use crate::NippyJarError;
use serde::{Deserialize, Serialize};

mod cuckoo;
pub use cuckoo::Cuckoo;

pub trait Filter {
/// Add element to the inclusion list.
fn add(&mut self, element: &[u8]) -> Result<(), NippyJarError>;

/// Checks if the element belongs to the inclusion list. There might be false positives.
fn contains(&self, element: &[u8]) -> Result<bool, NippyJarError>;
}

#[derive(Debug, Serialize, Deserialize, PartialEq)]
pub enum Filters {
Cuckoo(Cuckoo),
// Avoids irrefutable let errors. Remove this after adding another one.
Unused,
}

impl Filter for Filters {
fn add(&mut self, element: &[u8]) -> Result<(), NippyJarError> {
match self {
Filters::Cuckoo(c) => c.add(element),
Filters::Unused => todo!(),
}
}

fn contains(&self, element: &[u8]) -> Result<bool, NippyJarError> {
match self {
Filters::Cuckoo(c) => c.contains(element),
Filters::Unused => todo!(),
}
}
}
Loading

0 comments on commit c8f6307

Please sign in to comment.