Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arbitrary bit sizes for slots in the counting bloom filter #12

Merged
merged 1 commit into from
Feb 16, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 60 additions & 36 deletions gossip/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ import (
"math"
)

// Filter is a counting bloom filter, used to approximate the number
// filter is a counting bloom filter, used to approximate the number
// of differences between InfoStores from different nodes, with
// minimal network overhead.
type Filter struct {
type filter struct {
K uint32 // Number of hashes
N uint32 // Number of insertions
R uint32 // Number of putative removals
Expand Down Expand Up @@ -56,19 +56,15 @@ func computeOptimalValues(N uint32, maxFP float64) (uint32, uint32) {
return M, K2
}

// NewFilter allocates and returns a new filter with expected number of
// newFilter allocates and returns a new filter with expected number of
// insertions N, Number of bits per slot B, and expected value of a false
// positive < maxFP.
func NewFilter(N uint32, B uint32, maxFP float64) (*Filter, error) {
// positive < maxFP. Number of bits must be 0 < B <= 8.
func newFilter(N uint32, B uint32, maxFP float64) (*filter, error) {
if N == 0 {
return nil, fmt.Errorf("number of insertions (N) must be > 0")
}
// TODO(spencer): we probably would be well-served using a 3-bit
// filter, so we should relax the following constraint and get a
// little bit fancier with the bit arithmetic to handle cross-byte
// slot values.
if B != 1 && B != 2 && B != 4 && B != 8 {
return nil, fmt.Errorf("number of bits (%d) must be a divisor of 8", B)
if B == 0 || B > 8 {
return nil, fmt.Errorf("number of bits (%d) must be 0 < B <= 8", B)
}
if maxFP <= 0 || maxFP >= 1 {
return nil, fmt.Errorf("max false positives must be 0 <= maxFP < 1: %f", maxFP)
Expand All @@ -77,7 +73,7 @@ func NewFilter(N uint32, B uint32, maxFP float64) (*Filter, error) {
maxCount := uint32((1 << B) - 1)
numBytes := (M*B + 7) / 8
bytes := make([]byte, numBytes, numBytes)
return &Filter{
return &filter{
K: K,
B: B,
M: M,
Expand All @@ -87,32 +83,60 @@ func NewFilter(N uint32, B uint32, maxFP float64) (*Filter, error) {
}, nil
}

// visitSlotBytes visits each byte (either one or two) that make up
// the bits in the specified slot. "fn" is invoked on each byte in
// turn, with values supplied for byteIndex, byteOffset, bitMask,
// and valBitOffset.
func (f *filter) visitSlotBytes(slot uint32, fn func(uint32, uint32, uint32, uint32)) {
bitIndex := slot * f.B
byteIndex := bitIndex / 8
byteOffset := bitIndex % 8

// Things are tricky here because we deal with crossing byte boundaries.
lastBit := byteOffset + f.B
valBitOffset := uint32(0)
for bit := byteOffset; bit < lastBit; {
b := lastBit - bit
if b > 8-byteOffset {
b = 8 - byteOffset
}
bitMask := uint32((1 << b) - 1)

fn(byteIndex, byteOffset, bitMask, valBitOffset) // call supplied method

bit += b
valBitOffset += b
byteIndex++
byteOffset = (byteOffset + b) % 8
}
}

// incrementSlot increments slot value by the specified amount, bounding at
// maximum slot value.
func (f *Filter) incrementSlot(slot uint32, incr int32) {
func (f *filter) incrementSlot(slot uint32, incr int32) {
val := int32(f.getSlot(slot)) + incr
if val > int32(f.MaxCount) {
val = int32(f.MaxCount)
} else if val < 0 {
val = 0
}
bitIndex := slot * f.B
byteIndex := bitIndex / 8
byteOffset := bitIndex % 8
f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) & ^(f.MaxCount << byteOffset))
f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) | uint32(val)<<byteOffset)
f.visitSlotBytes(slot, func(byteIndex uint32, byteOffset uint32, bitMask uint32, valBitOffset uint32) {
f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) & ^(bitMask << byteOffset))
f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) | (uint32(val>>valBitOffset)&bitMask)<<byteOffset)
})
}

// getSlot returns the slot value.
func (f *Filter) getSlot(slot uint32) uint32 {
bitIndex := slot * f.B
byteIndex := bitIndex / 8
byteOffset := bitIndex % 8
return (uint32(f.Data[byteIndex]) & (f.MaxCount << byteOffset)) >> byteOffset
func (f *filter) getSlot(slot uint32) uint32 {
val := uint32(0)
f.visitSlotBytes(slot, func(byteIndex uint32, byteOffset uint32, bitMask uint32, valBitOffset uint32) {
val |= ((uint32(f.Data[byteIndex]) & (bitMask << byteOffset)) >> byteOffset) << valBitOffset
})
return val
}

// AddKey adds the key to the filter.
func (f *Filter) AddKey(key string) {
// addKey adds the key to the filter.
func (f *filter) addKey(key string) {
f.hasher.hashKey(key)
for i := uint32(0); i < f.K; i++ {
slot := f.hasher.getHash(i) % f.M
Expand All @@ -121,9 +145,9 @@ func (f *Filter) AddKey(key string) {
f.N++
}

// HasKey checks whether key has been added to the filter. The chance this
// hasKey checks whether key has been added to the filter. The chance this
// method returns an incorrect value is given by ProbFalsePositive().
func (f *Filter) HasKey(key string) bool {
func (f *filter) hasKey(key string) bool {
f.hasher.hashKey(key)
for i := uint32(0); i < f.K; i++ {
slot := f.hasher.getHash(i) % f.M
Expand All @@ -134,11 +158,11 @@ func (f *Filter) HasKey(key string) bool {
return true
}

// RemoveKey removes a key by first verifying it's likely been seen and then
// removeKey removes a key by first verifying it's likely been seen and then
// decrementing each of the slots it hashes to. Returns true if the key was
// "removed"; false otherwise.
func (f *Filter) RemoveKey(key string) bool {
if f.HasKey(key) {
func (f *filter) removeKey(key string) bool {
if f.hasKey(key) {
f.hasher.hashKey(key)
for i := uint32(0); i < f.K; i++ {
slot := f.hasher.getHash(i) % f.M
Expand All @@ -150,18 +174,18 @@ func (f *Filter) RemoveKey(key string) bool {
return false
}

// ProbFalsePositive returns the probability the filter returns a false
// probFalsePositive returns the probability the filter returns a false
// positive.
func (f *Filter) ProbFalsePositive() float64 {
func (f *filter) probFalsePositive() float64 {
if f.R != 0 {
return probFalsePositive(f.ApproximateInsertions(), f.K, f.M)
return probFalsePositive(f.approximateInsertions(), f.K, f.M)
}
return probFalsePositive(f.N, f.K, f.M)
}

// ApproximateInsertions determines the approximate number of items
// inserted into the Filter after removals.
func (f *Filter) ApproximateInsertions() uint32 {
// approximateInsertions determines the approximate number of items
// inserted into the filter after removals.
func (f *filter) approximateInsertions() uint32 {
count := uint32(0)
for i := uint32(0); i < f.M; i++ {
count += f.getSlot(i)
Expand Down
132 changes: 72 additions & 60 deletions gossip/filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,19 @@ func TestOptimalValues(t *testing.T) {

// TestNewFilter verifies bad inputs, optimal values, size of slots data.
func TestNewFilter(t *testing.T) {
if _, err := NewFilter(0, 3, 0.10); err == nil {
t.Error("NewFilter should not accept N == 0")
if _, err := newFilter(0, 3, 0.10); err == nil {
t.Error("newFilter should not accept N == 0")
}
if _, err := NewFilter(1, 3, 0.10); err == nil {
t.Error("NewFilter should not accept bits B which are non-divisor of 8")
if _, err := newFilter(1, 0, 0.10); err == nil {
t.Error("newFilter should not accept 0 bits")
}
if _, err := NewFilter(1, 16, 0.10); err == nil {
t.Error("NewFilter should not accept bits B which are > 8")
if _, err := newFilter(1, 9, 0.10); err == nil {
t.Error("newFilter should not accept more than 8 bits")
}
f, err := NewFilter(1000, 8, 0.01)
if _, err := newFilter(1, 16, 0.10); err == nil {
t.Error("newFilter should not accept bits B which are > 8")
}
f, err := newFilter(1000, 8, 0.01)
if err != nil {
t.Error("unable to create a filter")
}
Expand All @@ -74,15 +77,13 @@ func TestNewFilter(t *testing.T) {
t.Error("slots data should require M bytes")
}

// Try some fractional byte slot sizes.
bits := []uint32{1, 2, 4}
for _, B := range bits {
f, err := NewFilter(1000, B, 0.01)
// Try all byte slot sizes.
for B := uint32(1); B <= 8; B++ {
f, err := newFilter(1000, B, 0.01)
if err != nil {
t.Error("unable to create a filter")
}
slotsPerByte := 8 / B
expSize := int((M + slotsPerByte - 1) / slotsPerByte)
expSize := int((M*B + 7) / 8)
if len(f.Data) != expSize {
t.Error("slot sizes don't match", len(f.Data), expSize)
}
Expand All @@ -94,90 +95,101 @@ func TestNewFilter(t *testing.T) {

// TestSlots tests slot increment and slot count fetching.
func TestSlots(t *testing.T) {
f, err := NewFilter(1000, 4, 0.01)
if err != nil {
t.Error("unable to create a filter")
}
// Verify all slots empty.
for i := 0; i < len(f.Data); i++ {
if f.Data[i] != 0 {
t.Errorf("slot %d not empty", i)
for b := 1; b <= 8; b++ {
f, err := newFilter(10, uint32(b), 0.10)
if err != nil {
t.Error("unable to create a filter")
}
// Verify all slots empty.
for i := 0; i < len(f.Data); i++ {
if f.Data[i] != 0 {
t.Errorf("slot %d not empty", i)
}
}
// Increment each slot and verify.
for s := uint32(0); s < f.M; s++ {
f.incrementSlot(s, 1)
if f.getSlot(s) != 1 {
t.Errorf("slot value %d != 1", f.getSlot(s))
}
// Increment past max count.
f.incrementSlot(s, int32(f.MaxCount))
if f.getSlot(s) != f.MaxCount {
t.Errorf("slot value should be max %d != %d", f.getSlot(s), f.MaxCount)
}
// Decrement once.
f.incrementSlot(s, -1)
if f.getSlot(s) != f.MaxCount-1 {
t.Errorf("slot value should be max-1 %d != %d", f.getSlot(s), f.MaxCount-1)
}
// Decrement past 0.
f.incrementSlot(s, -int32(f.MaxCount))
if f.getSlot(s) != 0 {
t.Errorf("slot value should be 0 %d != 0", f.getSlot(s))
}
// Increment all slots up to MaxCount and verify gets.
for i := uint32(0); i < f.MaxCount; i++ {
f.incrementSlot(s, 1)
if f.getSlot(s) != i+1 {
t.Errorf("slot value should be %d != %d", i+1, f.getSlot(s))
}
}
}
}
// Increment a slot and verify.
f.incrementSlot(0, 1)
if f.getSlot(0) != 1 {
t.Errorf("slot value %d != 1", f.getSlot(0))
}
// Increment past max count.
f.incrementSlot(0, int32(f.MaxCount))
if f.getSlot(0) != f.MaxCount {
t.Errorf("slot value should be max %d != %d", f.getSlot(0), f.MaxCount)
}
// Decrement once.
f.incrementSlot(0, -1)
if f.getSlot(0) != f.MaxCount-1 {
t.Errorf("slot value should be max %d != %d", f.getSlot(0), f.MaxCount-1)
}
// Decrement past 0.
f.incrementSlot(0, -int32(f.MaxCount))
if f.getSlot(0) != 0 {
t.Errorf("slot value should be max %d != 0", f.getSlot(0))
}
}

// TestKeys adds keys, tests existence, and removes keys.
func TestKeys(t *testing.T) {
f, err := NewFilter(1000, 4, 0.01)
f, err := newFilter(1000, 4, 0.01)
if err != nil {
t.Error("unable to create a filter")
}
if f.HasKey("a") {
if f.hasKey("a") {
t.Error("filter shouldn't contain key a")
}
if f.AddKey("a"); !f.HasKey("a") {
if f.addKey("a"); !f.hasKey("a") {
t.Error("filter should contain key a")
}
if f.HasKey("b") {
if f.hasKey("b") {
t.Error("filter should contain key b")
}
if f.RemoveKey("a"); f.HasKey("a") {
if f.removeKey("a"); f.hasKey("a") {
t.Error("filter shouldn't contain key a after removal")
}
// Add key twice, verify it still exists after one removal.
f.AddKey("a")
f.AddKey("a")
f.RemoveKey("a")
if !f.HasKey("a") {
f.addKey("a")
f.addKey("a")
f.removeKey("a")
if !f.hasKey("a") {
t.Error("filter should still contain key a")
}
}

// TestFalsePositives adds many keys and verifies false positive probability.
func TestFalsePositives(t *testing.T) {
f, err := NewFilter(1000, 4, 0.01)
f, err := newFilter(1000, 4, 0.01)
if err != nil {
t.Error("unable to create a filter")
}
lastFP := float64(0)
for i := 0; i < 1000; i++ {
f.AddKey(fmt.Sprintf("key-%d", i))
if f.ProbFalsePositive() < lastFP {
f.addKey(fmt.Sprintf("key-%d", i))
if f.probFalsePositive() < lastFP {
t.Error("P(FP) should increase")
}
lastFP = f.ProbFalsePositive()
lastFP = f.probFalsePositive()
}
for i := 0; i < 1000; i++ {
if !f.HasKey(fmt.Sprintf("key-%d", i)) {
if !f.hasKey(fmt.Sprintf("key-%d", i)) {
t.Error("could not find key-", i)
}
}
// Measure false positive rate empirically and verify
// against filter's math.
probFP := f.ProbFalsePositive()
probFP := f.probFalsePositive()
countFP := 0
for i := 0; i < 1000; i++ {
if f.HasKey(fmt.Sprintf("nonkey-%d", i)) {
if f.hasKey(fmt.Sprintf("nonkey-%d", i)) {
countFP++
}
}
Expand All @@ -191,13 +203,13 @@ func TestFalsePositives(t *testing.T) {
// TestApproximateInsertions adds many keys with an overloaded filter and
// verifies that approximation degrades gracefully.
func TestApproximateInsertions(t *testing.T) {
f, err := NewFilter(10, 4, 0.10)
f, err := newFilter(10, 4, 0.10)
if err != nil {
t.Error("unable to create a filter")
}
for i := 0; i <= 200; i++ {
f.AddKey(fmt.Sprintf("key-%d", i))
diff := i + 1 - int(f.ApproximateInsertions())
f.addKey(fmt.Sprintf("key-%d", i))
diff := i + 1 - int(f.approximateInsertions())
if i > 150 && diff == 0 {
t.Error("expected some approximation error at 150 insertions")
}
Expand Down
Loading