From 13f5cfad1898d61de2fcb1f1dd3c6749ab3f8f09 Mon Sep 17 00:00:00 2001
From: Spencer Kimball <spencer.kimball@gmail.com>
Date: Sat, 15 Feb 2014 23:36:37 -0800
Subject: [PATCH] Arbitrary bit sizes for slots in the counting bloom filter

Relax the divisible-by restriction on the number of bits for each slot.
I suspect the sweet spot for these things will be 3 bits, so this change
seemed worth doing. It was trickier than expected. I created a
"visitSlotBytes" method to refactor incrementSlot and getSlot to avoid
duplicated code as it had gotten complicated.

Fixed visibility issues with filter class.
---
 gossip/filter.go         |  96 +++++++++++++++++-----------
 gossip/filter_test.go    | 132 +++++++++++++++++++++------------------
 gossip/infostore.go      |  14 ++---
 gossip/infostore_test.go |  24 +++----
 4 files changed, 151 insertions(+), 115 deletions(-)

diff --git a/gossip/filter.go b/gossip/filter.go
index 0cedc4fdef1..1de9c61e82e 100644
--- a/gossip/filter.go
+++ b/gossip/filter.go
@@ -20,10 +20,10 @@ import (
 	"math"
 )
 
-// Filter is a counting bloom filter, used to approximate the number
+// filter is a counting bloom filter, used to approximate the number
 // of differences between InfoStores from different nodes, with
 // minimal network overhead.
-type Filter struct {
+type filter struct {
 	K        uint32  // Number of hashes
 	N        uint32  // Number of insertions
 	R        uint32  // Number of putative removals
@@ -56,19 +56,15 @@ func computeOptimalValues(N uint32, maxFP float64) (uint32, uint32) {
 	return M, K2
 }
 
-// NewFilter allocates and returns a new filter with expected number of
+// newFilter allocates and returns a new filter with expected number of
 // insertions N, Number of bits per slot B, and expected value of a false
-// positive < maxFP.
-func NewFilter(N uint32, B uint32, maxFP float64) (*Filter, error) {
+// positive < maxFP. Number of bits must be 0 < B <= 8.
+func newFilter(N uint32, B uint32, maxFP float64) (*filter, error) {
 	if N == 0 {
 		return nil, fmt.Errorf("number of insertions (N) must be > 0")
 	}
-	// TODO(spencer): we probably would be well-served using a 3-bit
-	// filter, so we should relax the following constraint and get a
-	// little bit fancier with the bit arithmetic to handle cross-byte
-	// slot values.
-	if B != 1 && B != 2 && B != 4 && B != 8 {
-		return nil, fmt.Errorf("number of bits (%d) must be a divisor of 8", B)
+	if B == 0 || B > 8 {
+		return nil, fmt.Errorf("number of bits (%d) must be 0 < B <= 8", B)
 	}
 	if maxFP <= 0 || maxFP >= 1 {
 		return nil, fmt.Errorf("max false positives must be 0 <= maxFP < 1: %f", maxFP)
@@ -77,7 +73,7 @@ func NewFilter(N uint32, B uint32, maxFP float64) (*Filter, error) {
 	maxCount := uint32((1 << B) - 1)
 	numBytes := (M*B + 7) / 8
 	bytes := make([]byte, numBytes, numBytes)
-	return &Filter{
+	return &filter{
 		K:        K,
 		B:        B,
 		M:        M,
@@ -87,32 +83,60 @@ func NewFilter(N uint32, B uint32, maxFP float64) (*Filter, error) {
 	}, nil
 }
 
+// visitSlotBytes visits each byte (either one or two) that make up
+// the bits in the specified slot. "fn" is invoked on each byte in
+// turn, with values supplied for byteIndex, byteOffset, bitMask,
+// and valBitOffset.
+func (f *filter) visitSlotBytes(slot uint32, fn func(uint32, uint32, uint32, uint32)) {
+	bitIndex := slot * f.B
+	byteIndex := bitIndex / 8
+	byteOffset := bitIndex % 8
+
+	// Things are tricky here because we deal with crossing byte boundaries.
+	lastBit := byteOffset + f.B
+	valBitOffset := uint32(0)
+	for bit := byteOffset; bit < lastBit; {
+		b := lastBit - bit
+		if b > 8-byteOffset {
+			b = 8 - byteOffset
+		}
+		bitMask := uint32((1 << b) - 1)
+
+		fn(byteIndex, byteOffset, bitMask, valBitOffset) // call supplied method
+
+		bit += b
+		valBitOffset += b
+		byteIndex++
+		byteOffset = (byteOffset + b) % 8
+	}
+}
+
 // incrementSlot increments slot value by the specified amount, bounding at
 // maximum slot value.
-func (f *Filter) incrementSlot(slot uint32, incr int32) {
+func (f *filter) incrementSlot(slot uint32, incr int32) {
 	val := int32(f.getSlot(slot)) + incr
 	if val > int32(f.MaxCount) {
 		val = int32(f.MaxCount)
 	} else if val < 0 {
 		val = 0
 	}
-	bitIndex := slot * f.B
-	byteIndex := bitIndex / 8
-	byteOffset := bitIndex % 8
-	f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) & ^(f.MaxCount << byteOffset))
-	f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) | uint32(val)<<byteOffset)
+	f.visitSlotBytes(slot, func(byteIndex uint32, byteOffset uint32, bitMask uint32, valBitOffset uint32) {
+		f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) & ^(bitMask << byteOffset))
+		f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) | (uint32(val>>valBitOffset)&bitMask)<<byteOffset)
+	})
 }
 
 // getSlot returns the slot value.
-func (f *Filter) getSlot(slot uint32) uint32 {
-	bitIndex := slot * f.B
-	byteIndex := bitIndex / 8
-	byteOffset := bitIndex % 8
-	return (uint32(f.Data[byteIndex]) & (f.MaxCount << byteOffset)) >> byteOffset
+func (f *filter) getSlot(slot uint32) uint32 {
+	val := uint32(0)
+	f.visitSlotBytes(slot, func(byteIndex uint32, byteOffset uint32, bitMask uint32, valBitOffset uint32) {
+		val |= ((uint32(f.Data[byteIndex]) & (bitMask << byteOffset)) >> byteOffset) << valBitOffset
+	})
+	return val
 }
 
-// AddKey adds the key to the filter.
-func (f *Filter) AddKey(key string) {
+// addKey adds the key to the filter.
+func (f *filter) addKey(key string) {
 	f.hasher.hashKey(key)
 	for i := uint32(0); i < f.K; i++ {
 		slot := f.hasher.getHash(i) % f.M
@@ -121,9 +145,9 @@ func (f *Filter) AddKey(key string) {
 	f.N++
 }
 
-// HasKey checks whether key has been added to the filter. The chance this
+// hasKey checks whether key has been added to the filter. The chance this
 // method returns an incorrect value is given by ProbFalsePositive().
-func (f *Filter) HasKey(key string) bool {
+func (f *filter) hasKey(key string) bool {
 	f.hasher.hashKey(key)
 	for i := uint32(0); i < f.K; i++ {
 		slot := f.hasher.getHash(i) % f.M
@@ -134,11 +158,11 @@ func (f *Filter) HasKey(key string) bool {
 	return true
 }
 
-// RemoveKey removes a key by first verifying it's likely been seen and then
+// removeKey removes a key by first verifying it's likely been seen and then
 // decrementing each of the slots it hashes to. Returns true if the key was
 // "removed"; false otherwise.
-func (f *Filter) RemoveKey(key string) bool {
-	if f.HasKey(key) {
+func (f *filter) removeKey(key string) bool {
+	if f.hasKey(key) {
 		f.hasher.hashKey(key)
 		for i := uint32(0); i < f.K; i++ {
 			slot := f.hasher.getHash(i) % f.M
@@ -150,18 +174,18 @@ func (f *Filter) RemoveKey(key string) bool {
 	return false
 }
 
-// ProbFalsePositive returns the probability the filter returns a false
+// probFalsePositive returns the probability the filter returns a false
 // positive.
-func (f *Filter) ProbFalsePositive() float64 {
+func (f *filter) probFalsePositive() float64 {
 	if f.R != 0 {
-		return probFalsePositive(f.ApproximateInsertions(), f.K, f.M)
+		return probFalsePositive(f.approximateInsertions(), f.K, f.M)
 	}
 	return probFalsePositive(f.N, f.K, f.M)
 }
 
-// ApproximateInsertions determines the approximate number of items
-// inserted into the Filter after removals.
-func (f *Filter) ApproximateInsertions() uint32 {
+// approximateInsertions determines the approximate number of items
+// inserted into the filter after removals.
+func (f *filter) approximateInsertions() uint32 {
 	count := uint32(0)
 	for i := uint32(0); i < f.M; i++ {
 		count += f.getSlot(i)
diff --git a/gossip/filter_test.go b/gossip/filter_test.go
index 0858c665265..e4a7e5a0c85 100644
--- a/gossip/filter_test.go
+++ b/gossip/filter_test.go
@@ -53,16 +53,19 @@ func TestOptimalValues(t *testing.T) {
 
 // TestNewFilter verifies bad inputs, optimal values, size of slots data.
 func TestNewFilter(t *testing.T) {
-	if _, err := NewFilter(0, 3, 0.10); err == nil {
-		t.Error("NewFilter should not accept N == 0")
+	if _, err := newFilter(0, 3, 0.10); err == nil {
+		t.Error("newFilter should not accept N == 0")
 	}
-	if _, err := NewFilter(1, 3, 0.10); err == nil {
-		t.Error("NewFilter should not accept bits B which are non-divisor of 8")
+	if _, err := newFilter(1, 0, 0.10); err == nil {
+		t.Error("newFilter should not accept 0 bits")
 	}
-	if _, err := NewFilter(1, 16, 0.10); err == nil {
-		t.Error("NewFilter should not accept bits B which are > 8")
+	if _, err := newFilter(1, 9, 0.10); err == nil {
+		t.Error("newFilter should not accept more than 8 bits")
 	}
-	f, err := NewFilter(1000, 8, 0.01)
+	if _, err := newFilter(1, 16, 0.10); err == nil {
+		t.Error("newFilter should not accept bits B which are > 8")
+	}
+	f, err := newFilter(1000, 8, 0.01)
 	if err != nil {
 		t.Error("unable to create a filter")
 	}
@@ -74,15 +77,13 @@ func TestNewFilter(t *testing.T) {
 		t.Error("slots data should require M bytes")
 	}
 
-	// Try some fractional byte slot sizes.
-	bits := []uint32{1, 2, 4}
-	for _, B := range bits {
-		f, err := NewFilter(1000, B, 0.01)
+	// Try all byte slot sizes.
+	for B := uint32(1); B <= 8; B++ {
+		f, err := newFilter(1000, B, 0.01)
 		if err != nil {
 			t.Error("unable to create a filter")
 		}
-		slotsPerByte := 8 / B
-		expSize := int((M + slotsPerByte - 1) / slotsPerByte)
+		expSize := int((M*B + 7) / 8)
 		if len(f.Data) != expSize {
 			t.Error("slot sizes don't match", len(f.Data), expSize)
 		}
@@ -94,90 +95,101 @@ func TestNewFilter(t *testing.T) {
 
 // TestSlots tests slot increment and slot count fetching.
 func TestSlots(t *testing.T) {
-	f, err := NewFilter(1000, 4, 0.01)
-	if err != nil {
-		t.Error("unable to create a filter")
-	}
-	// Verify all slots empty.
-	for i := 0; i < len(f.Data); i++ {
-		if f.Data[i] != 0 {
-			t.Errorf("slot %d not empty", i)
+	for b := 1; b <= 8; b++ {
+		f, err := newFilter(10, uint32(b), 0.10)
+		if err != nil {
+			t.Error("unable to create a filter")
+		}
+		// Verify all slots empty.
+		for i := 0; i < len(f.Data); i++ {
+			if f.Data[i] != 0 {
+				t.Errorf("slot %d not empty", i)
+			}
+		}
+		// Increment each slot and verify.
+		for s := uint32(0); s < f.M; s++ {
+			f.incrementSlot(s, 1)
+			if f.getSlot(s) != 1 {
+				t.Errorf("slot value %d != 1", f.getSlot(s))
+			}
+			// Increment past max count.
+			f.incrementSlot(s, int32(f.MaxCount))
+			if f.getSlot(s) != f.MaxCount {
+				t.Errorf("slot value should be max %d != %d", f.getSlot(s), f.MaxCount)
+			}
+			// Decrement once.
+			f.incrementSlot(s, -1)
+			if f.getSlot(s) != f.MaxCount-1 {
+				t.Errorf("slot value should be max-1 %d != %d", f.getSlot(s), f.MaxCount-1)
+			}
+			// Decrement past 0.
+			f.incrementSlot(s, -int32(f.MaxCount))
+			if f.getSlot(s) != 0 {
+				t.Errorf("slot value should be 0 %d != 0", f.getSlot(s))
+			}
+			// Increment all slots up to MaxCount and verify gets.
+			for i := uint32(0); i < f.MaxCount; i++ {
+				f.incrementSlot(s, 1)
+				if f.getSlot(s) != i+1 {
+					t.Errorf("slot value should be %d != %d", i+1, f.getSlot(s))
+				}
+			}
 		}
-	}
-	// Increment a slot and verify.
-	f.incrementSlot(0, 1)
-	if f.getSlot(0) != 1 {
-		t.Errorf("slot value %d != 1", f.getSlot(0))
-	}
-	// Increment past max count.
-	f.incrementSlot(0, int32(f.MaxCount))
-	if f.getSlot(0) != f.MaxCount {
-		t.Errorf("slot value should be max %d != %d", f.getSlot(0), f.MaxCount)
-	}
-	// Decrement once.
-	f.incrementSlot(0, -1)
-	if f.getSlot(0) != f.MaxCount-1 {
-		t.Errorf("slot value should be max %d != %d", f.getSlot(0), f.MaxCount-1)
-	}
-	// Decrement past 0.
-	f.incrementSlot(0, -int32(f.MaxCount))
-	if f.getSlot(0) != 0 {
-		t.Errorf("slot value should be max %d != 0", f.getSlot(0))
 	}
 }
 
 // TestKeys adds keys, tests existence, and removes keys.
 func TestKeys(t *testing.T) {
-	f, err := NewFilter(1000, 4, 0.01)
+	f, err := newFilter(1000, 4, 0.01)
 	if err != nil {
 		t.Error("unable to create a filter")
 	}
-	if f.HasKey("a") {
+	if f.hasKey("a") {
 		t.Error("filter shouldn't contain key a")
 	}
-	if f.AddKey("a"); !f.HasKey("a") {
+	if f.addKey("a"); !f.hasKey("a") {
 		t.Error("filter should contain key a")
 	}
-	if f.HasKey("b") {
+	if f.hasKey("b") {
 		t.Error("filter should contain key b")
 	}
-	if f.RemoveKey("a"); f.HasKey("a") {
+	if f.removeKey("a"); f.hasKey("a") {
 		t.Error("filter shouldn't contain key a after removal")
 	}
 	// Add key twice, verify it still exists after one removal.
-	f.AddKey("a")
-	f.AddKey("a")
-	f.RemoveKey("a")
-	if !f.HasKey("a") {
+	f.addKey("a")
+	f.addKey("a")
+	f.removeKey("a")
+	if !f.hasKey("a") {
 		t.Error("filter should still contain key a")
 	}
 }
 
 // TestFalsePositives adds many keys and verifies false positive probability.
 func TestFalsePositives(t *testing.T) {
-	f, err := NewFilter(1000, 4, 0.01)
+	f, err := newFilter(1000, 4, 0.01)
 	if err != nil {
 		t.Error("unable to create a filter")
 	}
 	lastFP := float64(0)
 	for i := 0; i < 1000; i++ {
-		f.AddKey(fmt.Sprintf("key-%d", i))
-		if f.ProbFalsePositive() < lastFP {
+		f.addKey(fmt.Sprintf("key-%d", i))
+		if f.probFalsePositive() < lastFP {
 			t.Error("P(FP) should increase")
 		}
-		lastFP = f.ProbFalsePositive()
+		lastFP = f.probFalsePositive()
 	}
 	for i := 0; i < 1000; i++ {
-		if !f.HasKey(fmt.Sprintf("key-%d", i)) {
+		if !f.hasKey(fmt.Sprintf("key-%d", i)) {
 			t.Error("could not find key-", i)
 		}
 	}
 	// Measure false positive rate empirically and verify
 	// against filter's math.
-	probFP := f.ProbFalsePositive()
+	probFP := f.probFalsePositive()
 	countFP := 0
 	for i := 0; i < 1000; i++ {
-		if f.HasKey(fmt.Sprintf("nonkey-%d", i)) {
+		if f.hasKey(fmt.Sprintf("nonkey-%d", i)) {
 			countFP++
 		}
 	}
@@ -191,13 +203,13 @@ func TestFalsePositives(t *testing.T) {
 // TestApproximateInsertions adds many keys with an overloaded filter and
 // verifies that approximation degrades gracefully.
 func TestApproximateInsertions(t *testing.T) {
-	f, err := NewFilter(10, 4, 0.10)
+	f, err := newFilter(10, 4, 0.10)
 	if err != nil {
 		t.Error("unable to create a filter")
 	}
 	for i := 0; i <= 200; i++ {
-		f.AddKey(fmt.Sprintf("key-%d", i))
-		diff := i + 1 - int(f.ApproximateInsertions())
+		f.addKey(fmt.Sprintf("key-%d", i))
+		diff := i + 1 - int(f.approximateInsertions())
 		if i > 150 && diff == 0 {
 			t.Error("expected some approximation error at 150 insertions")
 		}
diff --git a/gossip/infostore.go b/gossip/infostore.go
index 3b0415e6aad..b39360e8787 100644
--- a/gossip/infostore.go
+++ b/gossip/infostore.go
@@ -40,7 +40,7 @@ type InfoStore struct {
 // Parameters to tune bloom filters returned by the store.
 const (
 	// filterBits is the default number of bits per byte slot.
-	filterBits = 4
+	filterBits = 3
 	// filterMaxFP is the default upper bound for a false positive's value.
 	filterMaxFP = 0.025
 )
@@ -291,15 +291,15 @@ func (is *InfoStore) delta(seq int64) (*InfoStore, error) {
 // buildFilter builds a bloom filter containing the keys held in the
 // store which arrived within the specified number of maximum hops.
 // Filters are passed to peer nodes in order to evaluate gossip candidates.
-func (is *InfoStore) buildFilter(maxHops uint32) (*Filter, error) {
-	f, err := NewFilter(is.infoCount(), filterBits, filterMaxFP)
+func (is *InfoStore) buildFilter(maxHops uint32) (*filter, error) {
+	f, err := newFilter(is.infoCount(), filterBits, filterMaxFP)
 	if err != nil {
 		return nil, err
 	}
 
 	err = is.visitInfos(nil, func(info *Info) error {
 		if info.Hops <= maxHops {
-			f.AddKey(info.Key)
+			f.addKey(info.Key)
 		}
 		return nil
 	})
@@ -311,12 +311,12 @@ func (is *InfoStore) buildFilter(maxHops uint32) (*Filter, error) {
 // infostore. Each key from the infostore is "removed" from the
 // filter, to yield a filter with remains approximately equal to the
 // keys contained in the filter but not present in the infostore.
-func (is *InfoStore) diffFilter(f *Filter, maxHops uint32) (uint32, error) {
+func (is *InfoStore) diffFilter(f *filter, maxHops uint32) (uint32, error) {
 	err := is.visitInfos(nil, func(info *Info) error {
 		if info.Hops <= maxHops {
-			f.RemoveKey(info.Key)
+			f.removeKey(info.Key)
 		}
 		return nil
 	})
-	return f.ApproximateInsertions(), err
+	return f.approximateInsertions(), err
 }
diff --git a/gossip/infostore_test.go b/gossip/infostore_test.go
index 6bfb4862fc1..33405d58984 100644
--- a/gossip/infostore_test.go
+++ b/gossip/infostore_test.go
@@ -413,19 +413,19 @@ func TestBuildFilter(t *testing.T) {
 	}
 
 	for i := 0; i < 10; i++ {
-		if !f.HasKey(fmt.Sprintf("a.%d", i)) {
-			t.Error("filter should contain key a.", i)
+		if !f.hasKey(fmt.Sprintf("a.%d", i)) {
+			t.Errorf("filter should contain key a.%d", i)
 		}
-		if !f.HasKey(fmt.Sprintf("b.%d", i)) {
-			t.Error("filter should contain key b.", i)
+		if !f.hasKey(fmt.Sprintf("b.%d", i)) {
+			t.Errorf("filter should contain key b.%d", i)
 		}
-		if !f.HasKey(fmt.Sprintf("c.%d", i)) {
-			t.Error("filter should contain key c.", i)
+		if !f.hasKey(fmt.Sprintf("c.%d", i)) {
+			t.Errorf("filter should contain key c.%d", i)
 		}
 	}
 
 	// Verify non-keys are not present.
-	if f.HasKey("d.1") || f.HasKey("d.2") {
+	if f.hasKey("d.1") || f.hasKey("d.2") {
 		t.Error("filter should not contain d.1 or d.2")
 	}
 }
@@ -440,10 +440,10 @@ func TestFilterMaxHops(t *testing.T) {
 		t.Fatal("unable to build filter:", err)
 	}
 
-	if !f.HasKey("a.1") || !f.HasKey("b.1") {
+	if !f.hasKey("a.1") || !f.hasKey("b.1") {
 		t.Error("filter should have low-hops keys for a and b")
 	}
-	if f.HasKey("a.2") || f.HasKey("b.2") {
+	if f.hasKey("a.2") || f.hasKey("b.2") {
 		t.Error("filter shouldn't have high-hops keys for a and b")
 	}
 }
@@ -476,7 +476,7 @@ func TestDiff(t *testing.T) {
 	}
 
 	// An empty filter returns 0 diff.
-	f, err = NewFilter(10, 2, 0.01)
+	f, err = newFilter(10, 2, 0.01)
 	if err != nil {
 		t.Fatal("could not create filter:", err)
 	}
@@ -487,12 +487,12 @@ func TestDiff(t *testing.T) {
 
 	// Create a filter with just the non-group items of the infostore
 	// and diff: expect empty as infostore contains everything.
-	f, err = NewFilter(10, 2, 0.01)
+	f, err = newFilter(10, 2, 0.01)
 	if err != nil {
 		t.Fatal("could not create filter:", err)
 	}
 	for _, info := range is.Infos {
-		f.AddKey(info.Key)
+		f.addKey(info.Key)
 	}
 	diff, err = is.diffFilter(f, 1)
 	if diff != 0 || err != nil {