From 13f5cfad1898d61de2fcb1f1dd3c6749ab3f8f09 Mon Sep 17 00:00:00 2001 From: Spencer Kimball Date: Sat, 15 Feb 2014 23:36:37 -0800 Subject: [PATCH] Arbitrary bit sizes for slots in the counting bloom filter Relax the divisible-by restriction on the number of bits for each slot. I suspect the sweet spot for these things will be 3 bits, so this change seemed worth doing. It was trickier than expected. I created a "visitSlotBytes" method to refactor incrementSlot and getSlot to avoid duplicated code as it had gotten complicated. Fixed visibility issues with filter class. --- gossip/filter.go | 96 +++++++++++++++++----------- gossip/filter_test.go | 132 +++++++++++++++++++++------------------ gossip/infostore.go | 14 ++--- gossip/infostore_test.go | 24 +++---- 4 files changed, 151 insertions(+), 115 deletions(-) diff --git a/gossip/filter.go b/gossip/filter.go index 0cedc4fdef1..1de9c61e82e 100644 --- a/gossip/filter.go +++ b/gossip/filter.go @@ -20,10 +20,10 @@ import ( "math" ) -// Filter is a counting bloom filter, used to approximate the number +// filter is a counting bloom filter, used to approximate the number // of differences between InfoStores from different nodes, with // minimal network overhead. -type Filter struct { +type filter struct { K uint32 // Number of hashes N uint32 // Number of insertions R uint32 // Number of putative removals @@ -56,19 +56,15 @@ func computeOptimalValues(N uint32, maxFP float64) (uint32, uint32) { return M, K2 } -// NewFilter allocates and returns a new filter with expected number of +// newFilter allocates and returns a new filter with expected number of // insertions N, Number of bits per slot B, and expected value of a false -// positive < maxFP. -func NewFilter(N uint32, B uint32, maxFP float64) (*Filter, error) { +// positive < maxFP. Number of bits must be 0 < B <= 8. +func newFilter(N uint32, B uint32, maxFP float64) (*filter, error) { if N == 0 { return nil, fmt.Errorf("number of insertions (N) must be > 0") } - // TODO(spencer): we probably would be well-served using a 3-bit - // filter, so we should relax the following constraint and get a - // little bit fancier with the bit arithmetic to handle cross-byte - // slot values. - if B != 1 && B != 2 && B != 4 && B != 8 { - return nil, fmt.Errorf("number of bits (%d) must be a divisor of 8", B) + if B == 0 || B > 8 { + return nil, fmt.Errorf("number of bits (%d) must be 0 < B <= 8", B) } if maxFP <= 0 || maxFP >= 1 { return nil, fmt.Errorf("max false positives must be 0 <= maxFP < 1: %f", maxFP) @@ -77,7 +73,7 @@ func NewFilter(N uint32, B uint32, maxFP float64) (*Filter, error) { maxCount := uint32((1 << B) - 1) numBytes := (M*B + 7) / 8 bytes := make([]byte, numBytes, numBytes) - return &Filter{ + return &filter{ K: K, B: B, M: M, @@ -87,32 +83,60 @@ func NewFilter(N uint32, B uint32, maxFP float64) (*Filter, error) { }, nil } +// visitSlotBytes visits each byte (either one or two) that make up +// the bits in the specified slot. "fn" is invoked on each byte in +// turn, with values supplied for byteIndex, byteOffset, bitMask, +// and valBitOffset. +func (f *filter) visitSlotBytes(slot uint32, fn func(uint32, uint32, uint32, uint32)) { + bitIndex := slot * f.B + byteIndex := bitIndex / 8 + byteOffset := bitIndex % 8 + + // Things are tricky here because we deal with crossing byte boundaries. + lastBit := byteOffset + f.B + valBitOffset := uint32(0) + for bit := byteOffset; bit < lastBit; { + b := lastBit - bit + if b > 8-byteOffset { + b = 8 - byteOffset + } + bitMask := uint32((1 << b) - 1) + + fn(byteIndex, byteOffset, bitMask, valBitOffset) // call supplied method + + bit += b + valBitOffset += b + byteIndex++ + byteOffset = (byteOffset + b) % 8 + } +} + // incrementSlot increments slot value by the specified amount, bounding at // maximum slot value. -func (f *Filter) incrementSlot(slot uint32, incr int32) { +func (f *filter) incrementSlot(slot uint32, incr int32) { val := int32(f.getSlot(slot)) + incr if val > int32(f.MaxCount) { val = int32(f.MaxCount) } else if val < 0 { val = 0 } - bitIndex := slot * f.B - byteIndex := bitIndex / 8 - byteOffset := bitIndex % 8 - f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) & ^(f.MaxCount << byteOffset)) - f.Data[byteIndex] = byte(uint32(f.Data[byteIndex]) | uint32(val)<>valBitOffset)&bitMask)<> byteOffset +func (f *filter) getSlot(slot uint32) uint32 { + val := uint32(0) + f.visitSlotBytes(slot, func(byteIndex uint32, byteOffset uint32, bitMask uint32, valBitOffset uint32) { + val |= ((uint32(f.Data[byteIndex]) & (bitMask << byteOffset)) >> byteOffset) << valBitOffset + }) + return val } -// AddKey adds the key to the filter. -func (f *Filter) AddKey(key string) { +// addKey adds the key to the filter. +func (f *filter) addKey(key string) { f.hasher.hashKey(key) for i := uint32(0); i < f.K; i++ { slot := f.hasher.getHash(i) % f.M @@ -121,9 +145,9 @@ func (f *Filter) AddKey(key string) { f.N++ } -// HasKey checks whether key has been added to the filter. The chance this +// hasKey checks whether key has been added to the filter. The chance this // method returns an incorrect value is given by ProbFalsePositive(). -func (f *Filter) HasKey(key string) bool { +func (f *filter) hasKey(key string) bool { f.hasher.hashKey(key) for i := uint32(0); i < f.K; i++ { slot := f.hasher.getHash(i) % f.M @@ -134,11 +158,11 @@ func (f *Filter) HasKey(key string) bool { return true } -// RemoveKey removes a key by first verifying it's likely been seen and then +// removeKey removes a key by first verifying it's likely been seen and then // decrementing each of the slots it hashes to. Returns true if the key was // "removed"; false otherwise. -func (f *Filter) RemoveKey(key string) bool { - if f.HasKey(key) { +func (f *filter) removeKey(key string) bool { + if f.hasKey(key) { f.hasher.hashKey(key) for i := uint32(0); i < f.K; i++ { slot := f.hasher.getHash(i) % f.M @@ -150,18 +174,18 @@ func (f *Filter) RemoveKey(key string) bool { return false } -// ProbFalsePositive returns the probability the filter returns a false +// probFalsePositive returns the probability the filter returns a false // positive. -func (f *Filter) ProbFalsePositive() float64 { +func (f *filter) probFalsePositive() float64 { if f.R != 0 { - return probFalsePositive(f.ApproximateInsertions(), f.K, f.M) + return probFalsePositive(f.approximateInsertions(), f.K, f.M) } return probFalsePositive(f.N, f.K, f.M) } -// ApproximateInsertions determines the approximate number of items -// inserted into the Filter after removals. -func (f *Filter) ApproximateInsertions() uint32 { +// approximateInsertions determines the approximate number of items +// inserted into the filter after removals. +func (f *filter) approximateInsertions() uint32 { count := uint32(0) for i := uint32(0); i < f.M; i++ { count += f.getSlot(i) diff --git a/gossip/filter_test.go b/gossip/filter_test.go index 0858c665265..e4a7e5a0c85 100644 --- a/gossip/filter_test.go +++ b/gossip/filter_test.go @@ -53,16 +53,19 @@ func TestOptimalValues(t *testing.T) { // TestNewFilter verifies bad inputs, optimal values, size of slots data. func TestNewFilter(t *testing.T) { - if _, err := NewFilter(0, 3, 0.10); err == nil { - t.Error("NewFilter should not accept N == 0") + if _, err := newFilter(0, 3, 0.10); err == nil { + t.Error("newFilter should not accept N == 0") } - if _, err := NewFilter(1, 3, 0.10); err == nil { - t.Error("NewFilter should not accept bits B which are non-divisor of 8") + if _, err := newFilter(1, 0, 0.10); err == nil { + t.Error("newFilter should not accept 0 bits") } - if _, err := NewFilter(1, 16, 0.10); err == nil { - t.Error("NewFilter should not accept bits B which are > 8") + if _, err := newFilter(1, 9, 0.10); err == nil { + t.Error("newFilter should not accept more than 8 bits") } - f, err := NewFilter(1000, 8, 0.01) + if _, err := newFilter(1, 16, 0.10); err == nil { + t.Error("newFilter should not accept bits B which are > 8") + } + f, err := newFilter(1000, 8, 0.01) if err != nil { t.Error("unable to create a filter") } @@ -74,15 +77,13 @@ func TestNewFilter(t *testing.T) { t.Error("slots data should require M bytes") } - // Try some fractional byte slot sizes. - bits := []uint32{1, 2, 4} - for _, B := range bits { - f, err := NewFilter(1000, B, 0.01) + // Try all byte slot sizes. + for B := uint32(1); B <= 8; B++ { + f, err := newFilter(1000, B, 0.01) if err != nil { t.Error("unable to create a filter") } - slotsPerByte := 8 / B - expSize := int((M + slotsPerByte - 1) / slotsPerByte) + expSize := int((M*B + 7) / 8) if len(f.Data) != expSize { t.Error("slot sizes don't match", len(f.Data), expSize) } @@ -94,90 +95,101 @@ func TestNewFilter(t *testing.T) { // TestSlots tests slot increment and slot count fetching. func TestSlots(t *testing.T) { - f, err := NewFilter(1000, 4, 0.01) - if err != nil { - t.Error("unable to create a filter") - } - // Verify all slots empty. - for i := 0; i < len(f.Data); i++ { - if f.Data[i] != 0 { - t.Errorf("slot %d not empty", i) + for b := 1; b <= 8; b++ { + f, err := newFilter(10, uint32(b), 0.10) + if err != nil { + t.Error("unable to create a filter") + } + // Verify all slots empty. + for i := 0; i < len(f.Data); i++ { + if f.Data[i] != 0 { + t.Errorf("slot %d not empty", i) + } + } + // Increment each slot and verify. + for s := uint32(0); s < f.M; s++ { + f.incrementSlot(s, 1) + if f.getSlot(s) != 1 { + t.Errorf("slot value %d != 1", f.getSlot(s)) + } + // Increment past max count. + f.incrementSlot(s, int32(f.MaxCount)) + if f.getSlot(s) != f.MaxCount { + t.Errorf("slot value should be max %d != %d", f.getSlot(s), f.MaxCount) + } + // Decrement once. + f.incrementSlot(s, -1) + if f.getSlot(s) != f.MaxCount-1 { + t.Errorf("slot value should be max-1 %d != %d", f.getSlot(s), f.MaxCount-1) + } + // Decrement past 0. + f.incrementSlot(s, -int32(f.MaxCount)) + if f.getSlot(s) != 0 { + t.Errorf("slot value should be 0 %d != 0", f.getSlot(s)) + } + // Increment all slots up to MaxCount and verify gets. + for i := uint32(0); i < f.MaxCount; i++ { + f.incrementSlot(s, 1) + if f.getSlot(s) != i+1 { + t.Errorf("slot value should be %d != %d", i+1, f.getSlot(s)) + } + } } - } - // Increment a slot and verify. - f.incrementSlot(0, 1) - if f.getSlot(0) != 1 { - t.Errorf("slot value %d != 1", f.getSlot(0)) - } - // Increment past max count. - f.incrementSlot(0, int32(f.MaxCount)) - if f.getSlot(0) != f.MaxCount { - t.Errorf("slot value should be max %d != %d", f.getSlot(0), f.MaxCount) - } - // Decrement once. - f.incrementSlot(0, -1) - if f.getSlot(0) != f.MaxCount-1 { - t.Errorf("slot value should be max %d != %d", f.getSlot(0), f.MaxCount-1) - } - // Decrement past 0. - f.incrementSlot(0, -int32(f.MaxCount)) - if f.getSlot(0) != 0 { - t.Errorf("slot value should be max %d != 0", f.getSlot(0)) } } // TestKeys adds keys, tests existence, and removes keys. func TestKeys(t *testing.T) { - f, err := NewFilter(1000, 4, 0.01) + f, err := newFilter(1000, 4, 0.01) if err != nil { t.Error("unable to create a filter") } - if f.HasKey("a") { + if f.hasKey("a") { t.Error("filter shouldn't contain key a") } - if f.AddKey("a"); !f.HasKey("a") { + if f.addKey("a"); !f.hasKey("a") { t.Error("filter should contain key a") } - if f.HasKey("b") { + if f.hasKey("b") { t.Error("filter should contain key b") } - if f.RemoveKey("a"); f.HasKey("a") { + if f.removeKey("a"); f.hasKey("a") { t.Error("filter shouldn't contain key a after removal") } // Add key twice, verify it still exists after one removal. - f.AddKey("a") - f.AddKey("a") - f.RemoveKey("a") - if !f.HasKey("a") { + f.addKey("a") + f.addKey("a") + f.removeKey("a") + if !f.hasKey("a") { t.Error("filter should still contain key a") } } // TestFalsePositives adds many keys and verifies false positive probability. func TestFalsePositives(t *testing.T) { - f, err := NewFilter(1000, 4, 0.01) + f, err := newFilter(1000, 4, 0.01) if err != nil { t.Error("unable to create a filter") } lastFP := float64(0) for i := 0; i < 1000; i++ { - f.AddKey(fmt.Sprintf("key-%d", i)) - if f.ProbFalsePositive() < lastFP { + f.addKey(fmt.Sprintf("key-%d", i)) + if f.probFalsePositive() < lastFP { t.Error("P(FP) should increase") } - lastFP = f.ProbFalsePositive() + lastFP = f.probFalsePositive() } for i := 0; i < 1000; i++ { - if !f.HasKey(fmt.Sprintf("key-%d", i)) { + if !f.hasKey(fmt.Sprintf("key-%d", i)) { t.Error("could not find key-", i) } } // Measure false positive rate empirically and verify // against filter's math. - probFP := f.ProbFalsePositive() + probFP := f.probFalsePositive() countFP := 0 for i := 0; i < 1000; i++ { - if f.HasKey(fmt.Sprintf("nonkey-%d", i)) { + if f.hasKey(fmt.Sprintf("nonkey-%d", i)) { countFP++ } } @@ -191,13 +203,13 @@ func TestFalsePositives(t *testing.T) { // TestApproximateInsertions adds many keys with an overloaded filter and // verifies that approximation degrades gracefully. func TestApproximateInsertions(t *testing.T) { - f, err := NewFilter(10, 4, 0.10) + f, err := newFilter(10, 4, 0.10) if err != nil { t.Error("unable to create a filter") } for i := 0; i <= 200; i++ { - f.AddKey(fmt.Sprintf("key-%d", i)) - diff := i + 1 - int(f.ApproximateInsertions()) + f.addKey(fmt.Sprintf("key-%d", i)) + diff := i + 1 - int(f.approximateInsertions()) if i > 150 && diff == 0 { t.Error("expected some approximation error at 150 insertions") } diff --git a/gossip/infostore.go b/gossip/infostore.go index 3b0415e6aad..b39360e8787 100644 --- a/gossip/infostore.go +++ b/gossip/infostore.go @@ -40,7 +40,7 @@ type InfoStore struct { // Parameters to tune bloom filters returned by the store. const ( // filterBits is the default number of bits per byte slot. - filterBits = 4 + filterBits = 3 // filterMaxFP is the default upper bound for a false positive's value. filterMaxFP = 0.025 ) @@ -291,15 +291,15 @@ func (is *InfoStore) delta(seq int64) (*InfoStore, error) { // buildFilter builds a bloom filter containing the keys held in the // store which arrived within the specified number of maximum hops. // Filters are passed to peer nodes in order to evaluate gossip candidates. -func (is *InfoStore) buildFilter(maxHops uint32) (*Filter, error) { - f, err := NewFilter(is.infoCount(), filterBits, filterMaxFP) +func (is *InfoStore) buildFilter(maxHops uint32) (*filter, error) { + f, err := newFilter(is.infoCount(), filterBits, filterMaxFP) if err != nil { return nil, err } err = is.visitInfos(nil, func(info *Info) error { if info.Hops <= maxHops { - f.AddKey(info.Key) + f.addKey(info.Key) } return nil }) @@ -311,12 +311,12 @@ func (is *InfoStore) buildFilter(maxHops uint32) (*Filter, error) { // infostore. Each key from the infostore is "removed" from the // filter, to yield a filter with remains approximately equal to the // keys contained in the filter but not present in the infostore. -func (is *InfoStore) diffFilter(f *Filter, maxHops uint32) (uint32, error) { +func (is *InfoStore) diffFilter(f *filter, maxHops uint32) (uint32, error) { err := is.visitInfos(nil, func(info *Info) error { if info.Hops <= maxHops { - f.RemoveKey(info.Key) + f.removeKey(info.Key) } return nil }) - return f.ApproximateInsertions(), err + return f.approximateInsertions(), err } diff --git a/gossip/infostore_test.go b/gossip/infostore_test.go index 6bfb4862fc1..33405d58984 100644 --- a/gossip/infostore_test.go +++ b/gossip/infostore_test.go @@ -413,19 +413,19 @@ func TestBuildFilter(t *testing.T) { } for i := 0; i < 10; i++ { - if !f.HasKey(fmt.Sprintf("a.%d", i)) { - t.Error("filter should contain key a.", i) + if !f.hasKey(fmt.Sprintf("a.%d", i)) { + t.Errorf("filter should contain key a.%d", i) } - if !f.HasKey(fmt.Sprintf("b.%d", i)) { - t.Error("filter should contain key b.", i) + if !f.hasKey(fmt.Sprintf("b.%d", i)) { + t.Errorf("filter should contain key b.%d", i) } - if !f.HasKey(fmt.Sprintf("c.%d", i)) { - t.Error("filter should contain key c.", i) + if !f.hasKey(fmt.Sprintf("c.%d", i)) { + t.Errorf("filter should contain key c.%d", i) } } // Verify non-keys are not present. - if f.HasKey("d.1") || f.HasKey("d.2") { + if f.hasKey("d.1") || f.hasKey("d.2") { t.Error("filter should not contain d.1 or d.2") } } @@ -440,10 +440,10 @@ func TestFilterMaxHops(t *testing.T) { t.Fatal("unable to build filter:", err) } - if !f.HasKey("a.1") || !f.HasKey("b.1") { + if !f.hasKey("a.1") || !f.hasKey("b.1") { t.Error("filter should have low-hops keys for a and b") } - if f.HasKey("a.2") || f.HasKey("b.2") { + if f.hasKey("a.2") || f.hasKey("b.2") { t.Error("filter shouldn't have high-hops keys for a and b") } } @@ -476,7 +476,7 @@ func TestDiff(t *testing.T) { } // An empty filter returns 0 diff. - f, err = NewFilter(10, 2, 0.01) + f, err = newFilter(10, 2, 0.01) if err != nil { t.Fatal("could not create filter:", err) } @@ -487,12 +487,12 @@ func TestDiff(t *testing.T) { // Create a filter with just the non-group items of the infostore // and diff: expect empty as infostore contains everything. - f, err = NewFilter(10, 2, 0.01) + f, err = newFilter(10, 2, 0.01) if err != nil { t.Fatal("could not create filter:", err) } for _, info := range is.Infos { - f.AddKey(info.Key) + f.addKey(info.Key) } diff, err = is.diffFilter(f, 1) if diff != 0 || err != nil {