From f3b41318a22e72b6416a561c2ded28f94a1048d4 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 26 Sep 2023 10:31:19 -0400 Subject: [PATCH] Fixing issue 35 --- binaryfusefilter.go | 13 +-- binaryfusefilter_test.go | 22 +++++ fusefilter.go | 207 --------------------------------------- fusefilter_test.go | 150 ---------------------------- xorfilter.go | 29 ++++-- xorfilter_test.go | 49 +++++++++ 6 files changed, 97 insertions(+), 373 deletions(-) delete mode 100644 fusefilter.go delete mode 100644 fusefilter_test.go diff --git a/binaryfusefilter.go b/binaryfusefilter.go index 3f06016..fd76263 100644 --- a/binaryfusefilter.go +++ b/binaryfusefilter.go @@ -1,9 +1,9 @@ package xorfilter import ( + "errors" "math" "math/bits" - "sort" ) type BinaryFuse8 struct { @@ -114,13 +114,8 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { iterations += 1 if iterations > MaxIterations { // The probability of this happening is lower than the - // the cosmic-ray probability (i.e., a cosmic ray corrupts your system), - // but if it happens, we just fill the fingerprint with ones which - // will flag all possible keys as 'possible', ensuring a correct result. - for i := 0; i < len(filter.Fingerprints); i++ { - filter.Fingerprints[i] = ^uint8(0) - } - return filter, nil + // the cosmic-ray probability (i.e., a cosmic ray corrupts your system). + return nil, errors.New("too many iterations") } blockBits := 1 @@ -252,7 +247,7 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { // manage to remove them all. We may simply sort the key to // solve the issue. This will run in time O(n log n) and it // mutates the input. - sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] }) + keys = pruneDuplicates(keys) } for i := uint32(0); i < size; i++ { reverseOrder[i] = 0 diff --git a/binaryfusefilter_test.go b/binaryfusefilter_test.go index 9b57539..6c8f138 100644 --- a/binaryfusefilter_test.go +++ b/binaryfusefilter_test.go @@ -5,6 +5,7 @@ import ( "math/rand" "testing" + "github.com/cespare/xxhash" "github.com/stretchr/testify/assert" ) @@ -305,3 +306,24 @@ func BenchmarkBinaryFuse8Contains50000000(b *testing.B) { binaryfusedbig.Contains(rand.Uint64()) } } + +func Test_Issue35(t *testing.T) { + for test := 0; test < 100; test++ { + hashes := make([]uint64, 0) + for i := 0; i < 40000; i++ { + v := encode(int32(rand.Intn(10)), int32(rand.Intn(100000))) + hashes = append(hashes, xxhash.Sum64(v)) + } + inner, err := PopulateBinaryFuse8(hashes) + if err != nil { + panic(err) + } + for i, d := range hashes { + e := inner.Contains(d) + if !e { + panic(i) + } + + } + } +} diff --git a/fusefilter.go b/fusefilter.go deleted file mode 100644 index 9a8928b..0000000 --- a/fusefilter.go +++ /dev/null @@ -1,207 +0,0 @@ -package xorfilter - -import ( - "errors" -) - -// The Fuse8 xor filter uses 8-bit fingerprints. It offers the same <0.4% false-positive probability -// as the xor filter, but uses less space (~9.1 bits/entry vs ~9.9 bits/entry). -// -// The Fuse8 xor filter uses the fuse data structure, which requires a large number of keys to be -// operational. Experimentally, this number is somewhere >1e5. For smaller key sets, prefer thhe -// Xor8 filter. -// -// For more information on the fuse graph data structure, see https://arxiv.org/abs/1907.04749. -// This implementation is referenced from the C implementation at https://github.com/FastFilter/xor_singleheader/pull/11. -type Fuse8 struct { - Seed uint64 - SegmentLength uint32 - Fingerprints []uint8 -} - -type h012 struct { - h0 uint32 - h1 uint32 - h2 uint32 -} - -const ARITY = 3 -const SEGMENT_COUNT = 100 -const SLOTS = SEGMENT_COUNT + ARITY - 1 - -// Contains returns `true` if key is part of the set with a false positive probability of <0.4%. -func (filter *Fuse8) Contains(key uint64) bool { - hash := mixsplit(key, filter.Seed) - f := uint8(fingerprint(hash)) - r0 := uint32(hash) - r1 := uint32(rotl64(hash, 21)) - r2 := uint32(rotl64(hash, 42)) - r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32) - seg := reduce(r0, SEGMENT_COUNT) - h0 := seg*filter.SegmentLength + reduce(r1, filter.SegmentLength) - h1 := (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength) - h2 := (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength) - return f == (filter.Fingerprints[h0] ^ filter.Fingerprints[h1] ^ - filter.Fingerprints[h2]) -} - -func (filter *Fuse8) makeKeyHashes(k uint64) hashes { - hash := mixsplit(k, filter.Seed) - answer := hashes{} - answer.h = hash - r0 := uint32(hash) - r1 := uint32(rotl64(hash, 21)) - r2 := uint32(rotl64(hash, 42)) - r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32) - seg := reduce(r0, SEGMENT_COUNT) - answer.h0 = (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength) - answer.h1 = (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength) - answer.h2 = (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength) - return answer -} - -func (filter *Fuse8) geth012(hash uint64) h012 { - answer := h012{} - r0 := uint32(hash) - r1 := uint32(rotl64(hash, 21)) - r2 := uint32(rotl64(hash, 42)) - r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32) - seg := reduce(r0, SEGMENT_COUNT) - answer.h0 = (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength) - answer.h1 = (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength) - answer.h2 = (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength) - return answer -} - -// Populate fills the filter with provided keys. For best results, -// the caller should avoid having too many duplicated keys. -// The function may return an error if the set is empty. -func PopulateFuse8(keys []uint64) (*Fuse8, error) { - - const FUSE_OVERHEAD = 1.0 / 0.879 - const FUSE_CONSTANT = 1024 // todo: determine value - // ref: Algorithm 3 - size := len(keys) - if size == 0 { - return nil, errors.New("provide a non-empty set") - } - - capacity := uint32(FUSE_OVERHEAD*float64(size) + FUSE_CONSTANT) - capacity = capacity / SLOTS * SLOTS - rngcounter := uint64(1) - - filter := &Fuse8{} - filter.SegmentLength = capacity / SLOTS - filter.Fingerprints = make([]uint8, capacity, capacity) - filter.Seed = splitmix64(&rngcounter) - - H := make([]xorset, capacity, capacity) - Q := make([]keyindex, capacity, capacity) - stack := make([]keyindex, size, size) - iterations := 0 - for true { - iterations += 1 - if iterations > MaxIterations { - // The probability of this happening is lower than the - // the cosmic-ray probability (i.e., a cosmic ray corrupts your system), - // but if it happens, we just fill the fingerprint with ones which - // will flag all possible keys as 'possible', ensuring a correct result. - for i := 0; i < len(filter.Fingerprints); i++ { - filter.Fingerprints[i] = ^uint8(0) - } - return filter, nil - } - - // Add all keys to the construction array. - for _, key := range keys { - hs := filter.makeKeyHashes(key) - - H[hs.h0].xormask ^= hs.h - H[hs.h0].count++ - H[hs.h1].xormask ^= hs.h - H[hs.h1].count++ - H[hs.h2].xormask ^= hs.h - H[hs.h2].count++ - } - - Qsize := 0 - // Add sets with one key to the queue. - for i := uint32(0); i < capacity; i++ { - if H[i].count == 1 { - Q[Qsize].index = i - Q[Qsize].hash = H[i].xormask - Qsize++ - } - } - - stacksize := 0 - for Qsize > 0 { - Qsize-- - ki := Q[Qsize] - index := ki.index - if H[index].count == 0 { - continue // not actually possible after the initial scan - } - - hash := ki.hash - hs := filter.geth012(hash) - - stack[stacksize] = ki - stacksize++ - - // Remove key added to stack from all sets in the construction array and - // enqueue sets that now have one key. - H[hs.h0].xormask ^= hash - H[hs.h0].count-- - if H[hs.h0].count == 1 { - Q[Qsize].index = hs.h0 - Q[Qsize].hash = H[hs.h0].xormask - Qsize++ - } - H[hs.h1].xormask ^= hash - H[hs.h1].count-- - if H[hs.h1].count == 1 { - Q[Qsize].index = hs.h1 - Q[Qsize].hash = H[hs.h1].xormask - Qsize++ - } - H[hs.h2].xormask ^= hash - H[hs.h2].count-- - if H[hs.h2].count == 1 { - Q[Qsize].index = hs.h2 - Q[Qsize].hash = H[hs.h2].xormask - Qsize++ - } - } - - if stacksize == size { - // Success - break - } - - for i := range H { - H[i] = xorset{0, 0} - } - filter.Seed = splitmix64(&rngcounter) - } - - // ref: Algorithm 4 - stacksize := size - for stacksize > 0 { - stacksize-- - ki := stack[stacksize] - hs := filter.geth012(ki.hash) - fp := uint8(fingerprint(ki.hash)) - switch ki.index { - case hs.h0: - fp ^= filter.Fingerprints[hs.h1] ^ filter.Fingerprints[hs.h2] - case hs.h1: - fp ^= filter.Fingerprints[hs.h0] ^ filter.Fingerprints[hs.h2] - default: - fp ^= filter.Fingerprints[hs.h0] ^ filter.Fingerprints[hs.h1] - } - filter.Fingerprints[ki.index] = fp - } - - return filter, nil -} diff --git a/fusefilter_test.go b/fusefilter_test.go deleted file mode 100644 index a724e52..0000000 --- a/fusefilter_test.go +++ /dev/null @@ -1,150 +0,0 @@ -package xorfilter - -import ( - "fmt" - "math/rand" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestFuse8Basic(t *testing.T) { - keys := make([]uint64, NUM_KEYS) - for i := range keys { - keys[i] = rand.Uint64() - } - filter, _ := PopulateFuse8(keys) - for _, v := range keys { - assert.Equal(t, true, filter.Contains(v)) - } - falsesize := 10000000 - matches := 0 - bpv := float64(len(filter.Fingerprints)) * 8.0 / float64(NUM_KEYS) - fmt.Println("Fuse8 filter:") - fmt.Println("bits per entry ", bpv) - for i := 0; i < falsesize; i++ { - v := rand.Uint64() - if filter.Contains(v) { - matches++ - } - } - fpp := float64(matches) * 100.0 / float64(falsesize) - fmt.Println("false positive rate ", fpp) - assert.Equal(t, true, fpp < 0.40) - cut := 1000 - if cut > NUM_KEYS { - cut = NUM_KEYS - } - keys = keys[:cut] - for trial := 0; trial < 10; trial++ { - rand.Seed(int64(trial)) - for i := range keys { - keys[i] = rand.Uint64() - } - filter, _ = PopulateFuse8(keys) - for _, v := range keys { - assert.Equal(t, true, filter.Contains(v)) - } - - } -} - -func TestFuse8Small(t *testing.T) { - keys := make([]uint64, SMALL_NUM_KEYS) - for i := range keys { - keys[i] = rand.Uint64() - } - filter, _ := PopulateFuse8(keys) - for _, v := range keys { - assert.Equal(t, true, filter.Contains(v)) - } - falsesize := 10000000 - matches := 0 - for i := 0; i < falsesize; i++ { - v := rand.Uint64() - if filter.Contains(v) { - matches++ - } - } - fpp := float64(matches) * 100.0 / float64(falsesize) - assert.Equal(t, true, fpp < 0.40) - cut := 1000 - if cut > SMALL_NUM_KEYS { - cut = SMALL_NUM_KEYS - } - keys = keys[:cut] - for trial := 0; trial < 10; trial++ { - rand.Seed(int64(trial)) - for i := range keys { - keys[i] = rand.Uint64() - } - filter, _ = PopulateFuse8(keys) - for _, v := range keys { - assert.Equal(t, true, filter.Contains(v)) - } - - } -} - -func BenchmarkConstructFuse8(b *testing.B) { - bigrandomarrayInit() - b.ResetTimer() - b.ReportAllocs() - for n := 0; n < b.N; n++ { - PopulateFuse8(bigrandomarray) - } -} - -func BenchmarkFuse8Populate10000000(b *testing.B) { - keys := make([]uint64, NUM_KEYS, NUM_KEYS) - for i := range keys { - keys[i] = rand.Uint64() - } - b.ResetTimer() - for n := 0; n < b.N; n++ { - PopulateFuse8(keys) - } -} - -func Test_DuplicateKeysFuse(t *testing.T) { - keys := []uint64{1, 77, 31, 241, 303, 303} - _, err := PopulateFuse8(keys) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } -} - -func BenchmarkFuse8Contains1000000(b *testing.B) { - keys := make([]uint64, NUM_KEYS, NUM_KEYS) - for i := range keys { - keys[i] = rand.Uint64() - } - filter, _ := PopulateFuse8(keys) - - b.ResetTimer() - for n := 0; n < b.N; n++ { - filter.Contains(keys[n%len(keys)]) - } -} - -var fusedbig *Fuse8 - -func fusedbigInit() { - fmt.Println("Fuse setup") - keys := make([]uint64, 50000000, 50000000) - for i := range keys { - keys[i] = rand.Uint64() - } - fusedbig, _ = PopulateFuse8(keys) - fmt.Println("Fuse setup ok") -} - -func BenchmarkFuse8Contains50000000(b *testing.B) { - if fusedbig == nil { - fusedbigInit() - } - b.ResetTimer() - for n := 0; n < b.N; n++ { - fusedbig.Contains(rand.Uint64()) - } -} diff --git a/xorfilter.go b/xorfilter.go index 838e572..4d4c294 100644 --- a/xorfilter.go +++ b/xorfilter.go @@ -3,6 +3,7 @@ package xorfilter import ( "errors" "math" + "sort" ) func murmur64(h uint64) uint64 { @@ -141,13 +142,8 @@ func Populate(keys []uint64) (*Xor8, error) { iterations += 1 if iterations > MaxIterations { // The probability of this happening is lower than the - // the cosmic-ray probability (i.e., a cosmic ray corrupts your system), - // but if it happens, we just fill the fingerprint with ones which - // will flag all possible keys as 'possible', ensuring a correct result. - for i := 0; i < len(filter.Fingerprints); i++ { - filter.Fingerprints[i] = ^uint8(0) - } - return filter, nil + // the cosmic-ray probability (i.e., a cosmic ray corrupts your system). + return nil, errors.New("too many iterations") } for i := 0; i < size; i++ { @@ -261,6 +257,11 @@ func Populate(keys []uint64) (*Xor8, error) { break } + if iterations == 10 { + keys = pruneDuplicates(keys) + size = len(keys) + } + sets0 = resetSets(sets0) sets1 = resetSets(sets1) sets2 = resetSets(sets2) @@ -284,3 +285,17 @@ func Populate(keys []uint64) (*Xor8, error) { } return filter, nil } + +func pruneDuplicates(array []uint64) []uint64 { + sort.Slice(array, func(i, j int) bool { + return array[i] < array[j] + }) + pos := 0 + for i := 1; i < len(array); i++ { + if array[i] != array[pos] { + array[pos+1] = array[i] + pos += 1 + } + } + return array[:pos+1] +} diff --git a/xorfilter_test.go b/xorfilter_test.go index 4eb2fbf..6d08b3f 100644 --- a/xorfilter_test.go +++ b/xorfilter_test.go @@ -5,7 +5,9 @@ import ( "math/rand" "testing" "time" + "unsafe" + "github.com/cespare/xxhash" "github.com/stretchr/testify/assert" ) @@ -101,6 +103,13 @@ func BenchmarkPopulate100000(b *testing.B) { } } +func encode(v1, v2 int32) []byte { + v := make([]byte, 8) + v = append(v, unsafe.Slice((*byte)(unsafe.Pointer(&v1)), 4)...) + v = append(v, unsafe.Slice((*byte)(unsafe.Pointer(&v2)), 4)...) + return v +} + // credit: el10savio func Test_DuplicateKeys(t *testing.T) { keys := []uint64{1, 77, 31, 241, 303, 303} @@ -169,3 +178,43 @@ func BenchmarkXor8bigContains50000000(b *testing.B) { xor8big.Contains(rand.Uint64()) } } + +func TestfsdIssue35_basic(t *testing.T) { + hashes := make([]uint64, 0) + for i := 0; i < 2000; i++ { + v := encode(int32(rand.Intn(10)), int32(rand.Intn(100000))) + hashes = append(hashes, xxhash.Sum64(v)) + } + inner, err := Populate(hashes) + if err != nil { + panic(err) + } + for i, d := range hashes { + e := inner.Contains(d) + fmt.Println("checking ", d) + if !e { + panic(i) + } + } +} + +func Test_Issue35_basic(t *testing.T) { + for test := 0; test < 100; test++ { + hashes := make([]uint64, 0) + for i := 0; i < 40000; i++ { + v := encode(int32(rand.Intn(10)), int32(rand.Intn(100000))) + hashes = append(hashes, xxhash.Sum64(v)) + } + inner, err := PopulateBinaryFuse8(hashes) + if err != nil { + panic(err) + } + for i, d := range hashes { + e := inner.Contains(d) + if !e { + panic(i) + } + + } + } +}