From f3b41318a22e72b6416a561c2ded28f94a1048d4 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Tue, 26 Sep 2023 10:31:19 -0400
Subject: [PATCH] Fixing issue 35

---
 binaryfusefilter.go      |  13 +--
 binaryfusefilter_test.go |  22 +++++
 fusefilter.go            | 207 ---------------------------------------
 fusefilter_test.go       | 150 ----------------------------
 xorfilter.go             |  29 ++++--
 xorfilter_test.go        |  49 +++++++++
 6 files changed, 97 insertions(+), 373 deletions(-)
 delete mode 100644 fusefilter.go
 delete mode 100644 fusefilter_test.go

diff --git a/binaryfusefilter.go b/binaryfusefilter.go
index 3f06016..fd76263 100644
--- a/binaryfusefilter.go
+++ b/binaryfusefilter.go
@@ -1,9 +1,9 @@
 package xorfilter
 
 import (
+	"errors"
 	"math"
 	"math/bits"
-	"sort"
 )
 
 type BinaryFuse8 struct {
@@ -114,13 +114,8 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) {
 		iterations += 1
 		if iterations > MaxIterations {
 			// The probability of this happening is lower than the
-			// the cosmic-ray probability (i.e., a cosmic ray corrupts your system),
-			// but if it happens, we just fill the fingerprint with ones which
-			// will flag all possible keys as 'possible', ensuring a correct result.
-			for i := 0; i < len(filter.Fingerprints); i++ {
-				filter.Fingerprints[i] = ^uint8(0)
-			}
-			return filter, nil
+			// the cosmic-ray probability (i.e., a cosmic ray corrupts your system).
+			return nil, errors.New("too many iterations")
 		}
 
 		blockBits := 1
@@ -252,7 +247,7 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) {
 			// manage to remove them all. We may simply sort the key to
 			// solve the issue. This will run in time O(n log n) and it
 			// mutates the input.
-			sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })
+			keys = pruneDuplicates(keys)
 		}
 		for i := uint32(0); i < size; i++ {
 			reverseOrder[i] = 0
diff --git a/binaryfusefilter_test.go b/binaryfusefilter_test.go
index 9b57539..6c8f138 100644
--- a/binaryfusefilter_test.go
+++ b/binaryfusefilter_test.go
@@ -5,6 +5,7 @@ import (
 	"math/rand"
 	"testing"
 
+	"github.com/cespare/xxhash"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -305,3 +306,24 @@ func BenchmarkBinaryFuse8Contains50000000(b *testing.B) {
 		binaryfusedbig.Contains(rand.Uint64())
 	}
 }
+
+func Test_Issue35(t *testing.T) {
+	for test := 0; test < 100; test++ {
+		hashes := make([]uint64, 0)
+		for i := 0; i < 40000; i++ {
+			v := encode(int32(rand.Intn(10)), int32(rand.Intn(100000)))
+			hashes = append(hashes, xxhash.Sum64(v))
+		}
+		inner, err := PopulateBinaryFuse8(hashes)
+		if err != nil {
+			panic(err)
+		}
+		for i, d := range hashes {
+			e := inner.Contains(d)
+			if !e {
+				panic(i)
+			}
+
+		}
+	}
+}
diff --git a/fusefilter.go b/fusefilter.go
deleted file mode 100644
index 9a8928b..0000000
--- a/fusefilter.go
+++ /dev/null
@@ -1,207 +0,0 @@
-package xorfilter
-
-import (
-	"errors"
-)
-
-// The Fuse8 xor filter uses 8-bit fingerprints. It offers the same <0.4% false-positive probability
-// as the xor filter, but uses less space (~9.1 bits/entry vs ~9.9 bits/entry).
-//
-// The Fuse8 xor filter uses the fuse data structure, which requires a large number of keys to be
-// operational. Experimentally, this number is somewhere >1e5. For smaller key sets, prefer thhe
-// Xor8 filter.
-//
-// For more information on the fuse graph data structure, see https://arxiv.org/abs/1907.04749.
-// This implementation is referenced from the C implementation at https://github.com/FastFilter/xor_singleheader/pull/11.
-type Fuse8 struct {
-	Seed          uint64
-	SegmentLength uint32
-	Fingerprints  []uint8
-}
-
-type h012 struct {
-	h0 uint32
-	h1 uint32
-	h2 uint32
-}
-
-const ARITY = 3
-const SEGMENT_COUNT = 100
-const SLOTS = SEGMENT_COUNT + ARITY - 1
-
-// Contains returns `true` if key is part of the set with a false positive probability of <0.4%.
-func (filter *Fuse8) Contains(key uint64) bool {
-	hash := mixsplit(key, filter.Seed)
-	f := uint8(fingerprint(hash))
-	r0 := uint32(hash)
-	r1 := uint32(rotl64(hash, 21))
-	r2 := uint32(rotl64(hash, 42))
-	r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32)
-	seg := reduce(r0, SEGMENT_COUNT)
-	h0 := seg*filter.SegmentLength + reduce(r1, filter.SegmentLength)
-	h1 := (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength)
-	h2 := (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength)
-	return f == (filter.Fingerprints[h0] ^ filter.Fingerprints[h1] ^
-		filter.Fingerprints[h2])
-}
-
-func (filter *Fuse8) makeKeyHashes(k uint64) hashes {
-	hash := mixsplit(k, filter.Seed)
-	answer := hashes{}
-	answer.h = hash
-	r0 := uint32(hash)
-	r1 := uint32(rotl64(hash, 21))
-	r2 := uint32(rotl64(hash, 42))
-	r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32)
-	seg := reduce(r0, SEGMENT_COUNT)
-	answer.h0 = (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength)
-	answer.h1 = (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength)
-	answer.h2 = (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength)
-	return answer
-}
-
-func (filter *Fuse8) geth012(hash uint64) h012 {
-	answer := h012{}
-	r0 := uint32(hash)
-	r1 := uint32(rotl64(hash, 21))
-	r2 := uint32(rotl64(hash, 42))
-	r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32)
-	seg := reduce(r0, SEGMENT_COUNT)
-	answer.h0 = (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength)
-	answer.h1 = (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength)
-	answer.h2 = (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength)
-	return answer
-}
-
-// Populate fills the filter with provided keys. For best results,
-// the caller should avoid having too many duplicated keys.
-// The function may return an error if the set is empty.
-func PopulateFuse8(keys []uint64) (*Fuse8, error) {
-
-	const FUSE_OVERHEAD = 1.0 / 0.879
-	const FUSE_CONSTANT = 1024 // todo: determine value
-	// ref: Algorithm 3
-	size := len(keys)
-	if size == 0 {
-		return nil, errors.New("provide a non-empty set")
-	}
-
-	capacity := uint32(FUSE_OVERHEAD*float64(size) + FUSE_CONSTANT)
-	capacity = capacity / SLOTS * SLOTS
-	rngcounter := uint64(1)
-
-	filter := &Fuse8{}
-	filter.SegmentLength = capacity / SLOTS
-	filter.Fingerprints = make([]uint8, capacity, capacity)
-	filter.Seed = splitmix64(&rngcounter)
-
-	H := make([]xorset, capacity, capacity)
-	Q := make([]keyindex, capacity, capacity)
-	stack := make([]keyindex, size, size)
-	iterations := 0
-	for true {
-		iterations += 1
-		if iterations > MaxIterations {
-			// The probability of this happening is lower than the
-			// the cosmic-ray probability (i.e., a cosmic ray corrupts your system),
-			// but if it happens, we just fill the fingerprint with ones which
-			// will flag all possible keys as 'possible', ensuring a correct result.
-			for i := 0; i < len(filter.Fingerprints); i++ {
-				filter.Fingerprints[i] = ^uint8(0)
-			}
-			return filter, nil
-		}
-
-		// Add all keys to the construction array.
-		for _, key := range keys {
-			hs := filter.makeKeyHashes(key)
-
-			H[hs.h0].xormask ^= hs.h
-			H[hs.h0].count++
-			H[hs.h1].xormask ^= hs.h
-			H[hs.h1].count++
-			H[hs.h2].xormask ^= hs.h
-			H[hs.h2].count++
-		}
-
-		Qsize := 0
-		// Add sets with one key to the queue.
-		for i := uint32(0); i < capacity; i++ {
-			if H[i].count == 1 {
-				Q[Qsize].index = i
-				Q[Qsize].hash = H[i].xormask
-				Qsize++
-			}
-		}
-
-		stacksize := 0
-		for Qsize > 0 {
-			Qsize--
-			ki := Q[Qsize]
-			index := ki.index
-			if H[index].count == 0 {
-				continue // not actually possible after the initial scan
-			}
-
-			hash := ki.hash
-			hs := filter.geth012(hash)
-
-			stack[stacksize] = ki
-			stacksize++
-
-			// Remove key added to stack from all sets in the construction array and
-			// enqueue sets that now have one key.
-			H[hs.h0].xormask ^= hash
-			H[hs.h0].count--
-			if H[hs.h0].count == 1 {
-				Q[Qsize].index = hs.h0
-				Q[Qsize].hash = H[hs.h0].xormask
-				Qsize++
-			}
-			H[hs.h1].xormask ^= hash
-			H[hs.h1].count--
-			if H[hs.h1].count == 1 {
-				Q[Qsize].index = hs.h1
-				Q[Qsize].hash = H[hs.h1].xormask
-				Qsize++
-			}
-			H[hs.h2].xormask ^= hash
-			H[hs.h2].count--
-			if H[hs.h2].count == 1 {
-				Q[Qsize].index = hs.h2
-				Q[Qsize].hash = H[hs.h2].xormask
-				Qsize++
-			}
-		}
-
-		if stacksize == size {
-			// Success
-			break
-		}
-
-		for i := range H {
-			H[i] = xorset{0, 0}
-		}
-		filter.Seed = splitmix64(&rngcounter)
-	}
-
-	// ref: Algorithm 4
-	stacksize := size
-	for stacksize > 0 {
-		stacksize--
-		ki := stack[stacksize]
-		hs := filter.geth012(ki.hash)
-		fp := uint8(fingerprint(ki.hash))
-		switch ki.index {
-		case hs.h0:
-			fp ^= filter.Fingerprints[hs.h1] ^ filter.Fingerprints[hs.h2]
-		case hs.h1:
-			fp ^= filter.Fingerprints[hs.h0] ^ filter.Fingerprints[hs.h2]
-		default:
-			fp ^= filter.Fingerprints[hs.h0] ^ filter.Fingerprints[hs.h1]
-		}
-		filter.Fingerprints[ki.index] = fp
-	}
-
-	return filter, nil
-}
diff --git a/fusefilter_test.go b/fusefilter_test.go
deleted file mode 100644
index a724e52..0000000
--- a/fusefilter_test.go
+++ /dev/null
@@ -1,150 +0,0 @@
-package xorfilter
-
-import (
-	"fmt"
-	"math/rand"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestFuse8Basic(t *testing.T) {
-	keys := make([]uint64, NUM_KEYS)
-	for i := range keys {
-		keys[i] = rand.Uint64()
-	}
-	filter, _ := PopulateFuse8(keys)
-	for _, v := range keys {
-		assert.Equal(t, true, filter.Contains(v))
-	}
-	falsesize := 10000000
-	matches := 0
-	bpv := float64(len(filter.Fingerprints)) * 8.0 / float64(NUM_KEYS)
-	fmt.Println("Fuse8 filter:")
-	fmt.Println("bits per entry ", bpv)
-	for i := 0; i < falsesize; i++ {
-		v := rand.Uint64()
-		if filter.Contains(v) {
-			matches++
-		}
-	}
-	fpp := float64(matches) * 100.0 / float64(falsesize)
-	fmt.Println("false positive rate ", fpp)
-	assert.Equal(t, true, fpp < 0.40)
-	cut := 1000
-	if cut > NUM_KEYS {
-		cut = NUM_KEYS
-	}
-	keys = keys[:cut]
-	for trial := 0; trial < 10; trial++ {
-		rand.Seed(int64(trial))
-		for i := range keys {
-			keys[i] = rand.Uint64()
-		}
-		filter, _ = PopulateFuse8(keys)
-		for _, v := range keys {
-			assert.Equal(t, true, filter.Contains(v))
-		}
-
-	}
-}
-
-func TestFuse8Small(t *testing.T) {
-	keys := make([]uint64, SMALL_NUM_KEYS)
-	for i := range keys {
-		keys[i] = rand.Uint64()
-	}
-	filter, _ := PopulateFuse8(keys)
-	for _, v := range keys {
-		assert.Equal(t, true, filter.Contains(v))
-	}
-	falsesize := 10000000
-	matches := 0
-	for i := 0; i < falsesize; i++ {
-		v := rand.Uint64()
-		if filter.Contains(v) {
-			matches++
-		}
-	}
-	fpp := float64(matches) * 100.0 / float64(falsesize)
-	assert.Equal(t, true, fpp < 0.40)
-	cut := 1000
-	if cut > SMALL_NUM_KEYS {
-		cut = SMALL_NUM_KEYS
-	}
-	keys = keys[:cut]
-	for trial := 0; trial < 10; trial++ {
-		rand.Seed(int64(trial))
-		for i := range keys {
-			keys[i] = rand.Uint64()
-		}
-		filter, _ = PopulateFuse8(keys)
-		for _, v := range keys {
-			assert.Equal(t, true, filter.Contains(v))
-		}
-
-	}
-}
-
-func BenchmarkConstructFuse8(b *testing.B) {
-	bigrandomarrayInit()
-	b.ResetTimer()
-	b.ReportAllocs()
-	for n := 0; n < b.N; n++ {
-		PopulateFuse8(bigrandomarray)
-	}
-}
-
-func BenchmarkFuse8Populate10000000(b *testing.B) {
-	keys := make([]uint64, NUM_KEYS, NUM_KEYS)
-	for i := range keys {
-		keys[i] = rand.Uint64()
-	}
-	b.ResetTimer()
-	for n := 0; n < b.N; n++ {
-		PopulateFuse8(keys)
-	}
-}
-
-func Test_DuplicateKeysFuse(t *testing.T) {
-	keys := []uint64{1, 77, 31, 241, 303, 303}
-	_, err := PopulateFuse8(keys)
-	if err != nil {
-		t.Fatalf("Unexpected error: %v", err)
-	}
-}
-
-func BenchmarkFuse8Contains1000000(b *testing.B) {
-	keys := make([]uint64, NUM_KEYS, NUM_KEYS)
-	for i := range keys {
-		keys[i] = rand.Uint64()
-	}
-	filter, _ := PopulateFuse8(keys)
-
-	b.ResetTimer()
-	for n := 0; n < b.N; n++ {
-		filter.Contains(keys[n%len(keys)])
-	}
-}
-
-var fusedbig *Fuse8
-
-func fusedbigInit() {
-	fmt.Println("Fuse setup")
-	keys := make([]uint64, 50000000, 50000000)
-	for i := range keys {
-		keys[i] = rand.Uint64()
-	}
-	fusedbig, _ = PopulateFuse8(keys)
-	fmt.Println("Fuse setup ok")
-}
-
-func BenchmarkFuse8Contains50000000(b *testing.B) {
-	if fusedbig == nil {
-		fusedbigInit()
-	}
-	b.ResetTimer()
-	for n := 0; n < b.N; n++ {
-		fusedbig.Contains(rand.Uint64())
-	}
-}
diff --git a/xorfilter.go b/xorfilter.go
index 838e572..4d4c294 100644
--- a/xorfilter.go
+++ b/xorfilter.go
@@ -3,6 +3,7 @@ package xorfilter
 import (
 	"errors"
 	"math"
+	"sort"
 )
 
 func murmur64(h uint64) uint64 {
@@ -141,13 +142,8 @@ func Populate(keys []uint64) (*Xor8, error) {
 		iterations += 1
 		if iterations > MaxIterations {
 			// The probability of this happening is lower than the
-			// the cosmic-ray probability (i.e., a cosmic ray corrupts your system),
-			// but if it happens, we just fill the fingerprint with ones which
-			// will flag all possible keys as 'possible', ensuring a correct result.
-			for i := 0; i < len(filter.Fingerprints); i++ {
-				filter.Fingerprints[i] = ^uint8(0)
-			}
-			return filter, nil
+			// the cosmic-ray probability (i.e., a cosmic ray corrupts your system).
+			return nil, errors.New("too many iterations")
 		}
 
 		for i := 0; i < size; i++ {
@@ -261,6 +257,11 @@ func Populate(keys []uint64) (*Xor8, error) {
 			break
 		}
 
+		if iterations == 10 {
+			keys = pruneDuplicates(keys)
+			size = len(keys)
+		}
+
 		sets0 = resetSets(sets0)
 		sets1 = resetSets(sets1)
 		sets2 = resetSets(sets2)
@@ -284,3 +285,17 @@ func Populate(keys []uint64) (*Xor8, error) {
 	}
 	return filter, nil
 }
+
+func pruneDuplicates(array []uint64) []uint64 {
+	sort.Slice(array, func(i, j int) bool {
+		return array[i] < array[j]
+	})
+	pos := 0
+	for i := 1; i < len(array); i++ {
+		if array[i] != array[pos] {
+			array[pos+1] = array[i]
+			pos += 1
+		}
+	}
+	return array[:pos+1]
+}
diff --git a/xorfilter_test.go b/xorfilter_test.go
index 4eb2fbf..6d08b3f 100644
--- a/xorfilter_test.go
+++ b/xorfilter_test.go
@@ -5,7 +5,9 @@ import (
 	"math/rand"
 	"testing"
 	"time"
+	"unsafe"
 
+	"github.com/cespare/xxhash"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -101,6 +103,13 @@ func BenchmarkPopulate100000(b *testing.B) {
 	}
 }
 
+func encode(v1, v2 int32) []byte {
+	v := make([]byte, 8)
+	v = append(v, unsafe.Slice((*byte)(unsafe.Pointer(&v1)), 4)...)
+	v = append(v, unsafe.Slice((*byte)(unsafe.Pointer(&v2)), 4)...)
+	return v
+}
+
 // credit: el10savio
 func Test_DuplicateKeys(t *testing.T) {
 	keys := []uint64{1, 77, 31, 241, 303, 303}
@@ -169,3 +178,43 @@ func BenchmarkXor8bigContains50000000(b *testing.B) {
 		xor8big.Contains(rand.Uint64())
 	}
 }
+
+func TestfsdIssue35_basic(t *testing.T) {
+	hashes := make([]uint64, 0)
+	for i := 0; i < 2000; i++ {
+		v := encode(int32(rand.Intn(10)), int32(rand.Intn(100000)))
+		hashes = append(hashes, xxhash.Sum64(v))
+	}
+	inner, err := Populate(hashes)
+	if err != nil {
+		panic(err)
+	}
+	for i, d := range hashes {
+		e := inner.Contains(d)
+		fmt.Println("checking ", d)
+		if !e {
+			panic(i)
+		}
+	}
+}
+
+func Test_Issue35_basic(t *testing.T) {
+	for test := 0; test < 100; test++ {
+		hashes := make([]uint64, 0)
+		for i := 0; i < 40000; i++ {
+			v := encode(int32(rand.Intn(10)), int32(rand.Intn(100000)))
+			hashes = append(hashes, xxhash.Sum64(v))
+		}
+		inner, err := PopulateBinaryFuse8(hashes)
+		if err != nil {
+			panic(err)
+		}
+		for i, d := range hashes {
+			e := inner.Contains(d)
+			if !e {
+				panic(i)
+			}
+
+		}
+	}
+}