From f8cedef1e7037f467509653ad6bf2d941834f27c Mon Sep 17 00:00:00 2001 From: Elad Gabay Date: Sun, 13 Aug 2023 20:14:37 +0300 Subject: [PATCH] Enhance Support for Larger Datasets and Buckets in Encoding This commit improves encoding by enabling the handling of number of items and buckets exceeding max(uint32). Formerly, the encoding used uint32 for counts, but the filter structure already supported larger values using uint. Until now, the filter partially supported larger datasets, not all the buckets were utilized, note to the change in `generateIndexTagHash`, `altIndex` and `indexHash`. Now, all references to bucket indices and item counts explicitly use uint64. A new encoding format accommodates larger filter. To distinguish between legacy (up to max(uint32) items) and the new format, a prefix marker is introduced. Decoding seamlessly supports both formats. The encode method takes a legacy boolean parameter for gradual adoption. --- cuckoofilter.go | 182 +++++++++++++++++++++++++++++++------------ cuckoofilter_test.go | 135 ++++++++++++++++---------------- packedtable.go | 98 ++++++++++++++--------- singletable.go | 106 +++++++++++++++---------- util.go | 14 ++-- 5 files changed, 337 insertions(+), 198 deletions(-) diff --git a/cuckoofilter.go b/cuckoofilter.go index b03092f..65fc1b8 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -11,36 +11,39 @@ import ( "errors" "fmt" "io" + "math" "github.com/dgryski/go-metro" ) // maximum number of cuckoo kicks before claiming failure -const kMaxCuckooCount uint = 500 +const kMaxCuckooCount uint64 = 500 + +type TableType uint32 const ( // TableTypeSingle normal single table - TableTypeSingle = 0 + TableTypeSingle TableType = 0 // TableTypePacked packed table, use semi-sort to save 1 bit per item - TableTypePacked = 1 + TableTypePacked TableType = 1 ) type table interface { - Init(tagsPerBucket, bitsPerTag, num uint, initialBucketsHint []byte) error - NumBuckets() uint - FindTagInBuckets(i1, i2 uint, tag uint32) bool - DeleteTagFromBucket(i uint, tag uint32) bool - InsertTagToBucket(i uint, tag uint32, kickOut bool, oldTag *uint32) bool - SizeInTags() uint - SizeInBytes() uint + Init(tagsPerBucket, bitsPerTag uint, num uint64, initialBucketsHint []byte) error + NumBuckets() uint64 + FindTagInBuckets(i1, i2 uint64, tag uint32) bool + DeleteTagFromBucket(i uint64, tag uint32) bool + InsertTagToBucket(i uint64, tag uint32, kickOut bool, oldTag *uint32) bool + SizeInTags() uint64 + SizeInBytes() uint64 Info() string BitsPerItem() uint - Reader() (io.Reader, uint) - Decode([]byte) error + Reader(legacy bool) (io.Reader, uint64) + Decode(legacy bool, _ []byte) error Reset() } -func getTable(tableType uint) interface{} { +func getTable(tableType TableType) interface{} { switch tableType { case TableTypePacked: return NewPackedTable() @@ -50,30 +53,28 @@ func getTable(tableType uint) interface{} { } type victimCache struct { - index uint + index uint64 tag uint32 used bool } -const filterMetadataSize = 3*bytesPerUint32 + 1 - // Filter cuckoo filter type struct type Filter struct { victim victimCache - numItems uint + numItems uint64 table table } -//NewFilter return a new initialized filter +// NewFilter return a new initialized filter /* tagsPerBucket: num of tags for each bucket, which is b in paper. tag is fingerprint, which is f in paper. - bitPerItem: num of bits for each item, which is length of tag(fingerprint) + bitPerItem: num of bits for each item, which is length of tag(fingerprint) (max is 32) maxNumKeys: num of keys that filter will store. this value should close to and lower nextPow2(maxNumKeys/tagsPerBucket) * maxLoadFactor. cause table.NumBuckets is always a power of two */ -func NewFilter(tagsPerBucket, bitsPerItem, maxNumKeys, tableType uint) *Filter { - numBuckets := getNextPow2(uint64(maxNumKeys / tagsPerBucket)) - if float64(maxNumKeys)/float64(numBuckets*tagsPerBucket) > maxLoadFactor(tagsPerBucket) { +func NewFilter(tagsPerBucket, bitsPerItem uint, maxNumKeys uint64, tableType TableType) *Filter { + numBuckets := getNextPow2(maxNumKeys / uint64(tagsPerBucket)) + if float64(maxNumKeys)/float64(numBuckets*uint64(tagsPerBucket)) > maxLoadFactor(tagsPerBucket) { numBuckets <<= 1 } if numBuckets == 0 { @@ -86,30 +87,46 @@ func NewFilter(tagsPerBucket, bitsPerItem, maxNumKeys, tableType uint) *Filter { } } -func (f *Filter) indexHash(hv uint32) uint { +func (f *Filter) indexHash(hv uint64) uint64 { // table.NumBuckets is always a power of two, so modulo can be replaced with bitwise-and: - return uint(hv) & (f.table.NumBuckets() - 1) + return hv & (f.table.NumBuckets() - 1) } -func (f *Filter) tagHash(hv uint32) uint32 { - return hv%((1<> 32)) - tag = f.tagHash(uint32(hash)) +func (f *Filter) generateIndexTagHash(item []byte) (index uint64, tag uint32) { + // For backward compatibility with existing filters before adding the support of more than max uint32 items, + // we need to use the old hash function. + if f.table.SizeInTags()-1 <= math.MaxUint32 { + hash := metro.Hash64(item, 1337) + index = f.indexHash(hash >> 32) + tag = f.tagHash(hash) + return + } + + hash1, hash2 := metro.Hash128(item, 1337) + index = f.indexHash(hash1) + tag = f.tagHash(hash2) return } -func (f *Filter) altIndex(index uint, tag uint32) uint { - // 0x5bd1e995 is the hash constant from MurmurHash2 - return f.indexHash(uint32(index) ^ (tag * 0x5bd1e995)) +func (f *Filter) altIndex(index uint64, tag uint32) uint64 { + // For backward compatibility with existing filters before adding the support of more than max uint32 items, + // we need to use the old hash function. + if f.table.SizeInTags()-1 <= math.MaxUint32 { + // 0x5bd1e995 is the hash constant from MurmurHash2. + return f.indexHash(uint64(uint32(index) ^ (tag * 0x5bd1e995))) + } + + // 0xc6a4a7935bd1e995 is the hash constant from MurmurHash64A. + return f.indexHash(index ^ (uint64(tag) * 0xc6a4a7935bd1e995)) } // Size return num of items that filter store -func (f *Filter) Size() uint { - var c uint +func (f *Filter) Size() uint64 { + var c uint64 if f.victim.used { c = 1 } @@ -122,7 +139,7 @@ func (f *Filter) LoadFactor() float64 { } // SizeInBytes return bytes occupancy of filter's table -func (f *Filter) SizeInBytes() uint { +func (f *Filter) SizeInBytes() uint64 { return f.table.SizeInBytes() } @@ -148,12 +165,12 @@ func (f *Filter) AddUnique(item []byte) bool { return f.Add(item) } -func (f *Filter) addImpl(i uint, tag uint32) bool { +func (f *Filter) addImpl(i uint64, tag uint32) bool { curIndex := i curTag := tag var oldTag uint32 - var count uint + var count uint64 var kickOut bool for count = 0; count < kMaxCuckooCount; count++ { kickOut = count > 0 @@ -255,8 +272,8 @@ func (f *Filter) Info() string { } // Encode returns a byte slice representing a Cuckoo filter -func (f *Filter) Encode() ([]byte, error) { - filterReader, filterSize := f.EncodeReader() +func (f *Filter) Encode(legacy bool) ([]byte, error) { + filterReader, filterSize := f.EncodeReader(legacy) buf := make([]byte, filterSize) if _, err := io.ReadFull(filterReader, buf); err != nil { return nil, err @@ -264,10 +281,48 @@ func (f *Filter) Encode() ([]byte, error) { return buf, nil } +const ( + // uint32(numItems), uint32(victim.index), uint32(victim.tag), byte(victim.used) + filterMetadataSizeLegacy = 3*bytesPerUint32 + 1 + + // uint64(numItems), uint64(victim.index), uint32(victim.tag), byte(victim.used) + filterMetadataSize = 2*bytesPerUint64 + bytesPerUint32 + 1 +) + +// In the legacy serialization format, there are 3 uint32s and then a byte for "victimUsed" which is a boolean 0 or 1. +// We need a way to distinguish between the legacy format and the new format, so we can use that byte as a marker +// (0/1 means the legacy format, other value means the new format). +// In the new format the first 13 bytes are not used, just markers, the actual serialization starts after the marker. +// The marker is hex of "IMNOTLEGACY!!" :). +var newFormatMarker = [filterMetadataSizeLegacy]byte{0x49, 0x4D, 0x4E, 0x4F, 0x54, 0x4C, 0x45, 0x47, 0x41, 0x43, 0x59, 0x21, 0x21} + // EncodeReader returns a reader representing a Cuckoo filter -func (f *Filter) EncodeReader() (io.Reader, uint) { +func (f *Filter) EncodeReader(legacy bool) (io.Reader, uint64) { + if legacy { + return f.encodeReaderLegacyMaxUint32() + } + var metadata [filterMetadataSize]byte + for i, n := range []uint64{f.numItems, f.victim.index} { + binary.LittleEndian.PutUint64(metadata[i*bytesPerUint64:], n) + } + + binary.LittleEndian.PutUint32(metadata[2*bytesPerUint64:], f.victim.tag) + + victimUsed := byte(0) + if f.victim.used { + victimUsed = byte(1) + } + metadata[2*bytesPerUint64+bytesPerUint32] = victimUsed + tableReader, tableEncodedSize := f.table.Reader(false) + return io.MultiReader(bytes.NewReader(newFormatMarker[:]), bytes.NewReader(metadata[:]), tableReader), uint64(len(newFormatMarker)) + uint64(len(metadata)) + tableEncodedSize +} + +// encodeReaderLegacyMaxUint32 returns a reader representing a Cuckoo filter encoded in the legacy mode that supports up to max(uint32) items. +func (f *Filter) encodeReaderLegacyMaxUint32() (io.Reader, uint64) { + var metadata [filterMetadataSizeLegacy]byte + for i, n := range []uint32{uint32(f.numItems), uint32(f.victim.index), f.victim.tag} { binary.LittleEndian.PutUint32(metadata[i*bytesPerUint32:], n) } @@ -277,8 +332,8 @@ func (f *Filter) EncodeReader() (io.Reader, uint) { victimUsed = byte(1) } metadata[bytesPerUint32*3] = victimUsed - tableReader, tableEncodedSize := f.table.Reader() - return io.MultiReader(bytes.NewReader(metadata[:]), tableReader), uint(len(metadata)) + tableEncodedSize + tableReader, tableEncodedSize := f.table.Reader(true) + return io.MultiReader(bytes.NewReader(metadata[:]), tableReader), uint64(len(metadata)) + tableEncodedSize } // Decode returns a Cuckoo Filter using a copy of the provided byte slice. @@ -293,13 +348,42 @@ func DecodeFrom(b []byte) (*Filter, error) { if len(b) < 20 { return nil, errors.New("unexpected bytes length") } - numItems := uint(binary.LittleEndian.Uint32(b[0*bytesPerUint32:])) - curIndex := uint(binary.LittleEndian.Uint32(b[1*bytesPerUint32:])) - curTag := binary.LittleEndian.Uint32(b[2*1*bytesPerUint32:]) - used := b[12] == byte(1) - tableType := uint(b[13]) + + curOffset := uint64(0) + legacy := uint(b[len(newFormatMarker)-1]) <= 1 + + // Skip the marker if it's the new format. + if !legacy { + curOffset += uint64(len(newFormatMarker)) + } + + var numItems uint64 + if legacy { + numItems = uint64(binary.LittleEndian.Uint32(b[curOffset:])) + curOffset += bytesPerUint32 + } else { + numItems = binary.LittleEndian.Uint64(b[curOffset:]) + curOffset += bytesPerUint64 + } + + var curIndex uint64 + if legacy { + curIndex = uint64(binary.LittleEndian.Uint32(b[curOffset:])) + curOffset += bytesPerUint32 + } else { + curIndex = binary.LittleEndian.Uint64(b[curOffset:]) + curOffset += bytesPerUint64 + } + + curTag := binary.LittleEndian.Uint32(b[curOffset:]) + curOffset += bytesPerUint32 + + used := b[curOffset] == byte(1) + + tableOffset := curOffset + 1 + tableType := TableType(b[tableOffset]) table := getTable(tableType).(table) - if err := table.Decode(b[13:]); err != nil { + if err := table.Decode(legacy, b[tableOffset:]); err != nil { return nil, err } return &Filter{ diff --git a/cuckoofilter_test.go b/cuckoofilter_test.go index b94a317..d2b2c33 100644 --- a/cuckoofilter_test.go +++ b/cuckoofilter_test.go @@ -19,7 +19,8 @@ const size = 100000 var ( testBucketSize = []uint{2, 4, 8} testFingerprintSize = []uint{2, 4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 17, 23, 31, 32} - testTableType = []uint{TableTypeSingle, TableTypePacked} + encodeLegacy = []bool{true, false} + testTableType = []TableType{TableTypeSingle, TableTypePacked} ) func TestFilter(t *testing.T) { @@ -29,82 +30,84 @@ func TestFilter(t *testing.T) { for _, b := range testBucketSize { for _, f := range testFingerprintSize { for _, table := range testTableType { - if f == 2 && table == TableTypePacked { - continue - } - if table == TableTypePacked && b != 4 { - continue - } - cf := NewFilter(b, f, 8190, table) - // fmt.Println(cf.Info()) - a := make([][]byte, 0) - for i := uint(0); i < insertNum; i++ { - _, _ = io.ReadFull(rand.Reader, hash[:]) - if cf.AddUnique(hash[:]) { - tmp := make([]byte, 32) - copy(tmp, hash[:]) - a = append(a, tmp) + for _, legacy := range encodeLegacy { + if f == 2 && table == TableTypePacked { + continue + } + if table == TableTypePacked && b != 4 { + continue + } + cf := NewFilter(b, f, 8190, table) + // fmt.Println(cf.Info()) + a := make([][]byte, 0) + for i := uint(0); i < insertNum; i++ { + _, _ = io.ReadFull(rand.Reader, hash[:]) + if cf.AddUnique(hash[:]) { + tmp := make([]byte, 32) + copy(tmp, hash[:]) + a = append(a, tmp) + } } - } - count := cf.Size() - if count != uint(len(a)) { - t.Errorf("Expected count = %d, instead count = %d, b %v f %v", uint(len(a)), count, b, f) - return - } + count := cf.Size() + if count != uint64(len(a)) { + t.Errorf("Expected count = %d, instead count = %d, b %v f %v", uint(len(a)), count, b, f) + return + } - encodedBytes, err := cf.Encode() - if err != nil { - t.Fatalf("err %v", err) - } - if len(encodedBytes) != cap(encodedBytes) { - t.Fatalf("len(%d) != cap(%d)", len(encodedBytes), cap(encodedBytes)) - } - ncf, err := Decode(encodedBytes) - if err != nil || !reflect.DeepEqual(cf, ncf) { - t.Errorf("Expected epual, err %v", err) - return - } + encodedBytes, err := cf.Encode(legacy) + if err != nil { + t.Fatalf("err %v", err) + } + if len(encodedBytes) != cap(encodedBytes) { + t.Fatalf("len(%d) != cap(%d)", len(encodedBytes), cap(encodedBytes)) + } + ncf, err := Decode(encodedBytes) + if err != nil || !reflect.DeepEqual(cf, ncf) { + t.Errorf("Expected epual, err %v", err) + return + } - encodedBytes, err = cf.Encode() - if err != nil { - t.Fatalf("err %v", err) - } - ncf, err = DecodeFrom(encodedBytes) - if err != nil || !reflect.DeepEqual(cf, ncf) { - t.Errorf("Expected epual, err %v", err) - return - } + encodedBytes, err = cf.Encode(legacy) + if err != nil { + t.Fatalf("err %v", err) + } + ncf, err = DecodeFrom(encodedBytes) + if err != nil || !reflect.DeepEqual(cf, ncf) { + t.Errorf("Expected epual, err %v", err) + return + } - filterReader, _ := cf.EncodeReader() - bytesFromReader, err := io.ReadAll(filterReader) - if err != nil { - t.Fatalf("Error reading from reader") - } - if !bytes.Equal(bytesFromReader, encodedBytes) { - t.Fatalf("Expected to be equal") - } + filterReader, _ := cf.EncodeReader(legacy) + bytesFromReader, err := io.ReadAll(filterReader) + if err != nil { + t.Fatalf("Error reading from reader") + } + if !bytes.Equal(bytesFromReader, encodedBytes) { + t.Fatalf("Expected to be equal") + } - fmt.Println(cf.Info()) - cf.BitsPerItem() - cf.SizeInBytes() - cf.LoadFactor() + fmt.Println(cf.Info()) + cf.BitsPerItem() + cf.SizeInBytes() + cf.LoadFactor() + + for _, v := range a { + if !cf.Contain(v) { + t.Errorf("Expected contain, instead not contain, b %v f %v table type %v", b, f, table) + return + } + cf.Delete(v) + } - for _, v := range a { - if !cf.Contain(v) { - t.Errorf("Expected contain, instead not contain, b %v f %v table type %v", b, f, table) + count = cf.Size() + if count != 0 { + t.Errorf("Expected count = 0, instead count == %d, b %v f %v table type %v", count, b, f, table) return } - cf.Delete(v) - } - count = cf.Size() - if count != 0 { - t.Errorf("Expected count = 0, instead count == %d, b %v f %v table type %v", count, b, f, table) - return + fmt.Printf("Filter bucketSize %v fingerprintSize %v tableType %v falsePositive Rate %v \n", b, f, table, cf.FalsePositiveRate()) } - - fmt.Printf("Filter bucketSize %v fingerprintSize %v tableType %v falsePositive Rate %v \n", b, f, table, cf.FalsePositiveRate()) } } } diff --git a/packedtable.go b/packedtable.go index dd2ea34..076d03b 100644 --- a/packedtable.go +++ b/packedtable.go @@ -21,10 +21,11 @@ type PackedTable struct { kDirBitsMask uint32 bitsPerTag uint - len uint - numBuckets uint + len uint64 + numBuckets uint64 buckets []byte perm PermEncoding + totalSlots uint64 } // NewPackedTable return a packedTable @@ -39,7 +40,7 @@ const ( ) // Init init table -func (p *PackedTable) Init(_, bitsPerTag, num uint, initialBucketsHint []byte) error { +func (p *PackedTable) Init(_, bitsPerTag uint, num uint64, initialBucketsHint []byte) error { p.bitsPerTag = bitsPerTag p.numBuckets = num @@ -48,28 +49,29 @@ func (p *PackedTable) Init(_, bitsPerTag, num uint, initialBucketsHint []byte) e p.kBytesPerBucket = (p.kBitsPerBucket + 7) >> 3 p.kDirBitsMask = ((1 << p.kDirBitsPerTag) - 1) << cFpSize // NOTE: use 7 extra bytes to avoid overrun as we always read a uint64 - p.len = (p.kBitsPerBucket*p.numBuckets+7)>>3 + 7 + p.len = (uint64(p.kBitsPerBucket)*p.numBuckets+7)>>3 + 7 buckets, err := getBucketsFromHint(initialBucketsHint, p.len) if err != nil { return err } p.buckets = buckets p.perm.Init() + p.totalSlots = tagsPerPTable * p.numBuckets return nil } // NumBuckets return num of table buckets -func (p *PackedTable) NumBuckets() uint { +func (p *PackedTable) NumBuckets() uint64 { return p.numBuckets } // SizeInTags return num of tags that table can store -func (p *PackedTable) SizeInTags() uint { - return tagsPerPTable * p.numBuckets +func (p *PackedTable) SizeInTags() uint64 { + return p.totalSlots } // SizeInBytes return bytes occupancy of table -func (p *PackedTable) SizeInBytes() uint { +func (p *PackedTable) SizeInBytes() uint64 { return p.len } @@ -79,9 +81,9 @@ func (p *PackedTable) BitsPerItem() uint { } // PrintBucket print a bucket -func (p *PackedTable) PrintBucket(i uint) { - pos := p.kBitsPerBucket * i / bitsPerByte - fmt.Printf("\tbucketBits =%x\n", p.buckets[pos:pos+p.kBytesPerBucket]) +func (p *PackedTable) PrintBucket(i uint64) { + pos := uint64(p.kBitsPerBucket) * i / bitsPerByte + fmt.Printf("\tbucketBits =%x\n", p.buckets[pos:pos+uint64(p.kBytesPerBucket)]) var tags [tagsPerPTable]uint32 p.ReadBucket(i, &tags) p.PrintTags(tags) @@ -118,10 +120,10 @@ func (p *PackedTable) sortTags(tags *[tagsPerPTable]uint32) { // ReadBucket read and decode the bucket i, pass the 4 decoded tags to the 2nd arg // bucket bits = 12 codeword bits + dir bits of tag1 + dir bits of tag2 ... -func (p *PackedTable) ReadBucket(i uint, tags *[tagsPerPTable]uint32) { +func (p *PackedTable) ReadBucket(i uint64, tags *[tagsPerPTable]uint32) { var codeword uint16 var lowBits [tagsPerPTable]uint8 - pos := i * p.kBitsPerBucket >> 3 + pos := i * uint64(p.kBitsPerBucket) >> 3 switch p.bitsPerTag { case 5: // 1 dirBits per tag, 16 bits per bucket @@ -205,13 +207,13 @@ func (p *PackedTable) ReadBucket(i uint, tags *[tagsPerPTable]uint32) { tags[3] |= uint32(lowBits[3]) } -func (p *PackedTable) readOutBytes(i, pos uint) (uint64, uint64, uint) { - rShift := (p.kBitsPerBucket * i) & (bitsPerByte - 1) +func (p *PackedTable) readOutBytes(i, pos uint64) (uint64, uint64, uint64) { + rShift := (uint64(p.kBitsPerBucket) * i) & (bitsPerByte - 1) // tag is max 32bit, store 31bit per tag, so max occupies 16 bytes - kBytes := (rShift + p.kBitsPerBucket + 7) / bitsPerByte + kBytes := (rShift + uint64(p.kBitsPerBucket) + 7) / bitsPerByte var u1, u2 uint64 - for k := uint(0); k < kBytes; k++ { + for k := uint64(0); k < kBytes; k++ { if k < bytesPerUint64 { u1 |= uint64(p.buckets[pos+k]) << (k * bitsPerByte) } else { @@ -223,7 +225,7 @@ func (p *PackedTable) readOutBytes(i, pos uint) (uint64, uint64, uint) { } // WriteBucket write tags into bucket i -func (p *PackedTable) WriteBucket(i uint, tags [tagsPerPTable]uint32) { +func (p *PackedTable) WriteBucket(i uint64, tags [tagsPerPTable]uint32) { p.sortTags(&tags) /* put in direct bits for each tag*/ @@ -242,7 +244,7 @@ func (p *PackedTable) WriteBucket(i uint, tags [tagsPerPTable]uint32) { // note that : tags[j] = lowBits[j] | highBits[j] codeword := p.perm.Encode(lowBits) - pos := i * p.kBitsPerBucket >> 3 + pos := i * uint64(p.kBitsPerBucket) >> 3 switch p.kBitsPerBucket { case 16: // 1 dirBits per tag @@ -342,11 +344,11 @@ func (p *PackedTable) WriteBucket(i uint, tags [tagsPerPTable]uint32) { } } -func (p *PackedTable) writeInBytes(i, pos uint, codeword uint16, highBits [tagsPerPTable]uint32) { - rShift := (p.kBitsPerBucket * i) & (bitsPerByte - 1) - lShift := (rShift + p.kBitsPerBucket) & (bitsPerByte - 1) +func (p *PackedTable) writeInBytes(i, pos uint64, codeword uint16, highBits [tagsPerPTable]uint32) { + rShift := (uint64(p.kBitsPerBucket) * i) & (bitsPerByte - 1) + lShift := (rShift + uint64(p.kBitsPerBucket)) & (bitsPerByte - 1) // tag is max 32bit, store 31bit per tag, so max occupies 16 bytes - kBytes := (rShift + p.kBitsPerBucket + 7) / bitsPerByte + kBytes := (rShift + uint64(p.kBitsPerBucket) + 7) / bitsPerByte rMask := uint8(0xff) >> (bitsPerByte - rShift) lMask := uint8(0xff) << lShift @@ -374,7 +376,7 @@ func (p *PackedTable) writeInBytes(i, pos uint, codeword uint16, highBits [tagsP } } - for k := uint(0); k < kBytes; k++ { + for k := uint64(0); k < kBytes; k++ { if k < bytesPerUint64 { p.buckets[pos+k] = byte(u1 >> (k * bitsPerByte)) } else { @@ -386,7 +388,7 @@ func (p *PackedTable) writeInBytes(i, pos uint, codeword uint16, highBits [tagsP } // FindTagInBuckets find if tag in bucket i1 i2 -func (p *PackedTable) FindTagInBuckets(i1, i2 uint, tag uint32) bool { +func (p *PackedTable) FindTagInBuckets(i1, i2 uint64, tag uint32) bool { var tags1, tags2 [tagsPerPTable]uint32 p.ReadBucket(i1, &tags1) p.ReadBucket(i2, &tags2) @@ -397,7 +399,7 @@ func (p *PackedTable) FindTagInBuckets(i1, i2 uint, tag uint32) bool { } // DeleteTagFromBucket delete tag from bucket i -func (p *PackedTable) DeleteTagFromBucket(i uint, tag uint32) bool { +func (p *PackedTable) DeleteTagFromBucket(i uint64, tag uint32) bool { var tags [tagsPerPTable]uint32 p.ReadBucket(i, &tags) for j := 0; j < tagsPerPTable; j++ { @@ -411,7 +413,7 @@ func (p *PackedTable) DeleteTagFromBucket(i uint, tag uint32) bool { } // InsertTagToBucket insert tag into bucket i -func (p *PackedTable) InsertTagToBucket(i uint, tag uint32, kickOut bool, oldTag *uint32) bool { +func (p *PackedTable) InsertTagToBucket(i uint64, tag uint32, kickOut bool, oldTag *uint32) bool { var tags [tagsPerPTable]uint32 p.ReadBucket(i, &tags) for j := 0; j < tagsPerPTable; j++ { @@ -447,20 +449,44 @@ func (p *PackedTable) Info() string { p.bitsPerTag, p.kDirBitsPerTag, p.numBuckets, p.SizeInTags()) } -const packedTableMetadataSize = 2+bytesPerUint32 +const ( + packedTableMetadataSize = 2 + bytesPerUint64 + packedTableMetadataSizeLegacy = 2 + bytesPerUint32 +) + +// Reader returns a reader representing a TableBucket +func (p *PackedTable) Reader(legacy bool) (io.Reader, uint64) { + var metadata []byte + if legacy { + metadata = make([]byte, packedTableMetadataSizeLegacy) + } else { + metadata = make([]byte, packedTableMetadataSize) + } -// Encode returns a byte slice representing a TableBucket -func (p *PackedTable) Reader() (io.Reader, uint) { - var metadata [packedTableMetadataSize]byte metadata[0] = uint8(TableTypePacked) metadata[1] = uint8(p.bitsPerTag) - binary.LittleEndian.PutUint32(metadata[2:], uint32(p.numBuckets)) - return io.MultiReader(bytes.NewReader(metadata[:]), bytes.NewReader(p.buckets)), uint(len(metadata) + len(p.buckets)) + + if legacy { + binary.LittleEndian.PutUint32(metadata[2:], uint32(p.numBuckets)) + } else { + binary.LittleEndian.PutUint64(metadata[2:], p.numBuckets) + } + + return io.MultiReader(bytes.NewReader(metadata[:]), bytes.NewReader(p.buckets)), uint64(len(metadata)) + uint64(len(p.buckets)) } // Decode parse a byte slice into a TableBucket -func (p *PackedTable) Decode(b []byte) error { +func (p *PackedTable) Decode(legacy bool, b []byte) error { bitsPerTag := uint(b[1]) - numBuckets := uint(binary.LittleEndian.Uint32(b[2:])) - return p.Init(0, bitsPerTag, numBuckets, b[6:]) + + var numBuckets, offset uint64 + if legacy { + numBuckets = uint64(binary.LittleEndian.Uint32(b[2:])) + offset = packedTableMetadataSizeLegacy + } else { + numBuckets = binary.LittleEndian.Uint64(b[2:]) + offset = packedTableMetadataSize + } + + return p.Init(0, bitsPerTag, numBuckets, b[offset:]) } diff --git a/singletable.go b/singletable.go index b41c63c..b30b3cc 100644 --- a/singletable.go +++ b/singletable.go @@ -16,11 +16,12 @@ import ( // SingleTable the most naive table implementation: one huge bit array type SingleTable struct { kTagsPerBucket uint - numBuckets uint + numBuckets uint64 bitsPerTag uint tagMask uint32 bucket []byte - len uint + len uint64 + totalSlots uint64 } // NewSingleTable return a singleTable @@ -29,34 +30,35 @@ func NewSingleTable() *SingleTable { } // Init init table -func (t *SingleTable) Init(tagsPerBucket, bitsPerTag, num uint, initialBucketsHint []byte) error { +func (t *SingleTable) Init(tagsPerBucket, bitsPerTag uint, num uint64, initialBucketsHint []byte) error { t.bitsPerTag = bitsPerTag t.numBuckets = num t.kTagsPerBucket = tagsPerBucket t.tagMask = (1 << bitsPerTag) - 1 - t.len = (t.bitsPerTag*t.kTagsPerBucket*t.numBuckets + 7) >> 3 + t.len = (uint64(t.bitsPerTag)*uint64(t.kTagsPerBucket)*t.numBuckets + 7) >> 3 buckets, err := getBucketsFromHint(initialBucketsHint, t.len) if err != nil { return err } t.bucket = buckets + t.totalSlots = uint64(t.kTagsPerBucket) * t.numBuckets return nil } // NumBuckets return num of table buckets -func (t *SingleTable) NumBuckets() uint { +func (t *SingleTable) NumBuckets() uint64 { return t.numBuckets } // SizeInBytes return bytes occupancy of table -func (t *SingleTable) SizeInBytes() uint { +func (t *SingleTable) SizeInBytes() uint64 { return t.len } // SizeInTags return num of tags that table can store -func (t *SingleTable) SizeInTags() uint { - return t.kTagsPerBucket * t.numBuckets +func (t *SingleTable) SizeInTags() uint64 { + return t.totalSlots } // BitsPerItem return bits occupancy per item of table @@ -65,8 +67,8 @@ func (t *SingleTable) BitsPerItem() uint { } // ReadTag read tag from bucket(i,j) -func (t *SingleTable) ReadTag(i, j uint) uint32 { - pos := (i*t.bitsPerTag*t.kTagsPerBucket + t.bitsPerTag*j) / bitsPerByte +func (t *SingleTable) ReadTag(i, j uint64) uint32 { + pos := (i*uint64(t.bitsPerTag)*uint64(t.kTagsPerBucket) + uint64(t.bitsPerTag)*j) / bitsPerByte var tag uint32 /* following code only works for little-endian */ switch t.bitsPerTag { @@ -89,12 +91,12 @@ func (t *SingleTable) ReadTag(i, j uint) uint32 { return tag & t.tagMask } -func (t *SingleTable) readOutBytes(i, j, pos uint) uint32 { - rShift := (i*t.bitsPerTag*t.kTagsPerBucket + t.bitsPerTag*j) & (bitsPerByte - 1) +func (t *SingleTable) readOutBytes(i, j, pos uint64) uint32 { + rShift := (i*uint64(t.bitsPerTag)*uint64(t.kTagsPerBucket) + uint64(t.bitsPerTag)*j) & (bitsPerByte - 1) // tag is max 32bit, so max occupies 5 bytes - kBytes := (rShift + t.bitsPerTag + 7) / bitsPerByte + kBytes := (rShift + uint64(t.bitsPerTag) + 7) / bitsPerByte var tmp uint64 - for k := uint(0); k < kBytes; k++ { + for k := uint64(0); k < kBytes; k++ { tmp |= uint64(t.bucket[pos+k]) << (bitsPerByte * k) } tmp >>= rShift @@ -102,8 +104,8 @@ func (t *SingleTable) readOutBytes(i, j, pos uint) uint32 { } // WriteTag write tag into bucket(i,j) -func (t *SingleTable) WriteTag(i, j uint, n uint32) { - pos := (i*t.bitsPerTag*t.kTagsPerBucket + t.bitsPerTag*j) / bitsPerByte +func (t *SingleTable) WriteTag(i, j uint64, n uint32) { + pos := (i*uint64(t.bitsPerTag)*uint64(t.kTagsPerBucket) + uint64(t.bitsPerTag)*j) / bitsPerByte tag := n & t.tagMask /* following code only works for little-endian */ switch t.bitsPerTag { @@ -146,11 +148,11 @@ func (t *SingleTable) WriteTag(i, j uint, n uint32) { } } -func (t *SingleTable) writeInBytes(i, j, pos uint, tag uint32) { - rShift := (i*t.bitsPerTag*t.kTagsPerBucket + t.bitsPerTag*j) & (bitsPerByte - 1) - lShift := (rShift + t.bitsPerTag) & (bitsPerByte - 1) +func (t *SingleTable) writeInBytes(i, j, pos uint64, tag uint32) { + rShift := (i*uint64(t.bitsPerTag)*uint64(t.kTagsPerBucket) + uint64(t.bitsPerTag)*j) & (bitsPerByte - 1) + lShift := (rShift + uint64(t.bitsPerTag)) & (bitsPerByte - 1) // tag is max 32bit, so max occupies 5 bytes - kBytes := (rShift + t.bitsPerTag + 7) / bitsPerByte + kBytes := (rShift + uint64(t.bitsPerTag) + 7) / bitsPerByte rMask := uint8(0xff) >> (bitsPerByte - rShift) lMask := uint8(0xff) << lShift @@ -163,15 +165,15 @@ func (t *SingleTable) writeInBytes(i, j, pos uint, tag uint32) { tmp |= uint64(t.bucket[pos+end]&lMask) << (end * bitsPerByte) tmp |= uint64(tag) << rShift - for k := uint(0); k < kBytes; k++ { + for k := uint64(0); k < kBytes; k++ { t.bucket[pos+k] = byte(tmp >> (k * bitsPerByte)) } } // FindTagInBuckets find if tag in bucket i1 i2 -func (t *SingleTable) FindTagInBuckets(i1, i2 uint, tag uint32) bool { - var j uint - for j = 0; j < t.kTagsPerBucket; j++ { +func (t *SingleTable) FindTagInBuckets(i1, i2 uint64, tag uint32) bool { + var j uint64 + for j = 0; j < uint64(t.kTagsPerBucket); j++ { if t.ReadTag(i1, j) == tag || t.ReadTag(i2, j) == tag { return true } @@ -180,9 +182,9 @@ func (t *SingleTable) FindTagInBuckets(i1, i2 uint, tag uint32) bool { } // DeleteTagFromBucket delete tag from bucket i -func (t *SingleTable) DeleteTagFromBucket(i uint, tag uint32) bool { - var j uint - for j = 0; j < t.kTagsPerBucket; j++ { +func (t *SingleTable) DeleteTagFromBucket(i uint64, tag uint32) bool { + var j uint64 + for j = 0; j < uint64(t.kTagsPerBucket); j++ { if t.ReadTag(i, j) == tag { t.WriteTag(i, j, 0) return true @@ -192,16 +194,16 @@ func (t *SingleTable) DeleteTagFromBucket(i uint, tag uint32) bool { } // InsertTagToBucket insert tag into bucket i -func (t *SingleTable) InsertTagToBucket(i uint, tag uint32, kickOut bool, oldTag *uint32) bool { - var j uint - for j = 0; j < t.kTagsPerBucket; j++ { +func (t *SingleTable) InsertTagToBucket(i uint64, tag uint32, kickOut bool, oldTag *uint32) bool { + var j uint64 + for j = 0; j < uint64(t.kTagsPerBucket); j++ { if t.ReadTag(i, j) == 0 { t.WriteTag(i, j, tag) return true } } if kickOut { - r := uint(rand.Int31()) % t.kTagsPerBucket + r := uint64(uint(rand.Int31()) % t.kTagsPerBucket) *oldTag = t.ReadTag(i, r) t.WriteTag(i, r, tag) } @@ -224,22 +226,46 @@ func (t *SingleTable) Info() string { t.bitsPerTag, t.kTagsPerBucket, t.numBuckets, t.SizeInTags()) } -const singleTableMetadataSize = 3 + bytesPerUint32 +const ( + singleTableMetadataSize = 3 + bytesPerUint64 + singleTableMetadataSizeLegacy = 3 + bytesPerUint32 +) + +// Reader returns a reader representing a TableBucket +func (t *SingleTable) Reader(legacy bool) (io.Reader, uint64) { + var metadata []byte + if legacy { + metadata = make([]byte, singleTableMetadataSizeLegacy) + } else { + metadata = make([]byte, singleTableMetadataSize) + } -// Encode returns a byte slice representing a TableBucket -func (t *SingleTable) Reader() (io.Reader, uint) { - var metadata [singleTableMetadataSize]byte metadata[0] = uint8(TableTypeSingle) metadata[1] = uint8(t.kTagsPerBucket) metadata[2] = uint8(t.bitsPerTag) - binary.LittleEndian.PutUint32(metadata[3:], uint32(t.numBuckets)) - return io.MultiReader(bytes.NewReader(metadata[:]), bytes.NewReader(t.bucket)), uint(len(metadata) + len(t.bucket)) + + if legacy { + binary.LittleEndian.PutUint32(metadata[3:], uint32(t.numBuckets)) + } else { + binary.LittleEndian.PutUint64(metadata[3:], t.numBuckets) + } + + return io.MultiReader(bytes.NewReader(metadata[:]), bytes.NewReader(t.bucket)), uint64(len(metadata)) + uint64(len(t.bucket)) } // Decode parse a byte slice into a TableBucket -func (t *SingleTable) Decode(b []byte) error { +func (t *SingleTable) Decode(legacy bool, b []byte) error { tagsPerBucket := uint(b[1]) bitsPerTag := uint(b[2]) - numBuckets := uint(binary.LittleEndian.Uint32(b[3:])) - return t.Init(tagsPerBucket, bitsPerTag, numBuckets, b[7:]) + + var numBuckets, offset uint64 + if legacy { + numBuckets = uint64(binary.LittleEndian.Uint32(b[3:])) + offset = singleTableMetadataSizeLegacy + } else { + numBuckets = binary.LittleEndian.Uint64(b[3:]) + offset = singleTableMetadataSize + } + + return t.Init(tagsPerBucket, bitsPerTag, numBuckets, b[offset:]) } diff --git a/util.go b/util.go index 9777123..9695228 100644 --- a/util.go +++ b/util.go @@ -8,12 +8,12 @@ package cuckoo import "fmt" const ( - bitsPerByte = 8 - bytesPerUint64 = 8 - bytesPerUint32 = 4 + bitsPerByte uint64 = 8 + bytesPerUint64 = 8 + bytesPerUint32 = 4 ) -func getNextPow2(n uint64) uint { +func getNextPow2(n uint64) uint64 { n-- n |= n >> 1 n |= n >> 2 @@ -22,7 +22,7 @@ func getNextPow2(n uint64) uint { n |= n >> 16 n |= n >> 32 n++ - return uint(n) + return n } func maxLoadFactor(tagsPerBucket uint) float64 { @@ -36,12 +36,12 @@ func maxLoadFactor(tagsPerBucket uint) float64 { } } -func getBucketsFromHint(initialBucketsHint []byte, expectedLength uint) ([]byte, error) { +func getBucketsFromHint(initialBucketsHint []byte, expectedLength uint64) ([]byte, error) { result := initialBucketsHint if len(result) == 0 { result = make([]byte, expectedLength) } - if uint(len(result)) != expectedLength { + if uint64(len(result)) != expectedLength { return nil, fmt.Errorf("buckets length should be %d but got %d", expectedLength, len(result)) } return result, nil