Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(cmSketch) #326

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 92 additions & 12 deletions sketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ type cmSketch struct {

const (
// cmDepth is the number of counter copies to store (think of it as rows).
// This value hasn't changed in years. The functions below using `fourIndexes`
// use that fact to unwind the loops.
cmDepth = 4
)

Expand All @@ -56,29 +58,107 @@ func newCmSketch(numCounters int64) *cmSketch {
// Cryptographic precision not needed
source := rand.New(rand.NewSource(time.Now().UnixNano())) //nolint:gosec
for i := 0; i < cmDepth; i++ {
sketch.seed[i] = source.Uint64()
sketch.seed[i] = spread(source.Uint64())
sketch.rows[i] = newCmRow(numCounters)
}
return sketch
}

// Increment increments the count(ers) for the specified key.
func circRightShift(x uint64, shift uint) uint64 {
return (x << (64 - shift)) | (x >> shift)
}
func circLeftShift(x uint64, shift uint) uint64 {
return (x << shift) | (x >> (64 - shift))
}

// Applies a supplemental hash function to a given hashCode, which defends against poor quality
// hash functions.
func spread(x uint64) uint64 {
x = (circRightShift(x, 16) ^ x) * 0x45d9f3b
x = (circRightShift(x, 16) ^ x) * 0x45d9f3b
return circRightShift(x, 16) ^ x
}

func leftAndRight(x uint64) (l, r uint64) {
l = x<<32 | (x & 0x00000000ffffffff)
r = x>>32 | (x & 0xffffffff00000000)
return
}

// fourIndexes returns the four indexes to use against the four rows for the given key.
// Because many 64 keys come in with low entropy in some portions of the 64 bits, some work is done
// to spread any entropy at all across all four index results, with the goal of minimizing the
// number of times similar keys results in similar row indexes. No tests against zero are needed so
// there are no branch predictions to stall the instruction pipeline.
func (s *cmSketch) fourIndexes(x uint64) (a, b, c, d uint64) {
x = spread(x)
l, r := leftAndRight(x)

l = circLeftShift(l, 3)
r = circRightShift(r, 5)

l = circLeftShift(l, 8)
a = (l ^ r)
l = circLeftShift(l, 8)
b = (l ^ (r >> 8))
l = circLeftShift(l, 8)
c = (l ^ (r >> 16))
l = circLeftShift(l, 8)
d = (l ^ (r >> 24))

// Interestingly, the seeds don't do anything important if the incoming hash doesn't cause
// different parts of them to be used. The seed is meant to further swivel the counter index
// per row, but if a given row's seed is used each time, unaltered for a row, it serves no
// purpose. An improvement below uses the seeds based on the hash low bits.
// a ^= s.seed[0]
// b ^= s.seed[1]
// c ^= s.seed[2]
// d ^= s.seed[3]

// Use the hash's lowest 6 bits for an int in range [0,63] that is then used to shift right
// each seed. The low bits of the hash are used to determine rotation amount of each row's seed
// before the seed is applied to the four index computers. These computers are actually
// swivelers, causing the index that is created via s.mask to be swiveled.
a ^= circRightShift(s.seed[0], uint(x&63))
b ^= circRightShift(s.seed[1], uint(x&63))
c ^= circRightShift(s.seed[2], uint(x&63))
d ^= circRightShift(s.seed[3], uint(x&63))
return
}

// Increment increments the counters for the specified key.
func (s *cmSketch) Increment(hashed uint64) {
for i := range s.rows {
s.rows[i].increment((hashed ^ s.seed[i]) & s.mask)
}
a, b, c, d := s.fourIndexes(hashed)
m := s.mask

s.rows[0].increment(a & m)
s.rows[1].increment(b & m)
s.rows[2].increment(c & m)
s.rows[3].increment(d & m)
}

// Estimate returns the value of the specified key.
// It does this by calculating the index for each row that `Increment` would have used for the
// specified key and returning the lowest of the four counters.
func (s *cmSketch) Estimate(hashed uint64) int64 {
min := byte(255)
for i := range s.rows {
val := s.rows[i].get((hashed ^ s.seed[i]) & s.mask)
if val < min {
min = val
}
a, b, c, d := s.fourIndexes(hashed)
m := s.mask

// find the smallest counter value from all the rows
v0 := s.rows[0].get(a & m)
v1 := s.rows[1].get(b & m)
v2 := s.rows[2].get(c & m)
v3 := s.rows[3].get(d & m)
if v1 < v0 {
v0 = v1
}
if v3 < v2 {
v2 = v3
}
if v2 < v0 {
return int64(v2)
}
return int64(min)
return int64(v0)
}

// Reset halves all counter values.
Expand Down
93 changes: 87 additions & 6 deletions sketch_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package ristretto

import (
"math/rand"
"sort"
"strings"
"testing"

"github.com/stretchr/testify/require"
Expand All @@ -16,17 +19,95 @@ func TestSketch(t *testing.T) {
newCmSketch(0)
}

// Temporary comment. This test is fixed, kind of.
// It will now report when two or more rows are identical but that isn't what
// should be tested. What should be tested is that the rows end up having unique
// values from each other. If the rows have identical counters, but they are in
// different positions, that doesn't help; no reason to have multiple rows if
// the increment for one row always changes counters like to does for the next
// row. So the actual test that shows whether row counters are being incremented
// uniquely is the new one below: TestSketchRowUniqueness. There the counters in
// a row are sorted before being compared.
func TestSketchIncrement(t *testing.T) {
s := newCmSketch(16)
s.Increment(1)
s.Increment(5)
s.Increment(9)
pseudoRandomIncrements(s, anyseed, anycount)
for i := 0; i < cmDepth; i++ {
if s.rows[i].string() != s.rows[0].string() {
break
rowi := s.rows[i].string()
for j := 0; j < i; j++ {
rowj := s.rows[j].string()
require.False(t, rowi == rowj, "identical rows, bad hashing")
}
}
}

const anyseed = int64(990099)
const anycount = int(100) // Hopefully not large enough to max out a counter

func pseudoRandomIncrements(s *cmSketch, seed int64, count int) {
r := rand.New(rand.NewSource(anyseed))
for n := 0; n < count; n++ {
s.Increment(r.Uint64())
}
}

// Bad hashing increments because there is very little entropy
// between the values. This is used to test how well the sketch
// uses multiple rows when there is little entropy between the hashes
// being incremented with.
func badHashingIncrements(s *cmSketch, count int) {
for hashed := 0; hashed < count; hashed++ {
s.Increment(uint64(hashed + 1))
}
}

func TestSketchRowUniqueness(t *testing.T) {
// Test the row uniqueness twice.
// Once when the hashes being added are pretty random
// which we would normally expect.
// And once when the hashes added are not normal, maybe
// they are all low numbers for example.
t.Run("WithGoodHashing", func(t *testing.T) {
s := newCmSketch(16)
pseudoRandomIncrements(s, anyseed, anycount)
testSketchRowUniqueness1(t, s)
})
t.Run("WithBadHashing", func(t *testing.T) {
s := newCmSketch(16)
badHashingIncrements(s, anycount)
testSketchRowUniqueness1(t, s)
})
}

func testSketchRowUniqueness1(t *testing.T, s *cmSketch) {
// Perform test like the one above, TestSketchIncrement, but
// compare the rows counters, once the counters are sorted.
// If after several insertions, the rows have the same counters
// in them, the hashing scheme is likely not actually
// creating uniqueness between rows.

var unswiv [cmDepth]string
for i := 0; i < cmDepth; i++ {
unswiv[i] = s.rows[i].string()
}
// Now perform a kind of unswivel on each row, so counters are in ascending order.
for i := 0; i < cmDepth; i++ {
fields := strings.Split(unswiv[i], " ")
sort.Strings(fields)
unswiv[i] = strings.Join(fields, " ")
}
identical := 0
for i := 0; i < cmDepth; i++ {
t.Logf("s.rows[%d] %s, unswiv[%d] %s", i, s.rows[i].string(), i, unswiv[i])

for j := 0; j < i; j++ {
if unswiv[i] == unswiv[j] {
// Even one would be suspect. But count here so we can see how many rows look the same.
identical++
break // break out of j loop, knowing i looks like any earlier is enough, don't want to double count
}
}
require.False(t, i == cmDepth-1, "identical rows, bad seeding")
}
require.Zero(t, identical, "%d unswiveled rows looked like earlier unswiveled rows", identical)
}

func TestSketchEstimate(t *testing.T) {
Expand Down