dgraph-io · FrankReh · Feb 18, 2022 · Dec 23, 2022 · Mar 16, 2023
@@ -42,6 +42,8 @@ type cmSketch struct {
 
 const (
 	// cmDepth is the number of counter copies to store (think of it as rows).
+	// This value hasn't changed in years. The functions below using `fourIndexes`
+	// use that fact to unwind the loops.
 	cmDepth = 4
 )
 
@@ -56,29 +58,107 @@ func newCmSketch(numCounters int64) *cmSketch {
 	// Cryptographic precision not needed
 	source := rand.New(rand.NewSource(time.Now().UnixNano())) //nolint:gosec
 	for i := 0; i < cmDepth; i++ {
-		sketch.seed[i] = source.Uint64()
+		sketch.seed[i] = spread(source.Uint64())
 		sketch.rows[i] = newCmRow(numCounters)
 	}
 	return sketch
 }
 
-// Increment increments the count(ers) for the specified key.
+func circRightShift(x uint64, shift uint) uint64 {
+	return (x << (64 - shift)) | (x >> shift)
+}
+func circLeftShift(x uint64, shift uint) uint64 {
+	return (x << shift) | (x >> (64 - shift))
+}
+
+//  Applies a supplemental hash function to a given hashCode, which defends against poor quality
+//  hash functions.
+func spread(x uint64) uint64 {
+	x = (circRightShift(x, 16) ^ x) * 0x45d9f3b
+	x = (circRightShift(x, 16) ^ x) * 0x45d9f3b
+	return circRightShift(x, 16) ^ x
+}
+
+func leftAndRight(x uint64) (l, r uint64) {
+	l = x<<32 | (x & 0x00000000ffffffff)
+	r = x>>32 | (x & 0xffffffff00000000)
+	return
+}
+
+// fourIndexes returns the four indexes to use against the four rows for the given key.
+// Because many 64 keys come in with low entropy in some portions of the 64 bits, some work is done
+// to spread any entropy at all across all four index results, with the goal of minimizing the
+// number of times similar keys results in similar row indexes. No tests against zero are needed so
+// there are no branch predictions to stall the instruction pipeline.
+func (s *cmSketch) fourIndexes(x uint64) (a, b, c, d uint64) {
+	x = spread(x)
+	l, r := leftAndRight(x)
+
+	l = circLeftShift(l, 3)
+	r = circRightShift(r, 5)
+
+	l = circLeftShift(l, 8)
+	a = (l ^ r)
+	l = circLeftShift(l, 8)
+	b = (l ^ (r >> 8))
+	l = circLeftShift(l, 8)
+	c = (l ^ (r >> 16))
+	l = circLeftShift(l, 8)
+	d = (l ^ (r >> 24))
+
+	// Interestingly, the seeds don't do anything important if the incoming hash doesn't cause
+	// different parts of them to be used. The seed is meant to further swivel the counter index
+	// per row, but if a given row's seed is used each time, unaltered for a row, it serves no
+	// purpose. An improvement below uses the seeds based on the hash low bits.
+	// a ^= s.seed[0]
+	// b ^= s.seed[1]
+	// c ^= s.seed[2]
+	// d ^= s.seed[3]
+
+	// Use the hash's lowest 6 bits for an int in range [0,63] that is then used to shift right
+	// each seed. The low bits of the hash are used to determine rotation amount of each row's seed
+	// before the seed is applied to the four index computers. These computers are actually
+	// swivelers, causing the index that is created via s.mask to be swiveled.
+	a ^= circRightShift(s.seed[0], uint(x&63))
+	b ^= circRightShift(s.seed[1], uint(x&63))
+	c ^= circRightShift(s.seed[2], uint(x&63))
+	d ^= circRightShift(s.seed[3], uint(x&63))
+	return
+}
+
+// Increment increments the counters for the specified key.
 func (s *cmSketch) Increment(hashed uint64) {
-	for i := range s.rows {
-		s.rows[i].increment((hashed ^ s.seed[i]) & s.mask)
-	}
+	a, b, c, d := s.fourIndexes(hashed)
+	m := s.mask
+
+	s.rows[0].increment(a & m)
+	s.rows[1].increment(b & m)
+	s.rows[2].increment(c & m)
+	s.rows[3].increment(d & m)
 }
 
 // Estimate returns the value of the specified key.
+// It does this by calculating the index for each row that `Increment` would have used for the
+// specified key and returning the lowest of the four counters.
 func (s *cmSketch) Estimate(hashed uint64) int64 {
-	min := byte(255)
-	for i := range s.rows {
-		val := s.rows[i].get((hashed ^ s.seed[i]) & s.mask)
-		if val < min {
-			min = val
-		}
+	a, b, c, d := s.fourIndexes(hashed)
+	m := s.mask
+
+	// find the smallest counter value from all the rows
+	v0 := s.rows[0].get(a & m)
+	v1 := s.rows[1].get(b & m)
+	v2 := s.rows[2].get(c & m)
+	v3 := s.rows[3].get(d & m)
+	if v1 < v0 {
+		v0 = v1
+	}
+	if v3 < v2 {
+		v2 = v3
+	}
+	if v2 < v0 {
+		return int64(v2)
 	}
-	return int64(min)
+	return int64(v0)
 }
 
 // Reset halves all counter values.

@@ -1,6 +1,9 @@
 package ristretto
 
 import (
+	"math/rand"
+	"sort"
+	"strings"
 	"testing"
 
 	"github.com/stretchr/testify/require"
@@ -16,17 +19,95 @@ func TestSketch(t *testing.T) {
 	newCmSketch(0)
 }
 
+// Temporary comment. This test is fixed, kind of.
+// It will now report when two or more rows are identical but that isn't what
+// should be tested. What should be tested is that the rows end up having unique
+// values from each other. If the rows have identical counters, but they are in
+// different positions, that doesn't help; no reason to have multiple rows if
+// the increment for one row always changes counters like to does for the next
+// row. So the actual test that shows whether row counters are being incremented
+// uniquely is the new one below: TestSketchRowUniqueness. There the counters in
+// a row are sorted before being compared.
 func TestSketchIncrement(t *testing.T) {
 	s := newCmSketch(16)
-	s.Increment(1)
-	s.Increment(5)
-	s.Increment(9)
+	pseudoRandomIncrements(s, anyseed, anycount)
 	for i := 0; i < cmDepth; i++ {
-		if s.rows[i].string() != s.rows[0].string() {
-			break
+		rowi := s.rows[i].string()
+		for j := 0; j < i; j++ {
+			rowj := s.rows[j].string()
+			require.False(t, rowi == rowj, "identical rows, bad hashing")
+		}
+	}
+}
+
+const anyseed = int64(990099)
+const anycount = int(100) // Hopefully not large enough to max out a counter
+
+func pseudoRandomIncrements(s *cmSketch, seed int64, count int) {
+	r := rand.New(rand.NewSource(anyseed))
+	for n := 0; n < count; n++ {
+		s.Increment(r.Uint64())
+	}
+}
+
+// Bad hashing increments because there is very little entropy
+// between the values. This is used to test how well the sketch
+// uses multiple rows when there is little entropy between the hashes
+// being incremented with.
+func badHashingIncrements(s *cmSketch, count int) {
+	for hashed := 0; hashed < count; hashed++ {
+		s.Increment(uint64(hashed + 1))
+	}
+}
+
+func TestSketchRowUniqueness(t *testing.T) {
+	// Test the row uniqueness twice.
+	// Once when the hashes being added are pretty random
+	// which we would normally expect.
+	// And once when the hashes added are not normal, maybe
+	// they are all low numbers for example.
+	t.Run("WithGoodHashing", func(t *testing.T) {
+		s := newCmSketch(16)
+		pseudoRandomIncrements(s, anyseed, anycount)
+		testSketchRowUniqueness1(t, s)
+	})
+	t.Run("WithBadHashing", func(t *testing.T) {
+		s := newCmSketch(16)
+		badHashingIncrements(s, anycount)
+		testSketchRowUniqueness1(t, s)
+	})
+}
+
+func testSketchRowUniqueness1(t *testing.T, s *cmSketch) {
+	// Perform test like the one above, TestSketchIncrement, but
+	// compare the rows counters, once the counters are sorted.
+	// If after several insertions, the rows have the same counters
+	// in them, the hashing scheme is likely not actually
+	// creating uniqueness between rows.
+
+	var unswiv [cmDepth]string
+	for i := 0; i < cmDepth; i++ {
+		unswiv[i] = s.rows[i].string()
+	}
+	// Now perform a kind of unswivel on each row, so counters are in ascending order.
+	for i := 0; i < cmDepth; i++ {
+		fields := strings.Split(unswiv[i], " ")
+		sort.Strings(fields)
+		unswiv[i] = strings.Join(fields, " ")
+	}
+	identical := 0
+	for i := 0; i < cmDepth; i++ {
+		t.Logf("s.rows[%d] %s, unswiv[%d] %s", i, s.rows[i].string(), i, unswiv[i])
+
+		for j := 0; j < i; j++ {
+			if unswiv[i] == unswiv[j] {
+				// Even one would be suspect. But count here so we can see how many rows look the same.
+				identical++
+				break // break out of j loop, knowing i looks like any earlier is enough, don't want to double count
+			}
 		}
-		require.False(t, i == cmDepth-1, "identical rows, bad seeding")
 	}
+	require.Zero(t, identical, "%d unswiveled rows looked like earlier unswiveled rows", identical)
 }
 
 func TestSketchEstimate(t *testing.T) {