From 854612bcf1a4702c5b4a330ebe0444706ee1d797 Mon Sep 17 00:00:00 2001
From: Alexander Petrov <alldroll@gmail.com>
Date: Fri, 20 Sep 2019 13:09:59 +0100
Subject: [PATCH 1/4] Implement shotgun4Intersect

---
 setutil.go      | 98 +++++++++++++++++++++++++++++++++++++++++++++++++
 setutil_test.go | 79 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 168 insertions(+), 9 deletions(-)

diff --git a/setutil.go b/setutil.go
index 3e8c01dd..2419c866 100644
--- a/setutil.go
+++ b/setutil.go
@@ -582,6 +582,104 @@ mainwhile:
 	return pos
 }
 
+// shotgun4Intersect performs intersection between small and large arrays described in
+// https://lemire.me/blog/2019/01/16/faster-intersections-between-sorted-arrays-with-shotgun/
+func shotgun4Intersect(small, large, buf []uint16) int {
+	if len(small) == 0 {
+		return 0
+	}
+
+	nS, nL := len(small), len(large)
+	buf = buf[:cap(buf)]
+	idxS, idxL := 0, 0
+	pos := 0
+
+	for (idxS+4 <= nS) && idxL < nL {
+		t1, t2, t3, t4 := small[idxS], small[idxS+1], small[idxS+2], small[idxS+3]
+		idx1, idx2, idx3, idx4 := idxL, idxL, idxL, idxL
+		n := nL - idxL
+
+		for n > 1 {
+			m := n >> 1
+
+			if large[idx1+m] < t1 {
+				idx1 += m
+			}
+
+			if large[idx2+m] < t2 {
+				idx2 += m
+			}
+
+			if large[idx3+m] < t3 {
+				idx3 += m
+			}
+
+			if large[idx4+m] < t4 {
+				idx4 += m
+			}
+
+			n -= m
+		}
+
+		if large[idx1] < t1 {
+			idx1++
+		}
+
+		if large[idx2] < t2 {
+			idx2++
+		}
+
+		if large[idx3] < t3 {
+			idx3++
+		}
+
+		if large[idx4] < t4 {
+			idx4++
+		}
+
+		if idx1 < nL && large[idx1] == t1 {
+			buf[pos] = t1
+			pos++
+		}
+
+		if idx2 < nL && large[idx2] == t2 {
+			buf[pos] = t2
+			pos++
+		}
+
+		if idx3 < nL && large[idx3] == t3 {
+			buf[pos] = t3
+			pos++
+		}
+
+		if idx4 < nL && large[idx4] == t4 {
+			buf[pos] = t4
+			pos++
+		}
+
+		idxS += 4
+		idxL = idx4
+	}
+
+	for idxS < nS && idxL < nL {
+		s := small[idxS]
+		idxL = advanceUntil(large, idxL, nL, s)
+
+		if idxL == nL {
+			break
+		}
+
+		if large[idxL] == s {
+			buf[pos] = s
+			pos++
+		}
+
+		idxS++
+	}
+
+	return pos
+}
+
 func binarySearch(array []uint16, ikey uint16) int {
 	low := 0
 	high := len(array) - 1
diff --git a/setutil_test.go b/setutil_test.go
index b037e02a..5b8c508f 100644
--- a/setutil_test.go
+++ b/setutil_test.go
@@ -4,6 +4,7 @@ package roaring
 
 import (
 	"github.com/stretchr/testify/assert"
+	"math/rand"
 	"testing"
 )
 
@@ -92,16 +93,31 @@ func TestSetUtilIntersection(t *testing.T) {
 	assert.Equal(t, expectedresult, result)
 }
 
-func TestSetUtilIntersection2(t *testing.T) {
-	data1 := []uint16{0, 2, 4, 6, 8, 10, 12, 14, 16, 18}
-	data2 := []uint16{0, 3, 6, 9, 12, 15, 18}
-	result := make([]uint16, 0, len(data1)+len(data2))
-	expectedresult := []uint16{0, 6, 12, 18}
-	nl := intersection2by2(data1, data2, result)
-	result = result[:nl]
-	result = result[:len(expectedresult)]
+func TestSetUtilIntersectionCases(t *testing.T) {
+	cases := []struct {
+		name string
+		algo func(a, b, buf []uint16) int
+	}{
+		{
+			name: "onesidedgallopingintersect2by2",
+			algo: onesidedgallopingintersect2by2,
+		},
+		{
+			name: "shotgun4Intersect",
+			algo: shotgun4Intersect,
+		},
+	}
 
-	assert.Equal(t, expectedresult, result)
+	data1 := []uint16{0, 3, 6, 9, 12, 15, 18}
+	data2 := []uint16{0, 2, 4, 6, 8, 10, 12, 14, 16, 18}
+	expected := []uint16{0, 6, 12, 18}
+
+	for _, c := range cases {
+		result := make([]uint16, 0, len(data1)+len(data2))
+		n := c.algo(data1, data2, result)
+
+		assert.Equalf(t, expected, result[:n], "failed algorithm: %s", c.name)
+	}
 }
 
 func TestSetUtilBinarySearch(t *testing.T) {
@@ -119,3 +135,48 @@ func TestSetUtilBinarySearch(t *testing.T) {
 		}
 	}
 }
+
+func BenchmarkIntersectAlgorithms(b *testing.B) {
+	sz1 := 1000
+	s1 := make([]uint16, sz1)
+
+	sz2 := MaxUint16
+	s2 := make([]uint16, sz2)
+
+	for i := 0; i < sz2; i++ {
+		s2[i] = uint16(i)
+	}
+
+	r := rand.New(rand.NewSource(0))
+	k := 0
+
+	for i := 0; i < sz1 && k < sz2; i++ {
+		n := r.Intn(100)
+		k += n
+
+		// prevent adding duplicates
+		if n == 0 && i > 0 {
+			k++
+		}
+
+		s1[i] = uint16(s2[k])
+	}
+
+	buf := make([]uint16, sz1+sz2)
+
+	b.Run("onesidedgallopingintersect2by2", func(b *testing.B) {
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			onesidedgallopingintersect2by2(s1, s2, buf)
+		}
+	})
+
+	b.Run("shotgun4", func(b *testing.B) {
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			shotgun4Intersect(s1, s2, buf)
+		}
+	})
+}

From e03e38d189f0910b3df9ebf7a293dd7d1d6618ca Mon Sep 17 00:00:00 2001
From: Daniel Lemire <lemire@gmail.com>
Date: Wed, 25 Sep 2019 13:42:11 -0400
Subject: [PATCH 2/4] Proving that gunshot works.

---
 serialization_test.go |  2 +-
 setutil.go            | 76 ++++++++++++++++++++++-----------------
 setutil_test.go       | 83 ++++++++++++++++++++++++++++++++++---------
 3 files changed, 111 insertions(+), 50 deletions(-)

diff --git a/serialization_test.go b/serialization_test.go
index 80750736..50392d5b 100644
--- a/serialization_test.go
+++ b/serialization_test.go
@@ -419,7 +419,7 @@ func singleSliceInArray() (*Bitmap, []*Bitmap) {
 
 func singleSlice() *Bitmap {
 	slice := make([]byte, 2)
-	return &Bitmap{highlowcontainer:roaringArray{keys: []uint16{0}, containers: []container{&arrayContainer{ byteSliceAsUint16Slice(slice)}}}}
+	return &Bitmap{highlowcontainer: roaringArray{keys: []uint16{0}, containers: []container{&arrayContainer{byteSliceAsUint16Slice(slice)}}}}
 }
 
 func TestByteSliceAsUint64Slice(t *testing.T) {
diff --git a/setutil.go b/setutil.go
index 2419c866..1bd7cbd7 100644
--- a/setutil.go
+++ b/setutil.go
@@ -582,6 +582,11 @@ mainwhile:
 	return pos
 }
 
+// returns -1 if x < y, zero otherwise
+func branchlessComparator(x, y uint16) int {
+	return (int(x) - int(y)) >> 63
+}
+
 // shotgun4Intersect performs intersection between small and large arrays described in
 // https://lemire.me/blog/2019/01/16/faster-intersections-between-sorted-arrays-with-shotgun/
 func shotgun4Intersect(small, large, buf []uint16) int {
@@ -601,58 +606,65 @@ func shotgun4Intersect(small, large, buf []uint16) int {
 
 		for n > 1 {
 			m := n >> 1
+			l1, l2, l3, l4 := large[idx1+m], large[idx2+m], large[idx3+m], large[idx4+m]
+			idx1 += branchlessComparator(l1, t1) & m
+			idx2 += branchlessComparator(l2, t2) & m
+			idx3 += branchlessComparator(l3, t3) & m
+			idx4 += branchlessComparator(l4, t4) & m
+			n -= m
+		}
 
-			if large[idx1+m] < t1 {
-				idx1 += m
+		l1, l2, l3, l4 := large[idx1], large[idx2], large[idx3], large[idx4]
+		if idx4+1 < nL { // common case
+			idx1 -= branchlessComparator(l1, t1)
+			idx2 -= branchlessComparator(l2, t2)
+			idx3 -= branchlessComparator(l3, t3)
+			idx4 -= branchlessComparator(l4, t4)
+			l1, l2, l3, l4 = large[idx1], large[idx2], large[idx3], large[idx4]
+		} else { // slow path
+			if l1 < t1 {
+				idx1++
+				if idx1 < nL {
+					l1 = large[idx1]
+				}
 			}
-
-			if large[idx2+m] < t2 {
-				idx2 += m
+			if l2 < t2 {
+				idx2++
+				if idx2 < nL {
+					l2 = large[idx2]
+				}
 			}
-
-			if large[idx3+m] < t3 {
-				idx3 += m
+			if l3 < t3 {
+				idx3++
+				if idx3 < nL {
+					l3 = large[idx3]
+				}
 			}
-
-			if large[idx4+m] < t4 {
-				idx4 += m
+			if l4 < t4 {
+				idx4++
+				if idx4 < nL {
+					l4 = large[idx4]
+				}
 			}
 
-			n -= m
-		}
-
-		if large[idx1] < t1 {
-			idx1++
-		}
-
-		if large[idx2] < t2 {
-			idx2++
-		}
-
-		if large[idx3] < t3 {
-			idx3++
-		}
-
-		if large[idx4] < t4 {
-			idx4++
 		}
 
-		if idx1 < nL && large[idx1] == t1 {
+		if l1 == t1 {
 			buf[pos] = t1
 			pos++
 		}
 
-		if idx2 < nL && large[idx2] == t2 {
+		if l2 == t2 {
 			buf[pos] = t2
 			pos++
 		}
 
-		if idx3 < nL && large[idx3] == t3 {
+		if l3 == t3 {
 			buf[pos] = t3
 			pos++
 		}
 
-		if idx4 < nL && large[idx4] == t4 {
+		if l4 == t4 {
 			buf[pos] = t4
 			pos++
 		}
diff --git a/setutil_test.go b/setutil_test.go
index 5b8c508f..3d90d4fa 100644
--- a/setutil_test.go
+++ b/setutil_test.go
@@ -5,6 +5,7 @@ package roaring
 import (
 	"github.com/stretchr/testify/assert"
 	"math/rand"
+	"sort"
 	"testing"
 )
 
@@ -93,6 +94,7 @@ func TestSetUtilIntersection(t *testing.T) {
 	assert.Equal(t, expectedresult, result)
 }
 
+// go test -run TestSetUtilIntersectionCases
 func TestSetUtilIntersectionCases(t *testing.T) {
 	cases := []struct {
 		name string
@@ -136,47 +138,94 @@ func TestSetUtilBinarySearch(t *testing.T) {
 	}
 }
 
+// go test  -bench BenchmarkIntersectAlgorithms -run -
 func BenchmarkIntersectAlgorithms(b *testing.B) {
-	sz1 := 1000
+	// sz1 is the small array
+	sz1 := 64 // this should not be *too* large
 	s1 := make([]uint16, sz1)
 
-	sz2 := MaxUint16
+	// to get more realistic results, we try different
+	// large array sizes. Our benchmarks is going to be
+	// an average of those...
+
+	sz2 := 3000
 	s2 := make([]uint16, sz2)
 
-	for i := 0; i < sz2; i++ {
-		s2[i] = uint16(i)
-	}
+	sz3 := 2040
+	s3 := make([]uint16, sz3)
 
-	r := rand.New(rand.NewSource(0))
-	k := 0
+	sz4 := 1200
+	s4 := make([]uint16, sz4)
 
-	for i := 0; i < sz1 && k < sz2; i++ {
-		n := r.Intn(100)
-		k += n
+	r := rand.New(rand.NewSource(1234))
 
-		// prevent adding duplicates
-		if n == 0 && i > 0 {
-			k++
-		}
+	// We are going to populate our large arrays with
+	// random data. Importantly, we need to sort.
+	// There might be a few duplicates, by random chance,
+	// but it should not affect results too much.
 
-		s1[i] = uint16(s2[k])
+	for i := 0; i < sz2; i++ {
+		s2[i] = uint16(r.Intn(MaxUint16))
+	}
+	sort.Sort(uint16Slice(s2))
+
+	for i := 0; i < sz3; i++ {
+		s3[i] = uint16(r.Intn(MaxUint16))
 	}
+	sort.Sort(uint16Slice(s3))
 
-	buf := make([]uint16, sz1+sz2)
+	for i := 0; i < sz4; i++ {
+		s4[i] = uint16(r.Intn(MaxUint16))
+	}
+	sort.Sort(uint16Slice(s4))
+
+	buf := make([]uint16, sz1+sz2+sz3+sz4)
+	commonseed := 123456
+	r = rand.New(rand.NewSource(commonseed)) // we set the same seed in both instances
 
 	b.Run("onesidedgallopingintersect2by2", func(b *testing.B) {
+
 		b.ResetTimer()
 
 		for i := 0; i < b.N; i++ {
+			// this is important: you want to start with a new
+			// small array each time otherwise onesidedgallopingintersect2by2
+			// might benefit from nearly perfect branch prediction, making
+			// the benchmark unrealistic.
+			// This needs to be super fast, which it should be if sz1 is
+			// small enough.
+			for i := 0; i < sz1; i++ {
+				// This needs to be super fast
+				s1[i] = uint16(r.Intn(MaxUint16))
+			}
+			sort.Sort(uint16Slice(s1)) // There might be duplicates, ignore them
+
 			onesidedgallopingintersect2by2(s1, s2, buf)
+			onesidedgallopingintersect2by2(s1, s3, buf)
+			onesidedgallopingintersect2by2(s1, s4, buf)
+
 		}
 	})
+	r = rand.New(rand.NewSource(commonseed)) // we set the same seed in both instances
 
 	b.Run("shotgun4", func(b *testing.B) {
 		b.ResetTimer()
-
 		for i := 0; i < b.N; i++ {
+			// this is important: you want to start with a new
+			// small array each time otherwise onesidedgallopingintersect2by2
+			// might benefit from nearly perfect branch prediction, making
+			// the benchmark unrealistic.
+			// This needs to be super fast, which it should be if sz1 is
+			// small enough.
+			for i := 0; i < sz1; i++ {
+				s1[i] = uint16(r.Intn(MaxUint16))
+			}
+			sort.Sort(uint16Slice(s1)) // There might be duplicates, ignore them
+
 			shotgun4Intersect(s1, s2, buf)
+			shotgun4Intersect(s1, s3, buf)
+			shotgun4Intersect(s1, s4, buf)
+
 		}
 	})
 }

From fede5f78aec554cc8bbacda3d74dc91beba3499b Mon Sep 17 00:00:00 2001
From: Daniel Lemire <lemire@gmail.com>
Date: Wed, 25 Sep 2019 13:49:58 -0400
Subject: [PATCH 3/4] Simple fix.

---
 setutil_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setutil_test.go b/setutil_test.go
index 3d90d4fa..a2f391ed 100644
--- a/setutil_test.go
+++ b/setutil_test.go
@@ -181,7 +181,7 @@ func BenchmarkIntersectAlgorithms(b *testing.B) {
 
 	buf := make([]uint16, sz1+sz2+sz3+sz4)
 	commonseed := 123456
-	r = rand.New(rand.NewSource(commonseed)) // we set the same seed in both instances
+	r = rand.New(rand.NewSource(int64(commonseed))) // we set the same seed in both instances
 
 	b.Run("onesidedgallopingintersect2by2", func(b *testing.B) {
 
@@ -206,7 +206,7 @@ func BenchmarkIntersectAlgorithms(b *testing.B) {
 
 		}
 	})
-	r = rand.New(rand.NewSource(commonseed)) // we set the same seed in both instances
+	r = rand.New(rand.NewSource(int64(commonseed))) // we set the same seed in both instances
 
 	b.Run("shotgun4", func(b *testing.B) {
 		b.ResetTimer()

From f4760127538ca2fa412056292a29a60ce2431344 Mon Sep 17 00:00:00 2001
From: Alexander Petrov <alldroll@gmail.com>
Date: Tue, 8 Oct 2019 21:19:25 +0100
Subject: [PATCH 4/4] Added more unit tests, fixed bug with shotgun4Intersect
 (case {1}, {1})

---
 setutil.go      |  5 +++-
 setutil_test.go | 67 ++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/setutil.go b/setutil.go
index 1bd7cbd7..ca9c977d 100644
--- a/setutil.go
+++ b/setutil.go
@@ -675,7 +675,10 @@ func shotgun4Intersect(small, large, buf []uint16) int {
 
 	for idxS < nS && idxL < nL {
 		s := small[idxS]
-		idxL = advanceUntil(large, idxL, nL, s)
+
+		if s > large[idxL] {
+			idxL = advanceUntil(large, idxL, nL, s)
+		}
 
 		if idxL == nL {
 			break
diff --git a/setutil_test.go b/setutil_test.go
index a2f391ed..ae7cbad0 100644
--- a/setutil_test.go
+++ b/setutil_test.go
@@ -3,10 +3,11 @@ package roaring
 // to run just these tests: go test -run TestSetUtil*
 
 import (
-	"github.com/stretchr/testify/assert"
 	"math/rand"
 	"sort"
 	"testing"
+
+	"github.com/stretchr/testify/assert"
 )
 
 func TestSetUtilDifference(t *testing.T) {
@@ -96,7 +97,7 @@ func TestSetUtilIntersection(t *testing.T) {
 
 // go test -run TestSetUtilIntersectionCases
 func TestSetUtilIntersectionCases(t *testing.T) {
-	cases := []struct {
+	algorithms := []struct {
 		name string
 		algo func(a, b, buf []uint16) int
 	}{
@@ -110,15 +111,63 @@ func TestSetUtilIntersectionCases(t *testing.T) {
 		},
 	}
 
-	data1 := []uint16{0, 3, 6, 9, 12, 15, 18}
-	data2 := []uint16{0, 2, 4, 6, 8, 10, 12, 14, 16, 18}
-	expected := []uint16{0, 6, 12, 18}
+	cases := []struct {
+		a, b, expected []uint16
+	}{
+		{
+			a:        []uint16{},
+			b:        []uint16{},
+			expected: []uint16{},
+		},
+		{
+			a:        []uint16{1},
+			b:        []uint16{1},
+			expected: []uint16{1},
+		},
+		{
+			a:        []uint16{1},
+			b:        []uint16{2},
+			expected: []uint16{},
+		},
+		{
+			a:        []uint16{1, 2},
+			b:        []uint16{2, 3},
+			expected: []uint16{2},
+		},
+		{
+			a:        []uint16{1, 2, 3},
+			b:        []uint16{0, 2, 4, 6, 8, 10, 12, 14, 16, 18},
+			expected: []uint16{2},
+		},
+		{
+			a:        []uint16{0, 3, 6, 9, 12, 15, 18},
+			b:        []uint16{0, 2, 4, 6, 8, 10, 12, 14, 16, 18},
+			expected: []uint16{0, 6, 12, 18},
+		},
+		{
+			a:        []uint16{0, 3, 6, 9, 12, 15, 18},
+			b:        []uint16{0, 3, 6, 9, 12, 15, 18},
+			expected: []uint16{0, 3, 6, 9, 12, 15, 18},
+		},
+		{
+			a:        []uint16{1, 2, 3, 5, 7, 11, 13, 16, 30, 40, 100, 131, 200},
+			b:        []uint16{10, 60, 100},
+			expected: []uint16{100},
+		},
+		{
+			a:        []uint16{10, 60, 100},
+			b:        []uint16{1, 2, 3, 5, 7, 11, 13, 16, 30, 40, 100, 131, 200},
+			expected: []uint16{100},
+		},
+	}
 
-	for _, c := range cases {
-		result := make([]uint16, 0, len(data1)+len(data2))
-		n := c.algo(data1, data2, result)
+	for _, a := range algorithms {
+		for i, c := range cases {
+			result := make([]uint16, 0, len(c.a)+len(c.b))
+			n := a.algo(c.a, c.b, result)
 
-		assert.Equalf(t, expected, result[:n], "failed algorithm: %s", c.name)
+			assert.Equalf(t, c.expected, result[:n], "test %d fail, algorithm: %s", i+1, a.name)
+		}
 	}
 }