From 2a6a739eb6752ba86b0e31b1ab833431b59898c9 Mon Sep 17 00:00:00 2001
From: Matt Robenolt <matt@ydekproductions.com>
Date: Thu, 4 Jul 2024 15:39:09 -0700
Subject: [PATCH] go/mysql: performance optimizations in protocol encoding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This employs a couple tricks that combined seemed fruitful:

* Swapping to binary.LittleEndian.Put* on the basic calls gets us a free
  boost while removing code. The main win from this swap is the slice
  boundary check, resulting in a massive boost. I kept it inlined, but
  added my own boundary checking in `writeLenEncInt` since swapping it
  out here resulted in a very minor performance regression from the
  current results. I assume from the extra coersion needed to the uint*
  type, and another reslice.
* Reslicing the byte slice early so all future operations work on
  0-index rather than pos+ indexing. This seemed to be a pretty sizeable
  win without needing to do more addition on every operation later to
  determine the index, they get swapped out for constants.
* Read path employs the same early reslicing, but already has explicit
  bounds checks.
* Rewrite `writeZeroes` to utilize the Go memclr optimization.

```
$ benchstat {old,new}.txt
goos: darwin
goarch: arm64
pkg: vitess.io/vitess/go/mysql
                                 │    old.txt     │               new.txt                │
                                 │     sec/op     │    sec/op     vs base                │
EncWriteInt/16-bit-10               0.4685n ±  0%   0.3516n ± 0%  -24.94% (p=0.000 n=10)
EncWriteInt/16-bit-lenencoded-10     2.049n ±  0%    2.049n ± 0%        ~ (p=0.972 n=10)
EncWriteInt/24-bit-lenencoded-10     1.987n ±  0%    2.056n ± 0%   +3.45% (p=0.000 n=10)
EncWriteInt/32-bit-10               0.7819n ±  0%   0.3906n ± 0%  -50.05% (p=0.000 n=10)
EncWriteInt/64-bit-10               1.4080n ±  0%   0.4684n ± 0%  -66.73% (p=0.000 n=10)
EncWriteInt/64-bit-lenencoded-10     3.126n ±  0%    2.051n ± 0%  -34.40% (p=0.000 n=10)
EncWriteZeroes/4-bytes-10           2.5030n ±  0%   0.3123n ± 0%  -87.52% (p=0.000 n=10)
EncWriteZeroes/10-bytes-10          4.3815n ±  0%   0.3120n ± 0%  -92.88% (p=0.000 n=10)
EncWriteZeroes/23-bytes-10          8.4575n ±  0%   0.3124n ± 0%  -96.31% (p=0.000 n=10)
EncWriteZeroes/55-bytes-10         20.8750n ± 10%   0.6245n ± 0%  -97.01%
EncReadInt/16-bit-10                 2.050n ±  0%    2.068n ± 1%   +0.90% (p=0.001 n=10)
EncReadInt/24-bit-10                 2.034n ±  0%    2.050n ± 0%   +0.76% (p=0.000 n=10)
EncReadInt/64-bit-10                 2.819n ±  1%    2.187n ± 0%  -22.41% (p=0.000 n=10)
geomean                              2.500n         0.8363n       -66.55%
```

Signed-off-by: Matt Robenolt <matt@ydekproductions.com>
---
 go/mysql/encoding.go      | 111 +++++++++++++++--------------
 go/mysql/encoding_test.go | 142 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 197 insertions(+), 56 deletions(-)

diff --git a/go/mysql/encoding.go b/go/mysql/encoding.go
index c79580acb39..6b33ffabfc2 100644
--- a/go/mysql/encoding.go
+++ b/go/mysql/encoding.go
@@ -47,31 +47,37 @@ func lenEncIntSize(i uint64) int {
 }
 
 func writeLenEncInt(data []byte, pos int, i uint64) int {
+	// reslice at pos to avoid doing arithmetic below
+	data = data[pos:]
+
 	switch {
 	case i < 251:
-		data[pos] = byte(i)
+		data[0] = byte(i)
 		return pos + 1
 	case i < 1<<16:
-		data[pos] = 0xfc
-		data[pos+1] = byte(i)
-		data[pos+2] = byte(i >> 8)
+		_ = data[2] // early bounds check
+		data[0] = 0xfc
+		data[1] = byte(i)
+		data[2] = byte(i >> 8)
 		return pos + 3
 	case i < 1<<24:
-		data[pos] = 0xfd
-		data[pos+1] = byte(i)
-		data[pos+2] = byte(i >> 8)
-		data[pos+3] = byte(i >> 16)
+		_ = data[3] // early bounds check
+		data[0] = 0xfd
+		data[1] = byte(i)
+		data[2] = byte(i >> 8)
+		data[3] = byte(i >> 16)
 		return pos + 4
 	default:
-		data[pos] = 0xfe
-		data[pos+1] = byte(i)
-		data[pos+2] = byte(i >> 8)
-		data[pos+3] = byte(i >> 16)
-		data[pos+4] = byte(i >> 24)
-		data[pos+5] = byte(i >> 32)
-		data[pos+6] = byte(i >> 40)
-		data[pos+7] = byte(i >> 48)
-		data[pos+8] = byte(i >> 56)
+		_ = data[8] // early bounds check
+		data[0] = 0xfe
+		data[1] = byte(i)
+		data[2] = byte(i >> 8)
+		data[3] = byte(i >> 16)
+		data[4] = byte(i >> 24)
+		data[5] = byte(i >> 32)
+		data[6] = byte(i >> 40)
+		data[7] = byte(i >> 48)
+		data[8] = byte(i >> 56)
 		return pos + 9
 	}
 }
@@ -101,28 +107,17 @@ func writeByte(data []byte, pos int, value byte) int {
 }
 
 func writeUint16(data []byte, pos int, value uint16) int {
-	data[pos] = byte(value)
-	data[pos+1] = byte(value >> 8)
+	binary.LittleEndian.PutUint16(data[pos:], value)
 	return pos + 2
 }
 
 func writeUint32(data []byte, pos int, value uint32) int {
-	data[pos] = byte(value)
-	data[pos+1] = byte(value >> 8)
-	data[pos+2] = byte(value >> 16)
-	data[pos+3] = byte(value >> 24)
+	binary.LittleEndian.PutUint32(data[pos:], value)
 	return pos + 4
 }
 
 func writeUint64(data []byte, pos int, value uint64) int {
-	data[pos] = byte(value)
-	data[pos+1] = byte(value >> 8)
-	data[pos+2] = byte(value >> 16)
-	data[pos+3] = byte(value >> 24)
-	data[pos+4] = byte(value >> 32)
-	data[pos+5] = byte(value >> 40)
-	data[pos+6] = byte(value >> 48)
-	data[pos+7] = byte(value >> 56)
+	binary.LittleEndian.PutUint64(data[pos:], value)
 	return pos + 8
 }
 
@@ -137,10 +132,16 @@ func writeLenEncString(data []byte, pos int, value string) int {
 }
 
 func writeZeroes(data []byte, pos int, len int) int {
-	for i := 0; i < len; i++ {
-		data[pos+i] = 0
+	// XXX: This implementation is optimized to leverage
+	// the go compiler's memclr pattern, see: https://github.com/golang/go/issues/5373
+	end := pos + len
+	data = data[pos:end]
+
+	for i := range data {
+		data[i] = 0
 	}
-	return pos + len
+
+	return end
 }
 
 //
@@ -228,6 +229,7 @@ func readFixedLenUint64(data []byte) (uint64, bool) {
 	case 3: // 2 bytes
 		return uint64(binary.LittleEndian.Uint16(data[1:])), true
 	case 4: // 3 bytes
+		_ = data[3] // early bounds check
 		return uint64(data[1]) |
 			uint64(data[2])<<8 |
 			uint64(data[3])<<16, true
@@ -242,37 +244,42 @@ func readLenEncInt(data []byte, pos int) (uint64, int, bool) {
 	if pos >= len(data) {
 		return 0, 0, false
 	}
-	switch data[pos] {
+
+	// reslice to avoid arithmetic below
+	data = data[pos:]
+
+	switch data[0] {
 	case 0xfc:
 		// Encoded in the next 2 bytes.
-		if pos+2 >= len(data) {
+		if 2 >= len(data) {
 			return 0, 0, false
 		}
-		return uint64(data[pos+1]) |
-			uint64(data[pos+2])<<8, pos + 3, true
+		return uint64(data[1]) |
+			uint64(data[2])<<8, pos + 3, true
 	case 0xfd:
 		// Encoded in the next 3 bytes.
-		if pos+3 >= len(data) {
+		if 3 >= len(data) {
 			return 0, 0, false
 		}
-		return uint64(data[pos+1]) |
-			uint64(data[pos+2])<<8 |
-			uint64(data[pos+3])<<16, pos + 4, true
+		return uint64(data[1]) |
+			uint64(data[2])<<8 |
+			uint64(data[3])<<16, pos + 4, true
 	case 0xfe:
 		// Encoded in the next 8 bytes.
-		if pos+8 >= len(data) {
+		if 8 >= len(data) {
 			return 0, 0, false
 		}
-		return uint64(data[pos+1]) |
-			uint64(data[pos+2])<<8 |
-			uint64(data[pos+3])<<16 |
-			uint64(data[pos+4])<<24 |
-			uint64(data[pos+5])<<32 |
-			uint64(data[pos+6])<<40 |
-			uint64(data[pos+7])<<48 |
-			uint64(data[pos+8])<<56, pos + 9, true
+		return uint64(data[1]) |
+			uint64(data[2])<<8 |
+			uint64(data[3])<<16 |
+			uint64(data[4])<<24 |
+			uint64(data[5])<<32 |
+			uint64(data[6])<<40 |
+			uint64(data[7])<<48 |
+			uint64(data[8])<<56, pos + 9, true
+	default:
+		return uint64(data[0]), pos + 1, true
 	}
-	return uint64(data[pos]), pos + 1, true
 }
 
 func readLenEncString(data []byte, pos int) (string, int, bool) {
diff --git a/go/mysql/encoding_test.go b/go/mysql/encoding_test.go
index c0081a6455b..41f6c416993 100644
--- a/go/mysql/encoding_test.go
+++ b/go/mysql/encoding_test.go
@@ -96,7 +96,6 @@ func TestEncUint16(t *testing.T) {
 
 	_, _, ok = readUint16(data, 9)
 	assert.False(t, ok, "readUint16 returned ok=true for shorter value")
-
 }
 
 func TestEncBytes(t *testing.T) {
@@ -122,7 +121,6 @@ func TestEncBytes(t *testing.T) {
 
 	_, _, ok = readBytes(data, 9, 2)
 	assert.False(t, ok, "readBytes returned ok=true for shorter value")
-
 }
 
 func TestEncUint32(t *testing.T) {
@@ -145,7 +143,6 @@ func TestEncUint32(t *testing.T) {
 
 	_, _, ok = readUint32(data, 7)
 	assert.False(t, ok, "readUint32 returned ok=true for shorter value")
-
 }
 
 func TestEncUint64(t *testing.T) {
@@ -169,7 +166,6 @@ func TestEncUint64(t *testing.T) {
 
 	_, _, ok = readUint64(data, 7)
 	assert.False(t, ok, "readUint64 returned ok=true for shorter value")
-
 }
 
 func TestEncString(t *testing.T) {
@@ -317,3 +313,141 @@ func TestEncString(t *testing.T) {
 		}
 	}
 }
+
+func TestWriteZeroes(t *testing.T) {
+	buf := make([]byte, 32)
+	resetBuf := func() {
+		t.Helper()
+		for i := range len(buf) {
+			buf[i] = 'f'
+		}
+	}
+
+	allMatch := func(b []byte, c byte) bool {
+		for i := range b {
+			if b[i] != c {
+				return false
+			}
+		}
+		return true
+	}
+
+	t.Run("0-offset", func(t *testing.T) {
+		for _, size := range []int{4, 10, 23, 24, 25, 26, 27} {
+			resetBuf()
+			pos := writeZeroes(buf, 0, size)
+			assert.Equal(t, size, pos, "expected to advance pos to %d, got %d", size, pos)
+			assert.True(t, allMatch(buf[:pos], 0), "buffer should be zeroes, %v", buf[:pos])
+			assert.True(t, allMatch(buf[pos:], 'f'), "buffer should be dirty, %v", buf[pos:])
+		}
+	})
+
+	t.Run("3-offset", func(t *testing.T) {
+		offset := 3
+		for _, size := range []int{4, 10, 23, 24, 25, 26, 27} {
+			resetBuf()
+			pos := writeZeroes(buf, offset, size)
+			assert.Equal(t, offset+size, pos, "expected to advance pos to %d, got %d", offset+size, pos)
+			assert.True(t, allMatch(buf[:offset], 'f'), "buffer should be dirty, %v", buf[offset:pos])
+			assert.True(t, allMatch(buf[offset:pos], 0), "buffer should be zeroes, %v", buf[:pos])
+			assert.True(t, allMatch(buf[pos:], 'f'), "buffer should be dirty, %v", buf[pos:])
+		}
+	})
+}
+
+func BenchmarkEncWriteInt(b *testing.B) {
+	buf := make([]byte, 16)
+
+	b.Run("16-bit", func(b *testing.B) {
+		value := uint16(0x0100)
+		for range b.N {
+			_ = writeUint16(buf, 0, value)
+		}
+	})
+
+	b.Run("16-bit-lenencoded", func(b *testing.B) {
+		value := uint64(0x0100)
+		for range b.N {
+			_ = writeLenEncInt(buf, 0, value)
+		}
+	})
+
+	b.Run("24-bit-lenencoded", func(b *testing.B) {
+		value := uint64(0xabcdef)
+		for range b.N {
+			_ = writeLenEncInt(buf, 0, value)
+		}
+	})
+
+	b.Run("32-bit", func(b *testing.B) {
+		value := uint32(0xabcdef)
+		for range b.N {
+			_ = writeUint32(buf, 0, value)
+		}
+	})
+
+	b.Run("64-bit", func(b *testing.B) {
+		value := uint64(0xa0a1a2a3a4a5a6a7)
+		for range b.N {
+			_ = writeUint64(buf, 0, value)
+		}
+	})
+
+	b.Run("64-bit-lenencoded", func(b *testing.B) {
+		value := uint64(0xa0a1a2a3a4a5a6a7)
+		for range b.N {
+			_ = writeLenEncInt(buf, 0, value)
+		}
+	})
+}
+
+func BenchmarkEncWriteZeroes(b *testing.B) {
+	buf := make([]byte, 128)
+
+	b.Run("4-bytes", func(b *testing.B) {
+		for range b.N {
+			_ = writeZeroes(buf, 16, 4)
+		}
+	})
+
+	b.Run("10-bytes", func(b *testing.B) {
+		for range b.N {
+			_ = writeZeroes(buf, 16, 10)
+		}
+	})
+
+	b.Run("23-bytes", func(b *testing.B) {
+		for range b.N {
+			_ = writeZeroes(buf, 16, 23)
+		}
+	})
+
+	b.Run("55-bytes", func(b *testing.B) {
+		for range b.N {
+			_ = writeZeroes(buf, 16, 55)
+		}
+	})
+}
+
+func BenchmarkEncReadInt(b *testing.B) {
+	b.Run("16-bit", func(b *testing.B) {
+		data := []byte{0xfc, 0xfb, 0x00}
+		for range b.N {
+			_, _, _ = readLenEncInt(data, 0)
+		}
+	})
+
+	b.Run("24-bit", func(b *testing.B) {
+		data := []byte{0xfd, 0x00, 0x00, 0x01}
+		for range b.N {
+			_, _, _ = readLenEncInt(data, 0)
+		}
+	})
+
+	b.Run("64-bit", func(b *testing.B) {
+		data := []byte{0xfe, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0}
+		for range b.N {
+			_, _, _ = readLenEncInt(data, 0)
+		}
+	})
+}