From 2a6a739eb6752ba86b0e31b1ab833431b59898c9 Mon Sep 17 00:00:00 2001 From: Matt Robenolt Date: Thu, 4 Jul 2024 15:39:09 -0700 Subject: [PATCH] go/mysql: performance optimizations in protocol encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This employs a couple tricks that combined seemed fruitful: * Swapping to binary.LittleEndian.Put* on the basic calls gets us a free boost while removing code. The main win from this swap is the slice boundary check, resulting in a massive boost. I kept it inlined, but added my own boundary checking in `writeLenEncInt` since swapping it out here resulted in a very minor performance regression from the current results. I assume from the extra coersion needed to the uint* type, and another reslice. * Reslicing the byte slice early so all future operations work on 0-index rather than pos+ indexing. This seemed to be a pretty sizeable win without needing to do more addition on every operation later to determine the index, they get swapped out for constants. * Read path employs the same early reslicing, but already has explicit bounds checks. * Rewrite `writeZeroes` to utilize the Go memclr optimization. ``` $ benchstat {old,new}.txt goos: darwin goarch: arm64 pkg: vitess.io/vitess/go/mysql │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ EncWriteInt/16-bit-10 0.4685n ± 0% 0.3516n ± 0% -24.94% (p=0.000 n=10) EncWriteInt/16-bit-lenencoded-10 2.049n ± 0% 2.049n ± 0% ~ (p=0.972 n=10) EncWriteInt/24-bit-lenencoded-10 1.987n ± 0% 2.056n ± 0% +3.45% (p=0.000 n=10) EncWriteInt/32-bit-10 0.7819n ± 0% 0.3906n ± 0% -50.05% (p=0.000 n=10) EncWriteInt/64-bit-10 1.4080n ± 0% 0.4684n ± 0% -66.73% (p=0.000 n=10) EncWriteInt/64-bit-lenencoded-10 3.126n ± 0% 2.051n ± 0% -34.40% (p=0.000 n=10) EncWriteZeroes/4-bytes-10 2.5030n ± 0% 0.3123n ± 0% -87.52% (p=0.000 n=10) EncWriteZeroes/10-bytes-10 4.3815n ± 0% 0.3120n ± 0% -92.88% (p=0.000 n=10) EncWriteZeroes/23-bytes-10 8.4575n ± 0% 0.3124n ± 0% -96.31% (p=0.000 n=10) EncWriteZeroes/55-bytes-10 20.8750n ± 10% 0.6245n ± 0% -97.01% EncReadInt/16-bit-10 2.050n ± 0% 2.068n ± 1% +0.90% (p=0.001 n=10) EncReadInt/24-bit-10 2.034n ± 0% 2.050n ± 0% +0.76% (p=0.000 n=10) EncReadInt/64-bit-10 2.819n ± 1% 2.187n ± 0% -22.41% (p=0.000 n=10) geomean 2.500n 0.8363n -66.55% ``` Signed-off-by: Matt Robenolt --- go/mysql/encoding.go | 111 +++++++++++++++-------------- go/mysql/encoding_test.go | 142 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 197 insertions(+), 56 deletions(-) diff --git a/go/mysql/encoding.go b/go/mysql/encoding.go index c79580acb39..6b33ffabfc2 100644 --- a/go/mysql/encoding.go +++ b/go/mysql/encoding.go @@ -47,31 +47,37 @@ func lenEncIntSize(i uint64) int { } func writeLenEncInt(data []byte, pos int, i uint64) int { + // reslice at pos to avoid doing arithmetic below + data = data[pos:] + switch { case i < 251: - data[pos] = byte(i) + data[0] = byte(i) return pos + 1 case i < 1<<16: - data[pos] = 0xfc - data[pos+1] = byte(i) - data[pos+2] = byte(i >> 8) + _ = data[2] // early bounds check + data[0] = 0xfc + data[1] = byte(i) + data[2] = byte(i >> 8) return pos + 3 case i < 1<<24: - data[pos] = 0xfd - data[pos+1] = byte(i) - data[pos+2] = byte(i >> 8) - data[pos+3] = byte(i >> 16) + _ = data[3] // early bounds check + data[0] = 0xfd + data[1] = byte(i) + data[2] = byte(i >> 8) + data[3] = byte(i >> 16) return pos + 4 default: - data[pos] = 0xfe - data[pos+1] = byte(i) - data[pos+2] = byte(i >> 8) - data[pos+3] = byte(i >> 16) - data[pos+4] = byte(i >> 24) - data[pos+5] = byte(i >> 32) - data[pos+6] = byte(i >> 40) - data[pos+7] = byte(i >> 48) - data[pos+8] = byte(i >> 56) + _ = data[8] // early bounds check + data[0] = 0xfe + data[1] = byte(i) + data[2] = byte(i >> 8) + data[3] = byte(i >> 16) + data[4] = byte(i >> 24) + data[5] = byte(i >> 32) + data[6] = byte(i >> 40) + data[7] = byte(i >> 48) + data[8] = byte(i >> 56) return pos + 9 } } @@ -101,28 +107,17 @@ func writeByte(data []byte, pos int, value byte) int { } func writeUint16(data []byte, pos int, value uint16) int { - data[pos] = byte(value) - data[pos+1] = byte(value >> 8) + binary.LittleEndian.PutUint16(data[pos:], value) return pos + 2 } func writeUint32(data []byte, pos int, value uint32) int { - data[pos] = byte(value) - data[pos+1] = byte(value >> 8) - data[pos+2] = byte(value >> 16) - data[pos+3] = byte(value >> 24) + binary.LittleEndian.PutUint32(data[pos:], value) return pos + 4 } func writeUint64(data []byte, pos int, value uint64) int { - data[pos] = byte(value) - data[pos+1] = byte(value >> 8) - data[pos+2] = byte(value >> 16) - data[pos+3] = byte(value >> 24) - data[pos+4] = byte(value >> 32) - data[pos+5] = byte(value >> 40) - data[pos+6] = byte(value >> 48) - data[pos+7] = byte(value >> 56) + binary.LittleEndian.PutUint64(data[pos:], value) return pos + 8 } @@ -137,10 +132,16 @@ func writeLenEncString(data []byte, pos int, value string) int { } func writeZeroes(data []byte, pos int, len int) int { - for i := 0; i < len; i++ { - data[pos+i] = 0 + // XXX: This implementation is optimized to leverage + // the go compiler's memclr pattern, see: https://github.com/golang/go/issues/5373 + end := pos + len + data = data[pos:end] + + for i := range data { + data[i] = 0 } - return pos + len + + return end } // @@ -228,6 +229,7 @@ func readFixedLenUint64(data []byte) (uint64, bool) { case 3: // 2 bytes return uint64(binary.LittleEndian.Uint16(data[1:])), true case 4: // 3 bytes + _ = data[3] // early bounds check return uint64(data[1]) | uint64(data[2])<<8 | uint64(data[3])<<16, true @@ -242,37 +244,42 @@ func readLenEncInt(data []byte, pos int) (uint64, int, bool) { if pos >= len(data) { return 0, 0, false } - switch data[pos] { + + // reslice to avoid arithmetic below + data = data[pos:] + + switch data[0] { case 0xfc: // Encoded in the next 2 bytes. - if pos+2 >= len(data) { + if 2 >= len(data) { return 0, 0, false } - return uint64(data[pos+1]) | - uint64(data[pos+2])<<8, pos + 3, true + return uint64(data[1]) | + uint64(data[2])<<8, pos + 3, true case 0xfd: // Encoded in the next 3 bytes. - if pos+3 >= len(data) { + if 3 >= len(data) { return 0, 0, false } - return uint64(data[pos+1]) | - uint64(data[pos+2])<<8 | - uint64(data[pos+3])<<16, pos + 4, true + return uint64(data[1]) | + uint64(data[2])<<8 | + uint64(data[3])<<16, pos + 4, true case 0xfe: // Encoded in the next 8 bytes. - if pos+8 >= len(data) { + if 8 >= len(data) { return 0, 0, false } - return uint64(data[pos+1]) | - uint64(data[pos+2])<<8 | - uint64(data[pos+3])<<16 | - uint64(data[pos+4])<<24 | - uint64(data[pos+5])<<32 | - uint64(data[pos+6])<<40 | - uint64(data[pos+7])<<48 | - uint64(data[pos+8])<<56, pos + 9, true + return uint64(data[1]) | + uint64(data[2])<<8 | + uint64(data[3])<<16 | + uint64(data[4])<<24 | + uint64(data[5])<<32 | + uint64(data[6])<<40 | + uint64(data[7])<<48 | + uint64(data[8])<<56, pos + 9, true + default: + return uint64(data[0]), pos + 1, true } - return uint64(data[pos]), pos + 1, true } func readLenEncString(data []byte, pos int) (string, int, bool) { diff --git a/go/mysql/encoding_test.go b/go/mysql/encoding_test.go index c0081a6455b..41f6c416993 100644 --- a/go/mysql/encoding_test.go +++ b/go/mysql/encoding_test.go @@ -96,7 +96,6 @@ func TestEncUint16(t *testing.T) { _, _, ok = readUint16(data, 9) assert.False(t, ok, "readUint16 returned ok=true for shorter value") - } func TestEncBytes(t *testing.T) { @@ -122,7 +121,6 @@ func TestEncBytes(t *testing.T) { _, _, ok = readBytes(data, 9, 2) assert.False(t, ok, "readBytes returned ok=true for shorter value") - } func TestEncUint32(t *testing.T) { @@ -145,7 +143,6 @@ func TestEncUint32(t *testing.T) { _, _, ok = readUint32(data, 7) assert.False(t, ok, "readUint32 returned ok=true for shorter value") - } func TestEncUint64(t *testing.T) { @@ -169,7 +166,6 @@ func TestEncUint64(t *testing.T) { _, _, ok = readUint64(data, 7) assert.False(t, ok, "readUint64 returned ok=true for shorter value") - } func TestEncString(t *testing.T) { @@ -317,3 +313,141 @@ func TestEncString(t *testing.T) { } } } + +func TestWriteZeroes(t *testing.T) { + buf := make([]byte, 32) + resetBuf := func() { + t.Helper() + for i := range len(buf) { + buf[i] = 'f' + } + } + + allMatch := func(b []byte, c byte) bool { + for i := range b { + if b[i] != c { + return false + } + } + return true + } + + t.Run("0-offset", func(t *testing.T) { + for _, size := range []int{4, 10, 23, 24, 25, 26, 27} { + resetBuf() + pos := writeZeroes(buf, 0, size) + assert.Equal(t, size, pos, "expected to advance pos to %d, got %d", size, pos) + assert.True(t, allMatch(buf[:pos], 0), "buffer should be zeroes, %v", buf[:pos]) + assert.True(t, allMatch(buf[pos:], 'f'), "buffer should be dirty, %v", buf[pos:]) + } + }) + + t.Run("3-offset", func(t *testing.T) { + offset := 3 + for _, size := range []int{4, 10, 23, 24, 25, 26, 27} { + resetBuf() + pos := writeZeroes(buf, offset, size) + assert.Equal(t, offset+size, pos, "expected to advance pos to %d, got %d", offset+size, pos) + assert.True(t, allMatch(buf[:offset], 'f'), "buffer should be dirty, %v", buf[offset:pos]) + assert.True(t, allMatch(buf[offset:pos], 0), "buffer should be zeroes, %v", buf[:pos]) + assert.True(t, allMatch(buf[pos:], 'f'), "buffer should be dirty, %v", buf[pos:]) + } + }) +} + +func BenchmarkEncWriteInt(b *testing.B) { + buf := make([]byte, 16) + + b.Run("16-bit", func(b *testing.B) { + value := uint16(0x0100) + for range b.N { + _ = writeUint16(buf, 0, value) + } + }) + + b.Run("16-bit-lenencoded", func(b *testing.B) { + value := uint64(0x0100) + for range b.N { + _ = writeLenEncInt(buf, 0, value) + } + }) + + b.Run("24-bit-lenencoded", func(b *testing.B) { + value := uint64(0xabcdef) + for range b.N { + _ = writeLenEncInt(buf, 0, value) + } + }) + + b.Run("32-bit", func(b *testing.B) { + value := uint32(0xabcdef) + for range b.N { + _ = writeUint32(buf, 0, value) + } + }) + + b.Run("64-bit", func(b *testing.B) { + value := uint64(0xa0a1a2a3a4a5a6a7) + for range b.N { + _ = writeUint64(buf, 0, value) + } + }) + + b.Run("64-bit-lenencoded", func(b *testing.B) { + value := uint64(0xa0a1a2a3a4a5a6a7) + for range b.N { + _ = writeLenEncInt(buf, 0, value) + } + }) +} + +func BenchmarkEncWriteZeroes(b *testing.B) { + buf := make([]byte, 128) + + b.Run("4-bytes", func(b *testing.B) { + for range b.N { + _ = writeZeroes(buf, 16, 4) + } + }) + + b.Run("10-bytes", func(b *testing.B) { + for range b.N { + _ = writeZeroes(buf, 16, 10) + } + }) + + b.Run("23-bytes", func(b *testing.B) { + for range b.N { + _ = writeZeroes(buf, 16, 23) + } + }) + + b.Run("55-bytes", func(b *testing.B) { + for range b.N { + _ = writeZeroes(buf, 16, 55) + } + }) +} + +func BenchmarkEncReadInt(b *testing.B) { + b.Run("16-bit", func(b *testing.B) { + data := []byte{0xfc, 0xfb, 0x00} + for range b.N { + _, _, _ = readLenEncInt(data, 0) + } + }) + + b.Run("24-bit", func(b *testing.B) { + data := []byte{0xfd, 0x00, 0x00, 0x01} + for range b.N { + _, _, _ = readLenEncInt(data, 0) + } + }) + + b.Run("64-bit", func(b *testing.B) { + data := []byte{0xfe, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0} + for range b.N { + _, _, _ = readLenEncInt(data, 0) + } + }) +}