Closed
Description
Go version
go version go1.24.2 darwin/amd64
Output of go env
in your module/workspace:
AR='ar'
CC='cc'
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_ENABLED='1'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
CXX='c++'
GCCGO='gccgo'
GO111MODULE=''
GOAMD64='v1'
GOARCH='amd64'
GOAUTH='netrc'
GOBIN='/Users/jcretel/go/bin'
GOCACHE='/Users/jcretel/Library/Caches/go-build'
GOCACHEPROG=''
GODEBUG=''
GOENV='/Users/jcretel/Library/Application Support/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFIPS140='off'
GOFLAGS=''
GOGCCFLAGS='-fPIC -arch x86_64 -m64 -pthread -fno-caret-diagnostics -Qunused-arguments -fmessage-length=0 -ffile-prefix-map=/var/folders/0k/mmhg_4vd4rxdzzxp8hr1564r0000gn/T/go-build2984084042=/tmp/go-build -gno-record-gcc-switches -fno-common'
GOHOSTARCH='amd64'
GOHOSTOS='darwin'
GOINSECURE=''
GOMOD='/Users/jcretel/Desktop/genDecodeRune/go.mod'
GOMODCACHE='/Users/jcretel/go/pkg/mod'
GONOPROXY=''
GONOSUMDB=''
GOOS='darwin'
GOPATH='/Users/jcretel/go'
GOPRIVATE=''
GOPROXY='https://proxy.golang.org,direct'
GOROOT='/usr/local/Cellar/go/1.24.2/libexec'
GOSUMDB='sum.golang.org'
GOTELEMETRY='on'
GOTELEMETRYDIR='/Users/jcretel/Library/Application Support/go/telemetry'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/usr/local/Cellar/go/1.24.2/libexec/pkg/tool/darwin_amd64'
GOVCS=''
GOVERSION='go1.24.2'
GOWORK=''
PKG_CONFIG='pkg-config'
What did you do?
(The complete code is available in this Gist.)
I eliminated the code duplication between utf8.DecodeRune
and utf8.DecodeRuneInString
by delegating to a generic function with a string | []byte
type constraint:
package utf8
func DecodeRune(p []byte) (r rune, size int) {
return genDecodeRune(p)
}
func DecodeRuneInString(s string) (r rune, size int) {
return genDecodeRune(s)
}
func genDecodeRune[ByteString string | []byte](s ByteString) (r rune, size int) {
// copied from https://cs.opensource.google/go/go/+/refs/tags/go1.24.2:src/unicode/utf8/utf8.go;l=205
n := len(s)
if n < 1 {
return RuneError, 0
}
s0 := s[0]
x := first[s0]
if x >= as {
// The following code simulates an additional check for x == xx and
// handling the ASCII and invalid cases accordingly. This mask-and-or
// approach prevents an additional branch.
mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
return rune(s[0])&^mask | RuneError&mask, 1
}
sz := int(x & 7)
accept := acceptRanges[x>>4]
if n < sz {
return RuneError, 1
}
s1 := s[1]
if s1 < accept.lo || accept.hi < s1 {
return RuneError, 1
}
if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
return rune(s0&mask2)<<6 | rune(s1&maskx), 2
}
s2 := s[2]
if s2 < locb || hicb < s2 {
return RuneError, 1
}
if sz <= 3 {
return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
}
s3 := s[3]
if s3 < locb || hicb < s3 {
return RuneError, 1
}
return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
}
// rest omitted
I also wrote some benchmarks:
package utf8_test
import (
"strconv"
"testing"
. "gist.github.com/jub0bs/8caddb242ddd776f5b30d5fd2bbb836e"
)
type ValidTest struct {
in string
out bool
}
var validTests = []ValidTest{
{"", true},
{"a", true},
{"abc", true},
{"Ж", true},
{"ЖЖ", true},
{"брэд-ЛГТМ", true},
{"☺☻☹", true},
{"aa\xe2", false},
{string([]byte{66, 250}), false},
{string([]byte{66, 250, 67}), false},
{"a\uFFFDb", true},
{string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF
{string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range
{string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range
{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
{string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect
{string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic)
{string("\xed\xbf\xbf"), false}, // U+DFFF low surrogate (sic)
}
func BenchmarkDecodeRune(b *testing.B) {
for _, tc := range validTests {
in := []byte(tc.in)
b.Run(strconv.Quote(tc.in), func(b *testing.B) {
for b.Loop() {
DecodeRune(in)
}
})
}
}
func BenchmarkDecodeRuneInString(b *testing.B) {
for _, tc := range validTests {
b.Run(strconv.Quote(tc.in), func(b *testing.B) {
for b.Loop() {
DecodeRuneInString(tc.in)
}
})
}
}
func BenchmarkDecodeASCIIRune(b *testing.B) {
a := []byte{'a'}
for b.Loop() {
DecodeRune(a)
}
}
func BenchmarkDecodeJapaneseRune(b *testing.B) {
nihon := []byte("本")
for b.Loop() {
DecodeRune(nihon)
}
}
What did you see happen?
Quite a large performance gap between Go 1.24.2 and this implementation:
goos: darwin
goarch: amd64
pkg: gist.github.com/jub0bs/8caddb242ddd776f5b30d5fd2bbb836e
cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
│ old │ new │
│ sec/op │ sec/op vs base │
DecodeRune/""-8 1.966n ± 2% 3.286n ± 1% +67.18% (p=0.000 n=10)
DecodeRune/"a"-8 2.650n ± 1% 3.841n ± 1% +44.96% (p=0.000 n=10)
DecodeRune/"abc"-8 2.647n ± 1% 3.831n ± 0% +44.68% (p=0.000 n=10)
DecodeRune/"Ж"-8 3.818n ± 1% 5.014n ± 1% +31.34% (p=0.000 n=10)
DecodeRune/"ЖЖ"-8 3.821n ± 1% 4.997n ± 1% +30.78% (p=0.000 n=10)
DecodeRune/"брэд-ЛГТМ"-8 3.822n ± 1% 4.982n ± 0% +30.35% (p=0.000 n=10)
DecodeRune/"☺☻☹"-8 4.407n ± 0% 5.577n ± 1% +26.56% (p=0.000 n=10)
DecodeRune/"aa\xe2"-8 2.646n ± 0% 3.826n ± 0% +44.62% (p=0.000 n=10)
DecodeRune/"B\xfa"-8 2.636n ± 0% 3.845n ± 2% +45.86% (p=0.000 n=10)
DecodeRune/"B\xfaC"-8 2.643n ± 1% 3.844n ± 1% +45.47% (p=0.000 n=10)
DecodeRune/"a�b"-8 2.639n ± 0% 3.828n ± 1% +45.05% (p=0.000 n=10)
DecodeRune/"\U0010ffff"-8 4.686n ± 0% 6.227n ± 1% +32.87% (p=0.000 n=10)
DecodeRune/"\xf4\x90\x80\x80"-8 3.526n ± 0% 5.071n ± 1% +43.82% (p=0.000 n=10)
DecodeRune/"\xf7\xbf\xbf\xbf"-8 2.646n ± 1% 4.317n ± 1% +63.16% (p=0.000 n=10)
DecodeRune/"\xfb\xbf\xbf\xbf\xbf"-8 2.643n ± 1% 4.293n ± 1% +62.43% (p=0.000 n=10)
DecodeRune/"\xc0\x80"-8 2.635n ± 1% 3.821n ± 1% +45.02% (p=0.000 n=10)
DecodeRune/"\xed\xa0\x80"-8 3.512n ± 1% 4.702n ± 0% +33.87% (p=0.000 n=10)
DecodeRune/"\xed\xbf\xbf"-8 3.514n ± 1% 4.682n ± 1% +33.21% (p=0.000 n=10)
DecodeRuneInString/""-8 2.347n ± 1% 3.051n ± 1% +29.97% (p=0.000 n=10)
DecodeRuneInString/"a"-8 2.471n ± 2% 3.681n ± 1% +48.97% (p=0.000 n=10)
DecodeRuneInString/"abc"-8 2.454n ± 1% 3.672n ± 0% +49.66% (p=0.000 n=10)
DecodeRuneInString/"Ж"-8 3.815n ± 0% 4.699n ± 1% +23.18% (p=0.000 n=10)
DecodeRuneInString/"ЖЖ"-8 3.816n ± 1% 4.705n ± 1% +23.28% (p=0.000 n=10)
DecodeRuneInString/"брэд-ЛГТМ"-8 3.824n ± 1% 4.725n ± 1% +23.56% (p=0.000 n=10)
DecodeRuneInString/"☺☻☹"-8 4.114n ± 0% 5.293n ± 1% +28.65% (p=0.000 n=10)
DecodeRuneInString/"aa\xe2"-8 2.445n ± 0% 3.719n ± 2% +52.14% (p=0.000 n=10)
DecodeRuneInString/"B\xfa"-8 2.450n ± 0% 3.688n ± 2% +50.54% (p=0.000 n=10)
DecodeRuneInString/"B\xfaC"-8 2.447n ± 0% 3.681n ± 0% +50.40% (p=0.000 n=10)
DecodeRuneInString/"a�b"-8 2.446n ± 0% 3.681n ± 1% +50.49% (p=0.000 n=10)
DecodeRuneInString/"\U0010ffff"-8 4.517n ± 1% 5.723n ± 1% +26.71% (p=0.000 n=10)
DecodeRuneInString/"\xf4\x90\x80\x80"-8 3.224n ± 1% 4.412n ± 1% +36.85% (p=0.000 n=10)
DecodeRuneInString/"\xf7\xbf\xbf\xbf"-8 2.439n ± 0% 3.679n ± 1% +50.84% (p=0.000 n=10)
DecodeRuneInString/"\xfb\xbf\xbf\xbf\xbf"-8 2.443n ± 1% 3.678n ± 1% +50.55% (p=0.000 n=10)
DecodeRuneInString/"\xc0\x80"-8 2.445n ± 1% 3.682n ± 0% +50.62% (p=0.000 n=10)
DecodeRuneInString/"\xed\xa0\x80"-8 3.245n ± 1% 4.414n ± 1% +36.04% (p=0.000 n=10)
DecodeRuneInString/"\xed\xbf\xbf"-8 3.227n ± 0% 4.410n ± 0% +36.66% (p=0.000 n=10)
DecodeASCIIRune-8 2.490n ± 0% 4.117n ± 0% +65.39% (p=0.000 n=10)
DecodeJapaneseRune-8 4.111n ± 0% 5.288n ± 0% +28.66% (p=0.000 n=10)
geomean 3.016n 4.258n +41.19%
What did you expect to see?
I expected to observe no such performance degradation between Go 1.24.2 and this implementation. It is not only disappointing but difficult to explain; see this thread on Gophers Slack. Such a generic constraint, if it were "free", would unlock much code deduplication in the standard library (see also net/textproto.TrimString and net/textproto.TrimBytes) and beyond.
Related: #56948