Skip to content

cmd/compile: a string | []byte generic constraint isn't a zero-cost abstraction #73417

Closed
@jub0bs

Description

@jub0bs

Go version

go version go1.24.2 darwin/amd64

Output of go env in your module/workspace:

AR='ar'
CC='cc'
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_ENABLED='1'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
CXX='c++'
GCCGO='gccgo'
GO111MODULE=''
GOAMD64='v1'
GOARCH='amd64'
GOAUTH='netrc'
GOBIN='/Users/jcretel/go/bin'
GOCACHE='/Users/jcretel/Library/Caches/go-build'
GOCACHEPROG=''
GODEBUG=''
GOENV='/Users/jcretel/Library/Application Support/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFIPS140='off'
GOFLAGS=''
GOGCCFLAGS='-fPIC -arch x86_64 -m64 -pthread -fno-caret-diagnostics -Qunused-arguments -fmessage-length=0 -ffile-prefix-map=/var/folders/0k/mmhg_4vd4rxdzzxp8hr1564r0000gn/T/go-build2984084042=/tmp/go-build -gno-record-gcc-switches -fno-common'
GOHOSTARCH='amd64'
GOHOSTOS='darwin'
GOINSECURE=''
GOMOD='/Users/jcretel/Desktop/genDecodeRune/go.mod'
GOMODCACHE='/Users/jcretel/go/pkg/mod'
GONOPROXY=''
GONOSUMDB=''
GOOS='darwin'
GOPATH='/Users/jcretel/go'
GOPRIVATE=''
GOPROXY='https://proxy.golang.org,direct'
GOROOT='/usr/local/Cellar/go/1.24.2/libexec'
GOSUMDB='sum.golang.org'
GOTELEMETRY='on'
GOTELEMETRYDIR='/Users/jcretel/Library/Application Support/go/telemetry'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/usr/local/Cellar/go/1.24.2/libexec/pkg/tool/darwin_amd64'
GOVCS=''
GOVERSION='go1.24.2'
GOWORK=''
PKG_CONFIG='pkg-config'

What did you do?

(The complete code is available in this Gist.)

I eliminated the code duplication between utf8.DecodeRune and utf8.DecodeRuneInString by delegating to a generic function with a string | []byte type constraint:

package utf8

func DecodeRune(p []byte) (r rune, size int) {
	return genDecodeRune(p)
}

func DecodeRuneInString(s string) (r rune, size int) {
	return genDecodeRune(s)
}

func genDecodeRune[ByteString string | []byte](s ByteString) (r rune, size int) {
	// copied from https://cs.opensource.google/go/go/+/refs/tags/go1.24.2:src/unicode/utf8/utf8.go;l=205
	n := len(s)
	if n < 1 {
		return RuneError, 0
	}
	s0 := s[0]
	x := first[s0]
	if x >= as {
		// The following code simulates an additional check for x == xx and
		// handling the ASCII and invalid cases accordingly. This mask-and-or
		// approach prevents an additional branch.
		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
		return rune(s[0])&^mask | RuneError&mask, 1
	}
	sz := int(x & 7)
	accept := acceptRanges[x>>4]
	if n < sz {
		return RuneError, 1
	}
	s1 := s[1]
	if s1 < accept.lo || accept.hi < s1 {
		return RuneError, 1
	}
	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
		return rune(s0&mask2)<<6 | rune(s1&maskx), 2
	}
	s2 := s[2]
	if s2 < locb || hicb < s2 {
		return RuneError, 1
	}
	if sz <= 3 {
		return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
	}
	s3 := s[3]
	if s3 < locb || hicb < s3 {
		return RuneError, 1
	}
	return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
}

// rest omitted

I also wrote some benchmarks:

package utf8_test

import (
	"strconv"
	"testing"

	. "gist.github.com/jub0bs/8caddb242ddd776f5b30d5fd2bbb836e"
)

type ValidTest struct {
	in  string
	out bool
}

var validTests = []ValidTest{
	{"", true},
	{"a", true},
	{"abc", true},
	{"Ж", true},
	{"ЖЖ", true},
	{"брэд-ЛГТМ", true},
	{"☺☻☹", true},
	{"aa\xe2", false},
	{string([]byte{66, 250}), false},
	{string([]byte{66, 250, 67}), false},
	{"a\uFFFDb", true},
	{string("\xF4\x8F\xBF\xBF"), true},      // U+10FFFF
	{string("\xF4\x90\x80\x80"), false},     // U+10FFFF+1; out of range
	{string("\xF7\xBF\xBF\xBF"), false},     // 0x1FFFFF; out of range
	{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
	{string("\xc0\x80"), false},             // U+0000 encoded in two bytes: incorrect
	{string("\xed\xa0\x80"), false},         // U+D800 high surrogate (sic)
	{string("\xed\xbf\xbf"), false},         // U+DFFF low surrogate (sic)
}

func BenchmarkDecodeRune(b *testing.B) {
	for _, tc := range validTests {
		in := []byte(tc.in)
		b.Run(strconv.Quote(tc.in), func(b *testing.B) {
			for b.Loop() {
				DecodeRune(in)
			}
		})
	}
}

func BenchmarkDecodeRuneInString(b *testing.B) {
	for _, tc := range validTests {
		b.Run(strconv.Quote(tc.in), func(b *testing.B) {
			for b.Loop() {
				DecodeRuneInString(tc.in)
			}
		})
	}
}

func BenchmarkDecodeASCIIRune(b *testing.B) {
	a := []byte{'a'}
	for b.Loop() {
		DecodeRune(a)
	}
}

func BenchmarkDecodeJapaneseRune(b *testing.B) {
	nihon := []byte("本")
	for b.Loop() {
		DecodeRune(nihon)
	}
}

What did you see happen?

Quite a large performance gap between Go 1.24.2 and this implementation:

goos: darwin
goarch: amd64
pkg: gist.github.com/jub0bs/8caddb242ddd776f5b30d5fd2bbb836e
cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
                                            │     old     │                 new                 │
                                            │   sec/op    │   sec/op     vs base                │
DecodeRune/""-8                               1.966n ± 2%   3.286n ± 1%  +67.18% (p=0.000 n=10)
DecodeRune/"a"-8                              2.650n ± 1%   3.841n ± 1%  +44.96% (p=0.000 n=10)
DecodeRune/"abc"-8                            2.647n ± 1%   3.831n ± 0%  +44.68% (p=0.000 n=10)
DecodeRune/"Ж"-8                              3.818n ± 1%   5.014n ± 1%  +31.34% (p=0.000 n=10)
DecodeRune/"ЖЖ"-8                             3.821n ± 1%   4.997n ± 1%  +30.78% (p=0.000 n=10)
DecodeRune/"брэд-ЛГТМ"-8                      3.822n ± 1%   4.982n ± 0%  +30.35% (p=0.000 n=10)
DecodeRune/"☺☻☹"-8                            4.407n ± 0%   5.577n ± 1%  +26.56% (p=0.000 n=10)
DecodeRune/"aa\xe2"-8                         2.646n ± 0%   3.826n ± 0%  +44.62% (p=0.000 n=10)
DecodeRune/"B\xfa"-8                          2.636n ± 0%   3.845n ± 2%  +45.86% (p=0.000 n=10)
DecodeRune/"B\xfaC"-8                         2.643n ± 1%   3.844n ± 1%  +45.47% (p=0.000 n=10)
DecodeRune/"a�b"-8                            2.639n ± 0%   3.828n ± 1%  +45.05% (p=0.000 n=10)
DecodeRune/"\U0010ffff"-8                     4.686n ± 0%   6.227n ± 1%  +32.87% (p=0.000 n=10)
DecodeRune/"\xf4\x90\x80\x80"-8               3.526n ± 0%   5.071n ± 1%  +43.82% (p=0.000 n=10)
DecodeRune/"\xf7\xbf\xbf\xbf"-8               2.646n ± 1%   4.317n ± 1%  +63.16% (p=0.000 n=10)
DecodeRune/"\xfb\xbf\xbf\xbf\xbf"-8           2.643n ± 1%   4.293n ± 1%  +62.43% (p=0.000 n=10)
DecodeRune/"\xc0\x80"-8                       2.635n ± 1%   3.821n ± 1%  +45.02% (p=0.000 n=10)
DecodeRune/"\xed\xa0\x80"-8                   3.512n ± 1%   4.702n ± 0%  +33.87% (p=0.000 n=10)
DecodeRune/"\xed\xbf\xbf"-8                   3.514n ± 1%   4.682n ± 1%  +33.21% (p=0.000 n=10)
DecodeRuneInString/""-8                       2.347n ± 1%   3.051n ± 1%  +29.97% (p=0.000 n=10)
DecodeRuneInString/"a"-8                      2.471n ± 2%   3.681n ± 1%  +48.97% (p=0.000 n=10)
DecodeRuneInString/"abc"-8                    2.454n ± 1%   3.672n ± 0%  +49.66% (p=0.000 n=10)
DecodeRuneInString/"Ж"-8                      3.815n ± 0%   4.699n ± 1%  +23.18% (p=0.000 n=10)
DecodeRuneInString/"ЖЖ"-8                     3.816n ± 1%   4.705n ± 1%  +23.28% (p=0.000 n=10)
DecodeRuneInString/"брэд-ЛГТМ"-8              3.824n ± 1%   4.725n ± 1%  +23.56% (p=0.000 n=10)
DecodeRuneInString/"☺☻☹"-8                    4.114n ± 0%   5.293n ± 1%  +28.65% (p=0.000 n=10)
DecodeRuneInString/"aa\xe2"-8                 2.445n ± 0%   3.719n ± 2%  +52.14% (p=0.000 n=10)
DecodeRuneInString/"B\xfa"-8                  2.450n ± 0%   3.688n ± 2%  +50.54% (p=0.000 n=10)
DecodeRuneInString/"B\xfaC"-8                 2.447n ± 0%   3.681n ± 0%  +50.40% (p=0.000 n=10)
DecodeRuneInString/"a�b"-8                    2.446n ± 0%   3.681n ± 1%  +50.49% (p=0.000 n=10)
DecodeRuneInString/"\U0010ffff"-8             4.517n ± 1%   5.723n ± 1%  +26.71% (p=0.000 n=10)
DecodeRuneInString/"\xf4\x90\x80\x80"-8       3.224n ± 1%   4.412n ± 1%  +36.85% (p=0.000 n=10)
DecodeRuneInString/"\xf7\xbf\xbf\xbf"-8       2.439n ± 0%   3.679n ± 1%  +50.84% (p=0.000 n=10)
DecodeRuneInString/"\xfb\xbf\xbf\xbf\xbf"-8   2.443n ± 1%   3.678n ± 1%  +50.55% (p=0.000 n=10)
DecodeRuneInString/"\xc0\x80"-8               2.445n ± 1%   3.682n ± 0%  +50.62% (p=0.000 n=10)
DecodeRuneInString/"\xed\xa0\x80"-8           3.245n ± 1%   4.414n ± 1%  +36.04% (p=0.000 n=10)
DecodeRuneInString/"\xed\xbf\xbf"-8           3.227n ± 0%   4.410n ± 0%  +36.66% (p=0.000 n=10)
DecodeASCIIRune-8                             2.490n ± 0%   4.117n ± 0%  +65.39% (p=0.000 n=10)
DecodeJapaneseRune-8                          4.111n ± 0%   5.288n ± 0%  +28.66% (p=0.000 n=10)
geomean                                       3.016n        4.258n       +41.19%

What did you expect to see?

I expected to observe no such performance degradation between Go 1.24.2 and this implementation. It is not only disappointing but difficult to explain; see this thread on Gophers Slack. Such a generic constraint, if it were "free", would unlock much code deduplication in the standard library (see also net/textproto.TrimString and net/textproto.TrimBytes) and beyond.

Related: #56948

Metadata

Metadata

Assignees

No one assigned

    Labels

    compiler/runtimeIssues related to the Go compiler and/or runtime.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions