Skip to content

Commit

Permalink
server.go: use worker goroutines for fewer stack allocations (#3204)
Browse files Browse the repository at this point in the history
Currently (go1.13.4), the default stack size for newly spawned
goroutines is 2048 bytes. This is insufficient when processing gRPC
requests as the we often require more than 4 KiB stacks. This causes the
Go runtime to call runtime.morestack at least twice per RPC, which
causes performance to suffer needlessly as stack reallocations require
all sorts of internal work such as changing pointers to point to new
addresses.

Since this stack growth is guaranteed to happen at least twice per RPC,
reusing goroutines gives us two wins:

  1. The stack is already grown to 8 KiB after the first RPC, so
     subsequent RPCs do not call runtime.morestack.
  2. We eliminate the need to spawn a new goroutine for each request
     (even though they're relatively inexpensive).

Performance improves across the board. The improvement is especially
visible in small, unary requests as the overhead of stack reallocation
is higher, percentage-wise. QPS is up anywhere between 3% and 5%
depending on the number of concurrent RPC requests in flight. Latency is
down ~3%. There is even a 1% decrease in memory footprint in some cases,
though that is an unintended, but happy coincidence.

unary-networkMode_none-bufConn_false-keepalive_false-benchTime_1m0s-trace_false-latency_0s-kbps_0-MTU_0-maxConcurrentCalls_8-reqSize_1B-respSize_1B-compressor_off-channelz_false-preloader_false
               Title       Before        After Percentage
            TotalOps      2613512      2701705     3.37%
             SendOps            0            0      NaN%
             RecvOps            0            0      NaN%
            Bytes/op      8657.00      8654.17    -0.03%
           Allocs/op       173.37       173.28     0.00%
             ReqT/op    348468.27    360227.33     3.37%
            RespT/op    348468.27    360227.33     3.37%
            50th-Lat    174.601µs    167.378µs    -4.14%
            90th-Lat    233.132µs    229.087µs    -1.74%
            99th-Lat     438.98µs    441.857µs     0.66%
             Avg-Lat    183.263µs     177.26µs    -3.28%
  • Loading branch information
adtac committed Apr 23, 2020
1 parent 29f40a4 commit a0cdc21
Showing 1 changed file with 96 additions and 4 deletions.
100 changes: 96 additions & 4 deletions server.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import (
"google.golang.org/grpc/grpclog"
"google.golang.org/grpc/internal/binarylog"
"google.golang.org/grpc/internal/channelz"
"google.golang.org/grpc/internal/grpcrand"
"google.golang.org/grpc/internal/grpcsync"
"google.golang.org/grpc/internal/transport"
"google.golang.org/grpc/keepalive"
Expand Down Expand Up @@ -87,6 +88,12 @@ type service struct {
mdata interface{}
}

type serverWorkerData struct {
st transport.ServerTransport
wg *sync.WaitGroup
stream *transport.Stream
}

// Server is a gRPC server to serve RPC requests.
type Server struct {
opts serverOptions
Expand All @@ -107,6 +114,8 @@ type Server struct {

channelzID int64 // channelz unique identification number
czData *channelzData

serverWorkerChannels []chan *serverWorkerData
}

type serverOptions struct {
Expand All @@ -133,6 +142,7 @@ type serverOptions struct {
connectionTimeout time.Duration
maxHeaderListSize *uint32
headerTableSize *uint32
numServerWorkers uint32
}

var defaultServerOptions = serverOptions{
Expand Down Expand Up @@ -410,6 +420,66 @@ func HeaderTableSize(s uint32) ServerOption {
})
}

// NumStreamWorkers returns a ServerOption that sets the number of worker
// goroutines that should be used to process incoming streams. Setting this to
// zero (default) will disable workers and spawn a new goroutine for each
// stream.
//
// This API is EXPERIMENTAL.
func NumStreamWorkers(numServerWorkers uint32) ServerOption {
// TODO: If/when this API gets stabilized (i.e. stream workers become the
// only way streams are processed), change the behavior of the zero value to
// a sane default. Preliminary experiments suggest that a value equal to the
// number of CPUs available is most performant; requires thorough testing.
return newFuncServerOption(func(o *serverOptions) {
o.numServerWorkers = numServerWorkers
})
}

// serverWorkerResetThreshold defines how often the stack must be reset. Every
// N requests, by spawning a new goroutine in its place, a worker can reset its
// stack so that large stacks don't live in memory forever. 2^16 should allow
// each goroutine stack to live for at least a few seconds in a typical
// workload (assuming a QPS of a few thousand requests/sec).
const serverWorkerResetThreshold = 1 << 16

// serverWorkers blocks on a *transport.Stream channel forever and waits for
// data to be fed by serveStreams. This allows different requests to be
// processed by the same goroutine, removing the need for expensive stack
// re-allocations (see the runtime.morestack problem [1]).
//
// [1] https://github.com/golang/go/issues/18138
func (s *Server) serverWorker(ch chan *serverWorkerData) {
// To make sure all server workers don't reset at the same time, choose a
// random number of iterations before resetting.
threshold := serverWorkerResetThreshold + grpcrand.Intn(serverWorkerResetThreshold)
for completed := 0; completed < threshold; completed++ {
data, ok := <-ch
if !ok {
return
}
s.handleStream(data.st, data.stream, s.traceInfo(data.st, data.stream))
data.wg.Done()
}
go s.serverWorker(ch)
}

// initServerWorkers creates worker goroutines and channels to process incoming
// connections to reduce the time spent overall on runtime.morestack.
func (s *Server) initServerWorkers() {
s.serverWorkerChannels = make([]chan *serverWorkerData, s.opts.numServerWorkers)
for i := uint32(0); i < s.opts.numServerWorkers; i++ {
s.serverWorkerChannels[i] = make(chan *serverWorkerData)
go s.serverWorker(s.serverWorkerChannels[i])
}
}

func (s *Server) stopServerWorkers() {
for i := uint32(0); i < s.opts.numServerWorkers; i++ {
close(s.serverWorkerChannels[i])
}
}

// NewServer creates a gRPC server which has no service registered and has not
// started to accept requests yet.
func NewServer(opt ...ServerOption) *Server {
Expand All @@ -434,6 +504,10 @@ func NewServer(opt ...ServerOption) *Server {
s.events = trace.NewEventLog("grpc.Server", fmt.Sprintf("%s:%d", file, line))
}

if s.opts.numServerWorkers > 0 {
s.initServerWorkers()
}

if channelz.IsOn() {
s.channelzID = channelz.RegisterServer(&channelzServer{s}, "")
}
Expand Down Expand Up @@ -739,12 +813,27 @@ func (s *Server) newHTTP2Transport(c net.Conn, authInfo credentials.AuthInfo) tr
func (s *Server) serveStreams(st transport.ServerTransport) {
defer st.Close()
var wg sync.WaitGroup

var roundRobinCounter uint32
st.HandleStreams(func(stream *transport.Stream) {
wg.Add(1)
go func() {
defer wg.Done()
s.handleStream(st, stream, s.traceInfo(st, stream))
}()
if s.opts.numServerWorkers > 0 {
data := &serverWorkerData{st: st, wg: &wg, stream: stream}
select {
case s.serverWorkerChannels[atomic.AddUint32(&roundRobinCounter, 1)%s.opts.numServerWorkers] <- data:
default:
// If all stream workers are busy, fallback to the default code path.
go func() {
s.handleStream(st, stream, s.traceInfo(st, stream))
wg.Done()
}()
}
} else {
go func() {
defer wg.Done()
s.handleStream(st, stream, s.traceInfo(st, stream))
}()
}
}, func(ctx context.Context, method string) context.Context {
if !EnableTracing {
return ctx
Expand Down Expand Up @@ -1507,6 +1596,9 @@ func (s *Server) Stop() {
for c := range st {
c.Close()
}
if s.opts.numServerWorkers > 0 {
s.stopServerWorkers()
}

s.mu.Lock()
if s.events != nil {
Expand Down

0 comments on commit a0cdc21

Please sign in to comment.