Skip to content

Commit

Permalink
grpc: support channel idleness
Browse files Browse the repository at this point in the history
  • Loading branch information
easwars committed May 12, 2023
1 parent 5c4bee5 commit 3649ed2
Show file tree
Hide file tree
Showing 14 changed files with 1,406 additions and 143 deletions.
233 changes: 185 additions & 48 deletions balancer_conn_wrappers.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,36 +46,53 @@ import (
// It uses the gracefulswitch.Balancer internally to ensure that balancer
// switches happen in a graceful manner.
type ccBalancerWrapper struct {
cc *ClientConn
// The following fields are initialized when the wrapper is created and are
// read-only afterwards, and therefore can be accessed without a mutex.
cc *ClientConn
opts balancer.BuildOptions

// Outgoing (gRPC --> balancer) calls are guaranteed to execute in a
// mutually exclusive manner as they are scheduled on the
// CallbackSerializer. Fields accessed *only* in serializer callbacks, can
// therefore be accessed without a mutex.
serializer *grpcsync.CallbackSerializer
serializerCancel context.CancelFunc
balancer *gracefulswitch.Balancer
curBalancerName string
// mutually exclusive manner as they are scheduled in the serializer. Fields
// accessed *only* in these serializer callbacks, can therefore be accessed
// without a mutex.
balancer *gracefulswitch.Balancer
curBalancerName string

// During the window of time when the channel is entring idle and the
// underlying balancer is being shut down, keeping track of whether the
// channel is in idle mode to ensure that calls from the underlying balancer
// are not forwarded to grpc.

// mu guards access to the below fields. The serializer and its cancel
// function need to be mutex protected because they are overwritten when the
// wrapper exits idle mode.
mu sync.Mutex
serializer *grpcsync.CallbackSerializer // To serialize all outoing calls.
serializerCancel context.CancelFunc // To close the seralizer at close/enterIdle time.
isIdleOrClosed bool
}

// newCCBalancerWrapper creates a new balancer wrapper. The underlying balancer
// is not created until the switchTo() method is invoked.
func newCCBalancerWrapper(cc *ClientConn, bopts balancer.BuildOptions) *ccBalancerWrapper {
ctx, cancel := context.WithCancel(context.Background())
ccb := &ccBalancerWrapper{
cc: cc,
serializer: grpcsync.NewCallbackSerializer(ctx),
serializerCancel: cancel,
cc: cc,
opts: bopts,
}

ccb.balancer = gracefulswitch.NewBalancer(ccb, bopts)
ctx, cancel := context.WithCancel(context.Background())
ccb.serializer = grpcsync.NewCallbackSerializer(ctx)
ccb.serializerCancel = cancel
return ccb
}

// updateClientConnState is invoked by grpc to push a ClientConnState update to
// the underlying balancer.
func (ccb *ccBalancerWrapper) updateClientConnState(ccs *balancer.ClientConnState) error {
ccb.mu.Lock()
errCh := make(chan error, 1)
ccb.serializer.Schedule(func(_ context.Context) {
ok := ccb.serializer.Schedule(func(_ context.Context) {
// If the addresses specified in the update contain addresses of type
// "grpclb" and the selected LB policy is not "grpclb", these addresses
// will be filtered out and ccs will be modified with the updated
Expand All @@ -92,16 +109,19 @@ func (ccb *ccBalancerWrapper) updateClientConnState(ccs *balancer.ClientConnStat
}
errCh <- ccb.balancer.UpdateClientConnState(*ccs)
})

// If the balancer wrapper is closed when waiting for this state update to
// be handled, the callback serializer will be closed as well, and we can
// rely on its Done channel to ensure that we don't block here forever.
select {
case err := <-errCh:
return err
case <-ccb.serializer.Done:
return nil
if !ok {
// If we are unable to schedule a function with the serializer, it
// indicates that it has been closed. A serializer is only closed when
// the wrapper is closed or is in idle.
ccb.mu.Unlock()
return fmt.Errorf("grpc: cannot send state update to a closed or idle balancer")
}
ccb.mu.Unlock()

// We get here only if the above call to Schedule succeeds, in which case it
// is guaranteed that the scheduled function will run. Therefore it is safe
// to block on this channel.
return <-errCh
}

// updateSubConnState is invoked by grpc to push a subConn state update to the
Expand All @@ -120,21 +140,19 @@ func (ccb *ccBalancerWrapper) updateSubConnState(sc balancer.SubConn, s connecti
if sc == nil {
return
}
ccb.mu.Lock()
ccb.serializer.Schedule(func(_ context.Context) {
ccb.balancer.UpdateSubConnState(sc, balancer.SubConnState{ConnectivityState: s, ConnectionError: err})
})
}

func (ccb *ccBalancerWrapper) exitIdle() {
ccb.serializer.Schedule(func(_ context.Context) {
ccb.balancer.ExitIdle()
})
ccb.mu.Unlock()
}

func (ccb *ccBalancerWrapper) resolverError(err error) {
ccb.mu.Lock()
ccb.serializer.Schedule(func(_ context.Context) {
ccb.balancer.ResolverError(err)
})
ccb.mu.Unlock()
}

// switchTo is invoked by grpc to instruct the balancer wrapper to switch to the
Expand All @@ -148,41 +166,142 @@ func (ccb *ccBalancerWrapper) resolverError(err error) {
// the ccBalancerWrapper keeps track of the current LB policy name, and skips
// the graceful balancer switching process if the name does not change.
func (ccb *ccBalancerWrapper) switchTo(name string) {
ccb.mu.Lock()
ccb.serializer.Schedule(func(_ context.Context) {
// TODO: Other languages use case-sensitive balancer registries. We should
// switch as well. See: https://github.com/grpc/grpc-go/issues/5288.
if strings.EqualFold(ccb.curBalancerName, name) {
return
}
ccb.buildLoadBalancingPolicy(name)
})
ccb.mu.Unlock()
}

// Use the default LB policy, pick_first, if no LB policy with name is
// found in the registry.
builder := balancer.Get(name)
if builder == nil {
channelz.Warningf(logger, ccb.cc.channelzID, "Channel switches to new LB policy %q, since the specified LB policy %q was not registered", PickFirstBalancerName, name)
builder = newPickfirstBuilder()
} else {
channelz.Infof(logger, ccb.cc.channelzID, "Channel switches to new LB policy %q", name)
}
// buildLoadBalancingPolicy performs the following:
// - retrieve a balancer builder for the given name. Use the default LB
// policy, pick_first, if no LB policy with name is found in the registry.
// - instruct the gracefulswitch balancer to switch to the above builder. This
// will actually build the new balancer.
// - update the `curBalancerName` field
//
// Must be called from a serializer callback.
func (ccb *ccBalancerWrapper) buildLoadBalancingPolicy(name string) {
builder := balancer.Get(name)
if builder == nil {
channelz.Warningf(logger, ccb.cc.channelzID, "Channel switches to new LB policy %q, since the specified LB policy %q was not registered", PickFirstBalancerName, name)
builder = newPickfirstBuilder()
} else {
channelz.Infof(logger, ccb.cc.channelzID, "Channel switches to new LB policy %q", name)
}

if err := ccb.balancer.SwitchTo(builder); err != nil {
channelz.Errorf(logger, ccb.cc.channelzID, "Channel failed to build new LB policy %q: %v", name, err)
return
}
ccb.curBalancerName = builder.Name()
}

func (ccb *ccBalancerWrapper) close() {
done := make(chan struct{})
ccb.mu.Lock()
if ok := ccb.serializer.Schedule(ccb.handleCloseAndEnterIdle(done)); !ok {
// This indicates that the wrapper is already closed or is in idle mode.
// Nothing needs to be done here.
ccb.mu.Unlock()
return
}
ccb.mu.Unlock()
<-done
}

// enterIdleMode is invoked by grpc when the channel enters idle mode upon
// expiry of idle_timeout. This call blocks until the balancer is closed.
func (ccb *ccBalancerWrapper) enterIdleMode() {
channelz.Info(logger, ccb.cc.channelzID, "ccBalancerWrapper: entering idle mode")
ccb.close()
}

func (ccb *ccBalancerWrapper) handleCloseAndEnterIdle(done chan struct{}) func(context.Context) {
channelz.Info(logger, ccb.cc.channelzID, "easwars: in handleCloseAndEnterIdle")
return func(context.Context) {
channelz.Info(logger, ccb.cc.channelzID, "easwars: in close func in serializer")
ccb.mu.Lock()
// Close the serializer to ensure that no more calls from gRPC are sent
// to the balancer.
ccb.serializerCancel()
ccb.isIdleOrClosed = true
ccb.mu.Unlock()

ccb.balancer.Close()
close(done)
}
}

// exitIdleMode is invoked by grpc when the channel exits idle mode either
// because of an RPC or because of an invocation of the Connect() API. This
// recreates the balancer that was closed previously when entering idle mode.
//
// If the channel is not in idle mode, we know for a fact that we are here as a
// result of the user calling the Connect() method on the ClientConn. In this
// case, we can simply forward the call to the underlying balancer, instructing
// it to reconnect to the backends.
func (ccb *ccBalancerWrapper) exitIdleMode() {
channelz.Info(logger, ccb.cc.channelzID, "ccBalancerWrapper: exiting idle mode")

ccb.mu.Lock()
if ccb.isIdleOrClosed {
ctx, cancel := context.WithCancel(context.Background())
ccb.serializer = grpcsync.NewCallbackSerializer(ctx)
ccb.serializerCancel = cancel
}

if err := ccb.balancer.SwitchTo(builder); err != nil {
channelz.Errorf(logger, ccb.cc.channelzID, "Channel failed to build new LB policy %q: %v", name, err)
done := make(chan struct{})
ccb.serializer.Schedule(func(_ context.Context) {
defer close(done)

ccb.mu.Lock()
defer ccb.mu.Unlock()

if !ccb.isIdleOrClosed {
ccb.balancer.ExitIdle()
return
}
ccb.curBalancerName = builder.Name()

// Gracefulswitch balancer does not support a switchTo operation after
// being closed. Hence we need to create a new one here.
ccb.balancer = gracefulswitch.NewBalancer(ccb, ccb.opts)
ccb.buildLoadBalancingPolicy(ccb.curBalancerName)
ccb.isIdleOrClosed = false
})
ccb.mu.Unlock()

<-done
}

func (ccb *ccBalancerWrapper) close() {
// Close the serializer to ensure that no more calls from gRPC are sent to
// the balancer. We don't have to worry about suppressing calls from a
// closed balancer because these are handled by the ClientConn (balancer
// wrapper is only ever closed when the ClientConn is closed).
ccb.serializerCancel()
<-ccb.serializer.Done
ccb.balancer.Close()
func (ccb *ccBalancerWrapper) isIdleOrClosedLocked() bool {
ccb.mu.Lock()
defer ccb.mu.Unlock()
return ccb.isIdleOrClosed
}

// We cannot schedule incoming calls on the serailzer becasue outoing call updateClientConnState is blocked on the seralizer. When this call is forwarded
// to the actual balancer, it might call NewSubConn inline which also needs to block.
// But holding the lock for the duration of the call is fine because:
// a: the lock is held on the outoing side only to queue callbacks in the serializer
// b: holding the lock for the whole duration is necessary here to guarantee
// that balancer close/enterIdle does not happen after we check for
// `isInactive`, but before we do that actual operation required for the
// incoming call.
//
// holding the lock for the whole duration does not work:
// - when a balancer switch happens, the old balancer is closed, wherein it might remove subConns
// - removeSubConn is handled with the lock and as part of it, and UpdateSubConnState is pushed by grpc
// - updateSubConnState tries to grab the lock to add a callback to the serializer, but its deadlocks
//
// graceful switch balancer drops calls from the balancer once close is called.
// So, we might not have to check for `isInactiveLocked()` here.
// No, we need it since nothing stops them from calling any methods on the balancer.ClientConn interface inline from their Close(), like removeAddrConn etc
func (ccb *ccBalancerWrapper) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) {
if len(addrs) <= 0 {
return nil, fmt.Errorf("grpc: cannot create SubConn with empty address list")
Expand All @@ -200,6 +319,12 @@ func (ccb *ccBalancerWrapper) NewSubConn(addrs []resolver.Address, opts balancer
}

func (ccb *ccBalancerWrapper) RemoveSubConn(sc balancer.SubConn) {
/*
if ccb.isIdleOrClosedLocked() {
return
}
*/

acbw, ok := sc.(*acBalancerWrapper)
if !ok {
return
Expand All @@ -208,6 +333,10 @@ func (ccb *ccBalancerWrapper) RemoveSubConn(sc balancer.SubConn) {
}

func (ccb *ccBalancerWrapper) UpdateAddresses(sc balancer.SubConn, addrs []resolver.Address) {
if ccb.isIdleOrClosedLocked() {
return
}

acbw, ok := sc.(*acBalancerWrapper)
if !ok {
return
Expand All @@ -216,6 +345,10 @@ func (ccb *ccBalancerWrapper) UpdateAddresses(sc balancer.SubConn, addrs []resol
}

func (ccb *ccBalancerWrapper) UpdateState(s balancer.State) {
if ccb.isIdleOrClosedLocked() {
return
}

// Update picker before updating state. Even though the ordering here does
// not matter, it can lead to multiple calls of Pick in the common start-up
// case where we wait for ready and then perform an RPC. If the picker is
Expand All @@ -226,6 +359,10 @@ func (ccb *ccBalancerWrapper) UpdateState(s balancer.State) {
}

func (ccb *ccBalancerWrapper) ResolveNow(o resolver.ResolveNowOptions) {
if ccb.isIdleOrClosedLocked() {
return
}

ccb.cc.resolveNow(o)
}

Expand Down
5 changes: 5 additions & 0 deletions call.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ import (
//
// All errors returned by Invoke are compatible with the status package.
func (cc *ClientConn) Invoke(ctx context.Context, method string, args, reply interface{}, opts ...CallOption) error {
if err := cc.idlenessMgr.onCallBegin(); err != nil {
return err
}
defer cc.idlenessMgr.onCallEnd()

// allow interceptor to see all applicable call options, which means those
// configured as defaults from dial option as well as per-call options
opts = combine(cc.dopts.callOptions, opts)
Expand Down
Loading

0 comments on commit 3649ed2

Please sign in to comment.