Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

client: backoff before reconnecting if an HTTP2 server preface was not received #1648

Merged
merged 11 commits into from
Dec 1, 2017
245 changes: 176 additions & 69 deletions clientconn.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ type dialOptions struct {
// This is to support v1 balancer.
balancerBuilder balancer.Builder
// This is to support grpclb.
resolverBuilder resolver.Builder
resolverBuilder resolver.Builder
waitForHandshake bool
}

const (
Expand All @@ -109,6 +110,15 @@ const (
// DialOption configures how we set up the connection.
type DialOption func(*dialOptions)

// WithWaitForHandshake blocks until the initial settings frame is received from the
// server before assigning RPCs to the connection.
// Experimental API.
func WithWaitForHandshake() DialOption {
return func(o *dialOptions) {
o.waitForHandshake = true
}
}

// WithWriteBufferSize lets you set the size of write buffer, this determines how much data can be batched
// before doing a write on the wire.
func WithWriteBufferSize(s int) DialOption {
Expand Down Expand Up @@ -240,7 +250,7 @@ func WithBackoffConfig(b BackoffConfig) DialOption {
return withBackoff(b)
}

// withBackoff sets the backoff strategy used for retries after a
// withBackoff sets the backoff strategy used for connectRetryNum after a
// failed connection attempt.
//
// This can be exported if arbitrary backoff strategies are allowed by gRPC.
Expand Down Expand Up @@ -804,6 +814,7 @@ func (ac *addrConn) tryUpdateAddrs(addrs []resolver.Address) bool {
grpclog.Infof("addrConn: tryUpdateAddrs curAddrFound: %v", curAddrFound)
if curAddrFound {
ac.addrs = addrs
ac.reconnectIdx = 0 // Start reconnecting from beginning in the new list.
}

return curAddrFound
Expand Down Expand Up @@ -899,22 +910,31 @@ type addrConn struct {
ctx context.Context
cancel context.CancelFunc

cc *ClientConn
curAddr resolver.Address
addrs []resolver.Address
dopts dialOptions
events trace.EventLog
acbw balancer.SubConn
cc *ClientConn
addrs []resolver.Address
dopts dialOptions
events trace.EventLog
acbw balancer.SubConn

mu sync.Mutex
state connectivity.State
mu sync.Mutex
curAddr resolver.Address
reconnectIdx int // The index in addrs list to start reconnecting from.
state connectivity.State
// ready is closed and becomes nil when a new transport is up or failed
// due to timeout.
ready chan struct{}
transport transport.ClientTransport

// The reason this addrConn is torn down.
tearDownErr error

connectRetryNum int
// backoffDeadline is the time until which resetTransport needs to
// wait before increasing connectRetryNum count.
backoffDeadline time.Time
// connectDeadline is the time by which all connection
// negotiations must complete.
connectDeadline time.Time
}

// adjustParams updates parameters used to create transports upon
Expand Down Expand Up @@ -949,6 +969,15 @@ func (ac *addrConn) errorf(format string, a ...interface{}) {

// resetTransport recreates a transport to the address for ac. The old
// transport will close itself on error or when the clientconn is closed.
// The created transport must receive initial settings frame from the server.
// In case that doesnt happen, transportMonitor will kill the newly created
// transport after connectDeadline has expired.
// In case there was an error on the transport before the settings frame was
// received, resetTransport resumes connecting to backends after the one that
// was previously connected to. In case end of the list is reached, resetTransport
// backs off until the original deadline.
// If the DialOption WithWaitForHandshake was set, resetTrasport returns
// successfully only after server settings are received.
//
// TODO(bar) make sure all state transitions are valid.
func (ac *addrConn) resetTransport() error {
Expand All @@ -962,19 +991,38 @@ func (ac *addrConn) resetTransport() error {
ac.ready = nil
}
ac.transport = nil
ac.curAddr = resolver.Address{}
ridx := ac.reconnectIdx
ac.mu.Unlock()
ac.cc.mu.RLock()
ac.dopts.copts.KeepaliveParams = ac.cc.mkp
ac.cc.mu.RUnlock()
for retries := 0; ; retries++ {
sleepTime := ac.dopts.bs.backoff(retries)
timeout := minConnectTimeout
var backoffDeadline, connectDeadline time.Time
for connectRetryNum := 0; ; connectRetryNum++ {
ac.mu.Lock()
if timeout < time.Duration(int(sleepTime)/len(ac.addrs)) {
timeout = time.Duration(int(sleepTime) / len(ac.addrs))
if ac.backoffDeadline.IsZero() {
// This means either a successful HTTP2 connection was established
// or this is the first time this addrConn is trying to establish a
// connection.
backoffFor := ac.dopts.bs.backoff(connectRetryNum) // time.Duration.
// This will be the duration that dial gets to finish.
dialDuration := minConnectTimeout
if backoffFor > dialDuration {
// Give dial more time as we keep failing to connect.
dialDuration = backoffFor
}
start := time.Now()
backoffDeadline = start.Add(backoffFor)
connectDeadline = start.Add(dialDuration)
ridx = 0 // Start connecting from the beginning.
} else {
// Continue trying to conect with the same deadlines.
connectRetryNum = ac.connectRetryNum
backoffDeadline = ac.backoffDeadline
connectDeadline = ac.connectDeadline
ac.backoffDeadline = time.Time{}
ac.connectDeadline = time.Time{}
ac.connectRetryNum = 0
}
connectTime := time.Now()
if ac.state == connectivity.Shutdown {
ac.mu.Unlock()
return errConnClosing
Expand All @@ -989,94 +1037,153 @@ func (ac *addrConn) resetTransport() error {
copy(addrsIter, ac.addrs)
copts := ac.dopts.copts
ac.mu.Unlock()
for _, addr := range addrsIter {
connected, err := ac.createTransport(connectRetryNum, ridx, backoffDeadline, connectDeadline, addrsIter, copts)
if err != nil {
return err
}
if connected {
return nil
}
}
}

// createTransport creates a connection to one of the backends in addrs.
// It returns true if a connection was established.
func (ac *addrConn) createTransport(connectRetryNum, ridx int, backoffDeadline, connectDeadline time.Time, addrs []resolver.Address, copts transport.ConnectOptions) (bool, error) {
for i := ridx; i < len(addrs); i++ {
addr := addrs[i]
target := transport.TargetInfo{
Addr: addr.Addr,
Metadata: addr.Metadata,
Authority: ac.cc.authority,
}
done := make(chan struct{})
onPrefaceReceipt := func() {
close(done)
ac.mu.Lock()
if ac.state == connectivity.Shutdown {
// ac.tearDown(...) has been invoked.
ac.mu.Unlock()
return errConnClosing
if !ac.backoffDeadline.IsZero() {
// If we haven't already started reconnecting to
// other backends.
// Note, this can happen when writer notices an error
// and triggers resetTransport while at the same time
// reader receives the preface and invokes this closure.
ac.backoffDeadline = time.Time{}
ac.connectDeadline = time.Time{}
ac.connectRetryNum = 0
}
ac.mu.Unlock()
sinfo := transport.TargetInfo{
Addr: addr.Addr,
Metadata: addr.Metadata,
Authority: ac.cc.authority,
}
newTransport, err := transport.NewClientTransport(ac.cc.ctx, sinfo, copts, timeout)
if err != nil {
if e, ok := err.(transport.ConnectionError); ok && !e.Temporary() {
ac.mu.Lock()
if ac.state != connectivity.Shutdown {
ac.state = connectivity.TransientFailure
ac.cc.handleSubConnStateChange(ac.acbw, ac.state)
}
ac.mu.Unlock()
return err
}
grpclog.Warningf("grpc: addrConn.resetTransport failed to create client transport: %v; Reconnecting to %v", err, addr)
}
// Do not cancel in the success path because of
// this issue in Go1.6: https://github.com/golang/go/issues/15078.
connectCtx, cancel := context.WithDeadline(ac.ctx, connectDeadline)
newTr, err := transport.NewClientTransport(connectCtx, ac.cc.ctx, target, copts, onPrefaceReceipt)
if err != nil {
cancel()
if e, ok := err.(transport.ConnectionError); ok && !e.Temporary() {
ac.mu.Lock()
if ac.state == connectivity.Shutdown {
// ac.tearDown(...) has been invoked.
ac.mu.Unlock()
return errConnClosing
if ac.state != connectivity.Shutdown {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be TransientFailure?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the state was Shutdown we don't want to put it to TransientFailure

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. If we are here the only possibilities are Connecting or Shutdown.

ac.state = connectivity.TransientFailure
ac.cc.handleSubConnStateChange(ac.acbw, ac.state)
}
ac.mu.Unlock()
continue
return false, err
}
ac.mu.Lock()
ac.printf("ready")
if ac.state == connectivity.Shutdown {
// ac.tearDown(...) has been invoked.
ac.mu.Unlock()
newTransport.Close()
return errConnClosing
}
ac.state = connectivity.Ready
ac.cc.handleSubConnStateChange(ac.acbw, ac.state)
t := ac.transport
ac.transport = newTransport
if t != nil {
t.Close()
}
ac.curAddr = addr
if ac.ready != nil {
close(ac.ready)
ac.ready = nil
return false, errConnClosing
}
ac.mu.Unlock()
return nil
grpclog.Warningf("grpc: addrConn.createTransport failed to connect to %v. Err :%v. Reconnecting...", addr, err)
continue
}
if ac.dopts.waitForHandshake {
select {
case <-done:
case <-connectCtx.Done():
// Didn't receive server preface, must kill this new transport now.
grpclog.Warningf("grpc: addrConn.createTransport failed to receive server preface before deadline.")
newTr.Close()
break
case <-ac.ctx.Done():
}
}
ac.mu.Lock()
ac.state = connectivity.TransientFailure
if ac.state == connectivity.Shutdown {
ac.mu.Unlock()
// ac.tearDonn(...) has been invoked.
newTr.Close()
return false, errConnClosing
}
ac.printf("ready")
ac.state = connectivity.Ready
ac.cc.handleSubConnStateChange(ac.acbw, ac.state)
ac.cc.resolveNow(resolver.ResolveNowOption{})
ac.transport = newTr
ac.curAddr = addr
if ac.ready != nil {
close(ac.ready)
ac.ready = nil
}
ac.connectRetryNum = connectRetryNum
ac.backoffDeadline = backoffDeadline
ac.connectDeadline = connectDeadline
ac.reconnectIdx = i + 1 // Start reconnecting from the next backend in the list.
ac.mu.Unlock()
timer := time.NewTimer(sleepTime - time.Since(connectTime))
select {
case <-timer.C:
case <-ac.ctx.Done():
timer.Stop()
return ac.ctx.Err()
}
return true, nil
}
ac.mu.Lock()
ac.state = connectivity.TransientFailure
ac.cc.handleSubConnStateChange(ac.acbw, ac.state)
ac.cc.resolveNow(resolver.ResolveNowOption{})
if ac.ready != nil {
close(ac.ready)
ac.ready = nil
}
ac.mu.Unlock()
timer := time.NewTimer(backoffDeadline.Sub(time.Now()))
select {
case <-timer.C:
case <-ac.ctx.Done():
timer.Stop()
return false, ac.ctx.Err()
}
return false, nil
}

// Run in a goroutine to track the error in transport and create the
// new transport if an error happens. It returns when the channel is closing.
func (ac *addrConn) transportMonitor() {
for {
var timer *time.Timer
var cdeadline <-chan time.Time
ac.mu.Lock()
t := ac.transport
if !ac.connectDeadline.IsZero() {
timer = time.NewTimer(ac.connectDeadline.Sub(time.Now()))
cdeadline = timer.C
}
ac.mu.Unlock()
// Block until we receive a goaway or an error occurs.
select {
case <-t.GoAway():
case <-t.Error():
case <-cdeadline:
ac.mu.Lock()
// This implies that client received server preface.
if ac.backoffDeadline.IsZero() {
ac.mu.Unlock()
continue
}
ac.mu.Unlock()
timer = nil
// No server preface received until deadline.
// Kill the connection.
grpclog.Warningf("grpc: addrConn.transportMonitor didn't get server preface after waiting. Closing the new transport now.")
t.Close()
}
if timer != nil {
timer.Stop()
}
// If a GoAway happened, regardless of error, adjust our keepalive
// parameters as appropriate.
Expand Down
Loading