diff --git a/sd/consul/instancer.go b/sd/consul/instancer.go index 38b18f0d3..6eb87a450 100644 --- a/sd/consul/instancer.go +++ b/sd/consul/instancer.go @@ -3,12 +3,14 @@ package consul import ( "fmt" "io" + "time" consul "github.com/hashicorp/consul/api" "github.com/go-kit/kit/log" "github.com/go-kit/kit/sd" "github.com/go-kit/kit/sd/internal/instance" + "github.com/go-kit/kit/util/conn" ) const defaultIndex = 0 @@ -59,6 +61,7 @@ func (s *Instancer) loop(lastIndex uint64) { var ( instances []string err error + d time.Duration = 10 * time.Millisecond ) for { instances, lastIndex, err = s.getInstances(lastIndex, s.quitc) @@ -67,9 +70,12 @@ func (s *Instancer) loop(lastIndex uint64) { return // stopped via quitc case err != nil: s.logger.Log("err", err) + time.Sleep(d) + d = conn.Exponential(d) s.cache.Update(sd.Event{Err: err}) default: s.cache.Update(sd.Event{Instances: instances}) + d = 10 * time.Millisecond } } } diff --git a/util/conn/manager.go b/util/conn/manager.go index 0b7db6281..725cbbc7a 100644 --- a/util/conn/manager.go +++ b/util/conn/manager.go @@ -2,6 +2,7 @@ package conn import ( "errors" + "math/rand" "net" "time" @@ -103,7 +104,7 @@ func (m *Manager) loop() { case conn = <-connc: if conn == nil { // didn't work - backoff = exponential(backoff) // wait longer + backoff = Exponential(backoff) // wait longer reconnectc = m.after(backoff) // try again } else { // worked! @@ -132,12 +133,18 @@ func dial(d Dialer, network, address string, logger log.Logger) net.Conn { return conn } -func exponential(d time.Duration) time.Duration { +// Exponential takes a duration and returns another one that is twice as long, +/- 50%. It is +// used to provide backoff for operations that may fail and should avoid thundering herds. +// See https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ for rationale +func Exponential(d time.Duration) time.Duration { d *= 2 + jitter := rand.Float64() + 0.5 + d = time.Duration(int64(float64(d.Nanoseconds()) * jitter)) if d > time.Minute { d = time.Minute } return d + } // ErrConnectionUnavailable is returned by the Manager's Write method when the