Skip to content

Commit

Permalink
Fix issue caused by sole server marked as failed under load
Browse files Browse the repository at this point in the history
If health checks are failing for all servers, make a second pass through the server list with health-checks ignored before returning failure

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
  • Loading branch information
brandond committed May 29, 2024
1 parent ed23a2b commit 5e42769
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions pkg/agent/loadbalancer/loadbalancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,15 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
lb.mutex.RLock()
defer lb.mutex.RUnlock()

var allChecksFailed bool
startIndex := lb.nextServerIndex
for {
targetServer := lb.currentServerAddress

server := lb.servers[targetServer]
if server == nil || targetServer == "" {
logrus.Debugf("Nil server for load balancer %s: %s", lb.serviceName, targetServer)
} else if server.healthCheck() {
} else if allChecksFailed || server.healthCheck() {
conn, err := server.dialContext(ctx, network, targetServer)
if err == nil {
return conn, nil
Expand All @@ -189,7 +190,11 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
startIndex = maxIndex
}
if lb.nextServerIndex == startIndex {
return nil, errors.New("all servers failed")
if allChecksFailed {
return nil, errors.New("all servers failed")
}
logrus.Debugf("Health checks for all servers have failed, retrying with health checks ignored")
allChecksFailed = true
}
}
}
Expand Down

0 comments on commit 5e42769

Please sign in to comment.