Skip to content

Commit

Permalink
Fix infinite failure on Kubernetes watch
Browse files Browse the repository at this point in the history
  • Loading branch information
vjsamuel committed Mar 9, 2018
1 parent 299f3b4 commit 095e2d0
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ https://github.com/elastic/beats/compare/v6.0.0-beta2...master[Check the HEAD di
- Fix conditions checking on autodiscover Docker labels. {pull}6412[6412]
- Fix for kafka logger. {pull}6430[6430]
- Remove double slashes in Windows service script. {pull}6491[6491]
- Fix infinite failure on Kubernetes watch {pull}6504[6504]

*Auditbeat*

Expand Down
23 changes: 22 additions & 1 deletion libbeat/common/kubernetes/watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package kubernetes
import (
"context"
"fmt"
"io"
"time"

"github.com/ericchiang/k8s"
Expand All @@ -11,6 +12,9 @@ import (
"github.com/elastic/beats/libbeat/logp"
)

// Max back off time for retries
const maxBackoff = 30 * time.Second

func filterByNode(node string) k8s.Option {
return k8s.QueryParam("fieldSelector", "spec.nodeName="+node)
}
Expand Down Expand Up @@ -161,6 +165,9 @@ func (w *watcher) Start() error {
}

func (w *watcher) watch() {
// Failures counter, do exponential backoff on retries
var failures uint

for {
select {
case <-w.ctx.Done():
Expand All @@ -176,7 +183,8 @@ func (w *watcher) watch() {
//watch failures should be logged and gracefully failed over as metadata retrieval
//should never stop.
logp.Err("kubernetes: Watching API error %v", err)
time.Sleep(time.Second)
backoff(failures)
failures++
continue
}

Expand All @@ -186,8 +194,14 @@ func (w *watcher) watch() {
if err != nil {
logp.Err("kubernetes: Watching API error %v", err)
watcher.Close()
if !(err == io.EOF || err == io.ErrUnexpectedEOF) {
// This is an error event which can be recovered by moving to the latest resource verison
logp.Info("kubernetes: Ignoring event, moving to most recent resource version")
w.lastResourceVersion = ""
}
break
}
failures = 0
switch eventType {
case k8s.EventAdded:
w.onAdd(r)
Expand All @@ -205,3 +219,10 @@ func (w *watcher) watch() {
func (w *watcher) Stop() {
w.stop()
}
func backoff(failures uint) {
wait := 1 << failures * time.Second
if wait > maxBackoff {
wait = maxBackoff
}
time.Sleep(wait)
}

0 comments on commit 095e2d0

Please sign in to comment.