-
Notifications
You must be signed in to change notification settings - Fork 486
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Flow: Improve Kubernetes log collection #5623
Changes from all commits
ff18be8
51cef9a
d8435e4
bb7ce48
9083b26
9ca3e83
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,13 +18,15 @@ import ( | |
"github.com/grafana/agent/component/discovery" | ||
"github.com/grafana/agent/component/loki/source/kubernetes/kubetail" | ||
"github.com/grafana/agent/pkg/flow/logging/level" | ||
"github.com/grafana/agent/service/cluster" | ||
"k8s.io/client-go/kubernetes" | ||
) | ||
|
||
func init() { | ||
component.Register(component.Registration{ | ||
Name: "loki.source.kubernetes", | ||
Args: Arguments{}, | ||
Name: "loki.source.kubernetes", | ||
Args: Arguments{}, | ||
NeedsServices: []string{cluster.ServiceName}, | ||
|
||
Build: func(opts component.Options, args component.Arguments) (component.Component, error) { | ||
return New(opts, args.(Arguments)) | ||
|
@@ -40,6 +42,8 @@ type Arguments struct { | |
|
||
// Client settings to connect to Kubernetes. | ||
Client commonk8s.ClientArguments `river:"client,block,optional"` | ||
|
||
Clustering cluster.ComponentBlock `river:"clustering,block,optional"` | ||
} | ||
|
||
// DefaultArguments holds default settings for loki.source.kubernetes. | ||
|
@@ -57,6 +61,7 @@ type Component struct { | |
log log.Logger | ||
opts component.Options | ||
positions positions.Positions | ||
cluster cluster.Cluster | ||
|
||
mut sync.Mutex | ||
args Arguments | ||
|
@@ -72,6 +77,7 @@ type Component struct { | |
var ( | ||
_ component.Component = (*Component)(nil) | ||
_ component.DebugComponent = (*Component)(nil) | ||
_ cluster.Component = (*Component)(nil) | ||
) | ||
|
||
// New creates a new loki.source.kubernetes component. | ||
|
@@ -89,7 +95,13 @@ func New(o component.Options, args Arguments) (*Component, error) { | |
return nil, err | ||
} | ||
|
||
data, err := o.GetServiceData(cluster.ServiceName) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
c := &Component{ | ||
cluster: data.(cluster.Cluster), | ||
log: o.Logger, | ||
opts: o, | ||
handler: loki.NewLogsReceiver(), | ||
|
@@ -168,28 +180,43 @@ func (c *Component) Update(args component.Arguments) error { | |
// No-op: manager already exists and options didn't change. | ||
} | ||
|
||
// Convert input targets into targets to give to tailer. | ||
targets := make([]*kubetail.Target, 0, len(newArgs.Targets)) | ||
c.resyncTargets(newArgs.Targets) | ||
c.args = newArgs | ||
return nil | ||
} | ||
|
||
func (c *Component) resyncTargets(targets []discovery.Target) { | ||
distTargets := discovery.NewDistributedTargets(c.args.Clustering.Enabled, c.cluster, targets) | ||
targets = distTargets.Get() | ||
|
||
for _, inTarget := range newArgs.Targets { | ||
lset := inTarget.Labels() | ||
tailTargets := make([]*kubetail.Target, 0, len(targets)) | ||
for _, target := range targets { | ||
lset := target.Labels() | ||
processed, err := kubetail.PrepareLabels(lset, c.opts.ID) | ||
if err != nil { | ||
// TODO(rfratto): should this set the health of the component? | ||
level.Error(c.log).Log("msg", "failed to process input target", "target", lset.String(), "err", err) | ||
continue | ||
} | ||
targets = append(targets, kubetail.NewTarget(lset, processed)) | ||
tailTargets = append(tailTargets, kubetail.NewTarget(lset, processed)) | ||
} | ||
|
||
// This will never fail because it only fails if the context gets canceled. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this is called while holding the component mutex, I'd say yes, we should have a timeout. Even if we can only log it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm nervous about that; maybe it should be detached from the mutex instead. If it times out and fails, that means that the targets won't ever converge unless there's another discovery update or the cluster updates again. While discovery updates are common for us, it might not be common in more stable environments. So, the risk of a timeout causing targets to never converge is worrying me. WDYT? |
||
// | ||
// TODO(rfratto): should we have a generous update timeout to prevent this | ||
// from potentially hanging forever? | ||
_ = c.tailer.SyncTargets(context.Background(), targets) | ||
_ = c.tailer.SyncTargets(context.Background(), tailTargets) | ||
} | ||
|
||
c.args = newArgs | ||
return nil | ||
// NotifyClusterChange implements cluster.Component. | ||
func (c *Component) NotifyClusterChange() { | ||
c.mut.Lock() | ||
defer c.mut.Unlock() | ||
|
||
if !c.args.Clustering.Enabled { | ||
return | ||
} | ||
c.resyncTargets(c.args.Targets) | ||
} | ||
|
||
// getTailerOptions gets tailer options from arguments. If args hasn't changed | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
package kubetail | ||
|
||
import ( | ||
"sync" | ||
"time" | ||
) | ||
|
||
// rollingAverageCalculator calculates a rolling average between points in | ||
// time. | ||
// | ||
// rollingAverageCalculator stores a circular buffer where the difference | ||
// between timestamps are kept. | ||
type rollingAverageCalculator struct { | ||
mtx sync.Mutex | ||
window []time.Duration | ||
windowSize int | ||
|
||
minEntries int | ||
minDuration time.Duration | ||
defaultDuration time.Duration | ||
|
||
currentIndex int | ||
prevTimestamp time.Time | ||
} | ||
|
||
func newRollingAverageCalculator(windowSize, minEntries int, minDuration, defaultDuration time.Duration) *rollingAverageCalculator { | ||
return &rollingAverageCalculator{ | ||
windowSize: windowSize, | ||
window: make([]time.Duration, windowSize), | ||
minEntries: minEntries, | ||
minDuration: minDuration, | ||
defaultDuration: defaultDuration, | ||
currentIndex: -1, | ||
} | ||
} | ||
|
||
// AddTimestamp adds a new timestamp to the rollingAverageCalculator. If there | ||
// is a previous timestamp, the difference between timestamps is calculated and | ||
// stored in the window. | ||
func (r *rollingAverageCalculator) AddTimestamp(timestamp time.Time) { | ||
r.mtx.Lock() | ||
defer func() { | ||
r.prevTimestamp = timestamp | ||
r.mtx.Unlock() | ||
}() | ||
|
||
// First timestamp | ||
if r.currentIndex == -1 && r.prevTimestamp.Equal(time.Time{}) { | ||
return | ||
} | ||
|
||
r.currentIndex++ | ||
if r.currentIndex >= r.windowSize { | ||
r.currentIndex = 0 | ||
} | ||
|
||
r.window[r.currentIndex] = timestamp.Sub(r.prevTimestamp) | ||
} | ||
|
||
// GetAverage calculates the average of all the durations in the window. | ||
func (r *rollingAverageCalculator) GetAverage() time.Duration { | ||
r.mtx.Lock() | ||
defer r.mtx.Unlock() | ||
|
||
var total time.Duration | ||
count := 0 | ||
for _, v := range r.window { | ||
if v != 0 { | ||
total += v | ||
count++ | ||
} | ||
} | ||
if count == 0 || count < r.minEntries { | ||
return r.defaultDuration | ||
} | ||
d := total / time.Duration(count) | ||
if d < r.minDuration { | ||
return r.minDuration | ||
} | ||
return d | ||
} | ||
|
||
// GetLast gets the last timestamp added to the rollingAverageCalculator. | ||
func (r *rollingAverageCalculator) GetLast() time.Time { | ||
r.mtx.Lock() | ||
defer r.mtx.Unlock() | ||
|
||
return r.prevTimestamp | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package kubetail | ||
|
||
import ( | ||
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func Test(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
windowSize int | ||
minEntries int | ||
input []int64 | ||
expected time.Duration | ||
}{ | ||
{ | ||
name: "empty, expect default", | ||
windowSize: 10, | ||
input: []int64{}, | ||
expected: time.Duration(10), | ||
}, | ||
{ | ||
name: "one sample, not enough, expect default", | ||
windowSize: 5, | ||
input: []int64{10}, | ||
expected: time.Duration(10), | ||
}, | ||
{ | ||
name: "partially full", | ||
windowSize: 10, | ||
input: []int64{10, 20, 30, 40, 50}, | ||
expected: time.Duration(10), | ||
}, | ||
{ | ||
name: "completely full", | ||
windowSize: 5, | ||
input: []int64{10, 20, 30, 40, 50, 60}, | ||
expected: time.Duration(10), | ||
}, | ||
{ | ||
name: "rollover simple", | ||
windowSize: 5, | ||
input: []int64{10, 20, 30, 40, 50, 60}, | ||
expected: time.Duration(10), | ||
}, | ||
{ | ||
name: "rollover complex: make sure first value is ignored", | ||
windowSize: 5, | ||
input: []int64{0, 40, 50, 60, 70, 80, 90}, | ||
expected: time.Duration(10), | ||
}, | ||
{ | ||
name: "complex", | ||
windowSize: 5, | ||
// 40 +1 +4 +45 +5 = 95, 95/5 = 19 | ||
input: []int64{10, 50, 51, 55, 100, 105}, | ||
expected: time.Duration(19), | ||
}, | ||
{ | ||
name: "complex 2", | ||
windowSize: 10, | ||
// outside of window | | ||
// 40 +1 +4 +45 +|5 +5 +90 +100 +150 +300 +5 +45 +50 +149 = 899 | ||
input: []int64{10, 50, 51, 55, 100, 105, 110, 200, 300, 450, 750, 755, 800, 850, 999}, | ||
expected: time.Duration(89), // Integer math result is truncated not rounded. | ||
}, | ||
{ | ||
name: "below min duration", | ||
windowSize: 5, | ||
input: []int64{0, 1, 2, 3, 4, 5, 6}, | ||
expected: time.Duration(2), | ||
}, | ||
} | ||
for _, test := range tests { | ||
t.Run(test.name, func(t *testing.T) { | ||
c := newRollingAverageCalculator(test.windowSize, test.minEntries, time.Duration(2), time.Duration(10)) | ||
for _, v := range test.input { | ||
c.AddTimestamp(time.Unix(0, v)) | ||
} | ||
avg := c.GetAverage() | ||
assert.Equal(t, test.expected, avg) | ||
}) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In what situations will PrepareLabels error? Is there a case where we discard a target and increment a counter or something?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like it fails based on validations:
In this particular case, the component would've been given a bad target.