Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Propagate Metric status changes to the API. #7525

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 63 additions & 17 deletions pkg/autoscaler/metrics/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package metrics

import (
"errors"
"fmt"
"sync"
"time"

Expand Down Expand Up @@ -90,6 +89,9 @@ type Collector interface {
Record(key types.NamespacedName, stat Stat)
// Delete deletes a Metric and halts collection.
Delete(string, string) error
// Watch registers a singleton function to call when a specific collector's status changes.
// The passed name is the namespace/name of the metric owned by the respective collector.
Watch(func(types.NamespacedName))
markusthoemmes marked this conversation as resolved.
Show resolved Hide resolved
}

// MetricClient surfaces the metrics that can be obtained via the collector.
Expand All @@ -112,6 +114,9 @@ type MetricCollector struct {

collections map[types.NamespacedName]*collection
collectionsMutex sync.RWMutex

watcherMutex sync.RWMutex
watcher func(types.NamespacedName)
}

var _ Collector = (*MetricCollector)(nil)
Expand Down Expand Up @@ -143,7 +148,7 @@ func (c *MetricCollector) CreateOrUpdate(metric *av1alpha1.Metric) error {
if exists {
collection.updateScraper(scraper)
collection.updateMetric(metric)
return nil
return collection.lastError()
}

c.collectionsMutex.Lock()
Expand All @@ -153,10 +158,10 @@ func (c *MetricCollector) CreateOrUpdate(metric *av1alpha1.Metric) error {
if exists {
collection.updateScraper(scraper)
collection.updateMetric(metric)
return nil
return collection.lastError()
}

c.collections[key] = newCollection(metric, scraper, c.tickProvider, c.logger)
c.collections[key] = newCollection(metric, scraper, c.tickProvider, c.Inform, c.logger)
return nil
}

Expand All @@ -183,6 +188,27 @@ func (c *MetricCollector) Record(key types.NamespacedName, stat Stat) {
}
}

// Watch registers a singleton function to call when collector status changes.
func (c *MetricCollector) Watch(fn func(types.NamespacedName)) {
c.watcherMutex.Lock()
defer c.watcherMutex.Unlock()

if c.watcher != nil {
c.logger.Fatal("Multiple calls to Watch() not supported")
}
c.watcher = fn
}

// Inform sends an update to the registered watcher function, if it is set.
func (c *MetricCollector) Inform(event types.NamespacedName) {
c.watcherMutex.RLock()
defer c.watcherMutex.RUnlock()

if c.watcher != nil {
c.watcher(event)
}
}

// StableAndPanicConcurrency returns both the stable and the panic concurrency.
// It may truncate metric buckets as a side-effect.
func (c *MetricCollector) StableAndPanicConcurrency(key types.NamespacedName, now time.Time) (float64, float64, error) {
Expand Down Expand Up @@ -224,6 +250,9 @@ type collection struct {
metricMutex sync.RWMutex
metric *av1alpha1.Metric

errMutex sync.RWMutex
lastErr error

scraperMutex sync.RWMutex
scraper StatsScraper
concurrencyBuckets *aggregation.TimedFloat64Buckets
Expand All @@ -249,7 +278,8 @@ func (c *collection) getScraper() StatsScraper {

// newCollection creates a new collection, which uses the given scraper to
// collect stats every scrapeTickInterval.
func newCollection(metric *av1alpha1.Metric, scraper StatsScraper, tickFactory func(time.Duration) *time.Ticker, logger *zap.SugaredLogger) *collection {
func newCollection(metric *av1alpha1.Metric, scraper StatsScraper, tickFactory func(time.Duration) *time.Ticker,
callback func(types.NamespacedName), logger *zap.SugaredLogger) *collection {
c := &collection{
metric: metric,
concurrencyBuckets: aggregation.NewTimedFloat64Buckets(
Expand All @@ -265,8 +295,8 @@ func newCollection(metric *av1alpha1.Metric, scraper StatsScraper, tickFactory f
stopCh: make(chan struct{}),
}

logger = logger.Named("collector").With(
zap.String(logkey.Key, fmt.Sprintf("%s/%s", metric.Namespace, metric.Name)))
key := types.NamespacedName{Namespace: metric.Namespace, Name: metric.Name}
logger = logger.Named("collector").With(zap.String(logkey.Key, key.String()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd just go WithValue.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do the zap.String thing everywhere we use logkey.Key and a stringified "namespace/name"


c.grp.Add(1)
go func() {
Expand All @@ -282,21 +312,17 @@ func newCollection(metric *av1alpha1.Metric, scraper StatsScraper, tickFactory f
currentMetric := c.currentMetric()
if currentMetric.Spec.ScrapeTarget == "" {
// Don't scrape empty target service.
if c.updateLastError(nil) {
callback(key)
}
continue
}
stat, err := c.getScraper().Scrape(currentMetric.Spec.StableWindow)
if err != nil {
copy := metric.DeepCopy()
switch {
case err == ErrFailedGetEndpoints:
copy.Status.MarkMetricNotReady("NoEndpoints", ErrFailedGetEndpoints.Error())
case err == ErrDidNotReceiveStat:
copy.Status.MarkMetricFailed("DidNotReceiveStat", ErrDidNotReceiveStat.Error())
default:
copy.Status.MarkMetricNotReady("CreateOrUpdateFailed", "Collector has failed.")
}
logger.Errorw("Failed to scrape metrics", zap.Error(err))
c.updateMetric(copy)
}
if c.updateLastError(err) {
callback(key)
}
if stat != emptyStat {
c.record(stat)
Expand Down Expand Up @@ -328,6 +354,26 @@ func (c *collection) currentMetric() *av1alpha1.Metric {
return c.metric
}

// updateLastError updates the last error returned from the scraper
// and returns true if the error or error state changed.
func (c *collection) updateLastError(err error) bool {
c.errMutex.Lock()
defer c.errMutex.Unlock()

if c.lastErr == err {
return false
}
c.lastErr = err
return true
}

func (c *collection) lastError() error {
c.errMutex.RLock()
defer c.errMutex.RUnlock()

return c.lastErr
}

// record adds a stat to the current collection.
func (c *collection) record(stat Stat) {
// Proxied requests have been counted at the activator. Subtract
Expand Down
121 changes: 39 additions & 82 deletions pkg/autoscaler/metrics/collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,9 @@ import (
"time"

"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
duckv1 "knative.dev/pkg/apis/duck/v1"
. "knative.dev/pkg/logging/testing"
av1alpha1 "knative.dev/serving/pkg/apis/autoscaling/v1alpha1"
"knative.dev/serving/pkg/apis/serving"
Expand Down Expand Up @@ -387,115 +384,75 @@ func TestMetricCollectorRecord(t *testing.T) {
}

func TestMetricCollectorError(t *testing.T) {
testMetric := &av1alpha1.Metric{
ObjectMeta: metav1.ObjectMeta{
Namespace: fake.TestNamespace,
Name: fake.TestRevision,
Labels: map[string]string{
serving.RevisionLabelKey: fake.TestRevision,
},
},
Spec: av1alpha1.MetricSpec{
ScrapeTarget: fake.TestRevision + "-zhudex",
},
}

errOther := errors.New("foo")

testCases := []struct {
name string
scraper *testScraper
metric *av1alpha1.Metric
expectedMetricStatus duckv1.Status
name string
scraper *testScraper
expectedError error
}{{
name: "Failed to get endpoints scraper error",
scraper: &testScraper{
s: func() (Stat, error) {
return emptyStat, ErrFailedGetEndpoints
},
},
metric: &av1alpha1.Metric{
ObjectMeta: metav1.ObjectMeta{
Namespace: fake.TestNamespace,
Name: fake.TestRevision,
Labels: map[string]string{
serving.RevisionLabelKey: fake.TestRevision,
},
},
Spec: av1alpha1.MetricSpec{
ScrapeTarget: fake.TestRevision + "-zhudex",
},
},
expectedMetricStatus: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: av1alpha1.MetricConditionReady,
Status: corev1.ConditionUnknown,
Reason: "NoEndpoints",
Message: ErrFailedGetEndpoints.Error(),
}},
},
expectedError: ErrFailedGetEndpoints,
}, {
name: "Did not receive stat scraper error",
scraper: &testScraper{
s: func() (Stat, error) {
return emptyStat, ErrDidNotReceiveStat
},
},
metric: &av1alpha1.Metric{
ObjectMeta: metav1.ObjectMeta{
Namespace: fake.TestNamespace,
Name: fake.TestRevision,
Labels: map[string]string{
serving.RevisionLabelKey: fake.TestRevision,
},
},
Spec: av1alpha1.MetricSpec{
ScrapeTarget: fake.TestRevision + "-zhudex",
},
},
expectedMetricStatus: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: av1alpha1.MetricConditionReady,
Status: corev1.ConditionFalse,
Reason: "DidNotReceiveStat",
Message: ErrDidNotReceiveStat.Error(),
}},
},
expectedError: ErrDidNotReceiveStat,
}, {
name: "Other scraper error",
scraper: &testScraper{
s: func() (Stat, error) {
return emptyStat, errors.New("foo")
},
},
metric: &av1alpha1.Metric{
ObjectMeta: metav1.ObjectMeta{
Namespace: fake.TestNamespace,
Name: fake.TestRevision,
Labels: map[string]string{
serving.RevisionLabelKey: fake.TestRevision,
},
return emptyStat, errOther
},
Spec: av1alpha1.MetricSpec{
ScrapeTarget: fake.TestRevision + "-zhudex",
},
},
expectedMetricStatus: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: av1alpha1.MetricConditionReady,
Status: corev1.ConditionUnknown,
Reason: "CreateOrUpdateFailed",
Message: "Collector has failed.",
}},
},
expectedError: errOther,
}}

logger := TestLogger(t)
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
factory := scraperFactory(test.scraper, nil)
coll := NewMetricCollector(factory, logger)
coll.CreateOrUpdate(test.metric)
key := types.NamespacedName{Namespace: test.metric.Namespace, Name: test.metric.Name}

var got duckv1.Status
wait.PollImmediate(10*time.Millisecond, 2*time.Second, func() (bool, error) {
collection, ok := coll.collections[key]
if ok {
got = collection.currentMetric().Status.Status
return equality.Semantic.DeepEqual(got, test.expectedMetricStatus), nil
}
return false, nil
watchCh := make(chan types.NamespacedName)
coll.Watch(func(key types.NamespacedName) {
watchCh <- key
})
if !equality.Semantic.DeepEqual(got, test.expectedMetricStatus) {
t.Errorf("Got = %#v, want: %#v, diff:\n%q", got, test.expectedMetricStatus, cmp.Diff(got, test.expectedMetricStatus))
coll.CreateOrUpdate(testMetric)
key := types.NamespacedName{Namespace: testMetric.Namespace, Name: testMetric.Name}

// Expect an event to be propagated because we're erroring.
event := <-watchCh
if event != key {
t.Fatalf("Event = %v, want %v", event, key)
}
coll.Delete(test.metric.Namespace, test.metric.Name)

// Make sure the error is surfaced via 'CreateOrUpdate', which is called in the reconciler.
if err := coll.CreateOrUpdate(testMetric); err != test.expectedError {
t.Fatalf("CreateOrUpdate = %v, want %v", err, test.expectedError)
}

coll.Delete(testMetric.Namespace, testMetric.Name)
})
}
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/reconciler/metric/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,7 @@ func NewController(
},
})

collector.Watch(impl.EnqueueKey)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here it presumes metrics name == pa name.
Probably fine, but 🤷‍♂

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it does? The collector manages metrics and this keys off of metrics. Both are Metric names.


return impl
}
16 changes: 12 additions & 4 deletions pkg/reconciler/metric/metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package metric

import (
"context"
"fmt"

"knative.dev/serving/pkg/apis/autoscaling/v1alpha1"
"knative.dev/serving/pkg/autoscaler/metrics"
Expand All @@ -40,9 +39,18 @@ func (r *reconciler) ReconcileKind(ctx context.Context, metric *v1alpha1.Metric)
metric.Status.InitializeConditions()

if err := r.collector.CreateOrUpdate(metric); err != nil {
// If create or update fails, we won't be able to collect at all.
metric.Status.MarkMetricFailed("CollectionFailed", "Failed to reconcile metric collection")
return fmt.Errorf("failed to initiate or update scraping: %w", err)
switch err {
case metrics.ErrFailedGetEndpoints:
metric.Status.MarkMetricNotReady("NoEndpoints", err.Error())
case metrics.ErrDidNotReceiveStat:
metric.Status.MarkMetricFailed("DidNotReceiveStat", err.Error())
default:
metric.Status.MarkMetricFailed("CollectionFailed",
"Failed to reconcile metric collection: "+err.Error())
}

// We don't return an error because retrying is of no use. We'll be poked by collector on a change.
return nil
}

metric.Status.MarkMetricReady()
Expand Down
Loading