diff --git a/go.mod b/go.mod index 8bab58e35..4d563cbf1 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/evanphx/json-patch v5.6.0+incompatible github.com/go-logr/logr v1.4.1 github.com/google/go-cmp v0.6.0 + github.com/prometheus/client_golang v1.18.0 github.com/spf13/afero v1.11.0 golang.org/x/time v0.5.0 google.golang.org/grpc v1.61.0 @@ -19,6 +20,7 @@ require ( k8s.io/apiextensions-apiserver v0.29.1 k8s.io/apimachinery v0.29.1 k8s.io/client-go v0.29.1 + k8s.io/component-base v0.29.1 k8s.io/klog/v2 v2.110.1 sigs.k8s.io/controller-runtime v0.17.0 sigs.k8s.io/controller-tools v0.14.0 @@ -31,6 +33,7 @@ require ( github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect github.com/bufbuild/protocompile v0.6.0 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/containerd/log v0.1.0 // indirect @@ -87,7 +90,6 @@ require ( github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pkg/profile v1.7.0 // indirect - github.com/prometheus/client_golang v1.18.0 // indirect github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect @@ -123,7 +125,6 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/component-base v0.29.1 // indirect k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect diff --git a/go.sum b/go.sum index 9a5cd7427..46cd0c925 100644 --- a/go.sum +++ b/go.sum @@ -10,6 +10,8 @@ github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migc github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/bufbuild/buf v1.27.2 h1:uX2kvZfPfRoOsrxUW4LwpykSyH+wI5dUnIG0QWHDCCU= github.com/bufbuild/buf v1.27.2/go.mod h1:7RImDhFDqhEsdK5wbuMhoVSlnrMggGGcd3s9WozvHtM= github.com/bufbuild/protocompile v0.6.0 h1:Uu7WiSQ6Yj9DbkdnOe7U4mNKp58y9WDMKDn28/ZlunY= diff --git a/pkg/controller/options.go b/pkg/controller/options.go index 630216979..91132def2 100644 --- a/pkg/controller/options.go +++ b/pkg/controller/options.go @@ -26,6 +26,8 @@ import ( "github.com/crossplane/crossplane-runtime/pkg/feature" "github.com/crossplane/crossplane-runtime/pkg/logging" "github.com/crossplane/crossplane-runtime/pkg/ratelimiter" + "github.com/crossplane/crossplane-runtime/pkg/reconciler/managed" + "github.com/crossplane/crossplane-runtime/pkg/statemetrics" ) // DefaultOptions returns a functional set of options with conservative @@ -61,6 +63,9 @@ type Options struct { // ESSOptions for External Secret Stores. ESSOptions *ESSOptions + + // MetricOptions for recording metrics. + MetricOptions *MetricOptions } // ForControllerRuntime extracts options for controller-runtime. @@ -79,3 +84,15 @@ type ESSOptions struct { TLSConfig *tls.Config TLSSecretName *string } + +// MetricOptions for recording metrics. +type MetricOptions struct { + // PollStateMetricInterval at which each controller should record state + PollStateMetricInterval time.Duration + + // MetricsRecorder to use for recording metrics. + MRMetrics managed.MetricRecorder + + // MRStateMetrics to use for recording state metrics. + MRStateMetrics *statemetrics.MRStateMetrics +} diff --git a/pkg/reconciler/managed/metrics.go b/pkg/reconciler/managed/metrics.go new file mode 100644 index 000000000..fe5f717be --- /dev/null +++ b/pkg/reconciler/managed/metrics.go @@ -0,0 +1,178 @@ +/* +Copyright 2024 The Crossplane Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package managed + +import ( + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + corev1 "k8s.io/api/core/v1" + kmetrics "k8s.io/component-base/metrics" + + xpv1 "github.com/crossplane/crossplane-runtime/apis/common/v1" + "github.com/crossplane/crossplane-runtime/pkg/resource" +) + +const subSystem = "crossplane" + +// MetricRecorder records the managed resource metrics. +type MetricRecorder interface { //nolint:interfacebloat // The first two methods are coming from Prometheus + Describe(ch chan<- *prometheus.Desc) + Collect(ch chan<- prometheus.Metric) + + recordUnchanged(name string) + recordFirstTimeReconciled(managed resource.Managed) + recordFirstTimeReady(managed resource.Managed) + recordDrift(managed resource.Managed) + recordDeleted(managed resource.Managed) +} + +// MRMetricRecorder records the lifecycle metrics of managed resources. +type MRMetricRecorder struct { + firstObservation sync.Map + lastObservation sync.Map + + mrDetected *prometheus.HistogramVec + mrFirstTimeReady *prometheus.HistogramVec + mrDeletion *prometheus.HistogramVec + mrDrift *prometheus.HistogramVec +} + +// NewMRMetricRecorder returns a new MRMetricRecorder which records metrics for managed resources. +func NewMRMetricRecorder() *MRMetricRecorder { + return &MRMetricRecorder{ + mrDetected: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: subSystem, + Name: "managed_resource_first_time_to_reconcile_seconds", + Help: "The time it took for a managed resource to be detected by the controller", + Buckets: kmetrics.ExponentialBuckets(10e-9, 10, 10), + }, []string{"gvk"}), + mrFirstTimeReady: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: subSystem, + Name: "managed_resource_first_time_to_readiness_seconds", + Help: "The time it took for a managed resource to become ready first time after creation", + Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600}, + }, []string{"gvk"}), + mrDeletion: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: subSystem, + Name: "managed_resource_deletion_seconds", + Help: "The time it took for a managed resource to be deleted", + Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600}, + }, []string{"gvk"}), + mrDrift: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: subSystem, + Name: "managed_resource_drift_seconds", + Help: "ALPHA: How long since the previous successful reconcile when a resource was found to be out of sync; excludes restart of the provider", + Buckets: kmetrics.ExponentialBuckets(10e-9, 10, 10), + }, []string{"gvk"}), + } +} + +// Describe sends the super-set of all possible descriptors of metrics +// collected by this Collector to the provided channel and returns once +// the last descriptor has been sent. +func (r *MRMetricRecorder) Describe(ch chan<- *prometheus.Desc) { + r.mrDetected.Describe(ch) + r.mrFirstTimeReady.Describe(ch) + r.mrDeletion.Describe(ch) + r.mrDrift.Describe(ch) +} + +// Collect is called by the Prometheus registry when collecting +// metrics. The implementation sends each collected metric via the +// provided channel and returns once the last metric has been sent. +func (r *MRMetricRecorder) Collect(ch chan<- prometheus.Metric) { + r.mrDetected.Collect(ch) + r.mrFirstTimeReady.Collect(ch) + r.mrDeletion.Collect(ch) + r.mrDrift.Collect(ch) +} + +func (r *MRMetricRecorder) recordUnchanged(name string) { + r.lastObservation.Store(name, time.Now()) +} + +func (r *MRMetricRecorder) recordFirstTimeReconciled(managed resource.Managed) { + if managed.GetCondition(xpv1.TypeSynced).Status == corev1.ConditionUnknown { + r.mrDetected.With(getLabels(managed)).Observe(time.Since(managed.GetCreationTimestamp().Time).Seconds()) + r.firstObservation.Store(managed.GetName(), time.Now()) // this is the first time we reconciled on this resource + } +} + +func (r *MRMetricRecorder) recordDrift(managed resource.Managed) { + name := managed.GetName() + last, ok := r.lastObservation.Load(name) + if !ok { + return + } + lt, ok := last.(time.Time) + if !ok { + return + } + + r.mrDrift.With(getLabels(managed)).Observe(time.Since(lt).Seconds()) + + r.lastObservation.Store(name, time.Now()) +} + +func (r *MRMetricRecorder) recordDeleted(managed resource.Managed) { + r.mrDeletion.With(getLabels(managed)).Observe(time.Since(managed.GetDeletionTimestamp().Time).Seconds()) +} + +func (r *MRMetricRecorder) recordFirstTimeReady(managed resource.Managed) { + // Note that providers may set the ready condition to "True", so we need + // to check the value here to send the ready metric + if managed.GetCondition(xpv1.TypeReady).Status == corev1.ConditionTrue { + _, ok := r.firstObservation.Load(managed.GetName()) // This map is used to identify the first time to readiness + if !ok { + return + } + r.mrFirstTimeReady.With(getLabels(managed)).Observe(time.Since(managed.GetCreationTimestamp().Time).Seconds()) + r.firstObservation.Delete(managed.GetName()) + } +} + +// A NopMetricRecorder does nothing. +type NopMetricRecorder struct{} + +// NewNopMetricRecorder returns a MRMetricRecorder that does nothing. +func NewNopMetricRecorder() *NopMetricRecorder { + return &NopMetricRecorder{} +} + +// Describe does nothing. +func (r *NopMetricRecorder) Describe(_ chan<- *prometheus.Desc) {} + +// Collect does nothing. +func (r *NopMetricRecorder) Collect(_ chan<- prometheus.Metric) {} + +func (r *NopMetricRecorder) recordUnchanged(_ string) {} + +func (r *NopMetricRecorder) recordFirstTimeReconciled(_ resource.Managed) {} + +func (r *NopMetricRecorder) recordDrift(_ resource.Managed) {} + +func (r *NopMetricRecorder) recordDeleted(_ resource.Managed) {} + +func (r *NopMetricRecorder) recordFirstTimeReady(_ resource.Managed) {} + +func getLabels(r resource.Managed) prometheus.Labels { + return prometheus.Labels{ + "gvk": r.GetObjectKind().GroupVersionKind().String(), + } +} diff --git a/pkg/reconciler/managed/reconciler.go b/pkg/reconciler/managed/reconciler.go index e0325eae7..f559e77a4 100644 --- a/pkg/reconciler/managed/reconciler.go +++ b/pkg/reconciler/managed/reconciler.go @@ -485,8 +485,9 @@ type Reconciler struct { supportedManagementPolicies []sets.Set[xpv1.ManagementAction] - log logging.Logger - record event.Recorder + log logging.Logger + record event.Recorder + metricRecorder MetricRecorder } type mrManaged struct { @@ -544,6 +545,13 @@ func WithPollInterval(after time.Duration) ReconcilerOption { } } +// WithMetricRecorder configures the Reconciler to use the supplied MetricRecorder. +func WithMetricRecorder(recorder MetricRecorder) ReconcilerOption { + return func(r *Reconciler) { + r.metricRecorder = recorder + } +} + // PollIntervalHook represents the function type passed to the // WithPollIntervalHook option to support dynamic computation of the poll // interval. @@ -701,6 +709,7 @@ func NewReconciler(m manager.Manager, of resource.ManagedKind, o ...ReconcilerOp supportedManagementPolicies: defaultSupportedManagementPolicies(), log: logging.NewNopLogger(), record: event.NewNopRecorder(), + metricRecorder: NewNopMetricRecorder(), } for _, ro := range o { @@ -734,6 +743,8 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu return reconcile.Result{}, errors.Wrap(resource.IgnoreNotFound(err), errGetManaged) } + r.metricRecorder.recordFirstTimeReconciled(managed) + record := r.record.WithAnnotations("external-name", meta.GetExternalName(managed)) log = log.WithValues( "uid", managed.GetUID(), @@ -823,6 +834,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu // details and removed our finalizer. If we assume we were the only // controller that added a finalizer to this resource then it should no // longer exist and thus there is no point trying to update its status. + r.metricRecorder.recordDeleted(managed) log.Debug("Successfully deleted managed resource") return reconcile.Result{Requeue: false}, nil } @@ -994,6 +1006,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu // removed our finalizer. If we assume we were the only controller that // added a finalizer to this resource then it should no longer exist and // thus there is no point trying to update its status. + r.metricRecorder.recordDeleted(managed) log.Debug("Successfully deleted managed resource") return reconcile.Result{Requeue: false}, nil } @@ -1150,6 +1163,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu reconcileAfter := r.pollIntervalHook(managed, r.pollInterval) log.Debug("External resource is up to date", "requeue-after", time.Now().Add(reconcileAfter)) managed.SetConditions(xpv1.ReconcileSuccess()) + r.metricRecorder.recordFirstTimeReady(managed) + + // record that we intentionally did not update the managed resource + // because no drift was detected. We call this so late in the reconcile + // because all the cases above could contribute (for different reasons) + // that the external object would not have been updated. + r.metricRecorder.recordUnchanged(managed.GetName()) + return reconcile.Result{RequeueAfter: reconcileAfter}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus) } @@ -1178,6 +1199,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu return reconcile.Result{Requeue: true}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus) } + // record the drift after the successful update. + r.metricRecorder.recordDrift(managed) + if _, err := r.managed.PublishConnection(ctx, managed, update.ConnectionDetails); err != nil { // If this is the first time we encounter this issue we'll be requeued // implicitly when we update our status with the new error condition. If diff --git a/pkg/statemetrics/mr_state_metrics.go b/pkg/statemetrics/mr_state_metrics.go new file mode 100644 index 000000000..eaa444ee5 --- /dev/null +++ b/pkg/statemetrics/mr_state_metrics.go @@ -0,0 +1,145 @@ +/* +Copyright 2024 The Crossplane Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package statemetrics + +import ( + "context" + "time" + + "github.com/prometheus/client_golang/prometheus" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + xpv1 "github.com/crossplane/crossplane-runtime/apis/common/v1" + "github.com/crossplane/crossplane-runtime/pkg/logging" + "github.com/crossplane/crossplane-runtime/pkg/resource" +) + +// MRStateMetrics holds Prometheus metrics for managed resources. +type MRStateMetrics struct { + Exists *prometheus.GaugeVec + Ready *prometheus.GaugeVec + Synced *prometheus.GaugeVec +} + +// NewMRStateMetrics returns a new MRStateMetrics. +func NewMRStateMetrics() *MRStateMetrics { + return &MRStateMetrics{ + Exists: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: subSystem, + Name: "managed_resource_exists", + Help: "The number of managed resources that exist", + }, []string{"gvk"}), + Ready: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: subSystem, + Name: "managed_resource_ready", + Help: "The number of managed resources in Ready=True state", + }, []string{"gvk"}), + Synced: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: subSystem, + Name: "managed_resource_synced", + Help: "The number of managed resources in Synced=True state", + }, []string{"gvk"}), + } +} + +// Describe sends the super-set of all possible descriptors of metrics +// collected by this Collector to the provided channel and returns once +// the last descriptor has been sent. +func (r *MRStateMetrics) Describe(ch chan<- *prometheus.Desc) { + r.Exists.Describe(ch) + r.Ready.Describe(ch) + r.Synced.Describe(ch) +} + +// Collect is called by the Prometheus registry when collecting +// metrics. The implementation sends each collected metric via the +// provided channel and returns once the last metric has been sent. +func (r *MRStateMetrics) Collect(ch chan<- prometheus.Metric) { + r.Exists.Collect(ch) + r.Ready.Collect(ch) + r.Synced.Collect(ch) +} + +// A MRStateRecorder records the state of managed resources. +type MRStateRecorder struct { + client client.Client + log logging.Logger + interval time.Duration + managedList resource.ManagedList + + metrics *MRStateMetrics +} + +// NewMRStateRecorder returns a new MRStateRecorder which records the state of managed resources. +func NewMRStateRecorder(client client.Client, log logging.Logger, metrics *MRStateMetrics, managedList resource.ManagedList, interval time.Duration) *MRStateRecorder { + return &MRStateRecorder{ + client: client, + log: log, + metrics: metrics, + managedList: managedList, + interval: interval, + } +} + +// Record records the state of managed resources. +func (r *MRStateRecorder) Record(ctx context.Context, mrList resource.ManagedList) error { + if err := r.client.List(ctx, mrList); err != nil { + r.log.Info("Failed to list managed resources", "error", err) + return err + } + + mrs := mrList.GetItems() + if len(mrs) == 0 { + return nil + } + + label := mrs[0].GetObjectKind().GroupVersionKind().String() + r.metrics.Exists.WithLabelValues(label).Set(float64(len(mrs))) + + var numReady, numSynced float64 = 0, 0 + for _, o := range mrs { + if o.GetCondition(xpv1.TypeReady).Status == corev1.ConditionTrue { + numReady++ + } + + if o.GetCondition(xpv1.TypeSynced).Status == corev1.ConditionTrue { + numSynced++ + } + } + + r.metrics.Ready.WithLabelValues(label).Set(numReady) + r.metrics.Synced.WithLabelValues(label).Set(numSynced) + + return nil +} + +// Start records state of managed resources with given interval. +func (r *MRStateRecorder) Start(ctx context.Context) error { + ticker := time.NewTicker(r.interval) + for { + select { + case <-ticker.C: + if err := r.Record(ctx, r.managedList); err != nil { + return err + } + case <-ctx.Done(): + ticker.Stop() + return nil + } + } +} diff --git a/pkg/statemetrics/state_recorder.go b/pkg/statemetrics/state_recorder.go new file mode 100644 index 000000000..8604e81ff --- /dev/null +++ b/pkg/statemetrics/state_recorder.go @@ -0,0 +1,46 @@ +/* +Copyright 2024 The Crossplane Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package statemetrics contains utilities for recording Crossplane resource state metrics. +package statemetrics + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime/schema" +) + +const subSystem = "crossplane" + +// A StateRecorder records the state of given GroupVersionKind. +type StateRecorder interface { + Record(ctx context.Context, gvk schema.GroupVersionKind) + Start(ctx context.Context) error +} + +// A NopStateRecorder does nothing. +type NopStateRecorder struct{} + +// NewNopStateRecorder returns a NopStateRecorder that does nothing. +func NewNopStateRecorder() *NopStateRecorder { + return &NopStateRecorder{} +} + +// Record does nothing. +func (r *NopStateRecorder) Record(_ context.Context, _ schema.GroupVersionKind) {} + +// Start does nothing. +func (r *NopStateRecorder) Start(_ context.Context) error { return nil }