Skip to content

Commit

Permalink
Fix error when the operator metrics ServiceMonitor already exists
Browse files Browse the repository at this point in the history
Signed-off-by: Israel Blancas <iblancas@redhat.com>
  • Loading branch information
iblancasa committed Nov 11, 2024
1 parent 7c79f2d commit fe7f908
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 10 deletions.
19 changes: 19 additions & 0 deletions .chloggen/3446.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: 'bug_fix'

# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action)
component: operator

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Operator pod crashed if the Service Monitor for the operator metrics was created before by another operator pod.

# One or more tracking issues related to the change
issues: [3446]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
Operator fails when the pod is restarted and the Service Monitor for operator metrics was already created by another operator pod.
To fix this, the operator now sets the owner reference on the Service Monitor to itself and checks if the Service Monitor already exists.
51 changes: 48 additions & 3 deletions internal/operator-metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ import (
"fmt"
"os"

"github.com/go-logr/logr"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -51,26 +54,35 @@ var _ manager.Runnable = &OperatorMetrics{}

type OperatorMetrics struct {
kubeClient client.Client
log logr.Logger
}

func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme) (OperatorMetrics, error) {
func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme, log logr.Logger) (OperatorMetrics, error) {
kubeClient, err := client.New(config, client.Options{Scheme: scheme})
if err != nil {
return OperatorMetrics{}, err
}

return OperatorMetrics{
kubeClient: kubeClient,
log: log,
}, nil
}

func (om OperatorMetrics) Start(ctx context.Context) error {
rawNamespace, err := os.ReadFile(namespaceFile)
if err != nil {
return fmt.Errorf("error reading namespace file: %w", err)
om.log.Error(err, "error reading namespace file. No ServiceMonitor will be created")
return nil
}
namespace := string(rawNamespace)

ownerRef, err := om.getOwnerReferences(ctx, namespace)
if err != nil {
om.log.Error(err, "error getting owner references. No ServiceMonitor will be created")
return nil
}

var tlsConfig *monitoringv1.TLSConfig

if om.caConfigMapExists() {
Expand Down Expand Up @@ -101,6 +113,7 @@ func (om OperatorMetrics) Start(ctx context.Context) error {
"app.kubernetes.io/part-of": "opentelemetry-operator",
"control-plane": "controller-manager",
},
OwnerReferences: []metav1.OwnerReference{ownerRef},
},
Spec: monitoringv1.ServiceMonitorSpec{
Selector: metav1.LabelSelector{
Expand All @@ -123,7 +136,8 @@ func (om OperatorMetrics) Start(ctx context.Context) error {
}

err = om.kubeClient.Create(ctx, &sm)
if err != nil {
// The ServiceMonitor can be already there if this is a restart
if err != nil && !apierrors.IsAlreadyExists(err) {
return fmt.Errorf("error creating service monitor: %w", err)
}

Expand All @@ -143,3 +157,34 @@ func (om OperatorMetrics) caConfigMapExists() bool {
}, &corev1.ConfigMap{},
) == nil
}

func (om OperatorMetrics) getOwnerReferences(ctx context.Context, namespace string) (metav1.OwnerReference, error) {
var deploymentList appsv1.DeploymentList

listOptions := []client.ListOption{
client.InNamespace(namespace),
client.MatchingLabels(map[string]string{
"app.kubernetes.io/name": "opentelemetry-operator",
"control-plane": "controller-manager",
}),
}

err := om.kubeClient.List(ctx, &deploymentList, listOptions...)
if err != nil {
return metav1.OwnerReference{}, err
}

if len(deploymentList.Items) == 0 {
return metav1.OwnerReference{}, fmt.Errorf("no deployments found with the specified label")
}
deployment := &deploymentList.Items[0]

ownerRef := metav1.OwnerReference{
APIVersion: "apps/v1",
Kind: "Deployment",
Name: deployment.Name,
UID: deployment.UID,
}

return ownerRef, nil
}
86 changes: 80 additions & 6 deletions internal/operator-metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,30 @@ package operatormetrics
import (
"context"
"os"
"reflect"
"testing"
"time"

"github.com/go-logr/logr"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

func TestNewOperatorMetrics(t *testing.T) {
config := &rest.Config{}
scheme := runtime.NewScheme()
metrics, err := NewOperatorMetrics(config, scheme)
metrics, err := NewOperatorMetrics(config, scheme, logr.Discard())
assert.NoError(t, err)
assert.NotNil(t, metrics.kubeClient)
}
Expand All @@ -53,12 +57,15 @@ func TestOperatorMetrics_Start(t *testing.T) {
namespaceFile = tmpFile.Name()

scheme := runtime.NewScheme()
err = corev1.AddToScheme(scheme)
require.NoError(t, err)
err = monitoringv1.AddToScheme(scheme)
require.NoError(t, err)
require.NoError(t, corev1.AddToScheme(scheme))
require.NoError(t, appsv1.AddToScheme(scheme))
require.NoError(t, monitoringv1.AddToScheme(scheme))

client := fake.NewClientBuilder().WithScheme(scheme).Build()
client := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(
&appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{Name: "opentelemetry-operator", Namespace: "test-namespace", Labels: map[string]string{"app.kubernetes.io/name": "opentelemetry-operator", "control-plane": "controller-manager"}},
},
).Build()

metrics := OperatorMetrics{kubeClient: client}

Expand Down Expand Up @@ -125,3 +132,70 @@ func TestOperatorMetrics_caConfigMapExists(t *testing.T) {
metricsWithoutConfigMap := OperatorMetrics{kubeClient: clientWithoutConfigMap}
assert.False(t, metricsWithoutConfigMap.caConfigMapExists())
}

func TestOperatorMetrics_getOwnerReferences(t *testing.T) {
tests := []struct {
name string
namespace string
objects []client.Object
want metav1.OwnerReference
wantErr bool
}{
{
name: "successful owner reference retrieval",
namespace: "test-namespace",
objects: []client.Object{
&appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "opentelemetry-operator",
Namespace: "test-namespace",
UID: "test-uid",
Labels: map[string]string{
"app.kubernetes.io/name": "opentelemetry-operator",
"control-plane": "controller-manager",
},
},
},
},
want: metav1.OwnerReference{
APIVersion: "apps/v1",
Kind: "Deployment",
Name: "opentelemetry-operator",
UID: "test-uid",
},
wantErr: false,
},
{
name: "no deployments found",
namespace: "test-namespace",
objects: []client.Object{},
want: metav1.OwnerReference{},
wantErr: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
scheme := runtime.NewScheme()
_ = appsv1.AddToScheme(scheme)
fakeClient := fake.NewClientBuilder().
WithScheme(scheme).
WithObjects(tt.objects...).
Build()

om := OperatorMetrics{
kubeClient: fakeClient,
log: logr.Discard(),
}

got, err := om.getOwnerReferences(context.Background(), tt.namespace)
if (err != nil) != tt.wantErr {
t.Errorf("getOwnerReferences() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("getOwnerReferences() got = %v, want %v", got, tt.want)
}
})
}
}
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ func main() {
}

if cfg.PrometheusCRAvailability() == prometheus.Available {
operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme)
operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme, ctrl.Log.WithName("operator-metrics-sm"))
if opError != nil {
setupLog.Error(opError, "Failed to create the operator metrics SM")
}
Expand Down

0 comments on commit fe7f908

Please sign in to comment.