Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Detect OOMkilled status for the operator itself #939

Merged
merged 12 commits into from
Nov 20, 2023
2 changes: 2 additions & 0 deletions api/v1alpha2/mondooauditconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ const (
AdmissionDegraded MondooAuditConfigConditionType = "AdmissionDegraded"
// Indicates weather Admission controller is Degraded because of the ScanAPI
ScanAPIDegraded MondooAuditConfigConditionType = "ScanAPIDegraded"
// Indicates weather the operator itself is Degraded
MondooOperaotrDegraded MondooAuditConfigConditionType = "MondooOperatorDegraded"
// MondooIntegrationDegraded will hold the status for any issues encountered while trying to CheckIn()
// on behalf of the Mondoo integration MRN
MondooIntegrationDegraded MondooAuditConfigConditionType = "IntegrationDegraded"
Expand Down
22 changes: 22 additions & 0 deletions cmd/mondoo-operator/operator/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,28 @@ func init() {
return err
}

// Check whether the mondoo-operator crashed because of OOMKilled
setupLog.Info("Checking whether mondoo-operator was terminated before")

k8sConfig, err := ctrl.GetConfig()
if err != nil {
setupLog.Error(err, "unable to get k8s config")
return err
}
// use separate client to prevent errors due to cache
// "the cache is not started, can not read objects"
// https://sdk.operatorframework.io/docs/building-operators/golang/references/client/#non-default-client
client, err := client.New(k8sConfig, client.Options{Scheme: scheme})
if err != nil {
setupLog.Error(err, "unable to create non-caching k8s client")
return err
}
err = checkForTerminatedState(ctx, client, v, setupLog)
if err != nil {
setupLog.Error(err, "unable to check for terminated state of mondoo-operator-controller")
return err
}

if err = resource_monitor.RegisterResourceMonitors(mgr, scanApiStore); err != nil {
setupLog.Error(err, "unable to register resource monitors", "controller", "resource_monitor")
return err
Expand Down
118 changes: 118 additions & 0 deletions cmd/mondoo-operator/operator/operator_status.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Copyright (c) Mondoo, Inc.
// SPDX-License-Identifier: BUSL-1.1

package operator

import (
"context"

"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/api/errors"

k8sv1alpha2 "go.mondoo.com/mondoo-operator/api/v1alpha2"
"go.mondoo.com/mondoo-operator/controllers"
"go.mondoo.com/mondoo-operator/controllers/status"
"go.mondoo.com/mondoo-operator/pkg/utils/k8s"
"go.mondoo.com/mondoo-operator/pkg/utils/mondoo"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
k8sversion "k8s.io/apimachinery/pkg/version"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func checkForTerminatedState(ctx context.Context, nonCacheClient client.Client, v *k8sversion.Info, logger logr.Logger) error {
statusReport := status.NewStatusReporter(nonCacheClient, controllers.MondooClientBuilder, v)

var err error
config := &k8sv1alpha2.MondooOperatorConfig{}
if err = nonCacheClient.Get(ctx, types.NamespacedName{Name: k8sv1alpha2.MondooOperatorConfigName}, config); err != nil {
if errors.IsNotFound(err) {
logger.Info("MondooOperatorConfig not found, using defaults")
} else {
logger.Error(err, "Failed to check for MondooOpertorConfig")
return err
}
}

mondooAuditConfigs := &k8sv1alpha2.MondooAuditConfigList{}
if err := nonCacheClient.List(ctx, mondooAuditConfigs); err != nil {
logger.Error(err, "error listing MondooAuditConfigs")
return err
}

for _, mondooAuditConfig := range mondooAuditConfigs.Items {
mondooAuditConfigCopy := mondooAuditConfig.DeepCopy()

podList := &corev1.PodList{}
listOpts := &client.ListOptions{
Namespace: mondooAuditConfig.Namespace,
LabelSelector: labels.SelectorFromSet(map[string]string{
"app.kubernetes.io/name": "mondoo-operator",
}),
}
if err := nonCacheClient.List(ctx, podList, listOpts); err != nil {
logger.Error(err, "failed to list pods", "Mondoo.Namespace", mondooAuditConfig.Namespace, "Mondoo.Name", mondooAuditConfig.Name)
return err
}

currentPod := k8s.GetNewestPodFromList(podList)
for _, containerStatus := range currentPod.Status.ContainerStatuses {
if containerStatus.Name != "manager" {
continue
}
stateUpdate := false
if containerStatus.State.Terminated != nil || containerStatus.LastTerminationState.Terminated != nil {
logger.Info("mondoo-operator was terminated before")
// Update status
updateOperatorConditions(&mondooAuditConfig, true, currentPod)
stateUpdate = true
} else if containerStatus.RestartCount == 0 && containerStatus.State.Terminated == nil {
logger.Info("mondoo-operator is running or starting", "state", containerStatus.State)
updateOperatorConditions(&mondooAuditConfig, false, &corev1.Pod{})
stateUpdate = true
}
if stateUpdate {
err := mondoo.UpdateMondooAuditStatus(ctx, nonCacheClient, mondooAuditConfigCopy, &mondooAuditConfig, logger)
if err != nil {
logger.Error(err, "failed to update status for MondooAuditConfig")
return err
}
// Report upstream before we get OOMkilled again
err = statusReport.Report(ctx, mondooAuditConfig, *config)
if err != nil {
logger.Error(err, "failed to report status upstream")
return err
}
break
}
}
}
return nil
}

func updateOperatorConditions(config *k8sv1alpha2.MondooAuditConfig, degradedStatus bool, pod *corev1.Pod) {
msg := "Mondoo Operator controller is available"
reason := "MondooOperatorAvailable"
status := corev1.ConditionFalse
updateCheck := mondoo.UpdateConditionIfReasonOrMessageChange
affectedPods := []string{}
memoryLimit := ""
if degradedStatus {
msg = "Mondoo Operator controller is unavailable"
for i, containerStatus := range pod.Status.ContainerStatuses {
if (containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137) ||
(containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode == 137) {
msg = "Mondoo Operator controller is unavailable due to OOM"
affectedPods = append(affectedPods, pod.Name)
memoryLimit = pod.Spec.Containers[i].Resources.Limits.Memory().String()
break
}
}

reason = "MondooOperatorUnavailable"
status = corev1.ConditionTrue
}

config.Status.Conditions = mondoo.SetMondooAuditCondition(config.Status.Conditions, k8sv1alpha2.MondooOperaotrDegraded, status, reason, msg, updateCheck, affectedPods, memoryLimit)
}
160 changes: 160 additions & 0 deletions cmd/mondoo-operator/operator/operator_status_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// Copyright (c) Mondoo, Inc.
// SPDX-License-Identifier: BUSL-1.1

package operator

import (
"context"
"testing"
"time"

"github.com/go-logr/zapr"
"github.com/golang/mock/gomock"
"github.com/stretchr/testify/suite"
"go.uber.org/zap"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/version"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

mondoov1alpha2 "go.mondoo.com/mondoo-operator/api/v1alpha2"
scanapistoremock "go.mondoo.com/mondoo-operator/controllers/resource_monitor/scan_api_store/mock"
"go.mondoo.com/mondoo-operator/controllers/scanapi"
"go.mondoo.com/mondoo-operator/pkg/utils/mondoo"
fakeMondoo "go.mondoo.com/mondoo-operator/pkg/utils/mondoo/fake"
"go.mondoo.com/mondoo-operator/pkg/utils/test"
"go.mondoo.com/mondoo-operator/tests/framework/utils"
)

type DeploymentHandlerSuite struct {
suite.Suite
ctx context.Context
scheme *runtime.Scheme
containerImageResolver mondoo.ContainerImageResolver

auditConfig mondoov1alpha2.MondooAuditConfig
fakeClientBuilder *fake.ClientBuilder
mockCtrl *gomock.Controller
scanApiStoreMock *scanapistoremock.MockScanApiStore
}

func (s *DeploymentHandlerSuite) SetupSuite() {
s.ctx = context.Background()
s.scheme = clientgoscheme.Scheme
s.Require().NoError(mondoov1alpha2.AddToScheme(s.scheme))
s.containerImageResolver = fakeMondoo.NewNoOpContainerImageResolver()
s.mockCtrl = gomock.NewController(s.T())
s.scanApiStoreMock = scanapistoremock.NewMockScanApiStore(s.mockCtrl)
}

func (s *DeploymentHandlerSuite) BeforeTest(suiteName, testName string) {
s.auditConfig = utils.DefaultAuditConfig("mondoo-operator", true, false, false, false)
s.fakeClientBuilder = fake.NewClientBuilder().WithObjects(&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: scanapi.TokenSecretName(s.auditConfig.Name),
Namespace: s.auditConfig.Namespace,
},
Data: map[string][]byte{"token": []byte("token")},
}, test.TestKubeSystemNamespace())
}

func (s *DeploymentHandlerSuite) AfterTest(suiteName, testName string) {
s.mockCtrl.Finish()
}

func (s *DeploymentHandlerSuite) TestOOMDetect() {
mondooAuditConfig := &s.auditConfig

oomPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "mondoo-operator-123",
Namespace: s.auditConfig.Namespace,
Labels: map[string]string{"app.kubernetes.io/name": "mondoo-operator"},
CreationTimestamp: metav1.Time{
Time: time.Now(),
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "manager",
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceMemory: *resource.NewQuantity(1, resource.BinarySI),
},
},
},
},
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{
{
Name: "manager",
LastTerminationState: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 137,
},
},
},
},
},
}

// This is needed because of https://github.com/kubernetes-sigs/controller-runtime/issues/2362
objs := []client.Object{mondooAuditConfig, oomPod}
k8sClient := s.fakeClientBuilder.WithScheme(clientgoscheme.Scheme).WithStatusSubresource(objs...).WithObjects(objs...).Build()

v := &version.Info{}
cfg := zap.NewDevelopmentConfig()
cfg.InitialFields = map[string]interface{}{
"controller": "terminated-test",
}
zapLog, err := cfg.Build()
s.Require().NoError(err, "failed to set up logging for test cases")
testLogger := zapr.NewLogger(zapLog)

err = checkForTerminatedState(s.ctx, k8sClient, v, testLogger)
s.NoError(err)

mondooAuditConfigs := &mondoov1alpha2.MondooAuditConfigList{}
err = k8sClient.List(s.ctx, mondooAuditConfigs)
s.NoError(err)
s.Len(mondooAuditConfigs.Items, 1)

condition := mondooAuditConfigs.Items[0].Status.Conditions[0]
s.Equal("Mondoo Operator controller is unavailable due to OOM", condition.Message)
s.Len(condition.AffectedPods, 1)
s.Contains(condition.AffectedPods, "mondoo-operator-123")
containerMemory := oomPod.Spec.Containers[0].Resources.Limits.Memory()
s.Equal(containerMemory.String(), condition.MemoryLimit)
s.Equal("MondooOperatorUnavailable", condition.Reason)
s.Equal(corev1.ConditionTrue, condition.Status)

oomPod.Status.ContainerStatuses[0].LastTerminationState = corev1.ContainerState{}
oomPod.Status.ContainerStatuses[0].State.Running = &corev1.ContainerStateRunning{}
s.NoError(k8sClient.Status().Update(s.ctx, oomPod))

err = checkForTerminatedState(s.ctx, k8sClient, v, testLogger)
s.NoError(err)

mondooAuditConfigs = &mondoov1alpha2.MondooAuditConfigList{}
err = k8sClient.List(s.ctx, mondooAuditConfigs)
s.NoError(err)
s.Len(mondooAuditConfigs.Items, 1)

condition = mondooAuditConfigs.Items[0].Status.Conditions[0]
s.Equal("Mondoo Operator controller is available", condition.Message)
s.Len(condition.AffectedPods, 0)
s.Equal("", condition.MemoryLimit)
s.Equal("MondooOperatorAvailable", condition.Reason)
s.Equal(corev1.ConditionFalse, condition.Status)
}

func TestOperatorSuite(t *testing.T) {
suite.Run(t, new(DeploymentHandlerSuite))
}
1 change: 0 additions & 1 deletion controllers/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ func (mr *MetricsReconciler) Start(ctx context.Context) error {
}

func (mr *MetricsReconciler) metricsLoop() {
mr.log.Info("Updating metrics")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one was filling up the logs. We don't have a debug level, so I removed it.

mondooAuditConfigs := &v1alpha2.MondooAuditConfigList{}
if err := mr.Client.List(mr.ctx, mondooAuditConfigs); err != nil {
mr.log.Error(err, "error listing MondooAuditConfigs")
Expand Down
5 changes: 5 additions & 0 deletions controllers/mondooauditconfig_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,11 @@ func TestTokenRegistration(t *testing.T) {
Identifier: status.ScanApiIdentifier,
Status: mondooclient.MessageStatus_MESSAGE_INFO,
},
{
Message: "No status reported yet",
Identifier: status.MondooOperatorIdentifier,
Status: mondooclient.MessageStatus_MESSAGE_UNKNOWN,
},
},
},
LastState: status.OperatorCustomState{
Expand Down
Loading