Skip to content

Commit

Permalink
✨ Detect OOMkilled status for the operator itself (#939)
Browse files Browse the repository at this point in the history
* ✨ Report on OOMkilled status
* ✨ Detect OOMkilled status for the operator itself

---------

Signed-off-by: Christian Zunker <christian@mondoo.com>
  • Loading branch information
czunker authored Nov 20, 2023
1 parent 0286e14 commit 4d7bf13
Show file tree
Hide file tree
Showing 12 changed files with 460 additions and 6 deletions.
2 changes: 2 additions & 0 deletions api/v1alpha2/mondooauditconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ const (
AdmissionDegraded MondooAuditConfigConditionType = "AdmissionDegraded"
// Indicates weather Admission controller is Degraded because of the ScanAPI
ScanAPIDegraded MondooAuditConfigConditionType = "ScanAPIDegraded"
// Indicates weather the operator itself is Degraded
MondooOperaotrDegraded MondooAuditConfigConditionType = "MondooOperatorDegraded"
// MondooIntegrationDegraded will hold the status for any issues encountered while trying to CheckIn()
// on behalf of the Mondoo integration MRN
MondooIntegrationDegraded MondooAuditConfigConditionType = "IntegrationDegraded"
Expand Down
22 changes: 22 additions & 0 deletions cmd/mondoo-operator/operator/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,28 @@ func init() {
return err
}

// Check whether the mondoo-operator crashed because of OOMKilled
setupLog.Info("Checking whether mondoo-operator was terminated before")

k8sConfig, err := ctrl.GetConfig()
if err != nil {
setupLog.Error(err, "unable to get k8s config")
return err
}
// use separate client to prevent errors due to cache
// "the cache is not started, can not read objects"
// https://sdk.operatorframework.io/docs/building-operators/golang/references/client/#non-default-client
client, err := client.New(k8sConfig, client.Options{Scheme: scheme})
if err != nil {
setupLog.Error(err, "unable to create non-caching k8s client")
return err
}
err = checkForTerminatedState(ctx, client, v, setupLog)
if err != nil {
setupLog.Error(err, "unable to check for terminated state of mondoo-operator-controller")
return err
}

if err = resource_monitor.RegisterResourceMonitors(mgr, scanApiStore); err != nil {
setupLog.Error(err, "unable to register resource monitors", "controller", "resource_monitor")
return err
Expand Down
118 changes: 118 additions & 0 deletions cmd/mondoo-operator/operator/operator_status.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Copyright (c) Mondoo, Inc.
// SPDX-License-Identifier: BUSL-1.1

package operator

import (
"context"

"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/api/errors"

k8sv1alpha2 "go.mondoo.com/mondoo-operator/api/v1alpha2"
"go.mondoo.com/mondoo-operator/controllers"
"go.mondoo.com/mondoo-operator/controllers/status"
"go.mondoo.com/mondoo-operator/pkg/utils/k8s"
"go.mondoo.com/mondoo-operator/pkg/utils/mondoo"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
k8sversion "k8s.io/apimachinery/pkg/version"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func checkForTerminatedState(ctx context.Context, nonCacheClient client.Client, v *k8sversion.Info, logger logr.Logger) error {
statusReport := status.NewStatusReporter(nonCacheClient, controllers.MondooClientBuilder, v)

var err error
config := &k8sv1alpha2.MondooOperatorConfig{}
if err = nonCacheClient.Get(ctx, types.NamespacedName{Name: k8sv1alpha2.MondooOperatorConfigName}, config); err != nil {
if errors.IsNotFound(err) {
logger.Info("MondooOperatorConfig not found, using defaults")
} else {
logger.Error(err, "Failed to check for MondooOpertorConfig")
return err
}
}

mondooAuditConfigs := &k8sv1alpha2.MondooAuditConfigList{}
if err := nonCacheClient.List(ctx, mondooAuditConfigs); err != nil {
logger.Error(err, "error listing MondooAuditConfigs")
return err
}

for _, mondooAuditConfig := range mondooAuditConfigs.Items {
mondooAuditConfigCopy := mondooAuditConfig.DeepCopy()

podList := &corev1.PodList{}
listOpts := &client.ListOptions{
Namespace: mondooAuditConfig.Namespace,
LabelSelector: labels.SelectorFromSet(map[string]string{
"app.kubernetes.io/name": "mondoo-operator",
}),
}
if err := nonCacheClient.List(ctx, podList, listOpts); err != nil {
logger.Error(err, "failed to list pods", "Mondoo.Namespace", mondooAuditConfig.Namespace, "Mondoo.Name", mondooAuditConfig.Name)
return err
}

currentPod := k8s.GetNewestPodFromList(podList)
for _, containerStatus := range currentPod.Status.ContainerStatuses {
if containerStatus.Name != "manager" {
continue
}
stateUpdate := false
if containerStatus.State.Terminated != nil || containerStatus.LastTerminationState.Terminated != nil {
logger.Info("mondoo-operator was terminated before")
// Update status
updateOperatorConditions(&mondooAuditConfig, true, currentPod)
stateUpdate = true
} else if containerStatus.RestartCount == 0 && containerStatus.State.Terminated == nil {
logger.Info("mondoo-operator is running or starting", "state", containerStatus.State)
updateOperatorConditions(&mondooAuditConfig, false, &corev1.Pod{})
stateUpdate = true
}
if stateUpdate {
err := mondoo.UpdateMondooAuditStatus(ctx, nonCacheClient, mondooAuditConfigCopy, &mondooAuditConfig, logger)
if err != nil {
logger.Error(err, "failed to update status for MondooAuditConfig")
return err
}
// Report upstream before we get OOMkilled again
err = statusReport.Report(ctx, mondooAuditConfig, *config)
if err != nil {
logger.Error(err, "failed to report status upstream")
return err
}
break
}
}
}
return nil
}

func updateOperatorConditions(config *k8sv1alpha2.MondooAuditConfig, degradedStatus bool, pod *corev1.Pod) {
msg := "Mondoo Operator controller is available"
reason := "MondooOperatorAvailable"
status := corev1.ConditionFalse
updateCheck := mondoo.UpdateConditionIfReasonOrMessageChange
affectedPods := []string{}
memoryLimit := ""
if degradedStatus {
msg = "Mondoo Operator controller is unavailable"
for i, containerStatus := range pod.Status.ContainerStatuses {
if (containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137) ||
(containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode == 137) {
msg = "Mondoo Operator controller is unavailable due to OOM"
affectedPods = append(affectedPods, pod.Name)
memoryLimit = pod.Spec.Containers[i].Resources.Limits.Memory().String()
break
}
}

reason = "MondooOperatorUnavailable"
status = corev1.ConditionTrue
}

config.Status.Conditions = mondoo.SetMondooAuditCondition(config.Status.Conditions, k8sv1alpha2.MondooOperaotrDegraded, status, reason, msg, updateCheck, affectedPods, memoryLimit)
}
160 changes: 160 additions & 0 deletions cmd/mondoo-operator/operator/operator_status_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// Copyright (c) Mondoo, Inc.
// SPDX-License-Identifier: BUSL-1.1

package operator

import (
"context"
"testing"
"time"

"github.com/go-logr/zapr"
"github.com/golang/mock/gomock"
"github.com/stretchr/testify/suite"
"go.uber.org/zap"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/version"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

mondoov1alpha2 "go.mondoo.com/mondoo-operator/api/v1alpha2"
scanapistoremock "go.mondoo.com/mondoo-operator/controllers/resource_monitor/scan_api_store/mock"
"go.mondoo.com/mondoo-operator/controllers/scanapi"
"go.mondoo.com/mondoo-operator/pkg/utils/mondoo"
fakeMondoo "go.mondoo.com/mondoo-operator/pkg/utils/mondoo/fake"
"go.mondoo.com/mondoo-operator/pkg/utils/test"
"go.mondoo.com/mondoo-operator/tests/framework/utils"
)

type DeploymentHandlerSuite struct {
suite.Suite
ctx context.Context
scheme *runtime.Scheme
containerImageResolver mondoo.ContainerImageResolver

auditConfig mondoov1alpha2.MondooAuditConfig
fakeClientBuilder *fake.ClientBuilder
mockCtrl *gomock.Controller
scanApiStoreMock *scanapistoremock.MockScanApiStore
}

func (s *DeploymentHandlerSuite) SetupSuite() {
s.ctx = context.Background()
s.scheme = clientgoscheme.Scheme
s.Require().NoError(mondoov1alpha2.AddToScheme(s.scheme))
s.containerImageResolver = fakeMondoo.NewNoOpContainerImageResolver()
s.mockCtrl = gomock.NewController(s.T())
s.scanApiStoreMock = scanapistoremock.NewMockScanApiStore(s.mockCtrl)
}

func (s *DeploymentHandlerSuite) BeforeTest(suiteName, testName string) {
s.auditConfig = utils.DefaultAuditConfig("mondoo-operator", true, false, false, false)
s.fakeClientBuilder = fake.NewClientBuilder().WithObjects(&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: scanapi.TokenSecretName(s.auditConfig.Name),
Namespace: s.auditConfig.Namespace,
},
Data: map[string][]byte{"token": []byte("token")},
}, test.TestKubeSystemNamespace())
}

func (s *DeploymentHandlerSuite) AfterTest(suiteName, testName string) {
s.mockCtrl.Finish()
}

func (s *DeploymentHandlerSuite) TestOOMDetect() {
mondooAuditConfig := &s.auditConfig

oomPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "mondoo-operator-123",
Namespace: s.auditConfig.Namespace,
Labels: map[string]string{"app.kubernetes.io/name": "mondoo-operator"},
CreationTimestamp: metav1.Time{
Time: time.Now(),
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "manager",
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceMemory: *resource.NewQuantity(1, resource.BinarySI),
},
},
},
},
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{
{
Name: "manager",
LastTerminationState: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 137,
},
},
},
},
},
}

// This is needed because of https://github.com/kubernetes-sigs/controller-runtime/issues/2362
objs := []client.Object{mondooAuditConfig, oomPod}
k8sClient := s.fakeClientBuilder.WithScheme(clientgoscheme.Scheme).WithStatusSubresource(objs...).WithObjects(objs...).Build()

v := &version.Info{}
cfg := zap.NewDevelopmentConfig()
cfg.InitialFields = map[string]interface{}{
"controller": "terminated-test",
}
zapLog, err := cfg.Build()
s.Require().NoError(err, "failed to set up logging for test cases")
testLogger := zapr.NewLogger(zapLog)

err = checkForTerminatedState(s.ctx, k8sClient, v, testLogger)
s.NoError(err)

mondooAuditConfigs := &mondoov1alpha2.MondooAuditConfigList{}
err = k8sClient.List(s.ctx, mondooAuditConfigs)
s.NoError(err)
s.Len(mondooAuditConfigs.Items, 1)

condition := mondooAuditConfigs.Items[0].Status.Conditions[0]
s.Equal("Mondoo Operator controller is unavailable due to OOM", condition.Message)
s.Len(condition.AffectedPods, 1)
s.Contains(condition.AffectedPods, "mondoo-operator-123")
containerMemory := oomPod.Spec.Containers[0].Resources.Limits.Memory()
s.Equal(containerMemory.String(), condition.MemoryLimit)
s.Equal("MondooOperatorUnavailable", condition.Reason)
s.Equal(corev1.ConditionTrue, condition.Status)

oomPod.Status.ContainerStatuses[0].LastTerminationState = corev1.ContainerState{}
oomPod.Status.ContainerStatuses[0].State.Running = &corev1.ContainerStateRunning{}
s.NoError(k8sClient.Status().Update(s.ctx, oomPod))

err = checkForTerminatedState(s.ctx, k8sClient, v, testLogger)
s.NoError(err)

mondooAuditConfigs = &mondoov1alpha2.MondooAuditConfigList{}
err = k8sClient.List(s.ctx, mondooAuditConfigs)
s.NoError(err)
s.Len(mondooAuditConfigs.Items, 1)

condition = mondooAuditConfigs.Items[0].Status.Conditions[0]
s.Equal("Mondoo Operator controller is available", condition.Message)
s.Len(condition.AffectedPods, 0)
s.Equal("", condition.MemoryLimit)
s.Equal("MondooOperatorAvailable", condition.Reason)
s.Equal(corev1.ConditionFalse, condition.Status)
}

func TestOperatorSuite(t *testing.T) {
suite.Run(t, new(DeploymentHandlerSuite))
}
1 change: 0 additions & 1 deletion controllers/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ func (mr *MetricsReconciler) Start(ctx context.Context) error {
}

func (mr *MetricsReconciler) metricsLoop() {
mr.log.Info("Updating metrics")
mondooAuditConfigs := &v1alpha2.MondooAuditConfigList{}
if err := mr.Client.List(mr.ctx, mondooAuditConfigs); err != nil {
mr.log.Error(err, "error listing MondooAuditConfigs")
Expand Down
5 changes: 5 additions & 0 deletions controllers/mondooauditconfig_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,11 @@ func TestTokenRegistration(t *testing.T) {
Identifier: status.ScanApiIdentifier,
Status: mondooclient.MessageStatus_MESSAGE_INFO,
},
{
Message: "No status reported yet",
Identifier: status.MondooOperatorIdentifier,
Status: mondooclient.MessageStatus_MESSAGE_UNKNOWN,
},
},
},
LastState: status.OperatorCustomState{
Expand Down
Loading

0 comments on commit 4d7bf13

Please sign in to comment.