Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(instrumentor): reconcile device on restarts #1710

Merged
merged 11 commits into from
Nov 10, 2024
23 changes: 11 additions & 12 deletions common/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,17 @@ import (
)

const (
CurrentNamespaceEnvVar = "CURRENT_NS"
DefaultOdigosNamespace = "odigos-system"
OdigosConfigurationName = "odigos-config"
OdigosConfigurationFileName = "config.yaml"
OTLPPort = 4317
OTLPHttpPort = 4318
PprofOdigosPort = 6060
OdigosInstrumentationLabel = "odigos-instrumentation"
InstrumentationEnabled = "enabled"
InstrumentationDisabled = "disabled"
OdigosReportedNameAnnotation = "odigos.io/reported-name"
EbpfInstrumentationAnnotation = "instrumentation.odigos.io/ebpf" // deprecated.
CurrentNamespaceEnvVar = "CURRENT_NS"
DefaultOdigosNamespace = "odigos-system"
OdigosConfigurationName = "odigos-config"
OdigosConfigurationFileName = "config.yaml"
OTLPPort = 4317
OTLPHttpPort = 4318
PprofOdigosPort = 6060
OdigosInstrumentationLabel = "odigos-instrumentation"
InstrumentationEnabled = "enabled"
InstrumentationDisabled = "disabled"
OdigosReportedNameAnnotation = "odigos.io/reported-name"

// Used to store the original value of the environment variable in the pod manifest.
// This is used to restore the original value when an instrumentation is removed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func (r *CollectorsGroupReconciler) Reconcile(ctx context.Context, req ctrl.Requ
logger.V(0).Info("Reconciling instrumented applications after node collectors group became ready", "count", len(instApps.Items))

var reconcileErr error
var gotConflict bool

for _, runtimeDetails := range instApps.Items {
var currentInstApp odigosv1.InstrumentedApplication
Expand All @@ -67,9 +68,17 @@ func (r *CollectorsGroupReconciler) Reconcile(ctx context.Context, req ctrl.Requ

err = reconcileSingleWorkload(ctx, r.Client, &currentInstApp, isDataCollectionReady)
if err != nil {
if apierrors.IsConflict(err) {
gotConflict = true
}
reconcileErr = errors.Join(reconcileErr, err)
}
}

if gotConflict && reconcileErr == nil {
// if we got a conflict and no other error, we will request a requeue and not return an error
// so we can retry the reconciliation but not have logs filled with errors
return ctrl.Result{Requeue: true}, nil
}
return ctrl.Result{}, reconcileErr
}
52 changes: 19 additions & 33 deletions instrumentor/controllers/instrumentationdevice/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"errors"

odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1"
"github.com/odigos-io/odigos/common/consts"
"github.com/odigos-io/odigos/instrumentor/controllers/utils"
"github.com/odigos-io/odigos/instrumentor/controllers/utils/versionsupport"
"github.com/odigos-io/odigos/instrumentor/instrumentation"
Expand Down Expand Up @@ -42,15 +41,6 @@ var (
GetDefaultSDKs = sdks.GetDefaultSDKs
)

func clearInstrumentationEbpf(obj client.Object) {
annotations := obj.GetAnnotations()
if annotations == nil {
return
}

delete(annotations, consts.EbpfInstrumentationAnnotation)
}

func isDataCollectionReady(ctx context.Context, c client.Client) bool {
logger := log.FromContext(ctx)

Expand Down Expand Up @@ -165,36 +155,32 @@ func removeInstrumentationDeviceFromWorkload(ctx context.Context, kubeClient cli
return client.IgnoreNotFound(err)
}

result, err := controllerutil.CreateOrPatch(ctx, kubeClient, workloadObj, func() error {

// clear old ebpf instrumentation annotation, just in case it still exists
clearInstrumentationEbpf(workloadObj)
podSpec, err := getPodSpecFromObject(workloadObj)
if err != nil {
return err
}
// If instrumentation device is removed successfully, remove odigos.io/inject-instrumentation label to disable the webhook
instrumentation.RemoveInjectInstrumentationLabel(podSpec)

instrumentation.RevertInstrumentationDevices(podSpec)

err = instrumentation.RevertEnvOverwrites(workloadObj, podSpec)
if err != nil {
return err
}
podSpec, err := getPodSpecFromObject(workloadObj)
if err != nil {
return err
}
// If instrumentation device is removed successfully, remove odigos.io/inject-instrumentation label to disable the webhook
webhookLabelRemoved := instrumentation.RemoveInjectInstrumentationLabel(podSpec)
deviceRemoved := instrumentation.RevertInstrumentationDevices(podSpec)
envChanged, err := instrumentation.RevertEnvOverwrites(workloadObj, podSpec)
if err != nil {
return err
}

// if we didn't change anything, we don't need to update the object
// skip the api-server call, return no-op and skip the log message
if !webhookLabelRemoved && !deviceRemoved && !envChanged {
return nil
})
}

err = kubeClient.Update(ctx, workloadObj)
if err != nil {
// if the update fails due to a conflict, the controller will retry the operation
return err
}

modified := result != controllerutil.OperationResultNone
if modified {
logger := log.FromContext(ctx)
logger.V(0).Info("removed instrumentation device from workload", "namespace", workloadObj.GetNamespace(), "kind", workloadObj.GetObjectKind(), "name", workloadObj.GetName(), "reason", uninstrumentReason)
}
logger := log.FromContext(ctx)
logger.V(0).Info("removed instrumentation device from workload", "namespace", workloadObj.GetNamespace(), "kind", workloadObj.GetObjectKind(), "name", workloadObj.GetName(), "reason", uninstrumentReason)

return nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"

odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -23,15 +24,24 @@ func (r *InstrumentationRuleReconciler) Reconcile(ctx context.Context, req ctrl.
}
isNodeCollectorReady := isDataCollectionReady(ctx, r.Client)

gotConflict := false
for _, runtimeDetails := range instApps.Items {
err := reconcileSingleWorkload(ctx, r.Client, &runtimeDetails, isNodeCollectorReady)
if err != nil {
return ctrl.Result{}, err
if apierrors.IsConflict(err) {
gotConflict = true
} else {
return ctrl.Result{}, err
}
}
}

logger := log.FromContext(ctx)
logger.V(0).Info("InstrumentationRule changed, recalculating instrumentation device for potential changes of otel sdks")

return ctrl.Result{}, nil
if gotConflict {
return ctrl.Result{Requeue: true}, nil
} else {
return ctrl.Result{}, nil
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"

odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1"
"github.com/odigos-io/odigos/k8sutils/pkg/utils"
"github.com/odigos-io/odigos/k8sutils/pkg/workload"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -52,15 +53,10 @@ func (r *InstrumentedApplicationReconciler) Reconcile(ctx context.Context, req c
return ctrl.Result{}, err
}
err = removeInstrumentationDeviceFromWorkload(ctx, r.Client, req.Namespace, workloadKind, workloadName, ApplyInstrumentationDeviceReasonNoRuntimeDetails)
if err != nil {
logger.Error(err, "error removing instrumentation")
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
return utils.RetryOnConflict(err)
}

isNodeCollectorReady := isDataCollectionReady(ctx, r.Client)
err = reconcileSingleWorkload(ctx, r.Client, &runtimeDetails, isNodeCollectorReady)
return ctrl.Result{}, err
return utils.RetryOnConflict(err)
}
7 changes: 6 additions & 1 deletion instrumentor/controllers/instrumentationdevice/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@ type workloadPodTemplatePredicate struct {
}

func (w workloadPodTemplatePredicate) Create(e event.CreateEvent) bool {
return false
// when instrumentor restarts, this case will be triggered as workloads objects are being added to the cache.
// in this case, we need to reconcile the workload, and guarantee that the device is injected or removed
// based on the current state of the cluster.
// if the instrumented application is deleted but the device is not cleaned,
// the instrumented application controller will not be invoked after restart, which is why we need to handle this case here.
return true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should add a check here that the odigos label is present?
Since after this change, when the instrumentor is starting this will cause it to have an event for each workload in the cluster - even those which odigos does not care about.
For those the new check added in this PR below if apierrors.IsNotFound(err) will be true.
That means the even for workloads which were not instrumented at all we will call removeInstrumentationDeviceFromWorkload.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to handle workloads which has no label.

Consider the following sequence:

  • label is removed from a workload which has the device
  • instrumented application deleted
  • instrumentor went down

When the instrumentor starts, we need to remove the device from this workload. if we add the filter you suggested, nothing will remove the device from this workload

Copy link
Contributor

@RonFed RonFed Nov 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about checking if there is a device present in the pod spec? Since we have the workload object here I think we can check that.

I assume it will be fine with the current approach as well, but since this has the potential of bringing a huge batch of events, we should try to handle them as fast as possible by filtering here.

}

func (w workloadPodTemplatePredicate) Update(e event.UpdateEvent) bool {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ import (
"context"

odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1"
"github.com/odigos-io/odigos/k8sutils/pkg/utils"
"github.com/odigos-io/odigos/k8sutils/pkg/workload"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -17,7 +19,7 @@ type DeploymentReconciler struct {
func (r *DeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
instrumentedAppName := workload.CalculateWorkloadRuntimeObjectName(req.Name, workload.WorkloadKindDeployment)
err := reconcileSingleInstrumentedApplicationByName(ctx, r.Client, instrumentedAppName, req.Namespace)
return ctrl.Result{}, err
return utils.RetryOnConflict(err)
}

type DaemonSetReconciler struct {
Expand All @@ -27,7 +29,7 @@ type DaemonSetReconciler struct {
func (r *DaemonSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
instrumentedAppName := workload.CalculateWorkloadRuntimeObjectName(req.Name, workload.WorkloadKindDaemonSet)
err := reconcileSingleInstrumentedApplicationByName(ctx, r.Client, instrumentedAppName, req.Namespace)
return ctrl.Result{}, err
return utils.RetryOnConflict(err)
}

type StatefulSetReconciler struct {
Expand All @@ -37,16 +39,26 @@ type StatefulSetReconciler struct {
func (r *StatefulSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
instrumentedAppName := workload.CalculateWorkloadRuntimeObjectName(req.Name, workload.WorkloadKindStatefulSet)
err := reconcileSingleInstrumentedApplicationByName(ctx, r.Client, instrumentedAppName, req.Namespace)
return ctrl.Result{}, err
return utils.RetryOnConflict(err)
}

func reconcileSingleInstrumentedApplicationByName(ctx context.Context, k8sClient client.Client, instrumentedAppName string, namespace string) error {
var instrumentedApplication odigosv1.InstrumentedApplication
err := k8sClient.Get(ctx, types.NamespacedName{Name: instrumentedAppName, Namespace: namespace}, &instrumentedApplication)
if err != nil {
// changes in workload when there is no instrumented application is not interesting
return client.IgnoreNotFound(err)
if apierrors.IsNotFound(err) {
// if there is no instrumented application, make sure the device is removed from the workload pod template manifest
workloadName, workloadKind, err := workload.ExtractWorkloadInfoFromRuntimeObjectName(instrumentedAppName)
if err != nil {
return err
}
err = removeInstrumentationDeviceFromWorkload(ctx, k8sClient, namespace, workloadKind, workloadName, ApplyInstrumentationDeviceReasonNoRuntimeDetails)
return err
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this means that a conflict error will be printed by the controller runtime - right?

} else {
return err
}
Comment on lines +49 to +59
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
if apierrors.IsNotFound(err) {
// if there is no instrumented application, make sure the device is removed from the workload pod template manifest
workloadName, workloadKind, err := workload.ExtractWorkloadInfoFromRuntimeObjectName(instrumentedAppName)
if err != nil {
return err
}
err = removeInstrumentationDeviceFromWorkload(ctx, k8sClient, namespace, workloadKind, workloadName, ApplyInstrumentationDeviceReasonNoRuntimeDetails)
return err
} else {
return err
}
if apierrors.IsNotFound(err) {
// if there is no instrumented application, make sure the device is removed from the workload pod template manifest
workloadName, workloadKind, err := workload.ExtractWorkloadInfoFromRuntimeObjectName(instrumentedAppName)
if err != nil {
return err
}
err = removeInstrumentationDeviceFromWorkload(ctx, k8sClient, namespace, workloadKind, workloadName, ApplyInstrumentationDeviceReasonNoRuntimeDetails)
}
return err

}
isNodeCollectorReady := isDataCollectionReady(ctx, k8sClient)

return reconcileSingleWorkload(ctx, k8sClient, &instrumentedApplication, isNodeCollectorReady)
}
24 changes: 17 additions & 7 deletions instrumentor/instrumentation/instrumentation.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,13 @@ func ApplyInstrumentationDevicesToPodTemplate(original *corev1.PodTemplateSpec,
// this function restores a workload manifest env vars to their original values.
// it is used when the instrumentation is removed from the workload.
// the original values are read from the annotation which was saved when the instrumentation was applied.
func RevertEnvOverwrites(obj client.Object, podSpec *corev1.PodTemplateSpec) error {
func RevertEnvOverwrites(obj client.Object, podSpec *corev1.PodTemplateSpec) (bool, error) {
manifestEnvOriginal, err := envoverwrite.NewOrigWorkloadEnvValues(obj.GetAnnotations())
if err != nil {
return err
return false, err
}

changed := false
for iContainer, c := range podSpec.Spec.Containers {
containerOriginalEnv := manifestEnvOriginal.GetContainerStoredEnvs(c.Name)
newContainerEnvs := make([]corev1.EnvVar, 0, len(c.Env))
Expand All @@ -102,32 +103,37 @@ func RevertEnvOverwrites(obj client.Object, podSpec *corev1.PodTemplateSpec) err
// if the value is nil, the env var was not set by the user to begin with.
// we will simply not append it to the new envs to achieve the same effect.
}
changed = true
} else {
newContainerEnvs = append(newContainerEnvs, envVar)
}
}
podSpec.Spec.Containers[iContainer].Env = newContainerEnvs
}

manifestEnvOriginal.DeleteFromObj(obj)
annotationRemoved := manifestEnvOriginal.DeleteFromObj(obj)

return nil
return changed || annotationRemoved, nil
}

func RevertInstrumentationDevices(original *corev1.PodTemplateSpec) {
func RevertInstrumentationDevices(original *corev1.PodTemplateSpec) bool {
changed := false
for _, container := range original.Spec.Containers {
for resourceName := range container.Resources.Limits {
if strings.HasPrefix(string(resourceName), common.OdigosResourceNamespace) {
delete(container.Resources.Limits, resourceName)
changed = true
}
}
// Is it needed?
for resourceName := range container.Resources.Requests {
if strings.HasPrefix(string(resourceName), common.OdigosResourceNamespace) {
delete(container.Resources.Requests, resourceName)
changed = true
}
}
}
return changed
}

func getLanguageOfContainer(instrumentation *odigosv1.InstrumentedApplication, containerName string) common.ProgrammingLanguage {
Expand Down Expand Up @@ -238,8 +244,12 @@ func SetInjectInstrumentationLabel(original *corev1.PodTemplateSpec) {
}

// RemoveInjectInstrumentationLabel removes the "odigos.io/inject-instrumentation" label if it exists.
func RemoveInjectInstrumentationLabel(original *corev1.PodTemplateSpec) {
func RemoveInjectInstrumentationLabel(original *corev1.PodTemplateSpec) bool {
if original.Labels != nil {
delete(original.Labels, "odigos.io/inject-instrumentation")
if _, ok := original.Labels["odigos.io/inject-instrumentation"]; ok {
delete(original.Labels, "odigos.io/inject-instrumentation")
return true
}
}
return false
}
9 changes: 7 additions & 2 deletions k8sutils/pkg/envoverwrite/origenv.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,16 @@ func (o *OrigWorkloadEnvValues) SerializeToAnnotation(obj metav1.Object) error {
return nil
}

func (o *OrigWorkloadEnvValues) DeleteFromObj(obj metav1.Object) {
func (o *OrigWorkloadEnvValues) DeleteFromObj(obj metav1.Object) bool {
currentAnnotations := obj.GetAnnotations()
if currentAnnotations == nil {
return
return false
}

if _, ok := currentAnnotations[consts.ManifestEnvOriginalValAnnotation]; !ok {
return false
}

delete(currentAnnotations, consts.ManifestEnvOriginalValAnnotation)
return true
}
Loading