From 3629ae8caae21b7806ddfc3ab4464ab33b27bdcd Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 09:04:23 +0300 Subject: [PATCH 01/13] feat: configurable collector own metrics port on CRD --- .../crd/bases/odigos.io_collectorsgroups.yaml | 6 ++++++ .../odigos/v1alpha1/collectorsgroupspec.go | 11 +++++++++- api/odigos/v1alpha1/collectorsgroup_types.go | 4 ++++ .../controllers/datacollection/configmap.go | 20 +++++++++++-------- .../controllers/gateway/configmap_test.go | 12 +++++------ common/odigos_config.go | 12 +++++++++++ k8sutils/pkg/utils/collectorgroup_util.go | 15 +++++++++++--- .../controllers/collectorgroups/cluster.go | 11 +++++++++- .../collectorgroups/datacollection.go | 13 +++++++++--- .../controllers/collectorsgroup_controller.go | 11 +++------- .../controllers/destination_controller.go | 18 ++++++----------- .../instrumentationconfig_controller.go | 15 ++------------ 12 files changed, 93 insertions(+), 55 deletions(-) diff --git a/api/config/crd/bases/odigos.io_collectorsgroups.yaml b/api/config/crd/bases/odigos.io_collectorsgroups.yaml index 5628d2883..5dbd74892 100644 --- a/api/config/crd/bases/odigos.io_collectorsgroups.yaml +++ b/api/config/crd/bases/odigos.io_collectorsgroups.yaml @@ -42,6 +42,12 @@ spec: spec: description: CollectorsGroupSpec defines the desired state of Collector properties: + collectorOwnMetricsPort: + description: |- + The port to use for exposing the collector's own metrics as a prometheus endpoint. + Default when unset is 55682. + format: int32 + type: integer role: enum: - CLUSTER_GATEWAY diff --git a/api/generated/odigos/applyconfiguration/odigos/v1alpha1/collectorsgroupspec.go b/api/generated/odigos/applyconfiguration/odigos/v1alpha1/collectorsgroupspec.go index 29d343f93..776732e4a 100644 --- a/api/generated/odigos/applyconfiguration/odigos/v1alpha1/collectorsgroupspec.go +++ b/api/generated/odigos/applyconfiguration/odigos/v1alpha1/collectorsgroupspec.go @@ -24,7 +24,8 @@ import ( // CollectorsGroupSpecApplyConfiguration represents a declarative configuration of the CollectorsGroupSpec type for use // with apply. type CollectorsGroupSpecApplyConfiguration struct { - Role *v1alpha1.CollectorsGroupRole `json:"role,omitempty"` + Role *v1alpha1.CollectorsGroupRole `json:"role,omitempty"` + CollectorOwnMetricsPort *int32 `json:"collectorOwnMetricsPort,omitempty"` } // CollectorsGroupSpecApplyConfiguration constructs a declarative configuration of the CollectorsGroupSpec type for use with @@ -40,3 +41,11 @@ func (b *CollectorsGroupSpecApplyConfiguration) WithRole(value v1alpha1.Collecto b.Role = &value return b } + +// WithCollectorOwnMetricsPort sets the CollectorOwnMetricsPort field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the CollectorOwnMetricsPort field is set to the value of the last call. +func (b *CollectorsGroupSpecApplyConfiguration) WithCollectorOwnMetricsPort(value int32) *CollectorsGroupSpecApplyConfiguration { + b.CollectorOwnMetricsPort = &value + return b +} diff --git a/api/odigos/v1alpha1/collectorsgroup_types.go b/api/odigos/v1alpha1/collectorsgroup_types.go index 3a9787de2..3bce0fdb9 100644 --- a/api/odigos/v1alpha1/collectorsgroup_types.go +++ b/api/odigos/v1alpha1/collectorsgroup_types.go @@ -33,6 +33,10 @@ const ( // CollectorsGroupSpec defines the desired state of Collector type CollectorsGroupSpec struct { Role CollectorsGroupRole `json:"role"` + + // The port to use for exposing the collector's own metrics as a prometheus endpoint. + // Default when unset is 55682. + CollectorOwnMetricsPort int32 `json:"collectorOwnMetricsPort,omitempty"` } // CollectorsGroupStatus defines the observed state of Collector diff --git a/autoscaler/controllers/datacollection/configmap.go b/autoscaler/controllers/datacollection/configmap.go index c5fd60505..17074c63b 100644 --- a/autoscaler/controllers/datacollection/configmap.go +++ b/autoscaler/controllers/datacollection/configmap.go @@ -98,7 +98,7 @@ func createConfigMap(desired *v1.ConfigMap, ctx context.Context, c client.Client func getDesiredConfigMap(apps *odigosv1.InstrumentedApplicationList, dests *odigosv1.DestinationList, processors []*odigosv1.Processor, datacollection *odigosv1.CollectorsGroup, scheme *runtime.Scheme, setTracesLoadBalancer bool, disableNameProcessor bool) (*v1.ConfigMap, error) { - cmData, err := calculateConfigMapData(apps, dests, processors, setTracesLoadBalancer, disableNameProcessor) + cmData, err := calculateConfigMapData(datacollection, apps, dests, processors, setTracesLoadBalancer, disableNameProcessor) if err != nil { return nil, err } @@ -124,16 +124,20 @@ func getDesiredConfigMap(apps *odigosv1.InstrumentedApplicationList, dests *odig return &desired, nil } -func calculateConfigMapData(apps *odigosv1.InstrumentedApplicationList, dests *odigosv1.DestinationList, processors []*odigosv1.Processor, +func calculateConfigMapData(collectorsGroup *odigosv1.CollectorsGroup, apps *odigosv1.InstrumentedApplicationList, dests *odigosv1.DestinationList, processors []*odigosv1.Processor, setTracesLoadBalancer bool, disableNameProcessor bool) (string, error) { + // TODO: make collectorsGroup.Spec.CollectorOwnMetricsPort required and concile default values elsewhere + ownMetricsPort := int32(55682) + if collectorsGroup.Spec.CollectorOwnMetricsPort != 0 { + ownMetricsPort = collectorsGroup.Spec.CollectorOwnMetricsPort + } + empty := struct{}{} processorsCfg, tracesProcessors, metricsProcessors, logsProcessors, errs := config.GetCrdProcessorsConfigMap(commonconf.ToProcessorConfigurerArray(processors)) - if errs != nil { - for name, err := range errs { - log.Log.V(0).Info(err.Error(), "processor", name) - } + for name, err := range errs { + log.Log.V(0).Info(err.Error(), "processor", name) } if !disableNameProcessor { @@ -214,7 +218,7 @@ func calculateConfigMapData(apps *odigosv1.InstrumentedApplicationList, dests *o "scrape_interval": "10s", "static_configs": []config.GenericMap{ { - "targets": []string{"127.0.0.1:8888"}, + "targets": []string{fmt.Sprintf("127.0.0.1:%d", ownMetricsPort)}, }, }, "metric_relabel_configs": []config.GenericMap{ @@ -247,7 +251,7 @@ func calculateConfigMapData(apps *odigosv1.InstrumentedApplicationList, dests *o Extensions: []string{"health_check"}, Telemetry: config.Telemetry{ Metrics: config.GenericMap{ - "address": "0.0.0.0:8888", + "address": fmt.Sprintf("0.0.0.0:%d", ownMetricsPort), }, Resource: map[string]*string{ // The collector add "otelcol" as a service name, so we need to remove it diff --git a/autoscaler/controllers/gateway/configmap_test.go b/autoscaler/controllers/gateway/configmap_test.go index b4bbe628c..88ef1968b 100644 --- a/autoscaler/controllers/gateway/configmap_test.go +++ b/autoscaler/controllers/gateway/configmap_test.go @@ -12,7 +12,7 @@ func TestAddSelfTelemetryPipeline(t *testing.T) { cases := []struct { name string cfg *config.Config - err error + err error }{ { name: "no pipeline", @@ -66,7 +66,7 @@ func TestAddSelfTelemetryPipeline(t *testing.T) { }, Processors: config.GenericMap{ "memory_limiter": config.GenericMap{ - "check_interval": "1s", + "check_interval": "1s", }, "resource/odigos-version": config.GenericMap{ "attributes": []config.GenericMap{ @@ -98,7 +98,7 @@ func TestAddSelfTelemetryPipeline(t *testing.T) { }, }, } - + for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { c := tc.cfg @@ -115,15 +115,15 @@ func TestAddSelfTelemetryPipeline(t *testing.T) { assert.Equal(t, []string{"prometheus"}, c.Service.Pipelines["metrics/otelcol"].Receivers) assert.Equal(t, []string{"resource/pod-name"}, c.Service.Pipelines["metrics/otelcol"].Processors) assert.Equal(t, []string{"otlp/ui"}, c.Service.Pipelines["metrics/otelcol"].Exporters) - assert.Equal(t, "0.0.0.0:8888", c.Service.Telemetry.Metrics["address"]) + assert.Equal(t, "0.0.0.0:55682", c.Service.Telemetry.Metrics["address"]) for pipelineName, pipeline := range c.Service.Pipelines { if pipelineName == "metrics/otelcol" { assert.NotContains(t, pipeline.Processors, "odigostrafficmetrics") } else { - assert.Equal(t, pipeline.Processors[len(pipeline.Processors) - 1], "odigostrafficmetrics") + assert.Equal(t, pipeline.Processors[len(pipeline.Processors)-1], "odigostrafficmetrics") } } }) } -} \ No newline at end of file +} diff --git a/common/odigos_config.go b/common/odigos_config.go index d10427edb..d8650e5cd 100644 --- a/common/odigos_config.go +++ b/common/odigos_config.go @@ -2,6 +2,17 @@ package common type ProfileName string +type CollectorNodeConfiguration struct { + + // Each node collector, running as a daemonset, runs on the host network, + // and exposes prometheus metrics endpoint on this a dedicated port. + // When unset, the default port is 55682. + // Because it shares the port network with the host, + // if some other process is using the port, the node collector will not start. + // This option allows to set a different port for the node collector to overcome this issue if encountered. + CollectorOwnMetricsPort int `json:"collectorOwnMetricsPort,omitempty"` +} + type CollectorGatewayConfiguration struct { // RequestMemoryMiB is the memory request for the cluster gateway collector deployment. // it will be embedded in the deployment as a resource request of the form "memory: Mi" @@ -38,6 +49,7 @@ type OdigosConfiguration struct { AutoscalerImage string `json:"autoscalerImage,omitempty"` DefaultSDKs map[ProgrammingLanguage]OtelSdk `json:"defaultSDKs,omitempty"` CollectorGateway *CollectorGatewayConfiguration `json:"collectorGateway,omitempty"` + CollectorNode *CollectorNodeConfiguration `json:"collectorNode,omitempty"` Profiles []ProfileName `json:"profiles,omitempty"` // this is internal currently, and is not exposed on the CLI / helm diff --git a/k8sutils/pkg/utils/collectorgroup_util.go b/k8sutils/pkg/utils/collectorgroup_util.go index eff0b9122..1647386cd 100644 --- a/k8sutils/pkg/utils/collectorgroup_util.go +++ b/k8sutils/pkg/utils/collectorgroup_util.go @@ -2,15 +2,24 @@ package utils import ( "context" + odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1" "k8s.io/apimachinery/pkg/api/errors" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) -func CreateCollectorGroup(ctx context.Context, c client.Client, collectorGroup *odigosv1.CollectorsGroup) error { - log.FromContext(ctx).Info("Creating collector group", "collectorGroupName", collectorGroup.Name) - return c.Create(ctx, collectorGroup) +func ApplyCollectorGroup(ctx context.Context, c client.Client, collectorGroup *odigosv1.CollectorsGroup) error { + logger := log.FromContext(ctx).WithValues("collectorGroupName", collectorGroup.Name) + logger.Info("Applying collector group", "collectorGroupName", collectorGroup.Name) + + err := c.Patch(ctx, collectorGroup, client.Apply, client.ForceOwnership, client.FieldOwner("scheduler")) + if err != nil { + logger.Error(err, "Failed to apply collector group") + return err + } + + return nil } func GetCollectorGroup(ctx context.Context, c client.Client, namespace string, collectorGroupName string) (*odigosv1.CollectorsGroup, error) { diff --git a/scheduler/controllers/collectorgroups/cluster.go b/scheduler/controllers/collectorgroups/cluster.go index e528bb84b..864e7c590 100644 --- a/scheduler/controllers/collectorgroups/cluster.go +++ b/scheduler/controllers/collectorgroups/cluster.go @@ -6,14 +6,23 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// The cluster gateway collector runs as a deployment and the pod is exposed as a service. +// Thus it cannot collide with other ports on the same node, and we can use an handy default port. +const ClusterCollectorDefaultOwnMetricsPort = 8888 + func NewClusterCollectorGroup(namespace string) *odigosv1.CollectorsGroup { return &odigosv1.CollectorsGroup{ + TypeMeta: metav1.TypeMeta{ + Kind: "CollectorsGroup", + APIVersion: "odigos.io/v1alpha1", + }, ObjectMeta: metav1.ObjectMeta{ Name: consts.OdigosClusterCollectorCollectorGroupName, Namespace: namespace, }, Spec: odigosv1.CollectorsGroupSpec{ - Role: odigosv1.CollectorsGroupRoleClusterGateway, + Role: odigosv1.CollectorsGroupRoleClusterGateway, + CollectorOwnMetricsPort: ClusterCollectorDefaultOwnMetricsPort, }, } } diff --git a/scheduler/controllers/collectorgroups/datacollection.go b/scheduler/controllers/collectorgroups/datacollection.go index b87da7b6f..71a6b508d 100644 --- a/scheduler/controllers/collectorgroups/datacollection.go +++ b/scheduler/controllers/collectorgroups/datacollection.go @@ -7,18 +7,25 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const NodeCollectorDefaultOwnMetricsPort = 55682 + func NewNodeCollectorGroup() *odigosv1.CollectorsGroup { return &odigosv1.CollectorsGroup{ + TypeMeta: metav1.TypeMeta{ + Kind: "CollectorsGroup", + APIVersion: "odigos.io/v1alpha1", + }, ObjectMeta: metav1.ObjectMeta{ Name: consts.OdigosNodeCollectorDaemonSetName, Namespace: env.GetCurrentNamespace(), }, Spec: odigosv1.CollectorsGroupSpec{ - Role: odigosv1.CollectorsGroupRoleNodeCollector, + Role: odigosv1.CollectorsGroupRoleNodeCollector, + CollectorOwnMetricsPort: NodeCollectorDefaultOwnMetricsPort, }, } } -func ShouldCreateNodeCollectorGroup(gatewayReady bool, dataCollectionExists bool, numberofInstrumentedApps int) bool { - return gatewayReady && !dataCollectionExists && numberofInstrumentedApps > 0 +func ShouldHaveNodeCollectorGroup(gatewayReady bool, numberofInstrumentedApps int) bool { + return gatewayReady && numberofInstrumentedApps > 0 } diff --git a/scheduler/controllers/collectorsgroup_controller.go b/scheduler/controllers/collectorsgroup_controller.go index 4dfa5eb98..24ca9e1d6 100644 --- a/scheduler/controllers/collectorsgroup_controller.go +++ b/scheduler/controllers/collectorsgroup_controller.go @@ -71,7 +71,6 @@ func (r *CollectorsGroupReconciler) Reconcile(ctx context.Context, req ctrl.Requ } gatewayReady := false - dataCollectionExists := false for _, collectorGroup := range collectorGroups.Items { err := r.applyNewCollectorRoleNames(ctx, &collectorGroup) if err != nil { @@ -82,10 +81,6 @@ func (r *CollectorsGroupReconciler) Reconcile(ctx context.Context, req ctrl.Requ if collectorGroup.Spec.Role == odigosv1.CollectorsGroupRoleClusterGateway && collectorGroup.Status.Ready { gatewayReady = true } - - if collectorGroup.Spec.Role == odigosv1.CollectorsGroupRoleNodeCollector { - dataCollectionExists = true - } } var instApps odigosv1.InstrumentationConfigList @@ -94,10 +89,10 @@ func (r *CollectorsGroupReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, err } - if collectorgroups.ShouldCreateNodeCollectorGroup(gatewayReady, dataCollectionExists, len(instApps.Items)) { - err = utils.CreateCollectorGroup(ctx, r.Client, collectorgroups.NewNodeCollectorGroup()) + if collectorgroups.ShouldHaveNodeCollectorGroup(gatewayReady, len(instApps.Items)) { + err = utils.ApplyCollectorGroup(ctx, r.Client, collectorgroups.NewNodeCollectorGroup()) if err != nil { - logger.Error(err, "failed to create data collection collector group") + logger.Error(err, "failed to apply node collector group") return ctrl.Result{}, err } } diff --git a/scheduler/controllers/destination_controller.go b/scheduler/controllers/destination_controller.go index a1af2d6ae..679215a71 100644 --- a/scheduler/controllers/destination_controller.go +++ b/scheduler/controllers/destination_controller.go @@ -18,6 +18,7 @@ package controllers import ( "context" + "github.com/odigos-io/odigos/k8sutils/pkg/utils" odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1" @@ -49,22 +50,15 @@ func (r *DestinationReconciler) Reconcile(ctx context.Context, req ctrl.Request) } if len(dests.Items) > 0 { - var collectorGroups odigosv1.CollectorsGroupList - err := r.List(ctx, &collectorGroups, client.InNamespace(req.Namespace)) + logger.V(0).Info("destinations found, syncing cluster collector group") + err := utils.ApplyCollectorGroup(ctx, r.Client, collectorgroups.NewClusterCollectorGroup(req.Namespace)) if err != nil { - logger.Error(err, "failed to list collectors groups") + logger.Error(err, "failed to sync cluster collector group") return ctrl.Result{}, err } - - if len(collectorGroups.Items) == 0 { - logger.V(0).Info("destinations found, but no collectors groups found, creating gateway") - err = utils.CreateCollectorGroup(ctx, r.Client, collectorgroups.NewClusterCollectorGroup(req.Namespace)) - if err != nil { - logger.Error(err, "failed to create gateway") - return ctrl.Result{}, err - } - } } + // once the gateway is created, it is not deleted, even if there are no destinations. + // we might want to re-consider this behavior. return ctrl.Result{}, nil } diff --git a/scheduler/controllers/instrumentationconfig_controller.go b/scheduler/controllers/instrumentationconfig_controller.go index 096a284ec..f47141113 100644 --- a/scheduler/controllers/instrumentationconfig_controller.go +++ b/scheduler/controllers/instrumentationconfig_controller.go @@ -52,19 +52,8 @@ func (r *InstrumentationConfigReconciler) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, err } - dataCollectionExists := true - _, err = utils.GetCollectorGroup(ctx, r.Client, namespace, consts.OdigosNodeCollectorDaemonSetName) - if err != nil { - if errors.IsNotFound(err) { - dataCollectionExists = false - } else { - logger.Error(err, "failed to get collector group", "collectorGroupName", consts.OdigosNodeCollectorCollectorGroupName) - return ctrl.Result{}, err - } - } - - if nodeCollectorGroupUtil.ShouldCreateNodeCollectorGroup(clusterCollectorGroup.Status.Ready, dataCollectionExists, numberOfInstrumentedApps) { - err = utils.CreateCollectorGroup(ctx, r.Client, nodeCollectorGroupUtil.NewNodeCollectorGroup()) + if nodeCollectorGroupUtil.ShouldHaveNodeCollectorGroup(clusterCollectorGroup.Status.Ready, numberOfInstrumentedApps) { + err = utils.ApplyCollectorGroup(ctx, r.Client, nodeCollectorGroupUtil.NewNodeCollectorGroup()) if err != nil { logger.Error(err, "failed to create data collection collector group") return ctrl.Result{}, err From 6325e7e9027dc12bd4cc4fe57eaeb73fd3c58b05 Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 09:30:16 +0300 Subject: [PATCH 02/13] feat: use gateway own telemetry port from CRD --- .../controllers/collectorsgroup_controller.go | 4 +++- autoscaler/controllers/gateway/configmap.go | 22 ++++++++++--------- autoscaler/controllers/gateway/service.go | 6 ++--- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/autoscaler/controllers/collectorsgroup_controller.go b/autoscaler/controllers/collectorsgroup_controller.go index 2e8726849..ab11a7db7 100644 --- a/autoscaler/controllers/collectorsgroup_controller.go +++ b/autoscaler/controllers/collectorsgroup_controller.go @@ -102,6 +102,8 @@ func (r *CollectorsGroupReconciler) Reconcile(ctx context.Context, req ctrl.Requ func (r *CollectorsGroupReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&odigosv1.CollectorsGroup{}). - WithEventFilter(&onlyCreatePredicate{}). + // we assume everything in the collectorsgroup spec is the configuration for the collectors to generate. + // thus, we need to monitor any change to the spec which is what the generation field is for. + WithEventFilter(&predicate.GenerationChangedPredicate{}). Complete(r) } diff --git a/autoscaler/controllers/gateway/configmap.go b/autoscaler/controllers/gateway/configmap.go index 61a28e857..ab9a23af2 100644 --- a/autoscaler/controllers/gateway/configmap.go +++ b/autoscaler/controllers/gateway/configmap.go @@ -28,12 +28,12 @@ const ( ) var ( - errNoPipelineConfigured = errors.New("no pipeline was configured, cannot add self telemetry pipeline") + errNoPipelineConfigured = errors.New("no pipeline was configured, cannot add self telemetry pipeline") errNoReceiversConfigured = errors.New("no receivers were configured, cannot add self telemetry pipeline") errNoExportersConfigured = errors.New("no exporters were configured, cannot add self telemetry pipeline") ) -func addSelfTelemetryPipeline(c *config.Config) error { +func addSelfTelemetryPipeline(c *config.Config, ownTelemetryPort int32) error { if c.Service.Pipelines == nil { return errNoPipelineConfigured } @@ -47,18 +47,18 @@ func addSelfTelemetryPipeline(c *config.Config) error { "config": config.GenericMap{ "scrape_configs": []config.GenericMap{ { - "job_name": "otelcol", + "job_name": "otelcol", "scrape_interval": "10s", "static_configs": []config.GenericMap{ { - "targets": []string{"127.0.0.1:8888"}, + "targets": []string{fmt.Sprintf("127.0.0.1:%d", ownTelemetryPort)}, }, }, "metric_relabel_configs": []config.GenericMap{ { "source_labels": []string{"__name__"}, - "regex": "(.*odigos.*|^otelcol_processor_accepted.*|^otelcol_exporter_sent.*)", - "action": "keep", + "regex": "(.*odigos.*|^otelcol_processor_accepted.*|^otelcol_exporter_sent.*)", + "action": "keep", }, }, }, @@ -91,13 +91,13 @@ func addSelfTelemetryPipeline(c *config.Config) error { }, } c.Service.Pipelines["metrics/otelcol"] = config.Pipeline{ - Receivers: []string{"prometheus/self-metrics"}, + Receivers: []string{"prometheus/self-metrics"}, Processors: []string{"resource/pod-name"}, - Exporters: []string{"otlp/odigos-own-telemetry-ui"}, + Exporters: []string{"otlp/odigos-own-telemetry-ui"}, } c.Service.Telemetry.Metrics = config.GenericMap{ - "address": "0.0.0.0:8888", + "address": fmt.Sprintf("0.0.0.0:%d", ownTelemetryPort), } for pipelineName, pipeline := range c.Service.Pipelines { @@ -126,7 +126,9 @@ func syncConfigMap(dests *odigosv1.DestinationList, allProcessors *odigosv1.Proc common.ToExporterConfigurerArray(dests), common.ToProcessorConfigurerArray(processors), memoryLimiterConfiguration, - addSelfTelemetryPipeline, + func(c *config.Config) error { + return addSelfTelemetryPipeline(c, gateway.Spec.CollectorOwnMetricsPort) + }, ) if err != nil { logger.Error(err, "Failed to calculate config") diff --git a/autoscaler/controllers/gateway/service.go b/autoscaler/controllers/gateway/service.go index e747c2f57..e0c168f2c 100644 --- a/autoscaler/controllers/gateway/service.go +++ b/autoscaler/controllers/gateway/service.go @@ -54,7 +54,7 @@ func syncService(gateway *odigosv1.CollectorsGroup, ctx context.Context, c clien } result, err := controllerutil.CreateOrPatch(ctx, c, gatewaySvc, func() error { - updateGatewaySvc(gatewaySvc) + updateGatewaySvc(gatewaySvc, gateway) return nil }) @@ -67,7 +67,7 @@ func syncService(gateway *odigosv1.CollectorsGroup, ctx context.Context, c clien return gatewaySvc, nil } -func updateGatewaySvc(svc *v1.Service) { +func updateGatewaySvc(svc *v1.Service, collectorsGroup *odigosv1.CollectorsGroup) { svc.Spec.Ports = []v1.ServicePort{ { Name: "otlp", @@ -83,7 +83,7 @@ func updateGatewaySvc(svc *v1.Service) { }, { Name: "metrics", - Port: 8888, + Port: collectorsGroup.Spec.CollectorOwnMetricsPort, }, } From 4c5aeb9846c8523ba6179dc6972e002e175f3a78 Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 10:25:07 +0300 Subject: [PATCH 03/13] feat: allow configuring collector own metrics port form odigos config --- common/odigos_config.go | 2 +- k8sutils/pkg/utils/config_util.go | 2 ++ .../collectorgroups/datacollection.go | 13 ++++++++++--- .../controllers/collectorsgroup_controller.go | 8 +++++++- .../instrumentationconfig_controller.go | 8 +++++++- scheduler/main.go | 19 +++++++++++++++++++ 6 files changed, 46 insertions(+), 6 deletions(-) diff --git a/common/odigos_config.go b/common/odigos_config.go index d8650e5cd..55152ec72 100644 --- a/common/odigos_config.go +++ b/common/odigos_config.go @@ -10,7 +10,7 @@ type CollectorNodeConfiguration struct { // Because it shares the port network with the host, // if some other process is using the port, the node collector will not start. // This option allows to set a different port for the node collector to overcome this issue if encountered. - CollectorOwnMetricsPort int `json:"collectorOwnMetricsPort,omitempty"` + CollectorOwnMetricsPort int32 `json:"collectorOwnMetricsPort,omitempty"` } type CollectorGatewayConfiguration struct { diff --git a/k8sutils/pkg/utils/config_util.go b/k8sutils/pkg/utils/config_util.go index 3845f60ec..cd60ff399 100644 --- a/k8sutils/pkg/utils/config_util.go +++ b/k8sutils/pkg/utils/config_util.go @@ -2,6 +2,7 @@ package utils import ( "context" + "fmt" "github.com/odigos-io/odigos/common" "github.com/odigos-io/odigos/common/consts" @@ -19,6 +20,7 @@ func GetCurrentOdigosConfig(ctx context.Context, k8sClient client.Client) (commo if err := k8sClient.Get(ctx, types.NamespacedName{Namespace: odigosSystemNamespaceName, Name: consts.OdigosConfigurationName}, &configMap); err != nil { return odigosConfig, err } + fmt.Println(configMap.Data[consts.OdigosConfigurationFileName]) if err := yaml.Unmarshal([]byte(configMap.Data[consts.OdigosConfigurationFileName]), &odigosConfig); err != nil { return odigosConfig, err } diff --git a/scheduler/controllers/collectorgroups/datacollection.go b/scheduler/controllers/collectorgroups/datacollection.go index 71a6b508d..409298c09 100644 --- a/scheduler/controllers/collectorgroups/datacollection.go +++ b/scheduler/controllers/collectorgroups/datacollection.go @@ -2,14 +2,21 @@ package collectorgroups import ( odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1" + "github.com/odigos-io/odigos/common" "github.com/odigos-io/odigos/k8sutils/pkg/consts" "github.com/odigos-io/odigos/k8sutils/pkg/env" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -const NodeCollectorDefaultOwnMetricsPort = 55682 +const NodeCollectorDefaultOwnMetricsPort = int32(55682) + +func NewNodeCollectorGroup(odigosConfig common.OdigosConfiguration) *odigosv1.CollectorsGroup { + + ownMetricsPort := NodeCollectorDefaultOwnMetricsPort + if odigosConfig.CollectorNode != nil && odigosConfig.CollectorNode.CollectorOwnMetricsPort != 0 { + ownMetricsPort = odigosConfig.CollectorNode.CollectorOwnMetricsPort + } -func NewNodeCollectorGroup() *odigosv1.CollectorsGroup { return &odigosv1.CollectorsGroup{ TypeMeta: metav1.TypeMeta{ Kind: "CollectorsGroup", @@ -21,7 +28,7 @@ func NewNodeCollectorGroup() *odigosv1.CollectorsGroup { }, Spec: odigosv1.CollectorsGroupSpec{ Role: odigosv1.CollectorsGroupRoleNodeCollector, - CollectorOwnMetricsPort: NodeCollectorDefaultOwnMetricsPort, + CollectorOwnMetricsPort: ownMetricsPort, }, } } diff --git a/scheduler/controllers/collectorsgroup_controller.go b/scheduler/controllers/collectorsgroup_controller.go index 24ca9e1d6..424c3b654 100644 --- a/scheduler/controllers/collectorsgroup_controller.go +++ b/scheduler/controllers/collectorsgroup_controller.go @@ -89,8 +89,14 @@ func (r *CollectorsGroupReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, err } + odigosConfig, err := utils.GetCurrentOdigosConfig(ctx, r.Client) + if err != nil { + logger.Error(err, "failed to get odigos config") + return ctrl.Result{}, err + } + if collectorgroups.ShouldHaveNodeCollectorGroup(gatewayReady, len(instApps.Items)) { - err = utils.ApplyCollectorGroup(ctx, r.Client, collectorgroups.NewNodeCollectorGroup()) + err = utils.ApplyCollectorGroup(ctx, r.Client, collectorgroups.NewNodeCollectorGroup(odigosConfig)) if err != nil { logger.Error(err, "failed to apply node collector group") return ctrl.Result{}, err diff --git a/scheduler/controllers/instrumentationconfig_controller.go b/scheduler/controllers/instrumentationconfig_controller.go index f47141113..090fd3b15 100644 --- a/scheduler/controllers/instrumentationconfig_controller.go +++ b/scheduler/controllers/instrumentationconfig_controller.go @@ -52,8 +52,14 @@ func (r *InstrumentationConfigReconciler) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, err } + odigosConfig, err := utils.GetCurrentOdigosConfig(ctx, r.Client) + if err != nil { + logger.Error(err, "failed to get odigos config") + return ctrl.Result{}, err + } + if nodeCollectorGroupUtil.ShouldHaveNodeCollectorGroup(clusterCollectorGroup.Status.Ready, numberOfInstrumentedApps) { - err = utils.ApplyCollectorGroup(ctx, r.Client, nodeCollectorGroupUtil.NewNodeCollectorGroup()) + err = utils.ApplyCollectorGroup(ctx, r.Client, nodeCollectorGroupUtil.NewNodeCollectorGroup(odigosConfig)) if err != nil { logger.Error(err, "failed to create data collection collector group") return ctrl.Result{}, err diff --git a/scheduler/main.go b/scheduler/main.go index 9ae9e330c..6dc53d004 100644 --- a/scheduler/main.go +++ b/scheduler/main.go @@ -22,6 +22,8 @@ import ( "github.com/go-logr/zapr" bridge "github.com/odigos-io/opentelemetry-zap-bridge" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -29,6 +31,10 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1" + "github.com/odigos-io/odigos/common/consts" + "github.com/odigos-io/odigos/k8sutils/pkg/env" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -71,11 +77,24 @@ func main() { logger := zapr.NewLogger(zapLogger) ctrl.SetLogger(logger) + odigosNs := env.GetCurrentNamespace() + nsSelector := client.InNamespace(odigosNs).AsSelector() + nameSelector := fields.OneTermEqualSelector("metadata.name", consts.OdigosConfigurationName) + odigosConfigSelector := fields.AndSelectors(nsSelector, nameSelector) + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, Metrics: metricsserver.Options{ BindAddress: metricsAddr, }, + Cache: cache.Options{ + DefaultTransform: cache.TransformStripManagedFields(), + ByObject: map[client.Object]cache.ByObject{ + &corev1.ConfigMap{}: { + Field: odigosConfigSelector, + }, + }, + }, HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "ce024640.odigos.io", From ff7de28aa8b1b63aa6c26fe1f75fc75280b29185 Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 10:40:08 +0300 Subject: [PATCH 04/13] refactor: make the cg CRD config required --- api/config/crd/bases/odigos.io_collectorsgroups.yaml | 6 +++--- api/odigos/v1alpha1/collectorsgroup_types.go | 3 +-- autoscaler/controllers/datacollection/configmap.go | 6 +----- common/odigos_config.go | 1 - scheduler/go.mod | 4 ++-- 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/api/config/crd/bases/odigos.io_collectorsgroups.yaml b/api/config/crd/bases/odigos.io_collectorsgroups.yaml index 5dbd74892..551a8892e 100644 --- a/api/config/crd/bases/odigos.io_collectorsgroups.yaml +++ b/api/config/crd/bases/odigos.io_collectorsgroups.yaml @@ -43,9 +43,8 @@ spec: description: CollectorsGroupSpec defines the desired state of Collector properties: collectorOwnMetricsPort: - description: |- - The port to use for exposing the collector's own metrics as a prometheus endpoint. - Default when unset is 55682. + description: The port to use for exposing the collector's own metrics + as a prometheus endpoint. format: int32 type: integer role: @@ -54,6 +53,7 @@ spec: - NODE_COLLECTOR type: string required: + - collectorOwnMetricsPort - role type: object status: diff --git a/api/odigos/v1alpha1/collectorsgroup_types.go b/api/odigos/v1alpha1/collectorsgroup_types.go index 3bce0fdb9..6830c5678 100644 --- a/api/odigos/v1alpha1/collectorsgroup_types.go +++ b/api/odigos/v1alpha1/collectorsgroup_types.go @@ -35,8 +35,7 @@ type CollectorsGroupSpec struct { Role CollectorsGroupRole `json:"role"` // The port to use for exposing the collector's own metrics as a prometheus endpoint. - // Default when unset is 55682. - CollectorOwnMetricsPort int32 `json:"collectorOwnMetricsPort,omitempty"` + CollectorOwnMetricsPort int32 `json:"collectorOwnMetricsPort"` } // CollectorsGroupStatus defines the observed state of Collector diff --git a/autoscaler/controllers/datacollection/configmap.go b/autoscaler/controllers/datacollection/configmap.go index 17074c63b..49c4c7515 100644 --- a/autoscaler/controllers/datacollection/configmap.go +++ b/autoscaler/controllers/datacollection/configmap.go @@ -127,11 +127,7 @@ func getDesiredConfigMap(apps *odigosv1.InstrumentedApplicationList, dests *odig func calculateConfigMapData(collectorsGroup *odigosv1.CollectorsGroup, apps *odigosv1.InstrumentedApplicationList, dests *odigosv1.DestinationList, processors []*odigosv1.Processor, setTracesLoadBalancer bool, disableNameProcessor bool) (string, error) { - // TODO: make collectorsGroup.Spec.CollectorOwnMetricsPort required and concile default values elsewhere - ownMetricsPort := int32(55682) - if collectorsGroup.Spec.CollectorOwnMetricsPort != 0 { - ownMetricsPort = collectorsGroup.Spec.CollectorOwnMetricsPort - } + ownMetricsPort := collectorsGroup.Spec.CollectorOwnMetricsPort empty := struct{}{} diff --git a/common/odigos_config.go b/common/odigos_config.go index 55152ec72..999ca8754 100644 --- a/common/odigos_config.go +++ b/common/odigos_config.go @@ -6,7 +6,6 @@ type CollectorNodeConfiguration struct { // Each node collector, running as a daemonset, runs on the host network, // and exposes prometheus metrics endpoint on this a dedicated port. - // When unset, the default port is 55682. // Because it shares the port network with the host, // if some other process is using the port, the node collector will not start. // This option allows to set a different port for the node collector to overcome this issue if encountered. diff --git a/scheduler/go.mod b/scheduler/go.mod index dcc459f56..ef8bb2b33 100644 --- a/scheduler/go.mod +++ b/scheduler/go.mod @@ -5,10 +5,12 @@ go 1.22.0 require ( github.com/go-logr/zapr v1.3.0 github.com/odigos-io/odigos/api v0.0.0 + github.com/odigos-io/odigos/common v0.0.0 github.com/odigos-io/odigos/k8sutils v0.0.0 github.com/odigos-io/opentelemetry-zap-bridge v0.0.5 github.com/onsi/ginkgo v1.16.5 github.com/onsi/gomega v1.34.2 + k8s.io/api v0.31.0 k8s.io/apimachinery v0.31.0 k8s.io/client-go v0.31.0 sigs.k8s.io/controller-runtime v0.19.0 @@ -50,7 +52,6 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/nxadm/tail v1.4.8 // indirect - github.com/odigos-io/odigos/common v0.0.0 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.19.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect @@ -82,7 +83,6 @@ require ( gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/api v0.31.0 // indirect k8s.io/apiextensions-apiserver v0.31.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect From 410a896a92090ab2e9881323735abfc27268ae45 Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 12:20:16 +0300 Subject: [PATCH 05/13] feat: helm and odigos config reconcile --- helm/odigos/templates/odigos-config-cm.yaml | 6 + helm/odigos/values.yaml | 8 + .../controllers/collectorsgroup_controller.go | 114 ----------- .../instrumentationconfig_controller.go | 76 ------- .../controllers/nodecollector_controller.go | 189 ++++++++++++++++++ scheduler/main.go | 9 +- 6 files changed, 204 insertions(+), 198 deletions(-) delete mode 100644 scheduler/controllers/collectorsgroup_controller.go delete mode 100644 scheduler/controllers/instrumentationconfig_controller.go create mode 100644 scheduler/controllers/nodecollector_controller.go diff --git a/helm/odigos/templates/odigos-config-cm.yaml b/helm/odigos/templates/odigos-config-cm.yaml index 024c3287e..1ea0ec880 100644 --- a/helm/odigos/templates/odigos-config-cm.yaml +++ b/helm/odigos/templates/odigos-config-cm.yaml @@ -30,6 +30,12 @@ data: goMemLimitMiB: {{ . }} {{- end }} {{- end }} + {{- if .Values.collectorNode }} + collectorNode: + {{- with .Values.collectorNode.collectorOwnMetricsPort }} + collectorOwnMetricsPort: {{ . }} + {{- end }} + {{- end }} instrumentorImage: {{ .Values.instrumentor.image.repository }} telemetryEnabled: {{ .Values.telemetry.enabled }} openshiftEnabled: {{ .Values.openshift.enabled }} diff --git a/helm/odigos/values.yaml b/helm/odigos/values.yaml index 01651fcc6..464141507 100644 --- a/helm/odigos/values.yaml +++ b/helm/odigos/values.yaml @@ -38,6 +38,14 @@ collectorGateway: # if not specified, it will be set to 80% of the hard limit of the memory limiter. goMemLimitMiB: 340 +collectorNode: + # Each node collector, running as a daemonset, uses the host network + # and exposes prometheus metrics endpoint on a dedicated port. + # Because it shares the port network with the host (k8s node), + # if some other process is using the port, the node collector will fail to start. + # Use this option to set a specific port for the node collector to control the port it will bind to. + collectorOwnMetricsPort: 55682 + autoscaler: image: repository: keyval/odigos-autoscaler diff --git a/scheduler/controllers/collectorsgroup_controller.go b/scheduler/controllers/collectorsgroup_controller.go deleted file mode 100644 index 424c3b654..000000000 --- a/scheduler/controllers/collectorsgroup_controller.go +++ /dev/null @@ -1,114 +0,0 @@ -/* -Copyright 2022. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controllers - -import ( - "context" - - odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1" - "github.com/odigos-io/odigos/k8sutils/pkg/utils" - "github.com/odigos-io/odigos/scheduler/controllers/collectorgroups" - "k8s.io/apimachinery/pkg/runtime" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" -) - -// CollectorsGroupReconciler reconciles a CollectorsGroup object -type CollectorsGroupReconciler struct { - client.Client - Scheme *runtime.Scheme -} - -// up until v1.0.31, the collectors group role names were "GATEWAY" and "DATA_COLLECTION". -// in v1.0.32, the role names were changed to "CLUSTER_GATEWAY" and "NODE_COLLECTOR", -// due to adding the Processor CRD which uses these role names. -// the new names are more descriptive and are preparations for future roles. -// unfortunately, the role names are used in the collectorgroup CR, which needs to be updated -// when a user upgrades from <=v1.0.31 to >=v1.0.32. -// this function is responsible to do this update. -// once we drop support for <=v1.0.31, we can remove this function. -func (r *CollectorsGroupReconciler) applyNewCollectorRoleNames(ctx context.Context, collectorGroup *odigosv1.CollectorsGroup) error { - if collectorGroup.Spec.Role == "GATEWAY" { - logger := log.FromContext(ctx) - logger.Info("updating collector group role name", "old", "GATEWAY", "new", "CLUSTER_GATEWAY") - collectorGroup.Spec.Role = odigosv1.CollectorsGroupRoleClusterGateway - return r.Update(ctx, collectorGroup) - } - if collectorGroup.Spec.Role == "DATA_COLLECTION" { - logger := log.FromContext(ctx) - logger.Info("updating collector group role name", "old", "DATA_COLLECTION", "new", "NODE_COLLECTOR") - collectorGroup.Spec.Role = odigosv1.CollectorsGroupRoleNodeCollector - return r.Update(ctx, collectorGroup) - } - return nil -} - -// +kubebuilder:rbac:groups=odigos.io,resources=collectorsgroups,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=odigos.io,resources=collectorsgroups/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=odigos.io,resources=collectorsgroups/finalizers,verbs=update -func (r *CollectorsGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - var collectorGroups odigosv1.CollectorsGroupList - err := r.List(ctx, &collectorGroups, client.InNamespace(req.Namespace)) - if err != nil { - logger.Error(err, "failed to list collectors groups") - return ctrl.Result{}, err - } - - gatewayReady := false - for _, collectorGroup := range collectorGroups.Items { - err := r.applyNewCollectorRoleNames(ctx, &collectorGroup) - if err != nil { - logger.Error(err, "failed to apply new collector role names") - return ctrl.Result{}, err - } - - if collectorGroup.Spec.Role == odigosv1.CollectorsGroupRoleClusterGateway && collectorGroup.Status.Ready { - gatewayReady = true - } - } - - var instApps odigosv1.InstrumentationConfigList - if err = r.List(ctx, &instApps); err != nil { - logger.Error(err, "failed to list InstrumentationConfigs") - return ctrl.Result{}, err - } - - odigosConfig, err := utils.GetCurrentOdigosConfig(ctx, r.Client) - if err != nil { - logger.Error(err, "failed to get odigos config") - return ctrl.Result{}, err - } - - if collectorgroups.ShouldHaveNodeCollectorGroup(gatewayReady, len(instApps.Items)) { - err = utils.ApplyCollectorGroup(ctx, r.Client, collectorgroups.NewNodeCollectorGroup(odigosConfig)) - if err != nil { - logger.Error(err, "failed to apply node collector group") - return ctrl.Result{}, err - } - } - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *CollectorsGroupReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). - For(&odigosv1.CollectorsGroup{}). - Complete(r) -} diff --git a/scheduler/controllers/instrumentationconfig_controller.go b/scheduler/controllers/instrumentationconfig_controller.go deleted file mode 100644 index 090fd3b15..000000000 --- a/scheduler/controllers/instrumentationconfig_controller.go +++ /dev/null @@ -1,76 +0,0 @@ -package controllers - -import ( - "context" - - odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1" - "github.com/odigos-io/odigos/k8sutils/pkg/consts" - "github.com/odigos-io/odigos/k8sutils/pkg/env" - "github.com/odigos-io/odigos/k8sutils/pkg/utils" - nodeCollectorGroupUtil "github.com/odigos-io/odigos/scheduler/controllers/collectorgroups" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" -) - -type InstrumentationConfigReconciler struct { - client.Client - Scheme *runtime.Scheme - ImagePullSecrets []string - OdigosVersion string -} - -func (r *InstrumentationConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - logger.V(0).Info("Reconciling InstrumentationConfig") - - namespace := env.GetCurrentNamespace() - - var instrumentedConfigs odigosv1.InstrumentationConfigList - err := r.List(ctx, &instrumentedConfigs) - if err != nil { - logger.Error(err, "failed to list InstrumentationConfigs") - return ctrl.Result{}, err - } - numberOfInstrumentedApps := len(instrumentedConfigs.Items) - - if numberOfInstrumentedApps == 0 { - if err = utils.DeleteCollectorGroup(ctx, r.Client, namespace, consts.OdigosNodeCollectorCollectorGroupName); err != nil { - return ctrl.Result{}, err - } - } - - clusterCollectorGroup, err := utils.GetCollectorGroup(ctx, r.Client, namespace, consts.OdigosClusterCollectorCollectorGroupName) - if err != nil { - if errors.IsNotFound(err) { - logger.V(3).Info("collector group doesn't exist", "collectorGroupName", clusterCollectorGroup) - return ctrl.Result{}, nil - } - logger.Error(err, "failed to get collector group", "collectorGroupName", consts.OdigosClusterCollectorCollectorGroupName) - return ctrl.Result{}, err - } - - odigosConfig, err := utils.GetCurrentOdigosConfig(ctx, r.Client) - if err != nil { - logger.Error(err, "failed to get odigos config") - return ctrl.Result{}, err - } - - if nodeCollectorGroupUtil.ShouldHaveNodeCollectorGroup(clusterCollectorGroup.Status.Ready, numberOfInstrumentedApps) { - err = utils.ApplyCollectorGroup(ctx, r.Client, nodeCollectorGroupUtil.NewNodeCollectorGroup(odigosConfig)) - if err != nil { - logger.Error(err, "failed to create data collection collector group") - return ctrl.Result{}, err - } - } - - return ctrl.Result{}, nil -} - -func (r *InstrumentationConfigReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). - For(&odigosv1.InstrumentationConfig{}). - Complete(r) -} diff --git a/scheduler/controllers/nodecollector_controller.go b/scheduler/controllers/nodecollector_controller.go new file mode 100644 index 000000000..2fd622a8e --- /dev/null +++ b/scheduler/controllers/nodecollector_controller.go @@ -0,0 +1,189 @@ +package controllers + +import ( + "context" + + odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1" + consts "github.com/odigos-io/odigos/common/consts" + k8sutilsconsts "github.com/odigos-io/odigos/k8sutils/pkg/consts" + "github.com/odigos-io/odigos/k8sutils/pkg/env" + "github.com/odigos-io/odigos/k8sutils/pkg/utils" + nodeCollectorGroupUtil "github.com/odigos-io/odigos/scheduler/controllers/collectorgroups" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +type NodeCollectorsGroupReconciler struct { + client.Client + Scheme *runtime.Scheme + ImagePullSecrets []string + OdigosVersion string +} + +// makes sure that the controller only reacts to events related to the odigos-config configmap +// and does not trigger on other configmaps +type odigosConfigPredicate struct{} + +func (i *odigosConfigPredicate) Create(e event.CreateEvent) bool { + return e.Object.GetName() == consts.OdigosConfigurationName +} + +func (i *odigosConfigPredicate) Update(e event.UpdateEvent) bool { + return e.ObjectNew.GetName() == consts.OdigosConfigurationName +} + +func (i *odigosConfigPredicate) Delete(e event.DeleteEvent) bool { + return e.Object.GetName() == consts.OdigosConfigurationName +} + +func (i *odigosConfigPredicate) Generic(e event.GenericEvent) bool { + return e.Object.GetName() == consts.OdigosConfigurationName +} + +var _ predicate.Predicate = &odigosConfigPredicate{} + +// For instrumentation configs, we only care if the object exists or not, since we count if there are more than 0. +// thus, we can filter out all updates events which will not affect reconciliation +type existingPredicate struct{} + +func (i *existingPredicate) Create(e event.CreateEvent) bool { + return true +} + +func (i *existingPredicate) Update(e event.UpdateEvent) bool { + return false +} + +func (i *existingPredicate) Delete(e event.DeleteEvent) bool { + return true +} + +func (i *existingPredicate) Generic(e event.GenericEvent) bool { + return false +} + +var _ predicate.Predicate = &existingPredicate{} + +// this predicate filters collectorsgroup events. +// it will only forward events that are: +// 1. for cluster collector group +// 2. If the cluster collector group was not ready and now it is ready +type clusterCollectorBecomesReadyPredicate struct{} + +func (i *clusterCollectorBecomesReadyPredicate) Create(e event.CreateEvent) bool { + return false +} + +func (i *clusterCollectorBecomesReadyPredicate) Update(e event.UpdateEvent) bool { + if e.ObjectNew.GetName() != k8sutilsconsts.OdigosClusterCollectorCollectorGroupName { + return false + } + + oldCollectorGroup, ok := e.ObjectOld.(*odigosv1.CollectorsGroup) + if !ok { + return false + } + newCollectorGroup, ok := e.ObjectNew.(*odigosv1.CollectorsGroup) + if !ok { + return false + } + + return !oldCollectorGroup.Status.Ready && newCollectorGroup.Status.Ready +} + +func (i *clusterCollectorBecomesReadyPredicate) Delete(e event.DeleteEvent) bool { + return false +} + +func (i *clusterCollectorBecomesReadyPredicate) Generic(e event.GenericEvent) bool { + return false +} + +var _ predicate.Predicate = &clusterCollectorBecomesReadyPredicate{} + +func (r *NodeCollectorsGroupReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + logger.V(0).Info("Reconciling NodeCollectorsGroup controller") + + namespace := env.GetCurrentNamespace() + + var instrumentedConfigs odigosv1.InstrumentationConfigList + err := r.List(ctx, &instrumentedConfigs) + if err != nil { + logger.Error(err, "failed to list InstrumentationConfigs") + return ctrl.Result{}, err + } + numberOfInstrumentedApps := len(instrumentedConfigs.Items) + + if numberOfInstrumentedApps == 0 { + if err = utils.DeleteCollectorGroup(ctx, r.Client, namespace, k8sutilsconsts.OdigosNodeCollectorCollectorGroupName); err != nil { + return ctrl.Result{}, err + } + } + + clusterCollectorGroup, err := utils.GetCollectorGroup(ctx, r.Client, namespace, k8sutilsconsts.OdigosClusterCollectorCollectorGroupName) + if err != nil { + if errors.IsNotFound(err) { + logger.V(3).Info("collector group doesn't exist", "collectorGroupName", clusterCollectorGroup) + return ctrl.Result{}, nil + } + logger.Error(err, "failed to get collector group", "collectorGroupName", k8sutilsconsts.OdigosClusterCollectorCollectorGroupName) + return ctrl.Result{}, err + } + + odigosConfig, err := utils.GetCurrentOdigosConfig(ctx, r.Client) + if err != nil { + logger.Error(err, "failed to get odigos config") + return ctrl.Result{}, err + } + + if nodeCollectorGroupUtil.ShouldHaveNodeCollectorGroup(clusterCollectorGroup.Status.Ready, numberOfInstrumentedApps) { + err = utils.ApplyCollectorGroup(ctx, r.Client, nodeCollectorGroupUtil.NewNodeCollectorGroup(odigosConfig)) + if err != nil { + logger.Error(err, "failed to create data collection collector group") + return ctrl.Result{}, err + } + } + + return ctrl.Result{}, nil +} + +func (r *NodeCollectorsGroupReconciler) SetupWithManager(mgr ctrl.Manager) error { + + // here we enumerate the inputs events that the controller when data collection collector group should be updated + + err := ctrl.NewControllerManagedBy(mgr). + For(&odigosv1.InstrumentationConfig{}). + Named("nodecollectorgroup-instrumentationconfig"). + WithEventFilter(&existingPredicate{}). + Complete(r) + if err != nil { + return err + } + + err = ctrl.NewControllerManagedBy(mgr). + For(&corev1.ConfigMap{}). + Named("nodecollectorgroup-odigosconfig"). + WithEventFilter(&odigosConfigPredicate{}). + Complete(r) + if err != nil { + return err + } + + err = ctrl.NewControllerManagedBy(mgr). + For(&odigosv1.CollectorsGroup{}). + Named("nodecollectorgroup-collectorsgroup"). + WithEventFilter(&clusterCollectorBecomesReadyPredicate{}). + Complete(r) + if err != nil { + return err + } + + return nil +} diff --git a/scheduler/main.go b/scheduler/main.go index 6dc53d004..21c1ad2a1 100644 --- a/scheduler/main.go +++ b/scheduler/main.go @@ -104,13 +104,6 @@ func main() { os.Exit(1) } - if err = (&controllers.CollectorsGroupReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "CollectorsGroup") - os.Exit(1) - } if err = (&controllers.DestinationReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), @@ -118,7 +111,7 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Destination") os.Exit(1) } - if err = (&controllers.InstrumentationConfigReconciler{ + if err = (&controllers.NodeCollectorsGroupReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), }).SetupWithManager(mgr); err != nil { From 11fa0d307a1cc264e6c350b4fca0edd1c3a2ee25 Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 12:23:10 +0300 Subject: [PATCH 06/13] fix: remove debug print --- k8sutils/pkg/utils/config_util.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/k8sutils/pkg/utils/config_util.go b/k8sutils/pkg/utils/config_util.go index cd60ff399..3845f60ec 100644 --- a/k8sutils/pkg/utils/config_util.go +++ b/k8sutils/pkg/utils/config_util.go @@ -2,7 +2,6 @@ package utils import ( "context" - "fmt" "github.com/odigos-io/odigos/common" "github.com/odigos-io/odigos/common/consts" @@ -20,7 +19,6 @@ func GetCurrentOdigosConfig(ctx context.Context, k8sClient client.Client) (commo if err := k8sClient.Get(ctx, types.NamespacedName{Namespace: odigosSystemNamespaceName, Name: consts.OdigosConfigurationName}, &configMap); err != nil { return odigosConfig, err } - fmt.Println(configMap.Data[consts.OdigosConfigurationFileName]) if err := yaml.Unmarshal([]byte(configMap.Data[consts.OdigosConfigurationFileName]), &odigosConfig); err != nil { return odigosConfig, err } From 674e01ed0e6f6a7aeca3d9b3871d27f337f4f51e Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 12:29:34 +0300 Subject: [PATCH 07/13] fix: remove unused predicate --- .../controllers/collectorsgroup_controller.go | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/autoscaler/controllers/collectorsgroup_controller.go b/autoscaler/controllers/collectorsgroup_controller.go index ab11a7db7..f46ec50bd 100644 --- a/autoscaler/controllers/collectorsgroup_controller.go +++ b/autoscaler/controllers/collectorsgroup_controller.go @@ -22,7 +22,6 @@ import ( odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1" "github.com/odigos-io/odigos/autoscaler/controllers/datacollection" "github.com/odigos-io/odigos/autoscaler/controllers/gateway" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/log" @@ -32,28 +31,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -type onlyCreatePredicate struct { - predicate.Funcs -} - -func (i *onlyCreatePredicate) Create(e event.CreateEvent) bool { - return true -} - -func (i *onlyCreatePredicate) Update(e event.UpdateEvent) bool { - return false -} - -func (i *onlyCreatePredicate) Delete(e event.DeleteEvent) bool { - return false -} - -func (i *onlyCreatePredicate) Generic(e event.GenericEvent) bool { - return false -} - -var _ predicate.Predicate = &onlyCreatePredicate{} - // CollectorsGroupReconciler reconciles a CollectorsGroup object type CollectorsGroupReconciler struct { client.Client From b177635209fba06d7358f2ad9eed8a916f046258 Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 12:31:37 +0300 Subject: [PATCH 08/13] chore: move const default port to k8sutils --- autoscaler/controllers/gateway/configmap_test.go | 4 +++- k8sutils/pkg/consts/consts.go | 7 ++++--- scheduler/controllers/collectorgroups/datacollection.go | 4 +--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/autoscaler/controllers/gateway/configmap_test.go b/autoscaler/controllers/gateway/configmap_test.go index 88ef1968b..3a037adb8 100644 --- a/autoscaler/controllers/gateway/configmap_test.go +++ b/autoscaler/controllers/gateway/configmap_test.go @@ -1,9 +1,11 @@ package gateway import ( + "fmt" "testing" "github.com/odigos-io/odigos/common/config" + "github.com/odigos-io/odigos/k8sutils/pkg/consts" "github.com/stretchr/testify/assert" ) @@ -115,7 +117,7 @@ func TestAddSelfTelemetryPipeline(t *testing.T) { assert.Equal(t, []string{"prometheus"}, c.Service.Pipelines["metrics/otelcol"].Receivers) assert.Equal(t, []string{"resource/pod-name"}, c.Service.Pipelines["metrics/otelcol"].Processors) assert.Equal(t, []string{"otlp/ui"}, c.Service.Pipelines["metrics/otelcol"].Exporters) - assert.Equal(t, "0.0.0.0:55682", c.Service.Telemetry.Metrics["address"]) + assert.Equal(t, fmt.Sprintf("0.0.0.0:%d", consts.OdigosNodeCollectorOwnTelemetryPortDefault), c.Service.Telemetry.Metrics["address"]) for pipelineName, pipeline := range c.Service.Pipelines { if pipelineName == "metrics/otelcol" { assert.NotContains(t, pipeline.Processors, "odigostrafficmetrics") diff --git a/k8sutils/pkg/consts/consts.go b/k8sutils/pkg/consts/consts.go index f27b4c7c5..052b0fc44 100644 --- a/k8sutils/pkg/consts/consts.go +++ b/k8sutils/pkg/consts/consts.go @@ -24,9 +24,10 @@ const ( ) const ( - OdigosNodeCollectorDaemonSetName = "odigos-data-collection" - OdigosNodeCollectorConfigMapName = OdigosNodeCollectorDaemonSetName - OdigosNodeCollectorCollectorGroupName = OdigosNodeCollectorDaemonSetName + OdigosNodeCollectorDaemonSetName = "odigos-data-collection" + OdigosNodeCollectorConfigMapName = OdigosNodeCollectorDaemonSetName + OdigosNodeCollectorCollectorGroupName = OdigosNodeCollectorDaemonSetName + OdigosNodeCollectorOwnTelemetryPortDefault = int32(55682) OdigosNodeCollectorConfigMapKey = "conf" // this key is different than the cluster collector value. not sure why ) diff --git a/scheduler/controllers/collectorgroups/datacollection.go b/scheduler/controllers/collectorgroups/datacollection.go index 409298c09..3d3631d84 100644 --- a/scheduler/controllers/collectorgroups/datacollection.go +++ b/scheduler/controllers/collectorgroups/datacollection.go @@ -8,11 +8,9 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -const NodeCollectorDefaultOwnMetricsPort = int32(55682) - func NewNodeCollectorGroup(odigosConfig common.OdigosConfiguration) *odigosv1.CollectorsGroup { - ownMetricsPort := NodeCollectorDefaultOwnMetricsPort + ownMetricsPort := consts.OdigosNodeCollectorOwnTelemetryPortDefault if odigosConfig.CollectorNode != nil && odigosConfig.CollectorNode.CollectorOwnMetricsPort != 0 { ownMetricsPort = odigosConfig.CollectorNode.CollectorOwnMetricsPort } From f52f1166d8b44622dd102f418ed0bf327507d63b Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 15:41:17 +0300 Subject: [PATCH 09/13] fix: values syntax error --- helm/odigos/values.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/helm/odigos/values.yaml b/helm/odigos/values.yaml index 464141507..5d4f2f56c 100644 --- a/helm/odigos/values.yaml +++ b/helm/odigos/values.yaml @@ -40,10 +40,10 @@ collectorGateway: collectorNode: # Each node collector, running as a daemonset, uses the host network - # and exposes prometheus metrics endpoint on a dedicated port. - # Because it shares the port network with the host (k8s node), - # if some other process is using the port, the node collector will fail to start. - # Use this option to set a specific port for the node collector to control the port it will bind to. + # and exposes prometheus metrics endpoint on a dedicated port. + # Because it shares the port network with the host (k8s node), + # if some other process is using the port, the node collector will fail to start. + # Use this option to set a specific port for the node collector to control the port it will bind to. collectorOwnMetricsPort: 55682 autoscaler: From 613ccf7f4ac1e9c3f62824f90f527d0afb68082d Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 15:45:09 +0300 Subject: [PATCH 10/13] chore: aling docs for new config option --- api/odigos/v1alpha1/collectorsgroup_types.go | 1 + common/odigos_config.go | 8 ++------ helm/odigos/values.yaml | 7 ++----- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/api/odigos/v1alpha1/collectorsgroup_types.go b/api/odigos/v1alpha1/collectorsgroup_types.go index 6830c5678..e04fc37ca 100644 --- a/api/odigos/v1alpha1/collectorsgroup_types.go +++ b/api/odigos/v1alpha1/collectorsgroup_types.go @@ -35,6 +35,7 @@ type CollectorsGroupSpec struct { Role CollectorsGroupRole `json:"role"` // The port to use for exposing the collector's own metrics as a prometheus endpoint. + // This can be used to resolve conflicting ports when a collector is using the host network. CollectorOwnMetricsPort int32 `json:"collectorOwnMetricsPort"` } diff --git a/common/odigos_config.go b/common/odigos_config.go index 999ca8754..5509be2ce 100644 --- a/common/odigos_config.go +++ b/common/odigos_config.go @@ -3,12 +3,8 @@ package common type ProfileName string type CollectorNodeConfiguration struct { - - // Each node collector, running as a daemonset, runs on the host network, - // and exposes prometheus metrics endpoint on this a dedicated port. - // Because it shares the port network with the host, - // if some other process is using the port, the node collector will not start. - // This option allows to set a different port for the node collector to overcome this issue if encountered. + // The port to use for exposing the collector's own metrics as a prometheus endpoint. + // This can be used to resolve conflicting ports when a collector is using the host network. CollectorOwnMetricsPort int32 `json:"collectorOwnMetricsPort,omitempty"` } diff --git a/helm/odigos/values.yaml b/helm/odigos/values.yaml index 5d4f2f56c..5f386788c 100644 --- a/helm/odigos/values.yaml +++ b/helm/odigos/values.yaml @@ -39,11 +39,8 @@ collectorGateway: goMemLimitMiB: 340 collectorNode: - # Each node collector, running as a daemonset, uses the host network - # and exposes prometheus metrics endpoint on a dedicated port. - # Because it shares the port network with the host (k8s node), - # if some other process is using the port, the node collector will fail to start. - # Use this option to set a specific port for the node collector to control the port it will bind to. + # The port to use for exposing the collector's own metrics as a prometheus endpoint. + # This can be used to resolve conflicting ports when a collector is using the host network. collectorOwnMetricsPort: 55682 autoscaler: From 59933581f3997f55d7b6111e51b23041c53037f0 Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 16:19:11 +0300 Subject: [PATCH 11/13] fix: helm values syntax --- helm/odigos/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/odigos/values.yaml b/helm/odigos/values.yaml index 5f386788c..e407e5dcb 100644 --- a/helm/odigos/values.yaml +++ b/helm/odigos/values.yaml @@ -40,7 +40,7 @@ collectorGateway: collectorNode: # The port to use for exposing the collector's own metrics as a prometheus endpoint. - # This can be used to resolve conflicting ports when a collector is using the host network. + # This can be used to resolve conflicting ports when a collector is using the host network. collectorOwnMetricsPort: 55682 autoscaler: From 27ccd4c2c53c66570d496fd6d72f4f84335c76c9 Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 16:21:19 +0300 Subject: [PATCH 12/13] fix: api all --- api/config/crd/bases/odigos.io_collectorsgroups.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/api/config/crd/bases/odigos.io_collectorsgroups.yaml b/api/config/crd/bases/odigos.io_collectorsgroups.yaml index 551a8892e..0fc09ae44 100644 --- a/api/config/crd/bases/odigos.io_collectorsgroups.yaml +++ b/api/config/crd/bases/odigos.io_collectorsgroups.yaml @@ -43,8 +43,9 @@ spec: description: CollectorsGroupSpec defines the desired state of Collector properties: collectorOwnMetricsPort: - description: The port to use for exposing the collector's own metrics - as a prometheus endpoint. + description: |- + The port to use for exposing the collector's own metrics as a prometheus endpoint. + This can be used to resolve conflicting ports when a collector is using the host network. format: int32 type: integer role: From 9a644902e34f5f79acd970966b7c614d40b14ba7 Mon Sep 17 00:00:00 2001 From: Amir Blum Date: Fri, 25 Oct 2024 16:29:47 +0300 Subject: [PATCH 13/13] chore: nicer logging --- k8sutils/pkg/utils/collectorgroup_util.go | 2 +- scheduler/controllers/nodecollector_controller.go | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/k8sutils/pkg/utils/collectorgroup_util.go b/k8sutils/pkg/utils/collectorgroup_util.go index 1647386cd..b62b6e145 100644 --- a/k8sutils/pkg/utils/collectorgroup_util.go +++ b/k8sutils/pkg/utils/collectorgroup_util.go @@ -10,7 +10,7 @@ import ( ) func ApplyCollectorGroup(ctx context.Context, c client.Client, collectorGroup *odigosv1.CollectorsGroup) error { - logger := log.FromContext(ctx).WithValues("collectorGroupName", collectorGroup.Name) + logger := log.FromContext(ctx) logger.Info("Applying collector group", "collectorGroupName", collectorGroup.Name) err := c.Patch(ctx, collectorGroup, client.Apply, client.ForceOwnership, client.FieldOwner("scheduler")) diff --git a/scheduler/controllers/nodecollector_controller.go b/scheduler/controllers/nodecollector_controller.go index 2fd622a8e..f58c87652 100644 --- a/scheduler/controllers/nodecollector_controller.go +++ b/scheduler/controllers/nodecollector_controller.go @@ -109,7 +109,6 @@ var _ predicate.Predicate = &clusterCollectorBecomesReadyPredicate{} func (r *NodeCollectorsGroupReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) - logger.V(0).Info("Reconciling NodeCollectorsGroup controller") namespace := env.GetCurrentNamespace()