diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 07e424761..7f2490829 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -124,6 +124,40 @@ func (r Runtime) String() string { } } +// ServiceMonitorConfig defines configuration options for the ServiceMonitor +// deployed for NVIDIA GPU Operator resources +type ServiceMonitorConfig struct { + // Enabled indicates if ServiceMonitor is deployed + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of ServiceMonitor" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + Enabled *bool `json:"enabled,omitempty"` + + // Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used. + // Supported units: y, w, d, h, m, s, ms + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval which metrics should be scraped from" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" + Interval promv1.Duration `json:"interval,omitempty"` + + // HonorLabels chooses the metric’s labels on collisions with target labels. + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Choose the metric's label on collisions with target labels" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + HonorLabels *bool `json:"honorLabels,omitempty"` + + // AdditionalLabels to add to ServiceMonitor instance + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Additional labels to add to ServiceMonitor instance" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" + AdditionalLabels map[string]string `json:"additionalLabels,omitempty"` + + // Relabelings allows to rewrite labels on metric sets + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets" + Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"` +} + // OperatorSpec describes configuration options for the operator type OperatorSpec struct { // +kubebuilder:validation:Enum=docker;crio;containerd @@ -143,6 +177,11 @@ type OperatorSpec struct { // queryable and should be preserved when modifying objects. Annotations map[string]string `json:"annotations,omitempty"` + // Optional: ServiceMonitor configuration for NVIDIA GPU Operator + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA GPU Operator" + ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"` + // UseOpenShiftDriverToolkit indicates if DriverToolkit image should be used on OpenShift to build and install driver modules // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="On OpenShift, enable DriverToolkit image to build and install driver modules" @@ -901,7 +940,7 @@ type DCGMExporterSpec struct { // Optional: ServiceMonitor configuration for NVIDIA DCGM Exporter // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA DCGM Exporter" - ServiceMonitor *DCGMExporterServiceMonitorConfig `json:"serviceMonitor,omitempty"` + ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"` } // DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter @@ -914,40 +953,6 @@ type DCGMExporterMetricsConfig struct { Name string `json:"name,omitempty"` } -// DCGMExporterServiceMonitorConfig defines configuration options for the ServiceMonitor -// deployed for DCGM Exporter -type DCGMExporterServiceMonitorConfig struct { - // Enabled indicates if ServiceMonitor is deployed for NVIDIA DCGM Exporter - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of NVIDIA DCGM Exporter ServiceMonitor" - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" - Enabled *bool `json:"enabled,omitempty"` - - // Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used. - // Supported units: y, w, d, h, m, s, ms - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval which metrics should be scraped from NVDIA DCGM Exporter" - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" - Interval promv1.Duration `json:"interval,omitempty"` - - // HonorLabels chooses the metric’s labels on collisions with target labels. - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Choose the metric's label on collisions with target labels" - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" - HonorLabels *bool `json:"honorLabels,omitempty"` - - // AdditionalLabels to add to ServiceMonitor instance for NVIDIA DCGM Exporter - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Additional labels to add to ServiceMonitor instance for NVIDIA DCGM Exporter" - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" - AdditionalLabels map[string]string `json:"additionalLabels,omitempty"` - - // Relabelings allows to rewrite labels on metric sets for NVIDIA DCGM Exporter - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets for NVIDIA DCGM Exporter" - Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"` -} - // DCGMSpec defines the properties for NVIDIA DCGM deployment type DCGMSpec struct { // Enabled indicates if deployment of NVIDIA DCGM Hostengine as a separate pod is enabled. @@ -2040,10 +2045,10 @@ func (dcgm *DCGMSpec) IsEnabled() bool { return *dcgm.Enabled } -// IsEnabled returns true if ServiceMonitor for DCGM Exporter is enabled through gpu-operator -func (sm *DCGMExporterServiceMonitorConfig) IsEnabled() bool { +// IsEnabled returns true if ServiceMonitor is enabled through gpu-operator +func (sm *ServiceMonitorConfig) IsEnabled() bool { if sm.Enabled == nil { - // ServiceMonitor for DCGM Exporter is disabled by default + // ServiceMonitor is disabled by default return false } return *sm.Enabled diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 6d876f675..3dafbc088 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -273,49 +273,6 @@ func (in *DCGMExporterMetricsConfig) DeepCopy() *DCGMExporterMetricsConfig { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DCGMExporterServiceMonitorConfig) DeepCopyInto(out *DCGMExporterServiceMonitorConfig) { - *out = *in - if in.Enabled != nil { - in, out := &in.Enabled, &out.Enabled - *out = new(bool) - **out = **in - } - if in.HonorLabels != nil { - in, out := &in.HonorLabels, &out.HonorLabels - *out = new(bool) - **out = **in - } - if in.AdditionalLabels != nil { - in, out := &in.AdditionalLabels, &out.AdditionalLabels - *out = make(map[string]string, len(*in)) - for key, val := range *in { - (*out)[key] = val - } - } - if in.Relabelings != nil { - in, out := &in.Relabelings, &out.Relabelings - *out = make([]*monitoringv1.RelabelConfig, len(*in)) - for i := range *in { - if (*in)[i] != nil { - in, out := &(*in)[i], &(*out)[i] - *out = new(monitoringv1.RelabelConfig) - (*in).DeepCopyInto(*out) - } - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DCGMExporterServiceMonitorConfig. -func (in *DCGMExporterServiceMonitorConfig) DeepCopy() *DCGMExporterServiceMonitorConfig { - if in == nil { - return nil - } - out := new(DCGMExporterServiceMonitorConfig) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DCGMExporterSpec) DeepCopyInto(out *DCGMExporterSpec) { *out = *in @@ -351,7 +308,7 @@ func (in *DCGMExporterSpec) DeepCopyInto(out *DCGMExporterSpec) { } if in.ServiceMonitor != nil { in, out := &in.ServiceMonitor, &out.ServiceMonitor - *out = new(DCGMExporterServiceMonitorConfig) + *out = new(ServiceMonitorConfig) (*in).DeepCopyInto(*out) } } @@ -1126,6 +1083,11 @@ func (in *OperatorSpec) DeepCopyInto(out *OperatorSpec) { (*out)[key] = val } } + if in.ServiceMonitor != nil { + in, out := &in.ServiceMonitor, &out.ServiceMonitor + *out = new(ServiceMonitorConfig) + (*in).DeepCopyInto(*out) + } if in.UseOpenShiftDriverToolkit != nil { in, out := &in.UseOpenShiftDriverToolkit, &out.UseOpenShiftDriverToolkit *out = new(bool) @@ -1307,6 +1269,49 @@ func (in *SandboxWorkloadsSpec) DeepCopy() *SandboxWorkloadsSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceMonitorConfig) DeepCopyInto(out *ServiceMonitorConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.HonorLabels != nil { + in, out := &in.HonorLabels, &out.HonorLabels + *out = new(bool) + **out = **in + } + if in.AdditionalLabels != nil { + in, out := &in.AdditionalLabels, &out.AdditionalLabels + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Relabelings != nil { + in, out := &in.Relabelings, &out.Relabelings + *out = make([]*monitoringv1.RelabelConfig, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(monitoringv1.RelabelConfig) + (*in).DeepCopyInto(*out) + } + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceMonitorConfig. +func (in *ServiceMonitorConfig) DeepCopy() *ServiceMonitorConfig { + if in == nil { + return nil + } + out := new(ServiceMonitorConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ToolkitSpec) DeepCopyInto(out *ToolkitSpec) { *out = *in diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 8ee8e9a8a..a70b4b8fa 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -389,11 +389,9 @@ spec: additionalProperties: type: string description: AdditionalLabels to add to ServiceMonitor instance - for NVIDIA DCGM Exporter type: object enabled: description: Enabled indicates if ServiceMonitor is deployed - for NVIDIA DCGM Exporter type: boolean honorLabels: description: HonorLabels chooses the metric’s labels on collisions @@ -401,13 +399,13 @@ spec: type: boolean interval: description: |- - Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used. + Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used. Supported units: y, w, d, h, m, s, ms pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string relabelings: description: Relabelings allows to rewrite labels on metric - sets for NVIDIA DCGM Exporter + sets items: description: |- RelabelConfig allows dynamic rewriting of the label set for targets, alerts, @@ -1553,6 +1551,124 @@ spec: runtimeClass: default: nvidia type: string + serviceMonitor: + description: 'Optional: ServiceMonitor configuration for NVIDIA + GPU Operator' + properties: + additionalLabels: + additionalProperties: + type: string + description: AdditionalLabels to add to ServiceMonitor instance + type: object + enabled: + description: Enabled indicates if ServiceMonitor is deployed + type: boolean + honorLabels: + description: HonorLabels chooses the metric’s labels on collisions + with target labels. + type: boolean + interval: + description: |- + Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used. + Supported units: y, w, d, h, m, s, ms + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + relabelings: + description: Relabelings allows to rewrite labels on metric + sets + items: + description: |- + RelabelConfig allows dynamic rewriting of the label set for targets, alerts, + scraped samples and remote write samples. + + + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + properties: + action: + default: replace + description: |- + Action to perform based on the regex matching. + + + `Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0. + `DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0. + + + Default: "Replace" + enum: + - replace + - Replace + - keep + - Keep + - drop + - Drop + - hashmod + - HashMod + - labelmap + - LabelMap + - labeldrop + - LabelDrop + - labelkeep + - LabelKeep + - lowercase + - Lowercase + - uppercase + - Uppercase + - keepequal + - KeepEqual + - dropequal + - DropEqual + type: string + modulus: + description: |- + Modulus to take of the hash of the source label values. + + + Only applicable when the action is `HashMod`. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. + type: string + replacement: + description: |- + Replacement value against which a Replace action is performed if the + regular expression matches. + + + Regex capture groups are available. + type: string + separator: + description: Separator is the string between concatenated + SourceLabels. + type: string + sourceLabels: + description: |- + The source labels select values from existing labels. Their content is + concatenated using the configured Separator and matched against the + configured regular expression. + items: + description: |- + LabelName is a valid Prometheus label name which may only contain ASCII + letters, numbers, as well as underscores. + pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$ + type: string + type: array + targetLabel: + description: |- + Label to which the resulting string is written in a replacement. + + + It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`, + `KeepEqual` and `DropEqual` actions. + + + Regex capture groups are available. + type: string + type: object + type: array + type: object use_ocp_driver_toolkit: description: UseOpenShiftDriverToolkit indicates if DriverToolkit image should be used on OpenShift to build and install driver diff --git a/controllers/object_controls.go b/controllers/object_controls.go index f9af46f26..6ad183b37 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -4492,6 +4492,27 @@ func crdExists(n ClusterPolicyController, name string) (bool, error) { return true, nil } +func serviceMonitorCustomEdits(desiredState *gpuv1.ServiceMonitorConfig, currentState *promv1.ServiceMonitor) { + // Apply custom edits for ServiceMonitor + if desiredState.Interval != "" { + currentState.Spec.Endpoints[0].Interval = desiredState.Interval + } + + if desiredState.HonorLabels != nil { + currentState.Spec.Endpoints[0].HonorLabels = *desiredState.HonorLabels + } + + if desiredState.AdditionalLabels != nil { + for key, value := range desiredState.AdditionalLabels { + currentState.ObjectMeta.Labels[key] = value + } + } + + if desiredState.Relabelings != nil { + currentState.Spec.Endpoints[0].RelabelConfigs = desiredState.Relabelings + } +} + // ServiceMonitor creates ServiceMonitor object func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { ctx := n.ctx @@ -4541,30 +4562,21 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { } // Apply custom edits for DCGM Exporter - if serviceMonitor.Interval != "" { - obj.Spec.Endpoints[0].Interval = serviceMonitor.Interval - } - - if serviceMonitor.HonorLabels != nil { - obj.Spec.Endpoints[0].HonorLabels = *serviceMonitor.HonorLabels - } + serviceMonitorCustomEdits(serviceMonitor, obj) + } - if serviceMonitor.AdditionalLabels != nil { - for key, value := range serviceMonitor.AdditionalLabels { - obj.ObjectMeta.Labels[key] = value - } - } - if serviceMonitor.Relabelings != nil { - relabelConfigs := make([]promv1.RelabelConfig, len(serviceMonitor.Relabelings)) - for i, relabel := range serviceMonitor.Relabelings { - if relabel != nil { - relabelConfigs[i] = *relabel - } - } - obj.Spec.Endpoints[0].RelabelConfigs = relabelConfigs + if n.stateNames[state] == "state-operator-metrics" { + serviceMonitor := n.singleton.Spec.Operator.ServiceMonitor + // if ServiceMonitor CRD is missing, assume prometheus is not setup and ignore CR creation + if !serviceMonitorCRDExists { + logger.V(1).Info("ServiceMonitor CRD is missing, ignoring creation of CR for operator-metrics") + return gpuv1.Ready, nil } + obj.Spec.NamespaceSelector.MatchNames = []string{obj.Namespace} + serviceMonitorCustomEdits(serviceMonitor, obj) } - if n.stateNames[state] == "state-operator-metrics" || n.stateNames[state] == "state-node-status-exporter" { + + if n.stateNames[state] == "state-node-status-exporter" { // if ServiceMonitor CRD is missing, assume prometheus is not setup and ignore CR creation if !serviceMonitorCRDExists { logger.V(1).Info("ServiceMonitor CRD is missing, ignoring creation of CR for operator-metrics") diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index af9e87c38..a06c998a9 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -46,6 +46,9 @@ spec: {{- if .Values.operator.use_ocp_driver_toolkit }} use_ocp_driver_toolkit: {{ .Values.operator.use_ocp_driver_toolkit }} {{- end }} + {{- if .Values.operator.serviceMonitor }} + serviceMonitor: {{ toYaml .Values.operator.serviceMonitor | nindent 6 }} + {{- end }} daemonsets: labels: {{- include "gpu-operator.operand-labels" . | nindent 6 }}