Skip to content

Commit 059e589

Browse files
committed
feat: automatically setup and inject prometheus configuration
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>
1 parent cb5a657 commit 059e589

File tree

11 files changed

+136
-95
lines changed

11 files changed

+136
-95
lines changed

deploy/cloud/helm/platform/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,11 @@ The Dynamo Platform Helm chart deploys the complete Dynamo Cloud infrastructure
8484
| dynamo-operator.dynamo.istio.gateway | string | `nil` | Istio gateway name for routing |
8585
| dynamo-operator.dynamo.ingressHostSuffix | string | `""` | Host suffix for generated ingress hostnames |
8686
| dynamo-operator.dynamo.virtualServiceSupportsHTTPS | bool | `false` | Whether VirtualServices should support HTTPS routing |
87+
| dynamo-operator.dynamo.metrics.prometheusEndpoint | string | `""` | Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables |
8788
| grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
8889
| kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide |
8990
| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance |
91+
| etcd.image.repository | string | `"bitnamilegacy/etcd"` | following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository |
9092
| nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance |
9193

9294
### NATS Configuration

deploy/cloud/helm/platform/components/operator/templates/deployment.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ spec:
107107
{{- if .Values.modelExpressURL }}
108108
- --model-express-url={{ .Values.modelExpressURL }}
109109
{{- end }}
110+
{{- if .Values.dynamo.metrics.prometheusEndpoint }}
111+
- --prometheus-endpoint={{ .Values.dynamo.metrics.prometheusEndpoint }}
112+
{{- end }}
110113
command:
111114
- /manager
112115
env:
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
{{- if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" }}
16+
apiVersion: monitoring.coreos.com/v1
17+
kind: PodMonitor
18+
metadata:
19+
name: dynamo-frontend
20+
spec:
21+
{{- if .Values.namespaceRestriction.enabled }}
22+
namespaceSelector:
23+
matchNames:
24+
- {{ .Release.Namespace }}
25+
{{- else }}
26+
namespaceSelector:
27+
any: true
28+
{{- end }}
29+
podMetricsEndpoints:
30+
- interval: 30s
31+
path: /metrics
32+
port: http
33+
selector:
34+
matchLabels:
35+
nvidia.com/dynamo-component-type: frontend
36+
nvidia.com/metrics-enabled: "true"
37+
---
38+
apiVersion: monitoring.coreos.com/v1
39+
kind: PodMonitor
40+
metadata:
41+
name: dynamo-worker
42+
spec:
43+
{{- if .Values.namespaceRestriction.enabled }}
44+
namespaceSelector:
45+
matchNames:
46+
- {{ .Release.Namespace }}
47+
{{- else }}
48+
namespaceSelector:
49+
any: true
50+
{{- end }}
51+
podMetricsEndpoints:
52+
- interval: 30s
53+
path: /metrics
54+
port: system
55+
selector:
56+
matchLabels:
57+
nvidia.com/dynamo-component-type: worker
58+
nvidia.com/metrics-enabled: "true"
59+
---
60+
apiVersion: monitoring.coreos.com/v1
61+
kind: PodMonitor
62+
metadata:
63+
name: dynamo-planner
64+
spec:
65+
{{- if .Values.namespaceRestriction.enabled }}
66+
namespaceSelector:
67+
matchNames:
68+
- {{ .Release.Namespace }}
69+
{{- else }}
70+
namespaceSelector:
71+
any: true
72+
{{- end }}
73+
podMetricsEndpoints:
74+
- interval: 30s
75+
path: /metrics
76+
port: metrics
77+
selector:
78+
matchLabels:
79+
nvidia.com/dynamo-component-type: planner
80+
nvidia.com/metrics-enabled: "true"
81+
{{- end }}

deploy/cloud/helm/platform/components/operator/values.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,6 @@ controllerManager:
7575
annotations: {}
7676

7777
dynamo:
78-
imageBuilder:
79-
serviceAccount:
80-
annotations: {}
8178
components:
8279
serviceAccount:
8380
annotations: {}
@@ -98,6 +95,9 @@ dynamo:
9895
password: ""
9996
existingSecretName: ''
10097
secure: true
98+
99+
metrics:
100+
prometheusEndpoint: ""
101101

102102

103103
#imagePullSecrets: []

deploy/cloud/helm/platform/values.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ dynamo-operator:
111111
# -- Whether VirtualServices should support HTTPS routing
112112
virtualServiceSupportsHTTPS: false
113113

114+
# Metrics configuration
115+
metrics:
116+
# -- Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables
117+
prometheusEndpoint: ""
118+
114119

115120
# Grove component - distributed inference orchestration
116121
grove:
@@ -130,8 +135,7 @@ etcd:
130135
enabled: true
131136

132137
image:
133-
# -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog
134-
# -- we need to use the legacy repository until we migrate to the new "secure" repository
138+
# -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository
135139
repository: bitnamilegacy/etcd
136140
tag: 3.5.18-debian-12-r5
137141

deploy/cloud/operator/cmd/main.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ func main() {
132132
var ingressHostSuffix string
133133
var groveTerminationDelay time.Duration
134134
var modelExpressURL string
135+
var prometheusEndpoint string
135136
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
136137
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
137138
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
@@ -161,6 +162,8 @@ func main() {
161162
"The termination delay for Grove PodGangSets")
162163
flag.StringVar(&modelExpressURL, "model-express-url", "",
163164
"URL of the Model Express server to inject into all pods")
165+
flag.StringVar(&prometheusEndpoint, "prometheus-endpoint", "",
166+
"URL of the Prometheus endpoint to use for metrics")
164167
opts := zap.Options{
165168
Development: true,
166169
}
@@ -196,7 +199,8 @@ func main() {
196199
IngressControllerTLSSecret: ingressControllerTLSSecretName,
197200
IngressHostSuffix: ingressHostSuffix,
198201
},
199-
ModelExpressURL: modelExpressURL,
202+
ModelExpressURL: modelExpressURL,
203+
PrometheusEndpoint: prometheusEndpoint,
200204
}
201205

202206
mainCtx := ctrl.SetupSignalHandler()

deploy/cloud/operator/internal/controller_common/predicate.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ type Config struct {
5858
IngressConfig IngressConfig
5959
// ModelExpressURL is the URL of the Model Express server to inject into all pods
6060
ModelExpressURL string
61+
// PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics
62+
PrometheusEndpoint string
6163
}
6264

6365
type IngressConfig struct {

deploy/cloud/operator/internal/dynamo/graph.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -660,26 +660,35 @@ func isWorkerComponent(componentType string) bool {
660660

661661
// addStandardEnvVars adds the standard environment variables that are common to both Grove and Controller
662662
func addStandardEnvVars(container *corev1.Container, controllerConfig controller_common.Config) {
663+
standardEnvVars := []corev1.EnvVar{}
663664
if controllerConfig.NatsAddress != "" {
664-
container.Env = append(container.Env, corev1.EnvVar{
665+
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
665666
Name: "NATS_SERVER",
666667
Value: controllerConfig.NatsAddress,
667668
})
668669
}
669670

670671
if controllerConfig.EtcdAddress != "" {
671-
container.Env = append(container.Env, corev1.EnvVar{
672+
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
672673
Name: "ETCD_ENDPOINTS",
673674
Value: controllerConfig.EtcdAddress,
674675
})
675676
}
676677

677678
if controllerConfig.ModelExpressURL != "" {
678-
container.Env = append(container.Env, corev1.EnvVar{
679+
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
679680
Name: "MODEL_EXPRESS_URL",
680681
Value: controllerConfig.ModelExpressURL,
681682
})
682683
}
684+
if controllerConfig.PrometheusEndpoint != "" {
685+
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
686+
Name: "PROMETHEUS_ENDPOINT",
687+
Value: controllerConfig.PrometheusEndpoint,
688+
})
689+
}
690+
// merge the env vars to allow users to override the standard env vars
691+
container.Env = MergeEnvs(standardEnvVars, container.Env)
683692
}
684693

685694
// GenerateBasePodSpec creates a basic PodSpec with common logic shared between controller and grove

deploy/cloud/operator/internal/dynamo/graph_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1071,6 +1071,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
10711071
Grove: controller_common.GroveConfig{
10721072
TerminationDelay: 15 * time.Minute,
10731073
},
1074+
PrometheusEndpoint: "http://localhost:9090",
10741075
},
10751076
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
10761077
ObjectMeta: metav1.ObjectMeta{
@@ -1348,6 +1349,10 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
13481349
Name: "MODEL_EXPRESS_URL",
13491350
Value: "model-express-url",
13501351
},
1352+
{
1353+
Name: "PROMETHEUS_ENDPOINT",
1354+
Value: "http://localhost:9090",
1355+
},
13511356
},
13521357
Resources: corev1.ResourceRequirements{
13531358
Requests: corev1.ResourceList{
@@ -1483,6 +1488,10 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
14831488
Name: "MODEL_EXPRESS_URL",
14841489
Value: "model-express-url",
14851490
},
1491+
{
1492+
Name: "PROMETHEUS_ENDPOINT",
1493+
Value: "http://localhost:9090",
1494+
},
14861495
},
14871496
Resources: corev1.ResourceRequirements{
14881497
Requests: corev1.ResourceList{

docs/guides/dynamo_deploy/installation_guide.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,8 @@ helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${REL
6363
helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default
6464

6565
# 3. Install Platform
66-
kubectl create namespace ${NAMESPACE}
6766
helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
68-
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE}
67+
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace
6968
```
7069

7170
> [!TIP]

0 commit comments

Comments
 (0)