Skip to content

Commit 11d7cd0

Browse files
julienmancusoindrajit96
authored andcommitted
feat: automatically setup and inject prometheus configuration (#2912)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com> Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
1 parent d934e41 commit 11d7cd0

File tree

16 files changed

+208
-104
lines changed

16 files changed

+208
-104
lines changed

components/backends/vllm/deploy/disagg_planner.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,6 @@ spec:
5353
mainContainer:
5454
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
5555
workingDir: /workspace/components/planner/src/dynamo/planner
56-
ports:
57-
- name: metrics
58-
containerPort: 9085
5956
command:
6057
- /bin/sh
6158
- -c
@@ -66,7 +63,6 @@ spec:
6663
--backend=vllm
6764
--adjustment-interval=60
6865
--profile-results-dir=/workspace/profiling_results
69-
--prometheus-port=9085
7066
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
7167
dynamoNamespace: vllm-disagg-planner
7268
componentType: frontend

deploy/cloud/helm/platform/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,11 @@ The Dynamo Platform Helm chart deploys the complete Dynamo Cloud infrastructure
8484
| dynamo-operator.dynamo.istio.gateway | string | `nil` | Istio gateway name for routing |
8585
| dynamo-operator.dynamo.ingressHostSuffix | string | `""` | Host suffix for generated ingress hostnames |
8686
| dynamo-operator.dynamo.virtualServiceSupportsHTTPS | bool | `false` | Whether VirtualServices should support HTTPS routing |
87+
| dynamo-operator.dynamo.metrics.prometheusEndpoint | string | `""` | Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables |
8788
| grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
8889
| kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide |
8990
| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance |
91+
| etcd.image.repository | string | `"bitnamilegacy/etcd"` | following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository |
9092
| nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance |
9193

9294
### NATS Configuration

deploy/cloud/helm/platform/components/operator/templates/deployment.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ spec:
107107
{{- if .Values.modelExpressURL }}
108108
- --model-express-url={{ .Values.modelExpressURL }}
109109
{{- end }}
110+
{{- if .Values.dynamo.metrics.prometheusEndpoint }}
111+
- --prometheus-endpoint={{ .Values.dynamo.metrics.prometheusEndpoint }}
112+
{{- end }}
110113
command:
111114
- /manager
112115
env:
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
{{- if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" }}
16+
apiVersion: monitoring.coreos.com/v1
17+
kind: PodMonitor
18+
metadata:
19+
name: dynamo-frontend
20+
spec:
21+
{{- if .Values.namespaceRestriction.enabled }}
22+
namespaceSelector:
23+
matchNames:
24+
- {{ .Release.Namespace }}
25+
{{- else }}
26+
namespaceSelector:
27+
any: true
28+
{{- end }}
29+
podMetricsEndpoints:
30+
- interval: 30s
31+
path: /metrics
32+
port: http
33+
selector:
34+
matchLabels:
35+
nvidia.com/dynamo-component-type: frontend
36+
nvidia.com/metrics-enabled: "true"
37+
---
38+
apiVersion: monitoring.coreos.com/v1
39+
kind: PodMonitor
40+
metadata:
41+
name: dynamo-worker
42+
spec:
43+
{{- if .Values.namespaceRestriction.enabled }}
44+
namespaceSelector:
45+
matchNames:
46+
- {{ .Release.Namespace }}
47+
{{- else }}
48+
namespaceSelector:
49+
any: true
50+
{{- end }}
51+
podMetricsEndpoints:
52+
- interval: 30s
53+
path: /metrics
54+
port: system
55+
selector:
56+
matchLabels:
57+
nvidia.com/dynamo-component-type: worker
58+
nvidia.com/metrics-enabled: "true"
59+
---
60+
apiVersion: monitoring.coreos.com/v1
61+
kind: PodMonitor
62+
metadata:
63+
name: dynamo-planner
64+
spec:
65+
{{- if .Values.namespaceRestriction.enabled }}
66+
namespaceSelector:
67+
matchNames:
68+
- {{ .Release.Namespace }}
69+
{{- else }}
70+
namespaceSelector:
71+
any: true
72+
{{- end }}
73+
podMetricsEndpoints:
74+
- interval: 30s
75+
path: /metrics
76+
port: metrics
77+
selector:
78+
matchLabels:
79+
nvidia.com/dynamo-component-type: planner
80+
nvidia.com/metrics-enabled: "true"
81+
{{- end }}

deploy/cloud/helm/platform/components/operator/values.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,6 @@ controllerManager:
7575
annotations: {}
7676

7777
dynamo:
78-
imageBuilder:
79-
serviceAccount:
80-
annotations: {}
8178
components:
8279
serviceAccount:
8380
annotations: {}
@@ -99,6 +96,9 @@ dynamo:
9996
existingSecretName: ''
10097
secure: true
10198

99+
metrics:
100+
prometheusEndpoint: ""
101+
102102

103103
#imagePullSecrets: []
104104
kubernetesClusterDomain: cluster.local

deploy/cloud/helm/platform/values.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ dynamo-operator:
111111
# -- Whether VirtualServices should support HTTPS routing
112112
virtualServiceSupportsHTTPS: false
113113

114+
# Metrics configuration
115+
metrics:
116+
# -- Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables
117+
prometheusEndpoint: ""
118+
114119

115120
# Grove component - distributed inference orchestration
116121
grove:
@@ -130,8 +135,7 @@ etcd:
130135
enabled: true
131136

132137
image:
133-
# -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog
134-
# -- we need to use the legacy repository until we migrate to the new "secure" repository
138+
# -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository
135139
repository: bitnamilegacy/etcd
136140
tag: 3.5.18-debian-12-r5
137141

deploy/cloud/operator/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ ensure-yq:
5757
fi
5858

5959
.PHONY: manifests
60-
manifests: controller-gen ensure-yq generate-api-docs ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
60+
manifests: controller-gen ensure-yq ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
6161
# Use a large maxDescLen to ensure all field comments are included as OpenAPI descriptions
6262
$(CONTROLLER_GEN) rbac:roleName=manager-role crd:maxDescLen=100000 webhook paths="./..." output:crd:artifacts:config=config/crd/bases
6363
echo "Removing name from mainContainer required fields"

deploy/cloud/operator/cmd/main.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ func main() {
132132
var ingressHostSuffix string
133133
var groveTerminationDelay time.Duration
134134
var modelExpressURL string
135+
var prometheusEndpoint string
135136
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
136137
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
137138
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
@@ -161,6 +162,8 @@ func main() {
161162
"The termination delay for Grove PodGangSets")
162163
flag.StringVar(&modelExpressURL, "model-express-url", "",
163164
"URL of the Model Express server to inject into all pods")
165+
flag.StringVar(&prometheusEndpoint, "prometheus-endpoint", "",
166+
"URL of the Prometheus endpoint to use for metrics")
164167
opts := zap.Options{
165168
Development: true,
166169
}
@@ -196,7 +199,8 @@ func main() {
196199
IngressControllerTLSSecret: ingressControllerTLSSecretName,
197200
IngressHostSuffix: ingressHostSuffix,
198201
},
199-
ModelExpressURL: modelExpressURL,
202+
ModelExpressURL: modelExpressURL,
203+
PrometheusEndpoint: prometheusEndpoint,
200204
}
201205

202206
mainCtx := ctrl.SetupSignalHandler()

deploy/cloud/operator/internal/consts/consts.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ const (
1616
DynamoServicePortName = "http"
1717
DynamoContainerPortName = "http"
1818

19+
DynamoPlannerMetricsPort = 9085
20+
DynamoMetricsPortName = "metrics"
21+
1922
DynamoSystemPort = 9090
2023
DynamoSystemPortName = "system"
2124

deploy/cloud/operator/internal/controller_common/predicate.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ type Config struct {
5858
IngressConfig IngressConfig
5959
// ModelExpressURL is the URL of the Model Express server to inject into all pods
6060
ModelExpressURL string
61+
// PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics
62+
PrometheusEndpoint string
6163
}
6264

6365
type IngressConfig struct {

0 commit comments

Comments
 (0)