Skip to content

Commit b9a205d

Browse files
committed
feat: automatically setup and inject prometheus configuration
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>
1 parent 059e589 commit b9a205d

File tree

10 files changed

+123
-57
lines changed

10 files changed

+123
-57
lines changed

components/backends/vllm/deploy/disagg_planner.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,6 @@ spec:
5353
mainContainer:
5454
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
5555
workingDir: /workspace/components/planner/src/dynamo/planner
56-
ports:
57-
- name: metrics
58-
containerPort: 9085
5956
command:
6057
- /bin/sh
6158
- -c
@@ -66,7 +63,6 @@ spec:
6663
--backend=vllm
6764
--adjustment-interval=60
6865
--profile-results-dir=/workspace/profiling_results
69-
--prometheus-port=9085
7066
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
7167
dynamoNamespace: vllm-disagg-planner
7268
componentType: frontend

deploy/cloud/helm/platform/components/operator/templates/prometheus.yaml

Lines changed: 1 addition & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -16,51 +16,7 @@
1616
apiVersion: monitoring.coreos.com/v1
1717
kind: PodMonitor
1818
metadata:
19-
name: dynamo-frontend
20-
spec:
21-
{{- if .Values.namespaceRestriction.enabled }}
22-
namespaceSelector:
23-
matchNames:
24-
- {{ .Release.Namespace }}
25-
{{- else }}
26-
namespaceSelector:
27-
any: true
28-
{{- end }}
29-
podMetricsEndpoints:
30-
- interval: 30s
31-
path: /metrics
32-
port: http
33-
selector:
34-
matchLabels:
35-
nvidia.com/dynamo-component-type: frontend
36-
nvidia.com/metrics-enabled: "true"
37-
---
38-
apiVersion: monitoring.coreos.com/v1
39-
kind: PodMonitor
40-
metadata:
41-
name: dynamo-worker
42-
spec:
43-
{{- if .Values.namespaceRestriction.enabled }}
44-
namespaceSelector:
45-
matchNames:
46-
- {{ .Release.Namespace }}
47-
{{- else }}
48-
namespaceSelector:
49-
any: true
50-
{{- end }}
51-
podMetricsEndpoints:
52-
- interval: 30s
53-
path: /metrics
54-
port: system
55-
selector:
56-
matchLabels:
57-
nvidia.com/dynamo-component-type: worker
58-
nvidia.com/metrics-enabled: "true"
59-
---
60-
apiVersion: monitoring.coreos.com/v1
61-
kind: PodMonitor
62-
metadata:
63-
name: dynamo-planner
19+
name: dynamo-metrics
6420
spec:
6521
{{- if .Values.namespaceRestriction.enabled }}
6622
namespaceSelector:
@@ -76,6 +32,5 @@ spec:
7632
port: metrics
7733
selector:
7834
matchLabels:
79-
nvidia.com/dynamo-component-type: planner
8035
nvidia.com/metrics-enabled: "true"
8136
{{- end }}

deploy/cloud/helm/platform/components/operator/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ dynamo:
9595
password: ""
9696
existingSecretName: ''
9797
secure: true
98-
98+
9999
metrics:
100100
prometheusEndpoint: ""
101101

deploy/cloud/operator/internal/consts/consts.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ const (
1616
DynamoServicePortName = "http"
1717
DynamoContainerPortName = "http"
1818

19+
DynamoMetricsPortName = "metrics"
20+
21+
DynamoPlannerMetricsPort = 9085
22+
1923
DynamoSystemPort = 9090
2024
DynamoSystemPortName = "system"
2125

deploy/cloud/operator/internal/dynamo/component_frontend.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,17 @@ func (f *FrontendDefaults) GetBaseContainer(context ComponentContext) (corev1.Co
3030
container.Command = []string{"python3"}
3131
container.Args = []string{"-m", "dynamo.frontend"}
3232

33-
// Add HTTP port
3433
container.Ports = []corev1.ContainerPort{
3534
{
3635
Protocol: corev1.ProtocolTCP,
3736
Name: commonconsts.DynamoContainerPortName,
3837
ContainerPort: int32(commonconsts.DynamoServicePort),
3938
},
39+
{
40+
Protocol: corev1.ProtocolTCP,
41+
Name: commonconsts.DynamoMetricsPortName,
42+
ContainerPort: int32(commonconsts.DynamoServicePort),
43+
},
4044
}
4145

4246
// Add frontend-specific defaults

deploy/cloud/operator/internal/dynamo/component_planner.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
package dynamo
77

88
import (
9+
"fmt"
10+
911
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
1012
corev1 "k8s.io/api/core/v1"
1113
)
@@ -21,6 +23,19 @@ func NewPlannerDefaults() *PlannerDefaults {
2123

2224
func (p *PlannerDefaults) GetBaseContainer(context ComponentContext) (corev1.Container, error) {
2325
container := p.getCommonContainer(context)
26+
container.Ports = []corev1.ContainerPort{
27+
{
28+
Protocol: corev1.ProtocolTCP,
29+
Name: commonconsts.DynamoMetricsPortName,
30+
ContainerPort: int32(commonconsts.DynamoPlannerMetricsPort),
31+
},
32+
}
33+
container.Env = append(container.Env, []corev1.EnvVar{
34+
{
35+
Name: "PROMETHEUS_PORT",
36+
Value: fmt.Sprintf("%d", commonconsts.DynamoPlannerMetricsPort),
37+
},
38+
}...)
2439
return container, nil
2540
}
2641

deploy/cloud/operator/internal/dynamo/component_planner_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
package dynamo
77

88
import (
9+
"fmt"
910
"testing"
1011

12+
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
1113
"github.com/google/go-cmp/cmp"
1214
corev1 "k8s.io/api/core/v1"
1315
)
@@ -46,10 +48,14 @@ func TestPlannerDefaults_GetBaseContainer(t *testing.T) {
4648
"/bin/sh",
4749
"-c",
4850
},
51+
Ports: []corev1.ContainerPort{
52+
{Name: commonconsts.DynamoMetricsPortName, ContainerPort: commonconsts.DynamoPlannerMetricsPort, Protocol: corev1.ProtocolTCP},
53+
},
4954
Env: []corev1.EnvVar{
5055
{Name: "DYN_NAMESPACE", Value: "dynamo-namespace"},
5156
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "name"},
5257
{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "namespace"},
58+
{Name: "PROMETHEUS_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoPlannerMetricsPort)},
5359
},
5460
},
5561
},

deploy/cloud/operator/internal/dynamo/component_worker.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,17 @@ func NewWorkerDefaults() *WorkerDefaults {
2525
func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Container, error) {
2626
container := w.getCommonContainer(context)
2727

28-
// Add system port
2928
container.Ports = []corev1.ContainerPort{
3029
{
3130
Protocol: corev1.ProtocolTCP,
3231
Name: commonconsts.DynamoSystemPortName,
3332
ContainerPort: int32(commonconsts.DynamoSystemPort),
3433
},
34+
{
35+
Protocol: corev1.ProtocolTCP,
36+
Name: commonconsts.DynamoMetricsPortName,
37+
ContainerPort: int32(commonconsts.DynamoSystemPort),
38+
},
3539
}
3640

3741
container.LivenessProbe = &corev1.Probe{

0 commit comments

Comments
 (0)