Skip to content

Commit f50860e

Browse files
mohammedabdulwahhabhhzhang16
authored andcommitted
feat: add crds for vllm and llm examples (#1766)
Signed-off-by: mohammedabdulwahhab <furkhan324@berkeley.edu> Co-authored-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
1 parent a19249d commit f50860e

File tree

10 files changed

+1246
-0
lines changed

10 files changed

+1246
-0
lines changed

examples/llm/deploy/agg.yaml

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
apiVersion: nvidia.com/v1alpha1
16+
kind: DynamoGraphDeployment
17+
metadata:
18+
name: llm-agg
19+
spec:
20+
envs:
21+
- name: DYN_DEPLOYMENT_CONFIG
22+
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
23+
services:
24+
Frontend:
25+
dynamoNamespace: llm-agg
26+
componentType: main
27+
replicas: 1
28+
resources:
29+
requests:
30+
cpu: "1"
31+
memory: "2Gi"
32+
limits:
33+
cpu: "1"
34+
memory: "2Gi"
35+
extraPodSpec:
36+
mainContainer:
37+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
38+
workingDir: /workspace/examples/llm
39+
args:
40+
- dynamo
41+
- serve
42+
- graphs.agg:Frontend
43+
- --system-app-port
44+
- "5000"
45+
- --enable-system-app
46+
- --use-default-health-checks
47+
- --service-name
48+
- Frontend
49+
Processor:
50+
dynamoNamespace: llm-agg
51+
componentType: worker
52+
replicas: 1
53+
resources:
54+
requests:
55+
cpu: "1"
56+
memory: "2Gi"
57+
limits:
58+
cpu: "1"
59+
memory: "2Gi"
60+
extraPodSpec:
61+
mainContainer:
62+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
63+
workingDir: /workspace/examples/llm
64+
args:
65+
- dynamo
66+
- serve
67+
- graphs.agg:Processor
68+
- --system-app-port
69+
- "5000"
70+
- --enable-system-app
71+
- --use-default-health-checks
72+
- --service-name
73+
- Processor
74+
VllmWorker:
75+
envFromSecret: hf-token-secret
76+
dynamoNamespace: llm-agg
77+
replicas: 1
78+
resources:
79+
requests:
80+
cpu: "10"
81+
memory: "20Gi"
82+
nvidia.com/gpu: "1"
83+
limits:
84+
cpu: "10"
85+
memory: "20Gi"
86+
nvidia.com/gpu: "1"
87+
extraPodSpec:
88+
mainContainer:
89+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
90+
workingDir: /workspace/examples/llm
91+
args:
92+
- dynamo
93+
- serve
94+
- graphs.agg:VllmWorker
95+
- --system-app-port
96+
- "5000"
97+
- --enable-system-app
98+
- --use-default-health-checks
99+
- --service-name
100+
- VllmWorker
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
apiVersion: nvidia.com/v1alpha1
16+
kind: DynamoGraphDeployment
17+
metadata:
18+
name: agg-router
19+
spec:
20+
envs:
21+
- name: DYN_DEPLOYMENT_CONFIG
22+
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
23+
services:
24+
Frontend:
25+
dynamoNamespace: llm-agg-router
26+
componentType: main
27+
replicas: 1
28+
resources:
29+
requests:
30+
cpu: "1"
31+
memory: "2Gi"
32+
limits:
33+
cpu: "1"
34+
memory: "2Gi"
35+
extraPodSpec:
36+
mainContainer:
37+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
38+
workingDir: /workspace/examples/llm
39+
args:
40+
- dynamo
41+
- serve
42+
- graphs.agg_router:Frontend
43+
- --system-app-port
44+
- "5000"
45+
- --enable-system-app
46+
- --use-default-health-checks
47+
- --service-name
48+
- Frontend
49+
Processor:
50+
dynamoNamespace: llm-agg-router
51+
componentType: worker
52+
replicas: 1
53+
resources:
54+
requests:
55+
cpu: "1"
56+
memory: "2Gi"
57+
limits:
58+
cpu: "1"
59+
memory: "2Gi"
60+
extraPodSpec:
61+
mainContainer:
62+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
63+
workingDir: /workspace/examples/llm
64+
args:
65+
- dynamo
66+
- serve
67+
- graphs.agg_router:Processor
68+
- --system-app-port
69+
- "5000"
70+
- --enable-system-app
71+
- --use-default-health-checks
72+
- --service-name
73+
- Processor
74+
Router:
75+
dynamoNamespace: llm-agg-router
76+
componentType: worker
77+
replicas: 1
78+
resources:
79+
requests:
80+
cpu: "1"
81+
memory: "2Gi"
82+
limits:
83+
cpu: "1"
84+
memory: "2Gi"
85+
extraPodSpec:
86+
mainContainer:
87+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
88+
workingDir: /workspace/examples/llm
89+
args:
90+
- dynamo
91+
- serve
92+
- graphs.agg_router:Router
93+
- --system-app-port
94+
- "5000"
95+
- --enable-system-app
96+
- --use-default-health-checks
97+
- --service-name
98+
- Router
99+
VllmWorker:
100+
envFromSecret: hf-token-secret
101+
dynamoNamespace: llm-agg-router
102+
replicas: 1
103+
resources:
104+
requests:
105+
cpu: "10"
106+
memory: "20Gi"
107+
nvidia.com/gpu: "1"
108+
limits:
109+
cpu: "10"
110+
memory: "20Gi"
111+
nvidia.com/gpu: "1"
112+
extraPodSpec:
113+
mainContainer:
114+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
115+
workingDir: /workspace/examples/llm
116+
args:
117+
- dynamo
118+
- serve
119+
- graphs.agg_router:VllmWorker
120+
- --system-app-port
121+
- "5000"
122+
- --enable-system-app
123+
- --use-default-health-checks
124+
- --service-name
125+
- VllmWorker

examples/llm/deploy/disagg.yaml

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
apiVersion: nvidia.com/v1alpha1
16+
kind: DynamoGraphDeployment
17+
metadata:
18+
name: llm-disagg
19+
spec:
20+
envs:
21+
- name: DYN_DEPLOYMENT_CONFIG
22+
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
23+
services:
24+
Frontend:
25+
dynamoNamespace: llm-disagg
26+
componentType: main
27+
replicas: 1
28+
resources:
29+
requests:
30+
cpu: "1"
31+
memory: "2Gi"
32+
limits:
33+
cpu: "1"
34+
memory: "2Gi"
35+
extraPodSpec:
36+
mainContainer:
37+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
38+
workingDir: /workspace/examples/llm
39+
args:
40+
- dynamo
41+
- serve
42+
- graphs.disagg:Frontend
43+
- --system-app-port
44+
- "5000"
45+
- --enable-system-app
46+
- --use-default-health-checks
47+
- --service-name
48+
- Frontend
49+
Processor:
50+
dynamoNamespace: llm-disagg
51+
componentType: worker
52+
replicas: 1
53+
resources:
54+
requests:
55+
cpu: "1"
56+
memory: "2Gi"
57+
limits:
58+
cpu: "1"
59+
memory: "2Gi"
60+
extraPodSpec:
61+
mainContainer:
62+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
63+
workingDir: /workspace/examples/llm
64+
args:
65+
- dynamo
66+
- serve
67+
- graphs.disagg:Processor
68+
- --system-app-port
69+
- "5000"
70+
- --enable-system-app
71+
- --use-default-health-checks
72+
- --service-name
73+
- Processor
74+
VllmWorker:
75+
envFromSecret: hf-token-secret
76+
dynamoNamespace: llm-disagg
77+
replicas: 1
78+
resources:
79+
requests:
80+
cpu: "10"
81+
memory: "20Gi"
82+
nvidia.com/gpu: "1"
83+
limits:
84+
cpu: "10"
85+
memory: "20Gi"
86+
nvidia.com/gpu: "1"
87+
extraPodSpec:
88+
mainContainer:
89+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
90+
workingDir: /workspace/examples/llm
91+
args:
92+
- dynamo
93+
- serve
94+
- graphs.disagg:VllmWorker
95+
- --system-app-port
96+
- "5000"
97+
- --enable-system-app
98+
- --use-default-health-checks
99+
- --service-name
100+
- VllmWorker
101+
PrefillWorker:
102+
envFromSecret: hf-token-secret
103+
dynamoNamespace: llm-disagg
104+
replicas: 1
105+
resources:
106+
requests:
107+
cpu: "10"
108+
memory: "20Gi"
109+
nvidia.com/gpu: "1"
110+
limits:
111+
cpu: "10"
112+
memory: "20Gi"
113+
nvidia.com/gpu: "1"
114+
extraPodSpec:
115+
mainContainer:
116+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
117+
workingDir: /workspace/examples/llm
118+
args:
119+
- dynamo
120+
- serve
121+
- graphs.disagg:PrefillWorker
122+
- --system-app-port
123+
- "5000"
124+
- --enable-system-app
125+
- --use-default-health-checks
126+
- --service-name
127+
- PrefillWorker

0 commit comments

Comments
 (0)