Skip to content

Commit 1823990

Browse files
committed
[docs][serve.llm] Add cross-node TP/PP and custom placement group documentation
Signed-off-by: Nikhil Ghosh <nikhil@anyscale.com>
1 parent f9c009a commit 1823990

File tree

3 files changed

+266
-1
lines changed

3 files changed

+266
-1
lines changed

doc/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ py_test_run_all_subdirectory(
299299
"source/serve/doc_code/stable_diffusion.py",
300300
"source/serve/doc_code/object_detection.py",
301301
"source/serve/doc_code/vllm_example.py",
302+
"source/serve/doc_code/cross_node_parallelism_example.py",
302303
"source/serve/doc_code/llm/llm_yaml_config_example.py",
303304
"source/serve/doc_code/llm/qwen_example.py",
304305
],
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
# flake8: noqa
2+
"""
3+
Cross-node parallelism examples for Ray Serve LLM.
4+
5+
TP / PP / custom placement group strategies
6+
for multi-node LLM deployments.
7+
"""
8+
9+
# __cross_node_tp_example_start__
10+
import vllm
11+
from ray import serve
12+
from ray.serve.llm import LLMConfig, build_openai_app
13+
14+
# Configure a model with tensor parallelism across 2 GPUs
15+
# Tensor parallelism splits model weights across GPUs
16+
llm_config = LLMConfig(
17+
model_loading_config=dict(
18+
model_id="llama-3.1-8b",
19+
model_source="meta-llama/Llama-3.1-8B-Instruct",
20+
),
21+
deployment_config=dict(
22+
autoscaling_config=dict(
23+
min_replicas=1,
24+
max_replicas=2,
25+
)
26+
),
27+
accelerator_type="L4",
28+
engine_kwargs=dict(
29+
tensor_parallel_size=2,
30+
distributed_executor_backend="ray",
31+
max_model_len=8192,
32+
),
33+
)
34+
35+
# Deploy the application
36+
app = build_openai_app({"llm_configs": [llm_config]})
37+
serve.run(app, blocking=True)
38+
# __cross_node_tp_example_end__
39+
40+
# __cross_node_pp_example_start__
41+
from ray import serve
42+
from ray.serve.llm import LLMConfig, build_openai_app
43+
44+
# Configure a model with pipeline parallelism across 2 GPUs
45+
# Pipeline parallelism splits model layers across GPUs
46+
llm_config = LLMConfig(
47+
model_loading_config=dict(
48+
model_id="llama-3.1-8b",
49+
model_source="meta-llama/Llama-3.1-8B-Instruct",
50+
),
51+
deployment_config=dict(
52+
autoscaling_config=dict(
53+
min_replicas=1,
54+
max_replicas=1,
55+
)
56+
),
57+
accelerator_type="L4",
58+
engine_kwargs=dict(
59+
pipeline_parallel_size=2,
60+
distributed_executor_backend="ray",
61+
max_model_len=8192,
62+
),
63+
)
64+
65+
# Deploy the application
66+
app = build_openai_app({"llm_configs": [llm_config]})
67+
serve.run(app, blocking=True)
68+
# __cross_node_pp_example_end__
69+
70+
# __cross_node_tp_pp_example_start__
71+
from ray import serve
72+
from ray.serve.llm import LLMConfig, build_openai_app
73+
74+
# Configure a model with both tensor and pipeline parallelism
75+
# This example uses 4 GPUs total (2 TP * 2 PP)
76+
llm_config = LLMConfig(
77+
model_loading_config=dict(
78+
model_id="llama-3.1-8b",
79+
model_source="meta-llama/Llama-3.1-8B-Instruct",
80+
),
81+
deployment_config=dict(
82+
autoscaling_config=dict(
83+
min_replicas=1,
84+
max_replicas=1,
85+
)
86+
),
87+
accelerator_type="L4",
88+
engine_kwargs=dict(
89+
tensor_parallel_size=2,
90+
pipeline_parallel_size=2,
91+
distributed_executor_backend="ray",
92+
max_model_len=8192,
93+
enable_chunked_prefill=True,
94+
max_num_batched_tokens=4096,
95+
),
96+
)
97+
98+
# Deploy the application
99+
app = build_openai_app({"llm_configs": [llm_config]})
100+
serve.run(app, blocking=True)
101+
# __cross_node_tp_pp_example_end__
102+
103+
# __custom_placement_group_pack_example_start__
104+
from ray import serve
105+
from ray.serve.llm import LLMConfig, build_openai_app
106+
107+
# Configure a model with custom placement group using PACK strategy
108+
# PACK tries to place workers on as few nodes as possible for locality
109+
llm_config = LLMConfig(
110+
model_loading_config=dict(
111+
model_id="llama-3.1-8b",
112+
model_source="meta-llama/Llama-3.1-8B-Instruct",
113+
),
114+
deployment_config=dict(
115+
autoscaling_config=dict(
116+
min_replicas=1,
117+
max_replicas=1,
118+
)
119+
),
120+
accelerator_type="L4",
121+
engine_kwargs=dict(
122+
tensor_parallel_size=2,
123+
distributed_executor_backend="ray",
124+
max_model_len=8192,
125+
),
126+
placement_group_config=dict(
127+
bundles=[{"GPU": 1, "CPU": 2}] * 2,
128+
strategy="PACK",
129+
),
130+
)
131+
132+
# Deploy the application
133+
app = build_openai_app({"llm_configs": [llm_config]})
134+
serve.run(app, blocking=True)
135+
# __custom_placement_group_pack_example_end__
136+
137+
# __custom_placement_group_spread_example_start__
138+
from ray import serve
139+
from ray.serve.llm import LLMConfig, build_openai_app
140+
141+
# Configure a model with custom placement group using SPREAD strategy
142+
# SPREAD distributes workers across nodes for fault tolerance
143+
llm_config = LLMConfig(
144+
model_loading_config=dict(
145+
model_id="llama-3.1-8b",
146+
model_source="meta-llama/Llama-3.1-8B-Instruct",
147+
),
148+
deployment_config=dict(
149+
autoscaling_config=dict(
150+
min_replicas=1,
151+
max_replicas=1,
152+
)
153+
),
154+
accelerator_type="L4",
155+
engine_kwargs=dict(
156+
tensor_parallel_size=4,
157+
distributed_executor_backend="ray",
158+
max_model_len=8192,
159+
),
160+
placement_group_config=dict(
161+
bundles=[{"GPU": 1, "CPU": 2}] * 4,
162+
strategy="SPREAD",
163+
),
164+
)
165+
166+
# Deploy the application
167+
app = build_openai_app({"llm_configs": [llm_config]})
168+
serve.run(app, blocking=True)
169+
# __custom_placement_group_spread_example_end__
170+
171+
# __custom_placement_group_strict_pack_example_start__
172+
from ray import serve
173+
from ray.serve.llm import LLMConfig, build_openai_app
174+
175+
# Configure a model with custom placement group using STRICT_PACK strategy
176+
# STRICT_PACK ensures all workers are placed on the same node
177+
llm_config = LLMConfig(
178+
model_loading_config=dict(
179+
model_id="llama-3.1-8b",
180+
model_source="meta-llama/Llama-3.1-8B-Instruct",
181+
),
182+
deployment_config=dict(
183+
autoscaling_config=dict(
184+
min_replicas=1,
185+
max_replicas=2,
186+
)
187+
),
188+
accelerator_type="A100",
189+
engine_kwargs=dict(
190+
tensor_parallel_size=2,
191+
distributed_executor_backend="ray",
192+
max_model_len=8192,
193+
),
194+
placement_group_config=dict(
195+
bundles=[{"GPU": 1, "CPU": 2}] * 2,
196+
strategy="STRICT_PACK",
197+
),
198+
)
199+
200+
# Deploy the application
201+
app = build_openai_app({"llm_configs": [llm_config]})
202+
serve.run(app, blocking=True)
203+
# __custom_placement_group_strict_pack_example_end__
204+
205+
# __yaml_cross_node_tp_pp_example_start__
206+
# config.yaml
207+
# applications:
208+
# - args:
209+
# llm_configs:
210+
# - model_loading_config:
211+
# model_id: llama-3.1-8b
212+
# model_source: meta-llama/Llama-3.1-8B-Instruct
213+
# accelerator_type: L4
214+
# deployment_config:
215+
# autoscaling_config:
216+
# min_replicas: 1
217+
# max_replicas: 1
218+
# engine_kwargs:
219+
# tensor_parallel_size: 2
220+
# pipeline_parallel_size: 2
221+
# distributed_executor_backend: ray
222+
# max_model_len: 8192
223+
# enable_chunked_prefill: true
224+
# max_num_batched_tokens: 4096
225+
# import_path: ray.serve.llm:build_openai_app
226+
# name: llm_app
227+
# route_prefix: "/"
228+
# __yaml_cross_node_tp_pp_example_end__
229+
230+
# __yaml_custom_placement_group_example_start__
231+
# config.yaml
232+
# applications:
233+
# - args:
234+
# llm_configs:
235+
# - model_loading_config:
236+
# model_id: llama-3.1-8b
237+
# model_source: meta-llama/Llama-3.1-8B-Instruct
238+
# accelerator_type: L4
239+
# deployment_config:
240+
# autoscaling_config:
241+
# min_replicas: 1
242+
# max_replicas: 1
243+
# engine_kwargs:
244+
# tensor_parallel_size: 4
245+
# distributed_executor_backend: ray
246+
# max_model_len: 8192
247+
# placement_group_config:
248+
# bundles:
249+
# - GPU: 1
250+
# CPU: 2
251+
# - GPU: 1
252+
# CPU: 2
253+
# - GPU: 1
254+
# CPU: 2
255+
# - GPU: 1
256+
# CPU: 2
257+
# strategy: SPREAD
258+
# import_path: ray.serve.llm:build_openai_app
259+
# name: llm_app
260+
# route_prefix: "/"
261+
# __yaml_custom_placement_group_example_end__

doc/source/serve/llm/index.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ Ray Serve LLM APIs allow users to deploy multiple LLM models together with a fam
1111
- 🔌 OpenAI compatible
1212
- 🔄 Multi-LoRA support with shared base models
1313
- 🚀 Engine agnostic architecture (i.e. vLLM, SGLang, etc)
14+
- 🔗 Cross-node tensor and pipeline parallelism
15+
- ⚙️ Custom :ref:`placement group strategies <pgroup-strategy>` for fine-grained resource control
1416

1517
## Requirements
1618

@@ -48,9 +50,10 @@ The LLMConfig class specifies model details such as:
4850

4951
- Model loading sources (HuggingFace or cloud storage)
5052
- Hardware requirements (accelerator type)
51-
- Engine arguments (e.g. vLLM engine kwargs)
53+
- Engine arguments (e.g. vLLM engine kwargs, tensor/pipeline parallelism)
5254
- LoRA multiplexing configuration
5355
- Serve auto-scaling parameters
56+
- Placement group configuration for multi-node deployments
5457

5558
```{toctree}
5659
:hidden:

0 commit comments

Comments
 (0)