[docs][serve.llm] Add cross-node TP/PP and custom placement group documentation

nrghosh · nrghosh · commit 182399018d02 · 2025-10-17T00:58:17.000-07:00
Signed-off-by: Nikhil Ghosh &lt;nikhil@anyscale.com&gt;
diff --git a/doc/BUILD.bazel b/doc/BUILD.bazel
@@ -299,6 +299,7 @@ py_test_run_all_subdirectory(
         "source/serve/doc_code/stable_diffusion.py",
         "source/serve/doc_code/object_detection.py",
         "source/serve/doc_code/vllm_example.py",
+        "source/serve/doc_code/cross_node_parallelism_example.py",
         "source/serve/doc_code/llm/llm_yaml_config_example.py",
         "source/serve/doc_code/llm/qwen_example.py",
     ],
diff --git a/doc/source/serve/doc_code/cross_node_parallelism_example.py b/doc/source/serve/doc_code/cross_node_parallelism_example.py
@@ -0,0 +1,261 @@
+# flake8: noqa
+"""
+Cross-node parallelism examples for Ray Serve LLM.
+
+TP / PP / custom placement group strategies
+for multi-node LLM deployments.
+"""
+
+# __cross_node_tp_example_start__
+import vllm
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+# Configure a model with tensor parallelism across 2 GPUs
+# Tensor parallelism splits model weights across GPUs
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="llama-3.1-8b",
+        model_source="meta-llama/Llama-3.1-8B-Instruct",
+    ),
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1,
+            max_replicas=2,
+        )
+    ),
+    accelerator_type="L4",
+    engine_kwargs=dict(
+        tensor_parallel_size=2,
+        distributed_executor_backend="ray",
+        max_model_len=8192,
+    ),
+)
+
+# Deploy the application
+app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(app, blocking=True)
+# __cross_node_tp_example_end__
+
+# __cross_node_pp_example_start__
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+# Configure a model with pipeline parallelism across 2 GPUs
+# Pipeline parallelism splits model layers across GPUs
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="llama-3.1-8b",
+        model_source="meta-llama/Llama-3.1-8B-Instruct",
+    ),
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1,
+            max_replicas=1,
+        )
+    ),
+    accelerator_type="L4",
+    engine_kwargs=dict(
+        pipeline_parallel_size=2,
+        distributed_executor_backend="ray",
+        max_model_len=8192,
+    ),
+)
+
+# Deploy the application
+app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(app, blocking=True)
+# __cross_node_pp_example_end__
+
+# __cross_node_tp_pp_example_start__
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+# Configure a model with both tensor and pipeline parallelism
+# This example uses 4 GPUs total (2 TP * 2 PP)
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="llama-3.1-8b",
+        model_source="meta-llama/Llama-3.1-8B-Instruct",
+    ),
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1,
+            max_replicas=1,
+        )
+    ),
+    accelerator_type="L4",
+    engine_kwargs=dict(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+        distributed_executor_backend="ray",
+        max_model_len=8192,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=4096,
+    ),
+)
+
+# Deploy the application
+app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(app, blocking=True)
+# __cross_node_tp_pp_example_end__
+
+# __custom_placement_group_pack_example_start__
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+# Configure a model with custom placement group using PACK strategy
+# PACK tries to place workers on as few nodes as possible for locality
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="llama-3.1-8b",
+        model_source="meta-llama/Llama-3.1-8B-Instruct",
+    ),
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1,
+            max_replicas=1,
+        )
+    ),
+    accelerator_type="L4",
+    engine_kwargs=dict(
+        tensor_parallel_size=2,
+        distributed_executor_backend="ray",
+        max_model_len=8192,
+    ),
+    placement_group_config=dict(
+        bundles=[{"GPU": 1, "CPU": 2}] * 2,
+        strategy="PACK",
+    ),
+)
+
+# Deploy the application
+app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(app, blocking=True)
+# __custom_placement_group_pack_example_end__
+
+# __custom_placement_group_spread_example_start__
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+# Configure a model with custom placement group using SPREAD strategy
+# SPREAD distributes workers across nodes for fault tolerance
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="llama-3.1-8b",
+        model_source="meta-llama/Llama-3.1-8B-Instruct",
+    ),
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1,
+            max_replicas=1,
+        )
+    ),
+    accelerator_type="L4",
+    engine_kwargs=dict(
+        tensor_parallel_size=4,
+        distributed_executor_backend="ray",
+        max_model_len=8192,
+    ),
+    placement_group_config=dict(
+        bundles=[{"GPU": 1, "CPU": 2}] * 4,
+        strategy="SPREAD",
+    ),
+)
+
+# Deploy the application
+app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(app, blocking=True)
+# __custom_placement_group_spread_example_end__
+
+# __custom_placement_group_strict_pack_example_start__
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+# Configure a model with custom placement group using STRICT_PACK strategy
+# STRICT_PACK ensures all workers are placed on the same node
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="llama-3.1-8b",
+        model_source="meta-llama/Llama-3.1-8B-Instruct",
+    ),
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1,
+            max_replicas=2,
+        )
+    ),
+    accelerator_type="A100",
+    engine_kwargs=dict(
+        tensor_parallel_size=2,
+        distributed_executor_backend="ray",
+        max_model_len=8192,
+    ),
+    placement_group_config=dict(
+        bundles=[{"GPU": 1, "CPU": 2}] * 2,
+        strategy="STRICT_PACK",
+    ),
+)
+
+# Deploy the application
+app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(app, blocking=True)
+# __custom_placement_group_strict_pack_example_end__
+
+# __yaml_cross_node_tp_pp_example_start__
+# config.yaml
+# applications:
+# - args:
+#     llm_configs:
+#       - model_loading_config:
+#           model_id: llama-3.1-8b
+#           model_source: meta-llama/Llama-3.1-8B-Instruct
+#         accelerator_type: L4
+#         deployment_config:
+#           autoscaling_config:
+#             min_replicas: 1
+#             max_replicas: 1
+#         engine_kwargs:
+#           tensor_parallel_size: 2
+#           pipeline_parallel_size: 2
+#           distributed_executor_backend: ray
+#           max_model_len: 8192
+#           enable_chunked_prefill: true
+#           max_num_batched_tokens: 4096
+#   import_path: ray.serve.llm:build_openai_app
+#   name: llm_app
+#   route_prefix: "/"
+# __yaml_cross_node_tp_pp_example_end__
+
+# __yaml_custom_placement_group_example_start__
+# config.yaml
+# applications:
+# - args:
+#     llm_configs:
+#       - model_loading_config:
+#           model_id: llama-3.1-8b
+#           model_source: meta-llama/Llama-3.1-8B-Instruct
+#         accelerator_type: L4
+#         deployment_config:
+#           autoscaling_config:
+#             min_replicas: 1
+#             max_replicas: 1
+#         engine_kwargs:
+#           tensor_parallel_size: 4
+#           distributed_executor_backend: ray
+#           max_model_len: 8192
+#         placement_group_config:
+#           bundles:
+#             - GPU: 1
+#               CPU: 2
+#             - GPU: 1
+#               CPU: 2
+#             - GPU: 1
+#               CPU: 2
+#             - GPU: 1
+#               CPU: 2
+#           strategy: SPREAD
+#   import_path: ray.serve.llm:build_openai_app
+#   name: llm_app
+#   route_prefix: "/"
+# __yaml_custom_placement_group_example_end__
diff --git a/doc/source/serve/llm/index.md b/doc/source/serve/llm/index.md
@@ -11,6 +11,8 @@ Ray Serve LLM APIs allow users to deploy multiple LLM models together with a fam
 - 🔌 OpenAI compatible
 - 🔄 Multi-LoRA support with shared base models
 - 🚀 Engine agnostic architecture (i.e. vLLM, SGLang, etc)
+- 🔗 Cross-node tensor and pipeline parallelism
+- ⚙️ Custom :ref:`placement group strategies <pgroup-strategy>` for fine-grained resource control
 
 ## Requirements
 
@@ -48,9 +50,10 @@ The LLMConfig class specifies model details such as:
 
 - Model loading sources (HuggingFace or cloud storage)
 - Hardware requirements (accelerator type)
-- Engine arguments (e.g. vLLM engine kwargs)
+- Engine arguments (e.g. vLLM engine kwargs, tensor/pipeline parallelism)
 - LoRA multiplexing configuration
 - Serve auto-scaling parameters
+- Placement group configuration for multi-node deployments
 
 ```{toctree}
 :hidden: