11#! /bin/bash
2+ #
3+ # Helper script to manually start or join a Ray cluster for online serving of vLLM models.
4+ # This script is first executed on the head node, and then on each worker node with the IP address
5+ # of the head node.
6+ #
7+ # Subcommands:
8+ # leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers).
9+ # worker: Starts a worker node that connects to an existing Ray head node.
10+ #
11+ # Example usage:
12+ # On the head node machine, start the Ray head node process and run a vLLM server.
13+ # ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>] && \
14+ # python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
15+ #
16+ # On each worker node, start the Ray worker node process.
17+ # ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
18+ #
19+ # About Ray:
20+ # Ray is an open-source distributed execution framework that simplifies
21+ # distributed computing. Learn more:
22+ # https://ray.io/
223
3- subcommand=$1
4- shift
524
6- ray_port=6379
7- ray_init_timeout=300
8- declare -a start_params
25+ subcommand=$1 # Either "leader" or "worker".
26+ shift # Remove the subcommand from the argument list.
927
28+ ray_port=6379 # Port used by the Ray head node.
29+ ray_init_timeout=300 # Seconds to wait before timing out.
30+ declare -a start_params # Parameters forwarded to the underlying 'ray start' command.
31+
32+ # Handle the worker subcommand.
1033case " $subcommand " in
1134 worker)
1235 ray_address=" "
@@ -32,6 +55,7 @@ case "$subcommand" in
3255 exit 1
3356 fi
3457
58+ # Retry until the worker node connects to the head node or the timeout expires.
3559 for (( i= 0 ; i < $ray_init_timeout ; i+= 5 )) ; do
3660 ray start --address=$ray_address :$ray_port --block " ${start_params[@]} "
3761 if [ $? -eq 0 ]; then
@@ -45,6 +69,7 @@ case "$subcommand" in
4569 exit 1
4670 ;;
4771
72+ # Handle the leader subcommand.
4873 leader)
4974 ray_cluster_size=" "
5075 while [ $# -gt 0 ]; do
@@ -69,10 +94,10 @@ case "$subcommand" in
6994 exit 1
7095 fi
7196
72- # start the ray daemon
97+ # Start the Ray head node.
7398 ray start --head --port=$ray_port " ${start_params[@]} "
7499
75- # wait until all workers are active
100+ # Poll Ray until every worker node is active.
76101 for (( i= 0 ; i < $ray_init_timeout ; i+= 5 )) ; do
77102 active_nodes=` python3 -c ' import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))' `
78103 if [ $active_nodes -eq $ray_cluster_size ]; then
0 commit comments