Skip to content

Commit b97791f

Browse files
crypdickepwalsh
authored andcommitted
[Docs] Improve documentation for ray cluster launcher helper script (vllm-project#20602)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
1 parent aae9781 commit b97791f

File tree

1 file changed

+62
-12
lines changed

1 file changed

+62
-12
lines changed

examples/online_serving/run_cluster.sh

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,97 @@
11
#!/bin/bash
2+
#
3+
# Launch a Ray cluster inside Docker for vLLM inference.
4+
#
5+
# This script can start either a head node or a worker node, depending on the
6+
# --head or --worker flag provided as the third positional argument.
7+
#
8+
# Usage:
9+
# 1. Designate one machine as the head node and execute:
10+
# bash run_cluster.sh \
11+
# vllm/vllm-openai \
12+
# <head_node_ip> \
13+
# --head \
14+
# /abs/path/to/huggingface/cache \
15+
# -e VLLM_HOST_IP=<head_node_ip>
16+
#
17+
# 2. On every worker machine, execute:
18+
# bash run_cluster.sh \
19+
# vllm/vllm-openai \
20+
# <head_node_ip> \
21+
# --worker \
22+
# /abs/path/to/huggingface/cache \
23+
# -e VLLM_HOST_IP=<worker_node_ip>
24+
#
25+
# Each worker requires a unique VLLM_HOST_IP value.
26+
# Keep each terminal session open. Closing a session stops the associated Ray
27+
# node and thereby shuts down the entire cluster.
28+
# Every machine must be reachable at the supplied IP address.
29+
#
30+
# The container is named "node-<random_suffix>". To open a shell inside
31+
# a container after launch, use:
32+
# docker exec -it node-<random_suffix> /bin/bash
33+
#
34+
# Then, you can execute vLLM commands on the Ray cluster as if it were a
35+
# single machine, e.g. vllm serve ...
36+
#
37+
# To stop the container, use:
38+
# docker stop node-<random_suffix>
239

3-
# Check for minimum number of required arguments
40+
# Check for minimum number of required arguments.
441
if [ $# -lt 4 ]; then
5-
echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
42+
echo "Usage: $0 docker_image head_node_ip --head|--worker path_to_hf_home [additional_args...]"
643
exit 1
744
fi
845

9-
# Assign the first three arguments and shift them away
46+
# Extract the mandatory positional arguments and remove them from $@.
1047
DOCKER_IMAGE="$1"
1148
HEAD_NODE_ADDRESS="$2"
12-
NODE_TYPE="$3" # Should be --head or --worker
49+
NODE_TYPE="$3" # Should be --head or --worker.
1350
PATH_TO_HF_HOME="$4"
1451
shift 4
1552

16-
# Additional arguments are passed directly to the Docker command
53+
# Preserve any extra arguments so they can be forwarded to Docker.
1754
ADDITIONAL_ARGS=("$@")
1855

19-
# Validate node type
56+
# Validate the NODE_TYPE argument.
2057
if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
2158
echo "Error: Node type must be --head or --worker"
2259
exit 1
2360
fi
2461

25-
# Define a function to cleanup on EXIT signal
62+
# Generate a unique container name with random suffix.
63+
# Docker container names must be unique on each host.
64+
# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
65+
# for example, on a multi-GPU machine.
66+
CONTAINER_NAME="node-${RANDOM}"
67+
68+
# Define a cleanup routine that removes the container when the script exits.
69+
# This prevents orphaned containers from accumulating if the script is interrupted.
2670
cleanup() {
27-
docker stop node
28-
docker rm node
71+
docker stop "${CONTAINER_NAME}"
72+
docker rm "${CONTAINER_NAME}"
2973
}
3074
trap cleanup EXIT
3175

32-
# Command setup for head or worker node
76+
# Build the Ray start command based on the node role.
77+
# The head node manages the cluster and accepts connections on port 6379,
78+
# while workers connect to the head's address.
3379
RAY_START_CMD="ray start --block"
3480
if [ "${NODE_TYPE}" == "--head" ]; then
3581
RAY_START_CMD+=" --head --port=6379"
3682
else
3783
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
3884
fi
3985

40-
# Run the docker command with the user specified parameters and additional arguments
86+
# Launch the container with the assembled parameters.
87+
# --network host: Allows Ray nodes to communicate directly via host networking
88+
# --shm-size 10.24g: Increases shared memory
89+
# --gpus all: Gives container access to all GPUs on the host
90+
# -v HF_HOME: Mounts HuggingFace cache to avoid re-downloading models
4191
docker run \
4292
--entrypoint /bin/bash \
4393
--network host \
44-
--name node \
94+
--name "${CONTAINER_NAME}" \
4595
--shm-size 10.24g \
4696
--gpus all \
4797
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \

0 commit comments

Comments
 (0)