diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md index a32840ea73b9..339a5b814021 100644 --- a/docs/serving/parallelism_scaling.md +++ b/docs/serving/parallelism_scaling.md @@ -62,7 +62,7 @@ If a single node lacks sufficient GPUs to hold the model, deploy vLLM across mul ### What is Ray? -Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments require Ray as the runtime engine. +Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments can use Ray as the runtime engine. vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens. @@ -130,6 +130,28 @@ vllm serve /path/to/the/model/in/the/container \ --distributed-executor-backend ray ``` +### Running vLLM with MultiProcessing + +Besides Ray, Multi-node vLLM deployments can also use `multiprocessing` as the runtime engine. Here's an example to deploy model across 2 nodes (8 GPUs per node) with `tp_size=8` and `pp_size=2`. + +Choose one node as the head node and run: + +```bash +vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 8 --pipeline-parallel-size 2 \ + --nnodes 2 --node-rank 0 \ + --master-addr +``` + +On the other worker node, run: + +```bash +vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 8 --pipeline-parallel-size 2 \ + --nnodes 2 --node-rank 1 \ + --master-addr --headless +``` + ## Optimizing network communication for tensor parallelism Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand. diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index b42d026a3e15..f81b5df96d4b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -124,9 +124,7 @@ def _init_executor(self) -> None: # Set multiprocessing envs set_multiprocessing_worker_envs() - # Multiprocessing-based executor does not support multi-node setting. - # Since it only works for single node, we can use the loopback address - # get_loopback_ip() for communication. + # use the loopback address get_loopback_ip() for communication. distributed_init_method = get_distributed_init_method( get_loopback_ip(), get_open_port() )