vllm-project · Isotr0py · Dec 13, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
@@ -62,7 +62,7 @@ If a single node lacks sufficient GPUs to hold the model, deploy vLLM across mul
 
 ### What is Ray?
 
-Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments require Ray as the runtime engine.
+Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments can use Ray as the runtime engine.
 
 vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.
 
@@ -130,6 +130,28 @@ vllm serve /path/to/the/model/in/the/container \
      --distributed-executor-backend ray
 ```
 
+### Running vLLM with MultiProcessing
+
+Besides Ray, Multi-node vLLM deployments can also use `multiprocessing` as the runtime engine. Here's an example to deploy model across 2 nodes (8 GPUs per node) with `tp_size=8` and `pp_size=2`.
+
+Choose one node as the head node and run:
+
+```bash
+vllm serve /path/to/the/model/in/the/container \
+  --tensor-parallel-size 8 --pipeline-parallel-size 2 \
+  --nnodes 2 --node-rank 0 \
+  --master-addr <HEAD_NODE_IP>
+```
+
+On the other worker node, run:
+
+```bash
+vllm serve /path/to/the/model/in/the/container \
+  --tensor-parallel-size 8 --pipeline-parallel-size 2 \
+  --nnodes 2 --node-rank 1 \
+  --master-addr <HEAD_NODE_IP> --headless
+```
+
 ## Optimizing network communication for tensor parallelism
 
 Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
@@ -124,9 +124,7 @@ def _init_executor(self) -> None:
         # Set multiprocessing envs
         set_multiprocessing_worker_envs()
 
-        # Multiprocessing-based executor does not support multi-node setting.
-        # Since it only works for single node, we can use the loopback address
-        # get_loopback_ip() for communication.
+        # use the loopback address get_loopback_ip() for communication.
         distributed_init_method = get_distributed_init_method(
             get_loopback_ip(), get_open_port()
         )
-        # use the loopback address get_loopback_ip() for communication.
-        distributed_init_method = get_distributed_init_method(
-            get_loopback_ip(), get_open_port()
-        )
+        if self.parallel_config.nnodes > 1:
+            distributed_init_method = get_distributed_init_method(
+                self.parallel_config.master_addr, self.parallel_config.master_port)
+        else:
+            # use the loopback address get_loopback_ip() for communication.
+            distributed_init_method = get_distributed_init_method(
+                get_loopback_ip(), get_open_port())
-        # use the loopback address get_loopback_ip() for communication.
-        distributed_init_method = get_distributed_init_method(
-            get_loopback_ip(), get_open_port()
-        )
+        if self.parallel_config.nnodes > 1:
+            distributed_init_method = get_distributed_init_method(
+                self.parallel_config.master_addr, self.parallel_config.master_port)
+        else:
+            # use the loopback address get_loopback_ip() for communication.
+            distributed_init_method = get_distributed_init_method(
+                get_loopback_ip(), get_open_port())