From aa131a94410683b0a02e74fed2ce95e6c2b6b030 Mon Sep 17 00:00:00 2001
From: conroy-cheers <conroy@corncheese.org>
Date: Sat, 14 Jun 2025 14:22:38 +1000
Subject: [PATCH] [Fix]: Fall back to Gloo when NCCL backend is unavailable

Signed-off-by: conroy-cheers <conroy@corncheese.org>
---
 vllm/distributed/parallel_state.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 10f87c49baa9..126160b09553 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -938,6 +938,13 @@ def init_distributed_environment(
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "
             "distributed environment")
+        if not torch.distributed.is_backend_available(backend):
+            logger.warning(
+                "Distributed backend %s is not available; "
+                "falling back to gloo.", backend)
+            assert torch.distributed.is_gloo_available(), (
+                "Fallback Gloo backend is not available.")
+            backend = "gloo"
         # this backend is used for WORLD
         torch.distributed.init_process_group(
             backend=backend,