From aa131a94410683b0a02e74fed2ce95e6c2b6b030 Mon Sep 17 00:00:00 2001 From: conroy-cheers Date: Sat, 14 Jun 2025 14:22:38 +1000 Subject: [PATCH] [Fix]: Fall back to Gloo when NCCL backend is unavailable Signed-off-by: conroy-cheers --- vllm/distributed/parallel_state.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 10f87c49baa9..126160b09553 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -938,6 +938,13 @@ def init_distributed_environment( assert distributed_init_method is not None, ( "distributed_init_method must be provided when initializing " "distributed environment") + if not torch.distributed.is_backend_available(backend): + logger.warning( + "Distributed backend %s is not available; " + "falling back to gloo.", backend) + assert torch.distributed.is_gloo_available(), ( + "Fallback Gloo backend is not available.") + backend = "gloo" # this backend is used for WORLD torch.distributed.init_process_group( backend=backend,