diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 9434867e1b120..f21fcd262d810 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -200,6 +200,10 @@ def from_torch(cls, op: ReduceOp) -> int:
     ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
 ]
 
+# be cautious! this is a collective call, it will block until all
+# processes in the communicator have called this function.
+# because Python object destruction can happen in random order,
+# it is better not to call it at all.
 # equivalent to c declaration:
 # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
 _c_ncclCommDestroy = nccl.ncclCommDestroy
@@ -278,11 +282,3 @@ def all_reduce(self,
                              ncclDataTypeEnum.from_torch(tensor.dtype),
                              ncclRedOpTypeEnum.from_torch(op), self.comm,
                              ctypes.c_void_p(stream.cuda_stream)))
-
-    def __del__(self):
-        # `dist` module might have been already destroyed
-        if hasattr(dist, 'destroy_process_group'):
-            dist.destroy_process_group()
-        # function might have been already destroyed
-        if _c_ncclCommDestroy is not None:
-            _c_ncclCommDestroy(self.comm)