Skip to content

Commit

Permalink
[Core][Distributed] fix pynccl del error (vllm-project#4508)
Browse files Browse the repository at this point in the history
  • Loading branch information
youkaichao authored and dtrifiro committed May 7, 2024
1 parent 8fd058c commit ee497f9
Showing 1 changed file with 4 additions and 8 deletions.
12 changes: 4 additions & 8 deletions vllm/distributed/device_communicators/pynccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ def from_torch(cls, op: ReduceOp) -> int:
ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
]

# be cautious! this is a collective call, it will block until all
# processes in the communicator have called this function.
# because Python object destruction can happen in random order,
# it is better not to call it at all.
# equivalent to c declaration:
# ncclResult_t ncclCommDestroy(ncclComm_t comm);
_c_ncclCommDestroy = nccl.ncclCommDestroy
Expand Down Expand Up @@ -278,11 +282,3 @@ def all_reduce(self,
ncclDataTypeEnum.from_torch(tensor.dtype),
ncclRedOpTypeEnum.from_torch(op), self.comm,
ctypes.c_void_p(stream.cuda_stream)))

def __del__(self):
# `dist` module might have been already destroyed
if hasattr(dist, 'destroy_process_group'):
dist.destroy_process_group()
# function might have been already destroyed
if _c_ncclCommDestroy is not None:
_c_ncclCommDestroy(self.comm)

0 comments on commit ee497f9

Please sign in to comment.