Skip to content

Commit

Permalink
common/cuda: Fix near-hang when remote side has exited
Browse files Browse the repository at this point in the history
Ignore errors caused by remote side having exited when closing CUDA IPC mappings.
openmpi/ompi#3244
  • Loading branch information
sjeaugey authored Mar 31, 2017
1 parent 5f6ba81 commit d361907
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions opal/mca/common/cuda/common_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -1157,10 +1157,10 @@ int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg)
if (ctx_ok) {
result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
true, result, cuda_reg->base.alloc_base);
opal_output(0, "Sleep on %d", getpid());
sleep(20);
if (CUDA_ERROR_DEINITIALIZED != result) {
opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
true, result, cuda_reg->base.alloc_base);
}
/* We will just continue on and hope things continue to work. */
} else {
opal_output_verbose(10, mca_common_cuda_output,
Expand Down

0 comments on commit d361907

Please sign in to comment.