diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index 9a955bac596..4a8be9cc046 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -195,14 +195,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * return 0; } } - + /* First access on a device pointer finalizes CUDA support initialization. */ + opal_accelerator_cuda_delayed_init(); return 1; } static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream) { CUresult result; - + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } *stream = (opal_accelerator_stream_t*)OBJ_NEW(opal_accelerator_cuda_stream_t); if (NULL == *stream) { return OPAL_ERR_OUT_OF_RESOURCE; @@ -248,6 +252,10 @@ OBJ_CLASS_INSTANCE( static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **event) { CUresult result; + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } *event = (opal_accelerator_event_t*)OBJ_NEW(opal_accelerator_cuda_event_t); if (NULL == *event) { @@ -340,6 +348,11 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void * { CUresult result; + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + if (NULL == stream || NULL == dest || NULL == src || size <= 0) { return OPAL_ERR_BAD_PARAM; } @@ -358,6 +371,11 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, { CUresult result; + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + if (NULL == dest || NULL == src || size <= 0) { return OPAL_ERR_BAD_PARAM; } @@ -391,6 +409,11 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, CUdeviceptr tmp; CUresult result; + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + if (NULL == dest || NULL == src || size <= 0) { return OPAL_ERR_BAD_PARAM; } @@ -425,6 +448,11 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size) { CUresult result; + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + if (NULL == ptr || 0 == size) { return OPAL_ERR_BAD_PARAM; } @@ -434,7 +462,7 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size) if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true, OPAL_PROC_MY_HOSTNAME, result); - return result; + return OPAL_ERROR; } } return 0; @@ -448,7 +476,7 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr) if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true, OPAL_PROC_MY_HOSTNAME, result); - return result; + return OPAL_ERROR; } } return 0; @@ -459,6 +487,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void { CUresult result; + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + if (NULL == ptr || NULL == base || NULL == size) { return OPAL_ERR_BAD_PARAM; } @@ -479,6 +512,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size) { CUresult result; + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + if (NULL == ptr && size > 0) { return OPAL_ERR_BAD_PARAM; } @@ -487,7 +525,7 @@ static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size) if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister failed", true, ptr, size, OPAL_PROC_MY_HOSTNAME, result); - return result; + return OPAL_ERROR; } return OPAL_SUCCESS; @@ -501,7 +539,7 @@ static int accelerator_cuda_host_unregister(int dev_id, void *ptr) if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemHostUnregister failed", true, ptr, OPAL_PROC_MY_HOSTNAME, result); - return result; + return OPAL_ERROR; } } return OPAL_SUCCESS; @@ -512,6 +550,11 @@ static int accelerator_cuda_get_device(int *dev_id) CUdevice cuDev; CUresult result; + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + if (NULL == dev_id) { return OPAL_ERR_BAD_PARAM; } @@ -520,7 +563,7 @@ static int accelerator_cuda_get_device(int *dev_id) if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuCtxGetDevice failed", true, result); - return result; + return OPAL_ERROR; } *dev_id = cuDev; return 0; @@ -530,6 +573,11 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de { CUresult result; + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + if (NULL == access) { return OPAL_ERR_BAD_PARAM; } @@ -538,7 +586,7 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuDeviceCanAccessPeer failed", true, OPAL_PROC_MY_HOSTNAME, result); - return result; + return OPAL_ERROR; } return 0; } @@ -554,18 +602,24 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc { CUresult result; int enable = 1; + + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr); if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME, result); - return result; + return OPAL_ERROR; } result = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) addr); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuPointerSetAttribute failed", true, OPAL_PROC_MY_HOSTNAME, result, addr); - return result; + return OPAL_ERROR; } return OPAL_SUCCESS; } diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h index 8efde778761..694a4192231 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.h +++ b/opal/mca/accelerator/cuda/accelerator_cuda.h @@ -45,6 +45,8 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module; +OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void); + END_C_DECLS #endif /* MCA_ACCELERATOR_CUDA_H */ diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c index 2ffeebafd00..d48e29c9f65 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c @@ -31,12 +31,16 @@ #include "opal/util/printf.h" #include "opal/util/proc.h" #include "opal/util/show_help.h" - +#include "opal/sys/atomic.h" /* Define global variables, used in accelerator_cuda.c */ CUstream opal_accelerator_cuda_memcpy_stream = NULL; opal_mutex_t opal_accelerator_cuda_stream_lock = {0}; +/* Initialization lock for delayed cuda initialization */ +static opal_mutex_t accelerator_cuda_init_lock; +static bool accelerator_cuda_init_complete = false; + #define STRINGIFY2(x) #x #define STRINGIFY(x) STRINGIFY2(x) @@ -115,19 +119,22 @@ static int accelerator_cuda_component_register(void) return OPAL_SUCCESS; } -static opal_accelerator_base_module_t* accelerator_cuda_init(void) +int opal_accelerator_cuda_delayed_init() { - int retval, i, j; - CUresult result; + int result = OPAL_SUCCESS; CUcontext cuContext; - OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t); + /* Double checked locking to avoid having to + * grab locks post lazy-initialization. */ + opal_atomic_rmb(); + if (true == accelerator_cuda_init_complete) { + return OPAL_SUCCESS; + } + OPAL_THREAD_LOCK(&accelerator_cuda_init_lock); - /* First check if the support is enabled. In the case that the user has - * turned it off, we do not need to continue with any CUDA specific - * initialization. Do this after MCA parameter registration. */ - if (!opal_cuda_support) { - return NULL; + /* If already initialized, just exit */ + if (true == accelerator_cuda_init_complete) { + goto out; } /* Check to see if this process is running in a CUDA context. If @@ -135,10 +142,11 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void) result = cuCtxGetCurrent(&cuContext); if (CUDA_SUCCESS != result) { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed"); - return NULL; + goto out; } else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context"); - return NULL; + result = OPAL_ERROR; + goto out; } else { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded"); } @@ -148,7 +156,7 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void) if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true, OPAL_PROC_MY_HOSTNAME, result); - return NULL; + goto out; } result = cuMemHostRegister(&checkmem, sizeof(int), 0); @@ -162,7 +170,26 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void) opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuMemHostRegister OK on test region"); } + result = OPAL_SUCCESS; + opal_atomic_wmb(); + accelerator_cuda_init_complete = true; +out: + OPAL_THREAD_UNLOCK(&accelerator_cuda_init_lock); + return result; +} + +static opal_accelerator_base_module_t* accelerator_cuda_init(void) +{ + OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t); + OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t); + /* First check if the support is enabled. In the case that the user has + * turned it off, we do not need to continue with any CUDA specific + * initialization. Do this after MCA parameter registration. */ + if (!opal_cuda_support) { + return NULL; + } + opal_accelerator_cuda_delayed_init(); return &opal_accelerator_cuda_module; } @@ -183,5 +210,6 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module) } OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock); + OBJ_DESTRUCT(&accelerator_cuda_init_lock); return; }