From fad962b8cb2cc7c0333e166a56b2e1144cd92869 Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Thu, 27 Jul 2023 10:36:54 -0500 Subject: [PATCH] Check for CUDA devices with nvmlDeviceGetCount_v2() first Checking w/lightweight nvmlDeviceGetCount_v2() call first allows us to avoid the more expensive call to cudaGetDeviceCount() when there's no NVIDIA devices on the node. Signed-off-by: Quincey Koziol --- configure.ac | 2 +- src/hmem_cuda.c | 130 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 104 insertions(+), 28 deletions(-) diff --git a/configure.ac b/configure.ac index f886a6e7d21..450997db9fa 100644 --- a/configure.ac +++ b/configure.ac @@ -595,7 +595,7 @@ AS_IF([test x"$with_cuda" != x"no"], [cuda_runtime.h], [cudart], [cudaMemcpy], - [-lcuda], + [-lcuda -lnvidia-ml], [$with_cuda], [], [have_cuda=1]) diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c index 1acca937489..551f63a3eba 100644 --- a/src/hmem_cuda.c +++ b/src/hmem_cuda.c @@ -42,14 +42,15 @@ #include #include +#include #if ENABLE_CUDA_DLOPEN #include #endif /* - * Convenience higher-order macros for enumerating CUDA driver/runtime API - * function names + * Convenience higher-order macros for enumerating CUDA driver/runtime and + * NVML API function names */ #define CUDA_DRIVER_FUNCS_DEF(_) \ _(cuGetErrorName) \ @@ -75,6 +76,11 @@ _(cudaIpcGetMemHandle) \ _(cudaIpcCloseMemHandle) +#define NVML_FUNCS_DEF(_) \ + _(nvmlInit_v2) \ + _(nvmlDeviceGetCount_v2) \ + _(nvmlShutdown) + static struct { int device_count; bool p2p_access_supported; @@ -82,13 +88,15 @@ static struct { bool use_ipc; void *driver_handle; void *runtime_handle; + void *nvml_handle; } cuda_attr = { .device_count = -1, .p2p_access_supported = false, .use_gdrcopy = false, .use_ipc = false, .driver_handle = NULL, - .runtime_handle = NULL + .runtime_handle = NULL, + .nvml_handle = NULL }; static struct { @@ -123,12 +131,16 @@ static struct { cudaError_t (*cudaIpcGetMemHandle)(cudaIpcMemHandle_t *handle, void *devptr); cudaError_t (*cudaIpcCloseMemHandle)(void *devptr); + nvmlReturn_t (*nvmlInit_v2)(void); + nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *deviceCount); + nvmlReturn_t (*nvmlShutdown)(void); } cuda_ops #if !ENABLE_CUDA_DLOPEN #define CUDA_OPS_INIT(sym) .sym = sym, = { CUDA_DRIVER_FUNCS_DEF(CUDA_OPS_INIT) CUDA_RUNTIME_FUNCS_DEF(CUDA_OPS_INIT) + NVML_FUNCS_DEF(CUDA_OPS_INIT) } #endif ; @@ -243,6 +255,21 @@ static cudaError_t ofi_cudaGetDeviceCount(int *count) return cuda_ops.cudaGetDeviceCount(count); } +static nvmlReturn_t ofi_nvmlInit_v2(void) +{ + return cuda_ops.nvmlInit_v2(); +} + +static nvmlReturn_t ofi_nvmlDeviceGetCount_v2(unsigned int *count) +{ + return cuda_ops.nvmlDeviceGetCount_v2(count); +} + +static nvmlReturn_t ofi_nvmlShutdown(void) +{ + return cuda_ops.nvmlShutdown(); +} + cudaError_t ofi_cudaMalloc(void **ptr, size_t size) { return cuda_ops.cudaMalloc(ptr, size); @@ -384,12 +411,13 @@ int cuda_get_base_addr(const void *ptr, void **base, size_t *size) if (!cuda_ops.sym) { \ FI_WARN(&core_prov, FI_LOG_CORE, \ "Failed to find " #sym "\n"); \ - goto err_dlclose_cuda_driver; \ + goto err_dlclose_nvml_lib; \ } \ } while (0); #define CUDA_DRIVER_FUNCS_DLOPEN(sym) CUDA_FUNCS_DLOPEN(driver, sym) #define CUDA_RUNTIME_FUNCS_DLOPEN(sym) CUDA_FUNCS_DLOPEN(runtime, sym) +#define NVML_LIB_FUNCS_DLOPEN(sym) CUDA_FUNCS_DLOPEN(nvml, sym) static int cuda_hmem_dl_init(void) { @@ -411,11 +439,21 @@ static int cuda_hmem_dl_init(void) goto err_dlclose_cuda_runtime; } + cuda_attr.nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW); + if (!cuda_attr.nvml_handle) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to dlopen libnvidia-ml.so\n"); + goto err_dlclose_cuda_driver; + } + CUDA_DRIVER_FUNCS_DEF(CUDA_DRIVER_FUNCS_DLOPEN) CUDA_RUNTIME_FUNCS_DEF(CUDA_RUNTIME_FUNCS_DLOPEN) + NVML_FUNCS_DEF(NVML_LIB_FUNCS_DLOPEN) return FI_SUCCESS; +err_dlclose_nvml_lib: + dlclose(cuda_attr.nvml_handle); err_dlclose_cuda_driver: dlclose(cuda_attr.driver_handle); err_dlclose_cuda_runtime: @@ -430,6 +468,7 @@ static int cuda_hmem_dl_init(void) static void cuda_hmem_dl_cleanup(void) { #if ENABLE_CUDA_DLOPEN + dlclose(cuda_attr.nvml_handle); dlclose(cuda_attr.driver_handle); dlclose(cuda_attr.runtime_handle); #endif @@ -437,31 +476,68 @@ static void cuda_hmem_dl_cleanup(void) static int cuda_hmem_verify_devices(void) { - cudaError_t cuda_ret; + nvmlReturn_t nvml_ret; + cudaError_t cuda_ret; + unsigned int nvml_device_count = 0; - /* Verify CUDA compute-capable devices are present on the host. */ - cuda_ret = ofi_cudaGetDeviceCount(&cuda_attr.device_count); - switch (cuda_ret) { - case cudaSuccess: - break; - - case cudaErrorNoDevice: - return -FI_ENOSYS; - - default: - FI_WARN(&core_prov, FI_LOG_CORE, - "Failed to perform cudaGetDeviceCount: %s:%s\n", - ofi_cudaGetErrorName(cuda_ret), - ofi_cudaGetErrorString(cuda_ret)); - return -FI_EIO; - } - - FI_INFO(&core_prov, FI_LOG_CORE, - "Number of CUDA devices detected: %d\n", - cuda_attr.device_count); + /* Check w/ nvmlDeviceGetCount_v2() first, to avoid more expensive + * call to cudaGetDeviceCount() when possible. + */ - if (cuda_attr.device_count <= 0) - return -FI_ENOSYS; + /* Make certain that the NVML routines are initialized */ + nvml_ret = ofi_nvmlInit_v2(); + if (nvml_ret != NVML_SUCCESS) + return -FI_ENOSYS; + + /* Verify NVIDIA devices are present on the host. */ + nvml_ret = ofi_nvmlDeviceGetCount_v2(&nvml_device_count); + if (nvml_ret != NVML_SUCCESS) { + ofi_nvmlShutdown(); + return -FI_ENOSYS; + } + + /* Make certain that the NVML routines get shutdown */ + /* Note: nvmlInit / Shutdown calls are refcounted, so no harm in + * calling nvmlShutdown here, if the user has called nvmlInit. + */ + nvml_ret = ofi_nvmlShutdown(); + if (nvml_ret != NVML_SUCCESS) + return -FI_ENOSYS; + + FI_INFO(&core_prov, FI_LOG_CORE, + "Number of NVIDIA devices detected: %u\n", + nvml_device_count); + + /* If NVIDIA devices are present, now perform more expensive check + * for actual GPUs. + */ + if (nvml_device_count > 0) { + /* Verify CUDA compute-capable devices are present on the host. */ + cuda_ret = ofi_cudaGetDeviceCount(&cuda_attr.device_count); + switch (cuda_ret) { + case cudaSuccess: + break; + + case cudaErrorNoDevice: + return -FI_ENOSYS; + + default: + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform cudaGetDeviceCount: %s:%s\n", + ofi_cudaGetErrorName(cuda_ret), + ofi_cudaGetErrorString(cuda_ret)); + return -FI_EIO; + } + + FI_INFO(&core_prov, FI_LOG_CORE, + "Number of CUDA devices detected: %d\n", + cuda_attr.device_count); + } else { + cuda_attr.device_count = 0; + } + + if (cuda_attr.device_count <= 0) + return -FI_ENOSYS; return FI_SUCCESS; }