Skip to content

Commit

Permalink
Allow selection of a particular GPU (via the mask).
Browse files Browse the repository at this point in the history
Signed-off-by: George Bosilca <gbosilca@nvidia.com>
  • Loading branch information
bosilca committed Jul 28, 2024
1 parent c403e17 commit 194c274
Showing 1 changed file with 13 additions and 22 deletions.
35 changes: 13 additions & 22 deletions parsec/mca/device/cuda/device_cuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ int parsec_cuda_max_streams = PARSEC_GPU_MAX_STREAMS;
int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks;
char* parsec_cuda_lib_path = NULL;

static int cuda_mask;
static int parsec_device_cuda_mask = 0xFF;
static int parsec_device_cuda_avail = 0;
static int parsec_cuda_sort_pending;

#if defined(PARSEC_PROF_TRACE)
Expand Down Expand Up @@ -104,10 +105,10 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority
else
parsec_device_cuda_component.modules = NULL;

for( i = j = 0; i < parsec_device_cuda_enabled; i++ ) {
for( i = j = 0; i < parsec_device_cuda_avail; i++ ) {

/* Allow fine grain selection of the GPU's */
if( !((1 << i) & cuda_mask) ) continue;
if( !((1 << i) & parsec_device_cuda_mask) ) continue;

rc = parsec_cuda_module_init(i, &parsec_device_cuda_component.modules[j]);
if( PARSEC_SUCCESS != rc ) {
Expand Down Expand Up @@ -139,7 +140,7 @@ static int device_cuda_component_register(void)
false, false, -1, &parsec_device_cuda_enabled);
(void)parsec_mca_param_reg_int_name("device_cuda", "mask",
"The bitwise mask of CUDA devices to be enabled (default all)",
false, false, 0xffffffff, &cuda_mask);
false, false, 0xffffffff, &parsec_device_cuda_mask);
(void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask",
"What devices are allowed to use NVLINK if available (default all)",
false, false, 0xffffffff, &parsec_cuda_nvlink_mask);
Expand Down Expand Up @@ -185,15 +186,14 @@ static int device_cuda_component_register(void)
static int device_cuda_component_open(void)
{
cudaError_t cudastatus;
int ndevices;

if( 0 == parsec_device_cuda_enabled ) {
return MCA_ERROR; /* Nothing to do around here */
}

cudastatus = cudaGetDeviceCount( &ndevices );
cudastatus = cudaGetDeviceCount(&parsec_device_cuda_avail);
if( cudaErrorNoDevice == (cudaError_t) cudastatus ) {
ndevices = 0;
parsec_device_cuda_avail = 0;
/* This is normal on machines with no GPUs, let it flow
* to do the normal checks vis-a-vis the number of requested
* devices and issue a warning only when not fulfilling
Expand All @@ -208,31 +208,22 @@ static int device_cuda_component_open(void)
} );
}

if( ndevices > parsec_device_cuda_enabled ) {
if( 0 < parsec_device_cuda_enabled ) {
ndevices = parsec_device_cuda_enabled;
}
} else if (ndevices < parsec_device_cuda_enabled ) {
/* Update the number of GPU for the upper layer */
if (parsec_device_cuda_avail < parsec_device_cuda_enabled ) {
if( 0 < parsec_device_cuda_enabled ) {
if( 0 == ndevices ) {
if( 0 == parsec_device_cuda_avail ) {
parsec_warning("User requested %d CUDA devices, but none are available on %s."
" CUDA support will be therefore disabled.",
parsec_device_cuda_enabled, parsec_hostname);
} else {
parsec_warning("User requested %d CUDA devices, but only %d are available on %s.",
parsec_device_cuda_enabled, ndevices, parsec_hostname);
parsec_device_cuda_enabled, parsec_device_cuda_avail, parsec_hostname);
}
parsec_mca_param_set_int(parsec_device_cuda_enabled_index, ndevices);
}
parsec_mca_param_set_int(parsec_device_cuda_enabled_index, parsec_device_cuda_avail);
}

/* Update the number of GPU for the upper layer */
parsec_device_cuda_enabled = ndevices;
if( 0 == ndevices ) {
return MCA_ERROR;
}

return MCA_SUCCESS;
return (0 == parsec_device_cuda_avail) ? MCA_ERROR : MCA_SUCCESS;
}

/**
Expand Down

0 comments on commit 194c274

Please sign in to comment.