Skip to content

Commit

Permalink
Add a configure option to enable GPU-aware communications.
Browse files Browse the repository at this point in the history
Signed-off-by: George Bosilca <gbosilca@nvidia.com>
  • Loading branch information
bosilca committed Aug 8, 2024
1 parent 10ca380 commit b998764
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 6 deletions.
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ option(PARSEC_DIST_WITH_MPI
if(PARSEC_DIST_WITH_MPI AND 0)
message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one")
endif()
option(PARSEC_MPI_IS_GPU_AWARE
"Build PaRSEC assuming the MPI library is GPU-aware, aka. can move data directly to and from GPU memory.\
As of today (mid 2024) while most MPI support such an option, they require a single process per GPU" ON)
option(PARSEC_DIST_THREAD
"Use an extra thread to progress the data movements" ON)
option(PARSEC_DIST_PRIORITIES
Expand Down
1 change: 1 addition & 0 deletions parsec/include/parsec/parsec_options.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@

/* Communication engine */
#cmakedefine PARSEC_DIST_WITH_MPI
#cmakedefine PARSEC_MPI_IS_GPU_AWARE
#cmakedefine PARSEC_DIST_THREAD
#cmakedefine PARSEC_DIST_PRIORITIES
#cmakedefine PARSEC_DIST_COLLECTIVES
Expand Down
11 changes: 7 additions & 4 deletions parsec/mca/device/device_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -954,9 +954,9 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,

/* Skip CTL flows only */
if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) {
gpu_task->flow_nb_elts[i] = 0; /* assume there is nothing to transfer to the GPU */
gpu_task->flow_nb_elts[i] = 0; /* assume there is nothing to transfer to the GPU */
continue;
}
}

PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%d:%s]:%s: Investigating flow %s:%d",
Expand All @@ -971,7 +971,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
gpu_device->super.device_index, gpu_device->super.name, task_name,
flow->name, i, gpu_elem,
this_task->data[i].data_in->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : "");
this_task->data[i].data_out = this_task->data[i].data_in;
this_task->data[i].data_out = this_task->data[i].data_in;
continue;
}
master = this_task->data[i].data_in->original;
Expand Down Expand Up @@ -2477,7 +2477,10 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
assert(PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy->data_transfer_status);
cpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;

if( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY) ) {
/* Report the CPU copy as the output of the task. */
this_task->data[i].data_out = cpu_copy;
}
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU copy %p [ref_count %d] moved to the read LRU in %s",
gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
Expand Down
9 changes: 9 additions & 0 deletions parsec/parsec_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,9 @@ PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_taskpool_t);
#define PARSEC_DEPENDENCIES_STARTUP_TASK ((parsec_dependency_t)(1<<29))
#define PARSEC_DEPENDENCIES_BITMASK (~(PARSEC_DEPENDENCIES_TASK_DONE|PARSEC_DEPENDENCIES_IN_DONE|PARSEC_DEPENDENCIES_STARTUP_TASK))

#define PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY 0x00000002
#define PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY 0x00000001

/**
* This structure is used internally by the parsec_dependencies_t structures
*/
Expand Down Expand Up @@ -491,6 +494,12 @@ PARSEC_DECLSPEC extern int parsec_slow_bind_warning;
* the scheduler, but can provide a better cache reuse.
*/
PARSEC_DECLSPEC extern int parsec_runtime_keep_highest_priority_task;
/**
* Global configuration mask allowing or not for the data to be sent or received,
* from or to, GPU memory. It can be an OR between PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY
* and PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY.
*/
PARSEC_DECLSPEC extern int parsec_mpi_allow_gpu_memory_communications;

/**
* Description of the state of the task. It indicates what will be the next
Expand Down
10 changes: 10 additions & 0 deletions parsec/parsec_mpi_funnelled.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ parsec_list_t mpi_funnelled_dynamic_sendreq_fifo; /* ordered non threaded fifo *
parsec_list_t mpi_funnelled_dynamic_recvreq_fifo; /* ordered non threaded fifo */
parsec_mempool_t *mpi_funnelled_dynamic_req_mempool = NULL;

int parsec_mpi_allow_gpu_memory_communications = 3;

/* This structure is used to save all the information necessary to
* invoke a callback after a MPI_Request is satisfied
*/
Expand Down Expand Up @@ -506,6 +508,14 @@ static int mpi_funneled_init_once(parsec_context_t* context)
MAX_MPI_TAG, (unsigned int)MAX_MPI_TAG, MAX_MPI_TAG / MAX_DEP_OUT_COUNT);
}

#if !defined(PARSEC_MPI_IS_GPU_AWARE)
parsec_mpi_allow_gpu_memory_communications = 0;
#endif
parsec_mca_param_reg_int_name("mpi", "gpu_aware",
"Enabled if PaRSEC should allow MPI to move data directly from or to GPU memory. Otherwise, all data"
" movements will transit through CPU memory, and will always have a backup copy there. Accepted values "
"are ORed between 1 for receiving into GPU memory and 2 for sending from GPU memory",
false, false, parsec_mpi_allow_gpu_memory_communications, &parsec_mpi_allow_gpu_memory_communications);
(void)context;
return 0;
}
Expand Down
4 changes: 2 additions & 2 deletions parsec/remote_dep_mpi.c
Original file line number Diff line number Diff line change
Expand Up @@ -2110,8 +2110,8 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es,
/* prepare the local receiving data */
assert(NULL == deps->output[k].data.data); /* we do not support in-place tiles now, make sure it doesn't happen yet */
if(NULL == deps->output[k].data.data) {
deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote,
deps->output[k].data.preferred_device);
int best_device = (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY) ? deps->output[k].data.preferred_device : 0;
deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote, best_device);
}
/* Mark the data under tranfer */
deps->output[k].data.data->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER;
Expand Down

0 comments on commit b998764

Please sign in to comment.