Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for batched tasks and for CUDA-aware communications #4

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ option(PARSEC_DIST_WITH_MPI
if(PARSEC_DIST_WITH_MPI AND 0)
message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one")
endif()
option(PARSEC_MPI_IS_GPU_AWARE
"Build PaRSEC assuming the MPI library is GPU-aware, aka. can move data directly to and from GPU memory.\
As of today (mid 2024) while most MPI support such an option, they require a single process per GPU" ON)
option(PARSEC_DIST_THREAD
"Use an extra thread to progress the data movements" ON)
option(PARSEC_DIST_PRIORITIES
Expand All @@ -181,6 +184,8 @@ option(PARSEC_GPU_ALLOC_PER_TILE
mark_as_advanced(PARSEC_GPU_ALLOC_PER_TILE)
option(PARSEC_GPU_WITH_CUDA
"Enable GPU support using CUDA kernels" ON)
option(PARSEC_GPU_WITH_CUDA_BATCH
"Enable the runtime support for batched kernels" ON)
option(PARSEC_GPU_WITH_HIP
"Enable GPU support using HIP kernels" ON)
option(PARSEC_GPU_WITH_LEVEL_ZERO
Expand Down Expand Up @@ -729,6 +734,12 @@ int main(int argc, char *argv[]) {
endif (CUDAToolkit_FOUND)
set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
endif( PARSEC_GPU_WITH_CUDA )
if( PARSEC_GPU_WITH_CUDA_BATCH )
if( NOT PARSEC_HAVE_CUDA)
message(FATAL_ERROR "PARSEC_GPU_WITH_CUDA_BATCH requires PARSEC_GPU_WITH_CUDA. Enable both or none")
endif( NOT PARSEC_HAVE_CUDA)
set(PARSEC_HAVE_CUDA_BATCH True CACHE BOOL "True if support for batched CUDA has been enabled")
endif( PARSEC_GPU_WITH_CUDA_BATCH )

if( PARSEC_GPU_WITH_HIP )
# This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents
Expand Down
4 changes: 4 additions & 0 deletions cmake_modules/PaRSECConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ endif(@PARSEC_DIST_WITH_MPI@)
if(@PARSEC_HAVE_CUDA@)
find_package(CUDAToolkit REQUIRED)
set(PARSEC_HAVE_CUDA TRUE)

if(@PARSEC_HAVE_CUDA_BATCH@)
set(PARSEC_HAVE_CUDA_BATCH TRUE)
endif(@PARSEC_HAVE_CUDA_BATCH@)
endif(@PARSEC_HAVE_CUDA@)

if(@PARSEC_HAVE_HIP@)
Expand Down
1 change: 1 addition & 0 deletions parsec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ if( BUILD_PARSEC )
$<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
$<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
$<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
${EXTRA_LIBS}
INTERFACE
Expand Down
123 changes: 99 additions & 24 deletions parsec/arena.c
Original file line number Diff line number Diff line change
Expand Up @@ -235,43 +235,118 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
return PARSEC_SUCCESS;
}

parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
#include "parsec/utils/zone_malloc.h"
#include "mca/device/device_gpu.h"

#if defined(PARSEC_DEBUG)
static int64_t parsec_countable_incoming_message = 0xF000000000000000;
#endif /* defined(PARSEC_DEBUG) */

static inline parsec_data_copy_t *
parsec_arena_internal_copy_new(parsec_arena_t *arena,
parsec_data_t *data,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_t *data;
parsec_data_copy_t *copy;
int rc;


data = parsec_data_new();
parsec_data_copy_t *copy = NULL;
parsec_data_t* ldata = data;
if( NULL == data ) {
ldata = parsec_data_new();
if( NULL == ldata ) {
return NULL;
}
#if defined(PARSEC_DEBUG)
/* Name the data with a default key to facilitate debuging */
ldata->key = (uint64_t)parsec_atomic_fetch_inc_int64(&parsec_countable_incoming_message);
ldata->key |= ((uint64_t)device) << 56;
#endif /* defined(PARSEC_DEBUG) */
}
if( 0 == device ) {
copy = parsec_data_copy_new(ldata, device, dtt,
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA);
if (NULL == copy) {
goto free_and_return;
}
int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
if (PARSEC_SUCCESS != rc) {
goto free_and_return;
}
return copy;
}
/**
* This part is not really nice, it breaks the separation between devices, and how their memory is
* managed. But, it should give nice perfromance improvements if the communication layer is
* capable of sending or receiving data directly to and from the accelerator memory. The only drawback
* is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
* prior behavior, going through the CPU memory.
*
* The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
* are released from the different LRU lists.
*/
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device);
if (NULL == gpu_device) {
return NULL;
}
size_t size = count * arena->elem_size;
void* device_private = zone_malloc(gpu_device->memory, size);
if( NULL == device_private ) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n",
device, size, (void *)copy->arena_chunk);
goto free_and_return;
}
copy = parsec_data_copy_new(ldata, device, dtt,
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
if (NULL == copy) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n",
device, size, (void *)copy->arena_chunk);
zone_free(gpu_device->memory, device_private);
goto free_and_return;
}
copy->dtt = dtt;
copy->device_private = device_private;
copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
"data ptr %p",
device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
copy->version = 0;
copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
copy->original->owner_device = device;
copy->original->preferred_device = device;
return copy;
free_and_return:
if( NULL != copy )
PARSEC_OBJ_RELEASE(copy);
if( NULL == data)
PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */
return NULL;
}

copy = parsec_data_copy_new( data, device, dtt,
PARSEC_DATA_FLAG_ARENA |
PARSEC_DATA_FLAG_PARSEC_OWNED |
PARSEC_DATA_FLAG_PARSEC_MANAGED);
parsec_data_copy_t *
parsec_arena_get_new_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_copy_t *dev0_copy, *copy;

if(NULL == copy) {
PARSEC_OBJ_RELEASE(data);
dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
if( NULL == dev0_copy ) {
return NULL;
}
dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
dev0_copy->version = 0; /* start from somewhere */
if( 0 == device ) {
return dev0_copy;
}

rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);

copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
if( NULL == copy ) {
copy = dev0_copy; /* return the main memory data copy */
}
/* This data is going to be released once all copies are released
* It does not exist without at least a copy, and we don't give the
* pointer to the user, so we must remove our retain from it
*/
PARSEC_OBJ_RELEASE(data);

if( PARSEC_SUCCESS != rc ) {
PARSEC_OBJ_RELEASE(copy);
return NULL;
}

PARSEC_OBJ_RELEASE(dev0_copy->original);
return copy;
}

Expand Down
10 changes: 5 additions & 5 deletions parsec/arena.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,15 @@ int parsec_arena_construct_ex(parsec_arena_t* arena,
* enough resource to allocate a new data copy of this type.
*/

parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt);
parsec_data_copy_t *parsec_arena_get_new_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt);

/**
* @brief Allocates memory for a given data copy. This is a function used by
* DSLs to set the memory associated with a data copy they have created.
* It is also used by parsec_arena_get_copy.
*
* It is also used by parsec_arena_get_new_copy.
*
* @param copy the (empty) data copy to allocate memory for. NB: the @p original
* field of this data copy must be set. The operation overwrites the device
* dtt and count of this data copy, as well as the device_private pointer.
Expand Down
6 changes: 4 additions & 2 deletions parsec/class/info.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ void parsec_info_object_array_init(parsec_info_object_array_t *oa, parsec_info_t
if(oa->known_infos == 0)
oa->info_objects = NULL;
else
oa->info_objects = calloc(sizeof(void*), oa->known_infos);
oa->info_objects = calloc(oa->known_infos, sizeof(void*));
oa->infos = nfo;
oa->cons_obj = cons_obj;
}
Expand Down Expand Up @@ -265,7 +265,7 @@ static void parsec_ioa_resize_and_rdlock(parsec_info_object_array_t *oa, parsec_
oa->info_objects = realloc(oa->info_objects, sizeof(void *) * ns);
memset(&oa->info_objects[oa->known_infos - 1], 0, ns - oa->known_infos);
} else {
oa->info_objects = calloc(sizeof(void*), ns);
oa->info_objects = calloc(ns, sizeof(void*));
}
oa->known_infos = ns;
}
Expand Down Expand Up @@ -312,6 +312,8 @@ void *parsec_info_get(parsec_info_object_array_t *oa, parsec_info_id_t iid)
if(NULL == ie->constructor)
return ret;
nio = ie->constructor(oa->cons_obj, ie->cons_data);
if( NULL == nio )
return ret;
ret = parsec_info_test_and_set(oa, iid, nio, NULL);
if(ret != nio && NULL != ie->destructor) {
ie->destructor(nio, ie->des_data);
Expand Down
Loading
Loading