Skip to content

Commit

Permalink
Transfer data to and from GPU.
Browse files Browse the repository at this point in the history
This is a multi-part patch that allows the CPU to prepare a data copy
mapped onto a device.

1. The first question is how is such a device selected ?

The allocation of such a copy happen way before the scheduler is invoked
for a task, in fact before the task is even ready. Thus, we need to
decide on the location of this copy only based on some static
information, such as the task affinity. Therefore, this approach only
works for owner-compute type of tasks, where the task will be executed
on the device that owns the data used for the task affinity.

2. Pass the correct data copy across the entire system, instead of
   falling back to data copy of the device 0 (CPU memory)

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
  • Loading branch information
bosilca committed Aug 5, 2024
1 parent 9079ec6 commit 2c499e6
Show file tree
Hide file tree
Showing 23 changed files with 637 additions and 263 deletions.
1 change: 1 addition & 0 deletions parsec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ if( BUILD_PARSEC )
$<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
$<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
$<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
${EXTRA_LIBS}
INTERFACE
Expand Down
114 changes: 90 additions & 24 deletions parsec/arena.c
Original file line number Diff line number Diff line change
Expand Up @@ -235,43 +235,109 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
return PARSEC_SUCCESS;
}

parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
#include "parsec/utils/zone_malloc.h"
#include "mca/device/device_gpu.h"

static inline parsec_data_copy_t *
parsec_arena_internal_copy_new(parsec_arena_t *arena,
parsec_data_t *data,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_t *data;
parsec_data_copy_t *copy;
int rc;


data = parsec_data_new();
parsec_data_copy_t *copy = NULL;
parsec_data_t* ldata = data;
if( NULL == data ) {
ldata = parsec_data_new();
if( NULL == ldata ) {
return NULL;
}
}
if( 0 == device ) {
copy = parsec_data_copy_new(ldata, device, dtt,
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA);
if (NULL == copy) {
goto free_and_return;
}
int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
if (PARSEC_SUCCESS != rc) {
goto free_and_return;
}
return copy;
}
/**
* This part is not really nice, it breaks the separation between devices, and how their memory is
* managed. But, it should give nice perfromance improvements if the communication layer is
* capable of sending or receiving data directly to and from the accelerator memory. The only drawback
* is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
* prior behavior, going through the CPU memory.
*
* The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
* are released from the different LRU lists.
*/
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device);
if (NULL == gpu_device) {
return NULL;
}
size_t size = count * arena->elem_size;
void* device_private = zone_malloc(gpu_device->memory, size);
if( NULL == device_private ) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n",
device, size, (void *)copy->arena_chunk);
goto free_and_return;
}
copy = parsec_data_copy_new(ldata, device, dtt,
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
if (NULL == copy) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n",
device, size, (void *)copy->arena_chunk);
zone_free(gpu_device->memory, device_private);
goto free_and_return;
}
copy->dtt = dtt;
copy->device_private = device_private;
copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
"data ptr %p",
device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
copy->version = 0;
copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
copy->original->owner_device = device;
copy->original->preferred_device = device;
return copy;
free_and_return:
if( NULL != copy )
PARSEC_OBJ_RELEASE(copy);
if( NULL == data)
PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */
return NULL;
}

copy = parsec_data_copy_new( data, device, dtt,
PARSEC_DATA_FLAG_ARENA |
PARSEC_DATA_FLAG_PARSEC_OWNED |
PARSEC_DATA_FLAG_PARSEC_MANAGED);
parsec_data_copy_t *
parsec_arena_get_new_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_copy_t *dev0_copy, *copy;

if(NULL == copy) {
PARSEC_OBJ_RELEASE(data);
dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
if( NULL == dev0_copy ) {
return NULL;
}
dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
dev0_copy->version = 0; /* start from somewhere */
if( 0 == device ) {
return dev0_copy;
}

rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);

copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
if( NULL == copy ) {
copy = dev0_copy; /* return the main memory data copy */
}
/* This data is going to be released once all copies are released
* It does not exist without at least a copy, and we don't give the
* pointer to the user, so we must remove our retain from it
*/
PARSEC_OBJ_RELEASE(data);

if( PARSEC_SUCCESS != rc ) {
PARSEC_OBJ_RELEASE(copy);
return NULL;
}

PARSEC_OBJ_RELEASE(dev0_copy->original);
return copy;
}

Expand Down
10 changes: 5 additions & 5 deletions parsec/arena.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,15 @@ int parsec_arena_construct_ex(parsec_arena_t* arena,
* enough resource to allocate a new data copy of this type.
*/

parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt);
parsec_data_copy_t *parsec_arena_get_new_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt);

/**
* @brief Allocates memory for a given data copy. This is a function used by
* DSLs to set the memory associated with a data copy they have created.
* It is also used by parsec_arena_get_copy.
*
* It is also used by parsec_arena_get_new_copy.
*
* @param copy the (empty) data copy to allocate memory for. NB: the @p original
* field of this data copy must be set. The operation overwrites the device
* dtt and count of this data copy, as well as the device_private pointer.
Expand Down
106 changes: 84 additions & 22 deletions parsec/data.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "parsec/sys/atomic.h"
#include "parsec/remote_dep.h"
#include "parsec/parsec_internal.h"
#include "parsec/utils/zone_malloc.h"

static parsec_lifo_t parsec_data_lifo;
static parsec_lifo_t parsec_data_copies_lifo;
Expand Down Expand Up @@ -65,6 +66,7 @@ static void parsec_data_construct(parsec_data_t* obj )
obj->preferred_device = -1;
obj->key = 0;
obj->nb_elts = 0;
obj->nb_copies = 0;
for( uint32_t i = 0; i < parsec_nb_devices;
obj->device_copies[i] = NULL, i++ );
obj->dc = NULL;
Expand Down Expand Up @@ -99,11 +101,12 @@ static void parsec_data_destruct(parsec_data_t* obj )
* GPU copies are normally stored in LRU lists, and must be
* destroyed by the release list to free the memory on the device
*/
PARSEC_OBJ_RELEASE( copy );
PARSEC_DATA_COPY_RELEASE(copy);
}
}
assert(NULL == obj->device_copies[i]);
}
assert(0 == obj->nb_copies);
}

PARSEC_OBJ_CLASS_INSTANCE(parsec_data_t, parsec_object_t,
Expand Down Expand Up @@ -161,8 +164,8 @@ void parsec_data_delete(parsec_data_t* data)

inline int
parsec_data_copy_attach(parsec_data_t* data,
parsec_data_copy_t* copy,
uint8_t device)
parsec_data_copy_t* copy,
uint8_t device)
{
assert(NULL == copy->original);
assert(NULL == copy->older);
Expand All @@ -175,6 +178,7 @@ parsec_data_copy_attach(parsec_data_t* data,
copy->older = NULL;
return PARSEC_ERROR;
}
parsec_atomic_fetch_add_int32(&data->nb_copies, 1);
PARSEC_OBJ_RETAIN(data);
return PARSEC_SUCCESS;
}
Expand All @@ -192,6 +196,7 @@ int parsec_data_copy_detach(parsec_data_t* data,
return PARSEC_ERR_NOT_FOUND;
}
data->device_copies[device] = copy->older;
parsec_atomic_fetch_add_int32(&data->nb_copies, -1);

copy->original = NULL;
copy->older = NULL;
Expand Down Expand Up @@ -221,7 +226,7 @@ parsec_data_copy_t* parsec_data_copy_new(parsec_data_t* data, uint8_t device,
}
copy->flags = flags;
if( PARSEC_SUCCESS != parsec_data_copy_attach(data, copy, device) ) {
PARSEC_OBJ_RELEASE(copy);
PARSEC_DATA_COPY_RELEASE(copy);
return NULL;
}
copy->dtt = dtt;
Expand Down Expand Up @@ -330,6 +335,12 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
copy = data->device_copies[device];
assert( NULL != copy );

if( valid_copy == device ) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
"DEV[%d]: already has ownership of data %p to copy %p in mode %d",
device, data, copy, access_mode);
goto bookkeeping;
}
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
"DEV[%d]: start transfer ownership of data %p to copy %p in mode %d",
device, data, copy, access_mode);
Expand Down Expand Up @@ -417,6 +428,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
}
}

bookkeeping:
if( PARSEC_FLOW_ACCESS_READ & access_mode ) {
copy->readers++;
}
Expand All @@ -435,40 +447,52 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
return valid_copy;
}

static char dump_coherency_codex(parsec_data_coherency_t state)
{
if( PARSEC_DATA_COHERENCY_INVALID == state ) return 'I';
if( PARSEC_DATA_COHERENCY_OWNED == state ) return 'O';
if( PARSEC_DATA_COHERENCY_EXCLUSIVE == state ) return 'E';
if( PARSEC_DATA_COHERENCY_SHARED == state ) return 'S';
return 'X';
}

void parsec_dump_data_copy(parsec_data_copy_t* copy)
void parsec_data_copy_dump(parsec_data_copy_t* copy)
{
parsec_debug_verbose(0, 0, "- [%d]: copy %p state %c readers %d version %u\n",
(int)copy->device_index, copy, dump_coherency_codex(copy->coherency_state), copy->readers, copy->version);
char *tranfer = "---", flags[] = "----", *coherency = "undef";
switch(copy->data_transfer_status) {
case PARSEC_DATA_STATUS_NOT_TRANSFER: tranfer = "no"; break;
case PARSEC_DATA_STATUS_UNDER_TRANSFER: tranfer = "yes"; break;
case PARSEC_DATA_STATUS_COMPLETE_TRANSFER: tranfer = "no"; break;
}
if (copy->flags & PARSEC_DATA_FLAG_ARENA) flags[0] = 'A';
if (copy->flags & PARSEC_DATA_FLAG_TRANSIT) flags[1] = 'T';
if (copy->flags & PARSEC_DATA_FLAG_PARSEC_MANAGED) flags[2] = 'M';
if (copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) flags[3] = 'O';

if( PARSEC_DATA_COHERENCY_INVALID == copy->coherency_state ) coherency = "invalid";
if( PARSEC_DATA_COHERENCY_OWNED == copy->coherency_state ) coherency = "owned";
if( PARSEC_DATA_COHERENCY_EXCLUSIVE == copy->coherency_state ) coherency = "exclusive";
if( PARSEC_DATA_COHERENCY_SHARED == copy->coherency_state ) coherency = "shared";

parsec_debug_verbose(0, 0, "%s [%d]: copy %p [ref %d] coherency %s readers %d version %u transit %s flags %s\n"
" older %p orig %p arena %p dev_priv %p\n",
((NULL != copy->original) && (copy->original->owner_device == copy->device_index)) ? "*" : " ",
(int)copy->device_index, copy, copy->super.super.obj_reference_count, coherency, copy->readers, copy->version, tranfer, flags,
(void *)copy->older, (void *)copy->original, (void *)copy->arena_chunk, copy->device_private);
}

void parsec_dump_data(parsec_data_t* data)
void parsec_data_dump(parsec_data_t* data)
{
parsec_debug_verbose(0, 0, "data %p key %lu owner %d\n", data, data->key, data->owner_device);
parsec_debug_verbose(0, 0, "data %p [ref %d] key %lu owner dev %d pref dev %d copies %d dc %p [# elems %zu]\n",
data, data->super.obj_reference_count, data->key, data->owner_device, data->preferred_device, data->nb_copies,
(void*)data->dc, data->nb_elts);

for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
if( NULL != data->device_copies[i])
parsec_dump_data_copy(data->device_copies[i]);
parsec_data_copy_dump(data->device_copies[i]);
}
}

parsec_data_copy_t*
parsec_data_get_copy(parsec_data_t* data, uint32_t device)
{
return PARSEC_DATA_GET_COPY(data, device);
return PARSEC_DATA_GET_COPY(data, device);
}

void parsec_data_copy_release(parsec_data_copy_t* copy)
{
/* TODO: Move the copy back to the CPU before destroying it */
/* TODO: Move the copy back to the CPU before destroying it */
PARSEC_DATA_COPY_RELEASE(copy);
}

Expand Down Expand Up @@ -509,7 +533,7 @@ parsec_data_create( parsec_data_t **holder,

if( !parsec_atomic_cas_ptr(holder, NULL, data) ) {
parsec_data_copy_detach(data, data_copy, 0);
PARSEC_OBJ_RELEASE(data_copy);
PARSEC_DATA_COPY_RELEASE(data_copy);
data = *holder;
}
} else {
Expand Down Expand Up @@ -560,3 +584,41 @@ parsec_data_destroy( parsec_data_t *data )
#endif
PARSEC_OBJ_RELEASE(data);
}

#include "parsec/utils/debug.h"

int parsec_data_release_self_contained_data(parsec_data_t *data)
{
if (data->super.obj_reference_count != data->nb_copies) return 0;
parsec_data_copy_t *copy;
PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Examine the status of data %p with %d copies and refcounts at %s:%d\n",
data, data->nb_copies, __FILE__, __LINE__);
/* this data is only referenced by it's own copies. If these copies are also only referenced by
* data, then we can release them all.
*/
for( uint32_t i = 0; i < parsec_nb_devices; i++) {
if (NULL == (copy = data->device_copies[i])) continue;
if( copy->super.super.obj_reference_count > 1 )
return 0;
}
PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Force the release of data %p at %s:%d", (COPY), __FILE__, __LINE__);
for( uint32_t i = 0; i < parsec_nb_devices; i++) {
if (NULL == (copy = data->device_copies[i])) continue;
assert(1 == copy->super.super.obj_reference_count);
if( 0 == copy->device_index ) {
PARSEC_OBJ_RELEASE(copy);
assert(NULL == copy);
} else {
/* Do not release data copies that do not belong to the CPU or really bad things will happen.
* Only the device manager can release these copies, the best we can do here is to detach them
* from the data and eventually release their memory.
*/
parsec_data_copy_detach(data, copy, copy->device_index);
zone_free((zone_malloc_t *)copy->arena_chunk, copy->device_private);
copy->device_private = NULL;
copy->arena_chunk = NULL;
}
}
return 1;
}

10 changes: 5 additions & 5 deletions parsec/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ typedef uint8_t parsec_data_coherency_t;
#define PARSEC_DATA_COHERENCY_SHARED ((parsec_data_coherency_t)0x4)

typedef uint8_t parsec_data_status_t;
#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_coherency_t)0x0)
#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_coherency_t)0x1)
#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_coherency_t)0x2)
#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_status_t)0x0)
#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_status_t)0x1)
#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_status_t)0x2)
/**
* Data copies have three levels of 'ownership':
* - a data copy can be owned and managed by PaRSEC.
Expand Down Expand Up @@ -124,8 +124,8 @@ PARSEC_DECLSPEC void
parsec_data_end_transfer_ownership_to_copy(parsec_data_t* data,
uint8_t device,
uint8_t access_mode);
PARSEC_DECLSPEC void parsec_dump_data_copy(parsec_data_copy_t* copy);
PARSEC_DECLSPEC void parsec_dump_data(parsec_data_t* copy);
PARSEC_DECLSPEC void parsec_data_copy_dump(parsec_data_copy_t *copy);
PARSEC_DECLSPEC void parsec_data_dump(parsec_data_t* copy);

PARSEC_DECLSPEC parsec_data_t *
parsec_data_create( parsec_data_t **holder,
Expand Down
Loading

0 comments on commit 2c499e6

Please sign in to comment.