Skip to content

Commit

Permalink
Transfer data to and from GPU.
Browse files Browse the repository at this point in the history
This is a multi-part patch that allows the CPU to prepare a data copy
mapped onto a device.

1. The first question is how is such a device selected ?

The allocation of such a copy happen way before the scheduler is invoked
for a task, in fact before the task is even ready. Thus, we need to
decide on the location of this copy only based on some static
information, such as the task affinity. Therefore, this approach only
works for owner-compute type of tasks, where the task will be executed
on the device that owns the data used for the task affinity.

2. Pass the correct data copy across the entire system, instead of
   falling back to data copy of the device 0 (CPU memory)

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
  • Loading branch information
bosilca committed Aug 1, 2024
1 parent 879fad3 commit 2eb2f04
Show file tree
Hide file tree
Showing 14 changed files with 320 additions and 91 deletions.
1 change: 1 addition & 0 deletions parsec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ if( BUILD_PARSEC )
$<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
$<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
$<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
${EXTRA_LIBS}
INTERFACE
Expand Down
90 changes: 68 additions & 22 deletions parsec/arena.c
Original file line number Diff line number Diff line change
Expand Up @@ -235,43 +235,89 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
return PARSEC_SUCCESS;
}

parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
#include "parsec/utils/zone_malloc.h"
#include "mca/device/device_gpu.h"

static inline parsec_data_copy_t *
parsec_arena_internal_copy_new(parsec_arena_t *arena,
parsec_data_t *data,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_t *data;
parsec_data_copy_t *copy;
int rc;


data = parsec_data_new();
if( NULL == data ) {
data = parsec_data_new();
if( NULL == data ) {
return NULL;
}
}
copy = parsec_data_copy_new(data, device, dtt,
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);

if (NULL == copy) {
PARSEC_OBJ_RELEASE(data);
return NULL;
}
if( 0 == device ) {
copy->flags |= PARSEC_DATA_FLAG_ARENA; /* allocated from an arena */
int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
if (PARSEC_SUCCESS != rc) {
PARSEC_OBJ_RELEASE(copy);
return NULL;
}
return copy;
}
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)parsec_mca_device_get(device);
if( NULL == gpu_device ) {
PARSEC_OBJ_RELEASE(copy);
return NULL;
}
size_t size = count * arena->elem_size;

copy = parsec_data_copy_new( data, device, dtt,
PARSEC_DATA_FLAG_ARENA |
PARSEC_DATA_FLAG_PARSEC_OWNED |
PARSEC_DATA_FLAG_PARSEC_MANAGED);
copy->dtt = dtt;
copy->device_private = zone_malloc(gpu_device->memory, size);
copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
"data ptr %p",
device, size, (void*)copy->arena_chunk, (void*)copy->device_private);

if(NULL == copy) {
PARSEC_OBJ_RELEASE(data);
if( NULL == copy->device_private ) {
PARSEC_OBJ_RELEASE(copy);
return NULL;
}
copy->version = 0;
copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
copy->original->owner_device = device;
copy->original->preferred_device = device;
return copy;
}

rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_copy_t *dev0_copy, *copy;

dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
if( NULL == dev0_copy ) {
return NULL;
}
dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
dev0_copy->version = 0; /* start from somewhere */
if( 0 == device ) {
return dev0_copy;
}

copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
if( NULL == copy ) {
PARSEC_OBJ_RELEASE(dev0_copy);
return NULL;
}
/* This data is going to be released once all copies are released
* It does not exist without at least a copy, and we don't give the
* pointer to the user, so we must remove our retain from it
*/
PARSEC_OBJ_RELEASE(data);

if( PARSEC_SUCCESS != rc ) {
PARSEC_OBJ_RELEASE(copy);
return NULL;
}

PARSEC_OBJ_RELEASE(dev0_copy->original);
return copy;
}

Expand Down
7 changes: 7 additions & 0 deletions parsec/data.c
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,12 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
copy = data->device_copies[device];
assert( NULL != copy );

if( valid_copy == device ) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
"DEV[%d]: already has ownership of data %p to copy %p in mode %d",
device, data, copy, access_mode);
goto bookkeeping;
}
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
"DEV[%d]: start transfer ownership of data %p to copy %p in mode %d",
device, data, copy, access_mode);
Expand Down Expand Up @@ -417,6 +423,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
}
}

bookkeeping:
if( PARSEC_FLOW_ACCESS_READ & access_mode ) {
copy->readers++;
}
Expand Down
6 changes: 3 additions & 3 deletions parsec/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ typedef uint8_t parsec_data_coherency_t;
#define PARSEC_DATA_COHERENCY_SHARED ((parsec_data_coherency_t)0x4)

typedef uint8_t parsec_data_status_t;
#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_coherency_t)0x0)
#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_coherency_t)0x1)
#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_coherency_t)0x2)
#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_status_t)0x0)
#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_status_t)0x1)
#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_status_t)0x2)
/**
* Data copies have three levels of 'ownership':
* - a data copy can be owned and managed by PaRSEC.
Expand Down
1 change: 0 additions & 1 deletion parsec/data_dist/matrix/two_dim_rectangle_cyclic.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ typedef struct parsec_matrix_block_cyclic {
* @param dc matrix description structure, already allocated, that will be initialize
* @param mtype type of data used for this matrix
* @param storage type of storage of data
* @param nodes number of nodes
* @param myrank rank of the local node (as of mpi rank)
* @param mb number of row in a tile
* @param nb number of column in a tile
Expand Down
19 changes: 16 additions & 3 deletions parsec/interfaces/ptg/ptg-compiler/jdf2c.c
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,17 @@ static char* dump_local_assignments( void** elem, void* arg )
if( dos > 0 ) {
string_arena_init(info->sa);
string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
#if 0
jdf_expr_t* type_str = jdf_find_property( def->properties, "type", NULL );
if( NULL == type_str ) {
string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
} else {
expr_info_t expr_info = {.sa = info->sa, .prefix = "", .suffix = "", .assignments = "locals"};
string_arena_add_string(info->sa, "const %s %s = %s%s.value;",
dump_expr((void**)type_str, &expr_info),
def->name, info->holder, def->name);
}
#endif
if( dos > 1 )
string_arena_add_string(info->sa, " (void)%s;", def->name);
return string_arena_get_string(info->sa);
Expand Down Expand Up @@ -5649,12 +5660,14 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call,

/* Code to create & fulfill a reshape promise locally in case this input dependency is typed */
jdf_generate_code_reshape_input_from_dep(jdf, f, flow, dl, spaces);
coutput("%s this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n"
"#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
/* We don't have to set the data_out anymore */
//coutput("%s this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n",
// spaces, flow->varname);

coutput("#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
"%s parsec_prof_grapher_data_input(chunk->original, (parsec_task_t*)this_task, &%s, 0);\n"
"#endif\n"
"%s }\n",
spaces, flow->varname,
spaces, JDF_OBJECT_ONAME( flow ),
spaces);
}
Expand Down
Loading

0 comments on commit 2eb2f04

Please sign in to comment.