Skip to content

Commit

Permalink
Walk forward and backward when creating a w2r task
Browse files Browse the repository at this point in the history
Discarded data sit toward the end of the lru while the data
to be evicted is at the front. We walk both forward and backward
to collect the discarded data from the back, until we either meet the
pivot or we found enough data to evict. If we discarded data we don't
evict.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
  • Loading branch information
devreal committed Dec 23, 2024
1 parent ae94def commit 53b7721
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 41 deletions.
3 changes: 3 additions & 0 deletions parsec/mca/device/cuda/device_cuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ static int device_cuda_component_register(void)
(void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_ejected_data",
"Sets up the maximum number of blocks that can be ejected from GPU memory",
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows);
(void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_discarded_data",
"Sets up the maximum number of discarded blocks to be collected at once",
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded);
(void)parsec_mca_param_reg_int_name("device_cuda", "max_streams",
"Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3",
false, false, PARSEC_GPU_MAX_STREAMS, &parsec_cuda_max_streams);
Expand Down
1 change: 1 addition & 0 deletions parsec/mca/device/device_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ typedef struct parsec_gpu_workspace_s {
PARSEC_DECLSPEC extern int parsec_gpu_output_stream;
PARSEC_DECLSPEC extern int parsec_gpu_verbosity;
PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_flows;
PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_discarded;

/**
* Debugging functions.
Expand Down
3 changes: 3 additions & 0 deletions parsec/mca/device/level_zero/device_level_zero_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,9 @@ static int device_level_zero_component_register(void)
(void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_ejected_data",
"Sets up the maximum number of blocks that can be ejected from GPU memory",
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows);
(void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_discarded_data",
"Sets up the maximum number of discarded blocks to be collected at once",
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded);
(void)parsec_mca_param_reg_int_name("device_level_zero", "max_streams",
"Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3",
false, false, PARSEC_GPU_MAX_STREAMS, &parsec_level_zero_max_streams);
Expand Down
133 changes: 92 additions & 41 deletions parsec/mca/device/transfer_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ static const parsec_symbol_t symb_gpu_d2h_task_param = {
};

int32_t parsec_gpu_d2h_max_flows = 0;
int32_t parsec_gpu_d2h_max_discarded = 0;

static const parsec_task_class_t parsec_gpu_d2h_task_class = {
.name = "GPU D2H data transfer",
Expand Down Expand Up @@ -215,6 +216,16 @@ static const parsec_task_class_t parsec_gpu_d2h_task_class = {
#endif
};

static inline void release_discarded_data(parsec_device_gpu_module_t *gpu_device, parsec_gpu_data_copy_t* gpu_copy)
{
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
"D2H[%d:%s] GPU data copy %p of discarded data %p will be released",
gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original);
parsec_device_release_gpu_copy(gpu_device, gpu_copy);

}

/**
* Transfer at most the MAX_PARAM_COUNT oldest data from the GPU back
Expand All @@ -227,58 +238,98 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device,
{
parsec_gpu_task_t *w2r_task = NULL;
parsec_gpu_d2h_task_t *d2h_task = NULL;
parsec_gpu_data_copy_t *gpu_copy, *cpu_copy;
parsec_list_item_t* item = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next;
parsec_gpu_data_copy_t *fwd_gpu_copy = NULL, *fwd_cpu_copy = NULL, *rev_gpu_copy = NULL, *rev_cpu_copy = NULL;
parsec_list_item_t* fwd = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next;
parsec_list_item_t* rev = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_prev;
int nb_cleaned = 0;
int nb_discarded = 0;
int nb_candidates = 0;
const int max_flows = (parsec_gpu_d2h_max_flows < MAX_PARAM_COUNT) ? parsec_gpu_d2h_max_flows : MAX_PARAM_COUNT;
/* store candidates in an array without unlinking them so we can easily abandon them */
parsec_gpu_data_copy_t *candidates[MAX_PARAM_COUNT];

/* Find a data copy that has no pending users on the GPU, and can be
* safely moved back on the main memory */
while(nb_cleaned < parsec_gpu_d2h_max_flows) {
* safely moved back on the main memory.
* Also look for data that was discarded and can be released immediatly.
*
* Observation: data to be evicted is more likely at the front of the list
* while data that is discarded is more likely at the end
* (since it was likely discarded shortly after being used)
* so we search from the front and the back. */
while(nb_candidates < max_flows &&
/* allow discarding to be disabled */
(parsec_gpu_d2h_max_discarded == 0 || nb_discarded < parsec_gpu_d2h_max_discarded)) {
/* Break at the end of the list */
if( item == &(gpu_device->gpu_mem_owned_lru.ghost_element) ) {
if( fwd == &gpu_device->gpu_mem_owned_lru.ghost_element ) {
break;
}
gpu_copy = (parsec_gpu_data_copy_t*)item;
cpu_copy = gpu_copy->original->device_copies[0];
parsec_atomic_lock( &gpu_copy->original->lock );
/* get the next item before altering the next pointer */
item = (parsec_list_item_t*)item->list_next; /* conversion needed for volatile */
if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) {
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
"D2H[%d:%s] GPU data copy %p of discarded data %p will be released",
gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original);
parsec_atomic_unlock( &gpu_copy->original->lock );
parsec_device_release_gpu_copy(gpu_device, gpu_copy);
} else if( 0 == gpu_copy->readers ) {
if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* allocate on-demand */
d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool);
if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */
parsec_atomic_unlock( &gpu_copy->original->lock );
return NULL;
}
PARSEC_OBJ_CONSTRUCT(d2h_task, parsec_task_t);
}
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
gpu_copy->readers++;
d2h_task->data[nb_cleaned].data_out = gpu_copy;
gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; /* mark the copy as in transfer */
parsec_atomic_unlock( &gpu_copy->original->lock );
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d",
gpu_device->super.device_index, gpu_device->super.name, (void*)d2h_task,
nb_cleaned, gpu_copy, gpu_copy->original, gpu_copy->readers);
nb_cleaned++;
if (MAX_PARAM_COUNT == nb_cleaned)
if (fwd == rev || fwd->list_next == rev) {
/* break at median if we discarded data */
if (nb_discarded > 0) {
break;
} else {
parsec_atomic_unlock( &gpu_copy->original->lock );
}
/* otherwise stop walking backwards because we already
* looked for discarded data on the way */
rev = NULL;
rev_gpu_copy = NULL;
rev_cpu_copy = NULL;
}

fwd_gpu_copy = (parsec_gpu_data_copy_t*)fwd;
fwd_cpu_copy = fwd_gpu_copy->original->device_copies[0];
/* get the next item before altering the next pointer */
fwd = (parsec_list_item_t*)fwd->list_next; /* conversion needed for volatile */
if (NULL != rev) {
rev_gpu_copy = (parsec_gpu_data_copy_t*)rev;
rev_cpu_copy = rev_gpu_copy->original->device_copies[0];
rev = (parsec_list_item_t*)rev->list_prev; // cast for volatile
}
if (parsec_gpu_d2h_max_discarded && fwd_cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) {
release_discarded_data(gpu_device, fwd_gpu_copy);
++nb_discarded;
} else if( max_flows > nb_candidates && 0 == fwd_gpu_copy->readers ) {
/* store the candidates but leave them in the LRU */
candidates[nb_candidates] = fwd_gpu_copy;
nb_candidates++;
}
if (parsec_gpu_d2h_max_discarded &&
NULL != rev_cpu_copy &&
rev_cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) {
release_discarded_data(gpu_device, rev_gpu_copy);
++nb_discarded;
}
}

if( nb_discarded > 0 || nb_candidates == 0 ) {
/* we discarded some data, don't bother pushing out */
return NULL;
}

if( 0 == nb_cleaned )
d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool);
if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */
return NULL;
}
PARSEC_OBJ_CONSTRUCT(d2h_task, parsec_task_t);

for (int i = 0; i < nb_candidates; ++i) {
parsec_gpu_data_copy_t *gpu_copy = candidates[i];
parsec_atomic_lock( &gpu_copy->original->lock );
if (PARSEC_UNLIKELY(gpu_copy->readers != 0)) {
/* gained a reader, ignore */
parsec_atomic_unlock( &gpu_copy->original->lock );
continue;
}
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
gpu_copy->readers++;
d2h_task->data[nb_cleaned].data_out = gpu_copy;
gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; /* mark the copy as in transfer */
parsec_atomic_unlock( &gpu_copy->original->lock );
nb_cleaned++;
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d",
gpu_device->super.device_index, gpu_device->super.name, (void*)d2h_task,
nb_cleaned, gpu_copy, gpu_copy->original, gpu_copy->readers);
}

d2h_task->priority = INT32_MAX;
d2h_task->task_class = &parsec_gpu_d2h_task_class;
Expand Down

0 comments on commit 53b7721

Please sign in to comment.