From a6bbc4fa74d895fc9424e25f003350ef7178c009 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 25 Jul 2024 20:55:14 -0700 Subject: [PATCH 01/14] Add support for batched tasks. The idea is the following: - tasks incarnations (aka. BODY) can be marked with the "batch" property allowing the runtime to provide the task with the entire list of ready tasks of the execution stream instead of just extracting the head. - this list of ready tasks is in fact a ring, that can then be trimmed by the kernel and divided into batch and the rest. The rest of the tasks will be left in the ring, while the batch group will be submitted for execution. - the kernel also needs to provide a callback into the gpu_task complete_stage, such that the runtime can call the specialized function able to complete all batched tasks. Signed-off-by: George Bosilca --- CMakeLists.txt | 8 +++++ cmake_modules/PaRSECConfig.cmake.in | 4 +++ parsec/include/parsec/parsec_options.h.in | 1 + parsec/interfaces/dtd/insert_function.c | 21 +++++++------ parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 31 ++++++++++++++----- parsec/mca/device/CMakeLists.txt | 3 ++ .../mca/device/cuda/device_cuda_component.c | 2 +- parsec/mca/device/device.c | 22 ++++++------- parsec/mca/device/device.h | 3 ++ parsec/mca/device/device_gpu.c | 26 +++++++++++----- .../level_zero/device_level_zero_component.c | 2 +- .../device/template/device_template_module.c | 2 +- .../pins/ptg_to_dtd/pins_ptg_to_dtd_module.c | 6 ++-- tests/dsl/dtd/dtd_test_cuda_task_insert.c | 4 +-- tests/dsl/dtd/dtd_test_new_tile.c | 4 +-- tests/dsl/dtd/dtd_test_simple_gemm.c | 4 +-- tests/runtime/cuda/get_best_device_check.jdf | 4 +-- tests/runtime/cuda/nvlink_wrapper.c | 6 ++-- tests/runtime/cuda/stress_wrapper.c | 2 +- 19 files changed, 101 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 339913987..35cbb152b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,8 @@ option(PARSEC_GPU_ALLOC_PER_TILE mark_as_advanced(PARSEC_GPU_ALLOC_PER_TILE) option(PARSEC_GPU_WITH_CUDA "Enable GPU support using CUDA kernels" ON) +option(PARSEC_GPU_WITH_CUDA_BATCH + "Enable the runtime support for batched kernels" ON) option(PARSEC_GPU_WITH_HIP "Enable GPU support using HIP kernels" ON) option(PARSEC_GPU_WITH_LEVEL_ZERO @@ -729,6 +731,12 @@ int main(int argc, char *argv[]) { endif (CUDAToolkit_FOUND) set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files") endif( PARSEC_GPU_WITH_CUDA ) + if( PARSEC_GPU_WITH_CUDA_BATCH ) + if( NOT PARSEC_HAVE_CUDA) + message(FATAL_ERROR "PARSEC_GPU_WITH_CUDA_BATCH requires PARSEC_GPU_WITH_CUDA. Enable both or none") + endif( NOT PARSEC_HAVE_CUDA) + set(PARSEC_HAVE_CUDA_BATCH True CACHE BOOL "True if support for batched CUDA has been enabled") + endif( PARSEC_GPU_WITH_CUDA_BATCH ) if( PARSEC_GPU_WITH_HIP ) # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents diff --git a/cmake_modules/PaRSECConfig.cmake.in b/cmake_modules/PaRSECConfig.cmake.in index 5d7827bee..3dd2bc459 100644 --- a/cmake_modules/PaRSECConfig.cmake.in +++ b/cmake_modules/PaRSECConfig.cmake.in @@ -65,6 +65,10 @@ endif(@PARSEC_DIST_WITH_MPI@) if(@PARSEC_HAVE_CUDA@) find_package(CUDAToolkit REQUIRED) set(PARSEC_HAVE_CUDA TRUE) + + if(@PARSEC_HAVE_CUDA_BATCH@) + set(PARSEC_HAVE_CUDA_BATCH TRUE) + endif(@PARSEC_HAVE_CUDA_BATCH@) endif(@PARSEC_HAVE_CUDA@) if(@PARSEC_HAVE_HIP@) diff --git a/parsec/include/parsec/parsec_options.h.in b/parsec/include/parsec/parsec_options.h.in index 85af2db8e..a5a143e12 100644 --- a/parsec/include/parsec/parsec_options.h.in +++ b/parsec/include/parsec/parsec_options.h.in @@ -130,6 +130,7 @@ #cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT #cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT #cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT +#cmakedefine PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT #cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT #cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT #cmakedefine PARSEC_HAVE_DEV_OPENCL_SUPPORT diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c index d1eac58a8..91543fdfd 100644 --- a/parsec/interfaces/dtd/insert_function.c +++ b/parsec/interfaces/dtd/insert_function.c @@ -1477,7 +1477,7 @@ parsec_dtd_startup(parsec_context_t *context, if( !(tp->devices_index_mask & (1 << device->device_index))) continue; /* not supported */ // If CUDA is enabled, let the CUDA device activated for this // taskpool. - if( PARSEC_DEV_CUDA == device->type ) continue; + if( PARSEC_DEV_CUDA & device->type ) continue; if( NULL != device->taskpool_register ) if( PARSEC_SUCCESS != device->taskpool_register(device, (parsec_taskpool_t *)tp)) { @@ -2355,8 +2355,8 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp, /* We assume that incarnations is big enough, because it has been pre-allocated * with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */ incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations; - for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && incarnations[i].type != PARSEC_DEV_NONE; i++) { - if( incarnations[i].type == device_type ) { + for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) { + if( incarnations[i].type & PARSEC_DEV_ANY_TYPE & device_type ) { parsec_warning("A chore for this device type has already been added to task class '%s'\n", tc->name); return PARSEC_ERROR; @@ -2369,7 +2369,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp, } incarnations[i].type = device_type; - if(PARSEC_DEV_CUDA == device_type) { + if(PARSEC_DEV_CUDA & device_type) { incarnations[i].hook = parsec_dtd_gpu_task_submit; dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function; } @@ -3258,19 +3258,20 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp, dtd_tc = parsec_dtd_create_task_classv(name_of_kernel, nb_params, params); tc = &dtd_tc->super; - __parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations; - (*incarnations)[0].type = device_type; - if( device_type == PARSEC_DEV_CUDA ) { + __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations; + incarnations[0].type = device_type; + if( device_type & PARSEC_DEV_CUDA ) { /* Special case for CUDA: we need an intermediate */ - (*incarnations)[0].hook = parsec_dtd_gpu_task_submit; + incarnations[0].hook = parsec_dtd_gpu_task_submit; dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer; } else { /* Default case: the user-provided function is directly the hook to call */ - (*incarnations)[0].hook = fpointer; // We can directly call the CPU hook + incarnations[0].hook = fpointer; // We can directly call the CPU hook dtd_tc->cpu_func_ptr = fpointer; } - (*incarnations)[1].type = PARSEC_DEV_NONE; + incarnations[1].type = PARSEC_DEV_NONE; + incarnations[1].hook = NULL; /* Bookkeeping of the task class */ parsec_dtd_register_task_class(&dtd_tp->super, fkey, tc); diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index ec53abeff..3730303d3 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -3938,6 +3938,7 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf, jdf_def_list_t* dyld_property; jdf_def_list_t* evaluate_property = NULL; jdf_def_list_t* device_property = NULL; + jdf_def_list_t* batch_property = NULL; (void)jdf; string_arena_add_string(sa, "static const __parsec_chore_t __%s_chores[] ={\n", base_name); @@ -3945,18 +3946,32 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf, jdf_find_property(body->properties, "type", &type_property); jdf_find_property(body->properties, "dyld", &dyld_property); jdf_find_property(body->properties, JDF_BODY_PROP_EVALUATE, &evaluate_property); - if( NULL == type_property) { + jdf_find_property(body->properties, "batch", &batch_property); + if (NULL == type_property) + { string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_CPU_SUPPORT)\n"); string_arena_add_string(sa, " { .type = PARSEC_DEV_CPU,\n"); string_arena_add_string(sa, " .evaluate = (parsec_evaluate_function_t*)%s,\n", (NULL == evaluate_property) ? "NULL" : evaluate_property->expr->jdf_c_code.fname); string_arena_add_string(sa, " .hook = (parsec_hook_t*)hook_of_%s },\n", base_name); string_arena_add_string(sa, "#endif /* defined(PARSEC_HAVE_DEV_CPU_SUPPORT) */\n"); - } else { + if( NULL != batch_property ) { + fprintf(stderr, + "Error: batched property (%s) not allowed for devices other than accelerators in body of task %s at line %d\n", + batch_property->expr->jdf_var, f->fname, JDF_OBJECT_LINENO(body)); + assert( NULL != batch_property ); + } + } + else + { char* dev_upper = strdup_upper(type_property->expr->jdf_var); string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", dev_upper); - string_arena_add_string(sa, " { .type = PARSEC_DEV_%s,\n", dev_upper); + string_arena_add_string(sa, " { .type = PARSEC_DEV_%s", dev_upper); + if( NULL != batch_property) { + string_arena_add_string(sa, " | PARSEC_DEV_CHORE_ALLOW_BATCH"); + } + string_arena_add_string(sa, ",\n"); if( NULL == dyld_property ) { string_arena_add_string(sa, " .dyld = NULL,\n"); } else { @@ -4491,7 +4506,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf ) " parsec_task_class_t* tc = (parsec_task_class_t*)__parsec_tp->super.super.task_classes_array[i];\n" " __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;\n" " uint32_t idx = 0, j;\n" - " for( j = 0; PARSEC_DEV_NONE != chores[j].type; j++ ) {\n" + " for( j = 0; PARSEC_DEV_NONE != (chores[j].type & PARSEC_DEV_ANY_TYPE); j++ ) {\n" " if( !(supported_dev & chores[j].type) ) continue;\n" " if( j != idx ) {\n" " chores[idx] = chores[j];\n" @@ -4680,7 +4695,7 @@ static void jdf_generate_constructor( const jdf_t* jdf ) coutput(" for( i = 0; i < __parsec_tp->super.super.nb_task_classes; i++ ) {\n" " __parsec_tp->super.super.task_classes_array[i] = tc = malloc(sizeof(parsec_task_class_t));\n" " memcpy(tc, %s_task_classes[i], sizeof(parsec_task_class_t));\n" - " for( j = 0; PARSEC_DEV_NONE != tc->incarnations[j].type; j++); /* compute the number of incarnations */\n" + " for( j = 0; PARSEC_DEV_NONE != (tc->incarnations[j].type & PARSEC_DEV_ANY_TYPE); j++); /* compute the number of incarnations */\n" " tc->incarnations = (__parsec_chore_t*)malloc((j+1) * sizeof(__parsec_chore_t));\n " " memcpy((__parsec_chore_t*)tc->incarnations, %s_task_classes[i]->incarnations, (j+1) * sizeof(__parsec_chore_t));\n\n" " /* Add a placeholder for initialization and startup task */\n" @@ -6731,8 +6746,8 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf, coutput(" /* Pointer to dynamic gpu function */\n" " {\n" " int chore_idx = 0;\n" - " for ( ; PARSEC_DEV_NONE != this_task->task_class->incarnations[chore_idx].type; ++chore_idx) {\n" - " if (this_task->task_class->incarnations[chore_idx].type == PARSEC_DEV_%s) break;\n" + " for ( ; PARSEC_DEV_NONE != (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_ANY_TYPE); ++chore_idx) {\n" + " if (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_%s) break;\n" " }\n" " /* The void* cast prevents the compiler from complaining about the type change */\n" " parsec_body.dyld_fn = (%s)(void*)this_task->task_class->incarnations[chore_idx].dyld_fn;\n" @@ -6983,7 +6998,7 @@ static void jdf_generate_code_hook(const jdf_t *jdf, coutput("#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", type_upper); if( NULL != type_property) { - if (!strcasecmp(type_property->expr->jdf_var, "cuda") + if (!strncasecmp(type_property->expr->jdf_var, "cuda", 4) /* for batched */ || !strcasecmp(type_property->expr->jdf_var, "hip")) { jdf_generate_code_hook_gpu(jdf, f, body, name); goto hook_end_block; diff --git a/parsec/mca/device/CMakeLists.txt b/parsec/mca/device/CMakeLists.txt index 85d23ba0d..0fef7a88a 100644 --- a/parsec/mca/device/CMakeLists.txt +++ b/parsec/mca/device/CMakeLists.txt @@ -14,6 +14,9 @@ set(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT 0 CACHE BOOL "PaRSEC has support for Recu if(PARSEC_HAVE_CUDA) set(PARSEC_HAVE_DEV_CUDA_SUPPORT 1 CACHE BOOL "PaRSEC support for CUDA") endif(PARSEC_HAVE_CUDA) +if(PARSEC_HAVE_CUDA_BATCH) + set(PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT 1 CACHE BOOL "PaRSEC support for batched CUDA") +endif(PARSEC_HAVE_CUDA_BATCH) if(PARSEC_HAVE_HIP) set(PARSEC_HAVE_DEV_HIP_SUPPORT 1 CACHE BOOL "PaRSEC support for HIP") endif(PARSEC_HAVE_HIP) diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index 4dc516e05..c06ba416b 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -276,7 +276,7 @@ static int device_cuda_component_close(void) /* Check that no CUDA devices are still registered with PaRSEC */ for(i = 0; i < parsec_mca_device_enabled(); i++) { if( NULL == (cdev = (parsec_device_cuda_module_t*)parsec_mca_device_get(i)) ) continue; - if(PARSEC_DEV_CUDA != cdev->super.super.type) continue; + if(PARSEC_DEV_CUDA & cdev->super.super.type) continue; PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream, "GPU[%d:%s] CUDA device %d still registered with PaRSEC at the end of CUDA finalize.\n" diff --git a/parsec/mca/device/device.c b/parsec/mca/device/device.c index d7ea17a99..d1d42dbf7 100644 --- a/parsec/mca/device/device.c +++ b/parsec/mca/device/device.c @@ -107,7 +107,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) { /* Run the evaluates for the incarnation types to determine if they can * execute this task */ - for(chore_id = 0; PARSEC_DEV_NONE != tc->incarnations[chore_id].type; chore_id++) { + for(chore_id = 0; PARSEC_DEV_NONE != (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); chore_id++) { if( 0 == (this_task->chore_mask & (1<incarnations[chore_id].hook ) continue; /* dyld hook not found during initialization */ @@ -116,7 +116,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) { if( PARSEC_HOOK_RETURN_DONE != rc ) { if( PARSEC_HOOK_RETURN_NEXT != rc ) { PARSEC_DEBUG_VERBOSE(5, parsec_device_output, "Failed to evaluate %s[%d] chore %d", - tmp, tc->incarnations[chore_id].type, + tmp, tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE, chore_id); } /* Mark this chore as tested */ @@ -124,7 +124,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) { continue; } } - valid_types |= tc->incarnations[chore_id].type; /* the eval accepted the type, but no device specified yet */ + valid_types |= (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); /* the eval accepted the type, but no device specified yet */ /* Evaluate may have picked a device, abide by it */ if( NULL != this_task->selected_device ) { assert( this_task->selected_device->type & valid_types ); @@ -140,7 +140,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) { if (PARSEC_DEV_CPU == valid_types) { /* shortcut for CPU only tasks */ this_task->selected_device = dev = parsec_mca_device_get(0); this_task->load = 0; - for(chore_id = 0; tc->incarnations[chore_id].type != PARSEC_DEV_CPU; chore_id++); + for(chore_id = 0; !(tc->incarnations[chore_id].type & PARSEC_DEV_CPU); chore_id++); this_task->selected_chore = chore_id; PARSEC_DEBUG_VERBOSE(80, parsec_device_output, "%s: Task %s cpu-only task set selected_device %d:%s", __func__, tmp, dev->device_index, dev->name); @@ -226,7 +226,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) { /* Skip the device if no incarnations for its type */ if(!(dev->type & valid_types)) continue; /* Skip recursive devices: time estimates are computed on the associated CPU device */ - if(dev->type == PARSEC_DEV_RECURSIVE) continue; + if(dev->type & PARSEC_DEV_RECURSIVE) continue; eta = dev->device_load + time_estimate(this_task, dev); if( best_eta > eta ) { @@ -244,14 +244,14 @@ int parsec_select_best_device( parsec_task_t* this_task ) { goto no_valid_device; this_task->selected_device = parsec_mca_device_get(best_index); - assert( this_task->selected_device->type != PARSEC_DEV_RECURSIVE ); + assert( !(this_task->selected_device->type & PARSEC_DEV_RECURSIVE) ); } device_selected: dev = this_task->selected_device; assert( NULL != dev ); assert( tp->devices_index_mask & (1 << dev->device_index) ); - for(chore_id = 0; tc->incarnations[chore_id].type != dev->type; chore_id++) + for(chore_id = 0; !(tc->incarnations[chore_id].type & dev->type); chore_id++) assert(PARSEC_DEV_NONE != tc->incarnations[chore_id].type /* we have selected this device, so there *must* be an incarnation that matches */); this_task->selected_chore = chore_id; this_task->load = time_estimate(this_task, dev); @@ -740,8 +740,8 @@ int parsec_mca_device_registration_complete(parsec_context_t* context) for( uint32_t i = 0; i < parsec_nb_devices; i++ ) { parsec_device_module_t* device = parsec_devices[i]; if( NULL == device ) continue; - if( PARSEC_DEV_RECURSIVE == device->type ) continue; - if( PARSEC_DEV_CPU == device->type ) { + if( PARSEC_DEV_RECURSIVE & device->type ) continue; + if( PARSEC_DEV_CPU & device->type ) { c = 0; for(int p = 0; p < context->nb_vp; p++) c += context->virtual_processes[p]->nb_cores; @@ -760,7 +760,7 @@ int parsec_mca_device_registration_complete(parsec_context_t* context) for( uint32_t i = 0; i < parsec_nb_devices; i++ ) { parsec_device_module_t* device = parsec_devices[i]; if( NULL == device ) continue; - if( PARSEC_DEV_RECURSIVE == device->type ) continue; + if( PARSEC_DEV_RECURSIVE & device->type ) continue; device->time_estimate_default = total_gflops_fp64/(double)device->gflops_fp64; parsec_debug_verbose(6, parsec_device_output, " Dev[%d] default-time-estimate %-4"PRId64" <- double %-8"PRId64" single %-8"PRId64" tensor %-8"PRId64" half %-8"PRId64" %s", i, device->time_estimate_default, device->gflops_fp64, device->gflops_fp32, device->gflops_tf32, device->gflops_fp16, device->gflops_guess? "GUESSED": ""); @@ -925,7 +925,7 @@ device_taskpool_register_static(parsec_device_module_t* device, parsec_taskpool_ continue; __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations; for( j = 0; NULL != chores[j].hook; j++ ) { - if( chores[j].type != device->type ) + if( !(chores[j].type & device->type) ) continue; if( NULL != chores[j].dyld_fn ) { continue; /* the function has been set for another device of the same type */ diff --git a/parsec/mca/device/device.h b/parsec/mca/device/device.h index bbd26cf00..d471eff9b 100644 --- a/parsec/mca/device/device.h +++ b/parsec/mca/device/device.h @@ -65,10 +65,13 @@ typedef struct parsec_device_base_component_2_0_0 parsec_device_base_component_t #define PARSEC_DEV_CUDA ((uint8_t)(1 << 2)) #define PARSEC_DEV_HIP ((uint8_t)(1 << 3)) #define PARSEC_DEV_LEVEL_ZERO ((uint8_t)(1 << 4)) +#define PARSEC_DEV_CUDA_BATCH ((uint8_t)(1 << 5)) #define PARSEC_DEV_TEMPLATE ((uint8_t)(1 << 7)) #define PARSEC_DEV_ANY_TYPE ((uint8_t) 0x3f) #define PARSEC_DEV_ALL ((uint8_t) 0x3f) #define PARSEC_DEV_MAX_NB_TYPE (7) +/* The following flags are extensions to the device type */ +#define PARSEC_DEV_CHORE_ALLOW_BATCH ((uint32_t)0x00000100) #define PARSEC_DEV_GPU_MASK (PARSEC_DEV_CUDA|PARSEC_DEV_HIP|PARSEC_DEV_LEVEL_ZERO) #define PARSEC_DEV_IS_GPU(t) (0 != ((t) & PARSEC_DEV_GPU_MASK)) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 24e40d5fd..d44a3ed2b 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -526,7 +526,7 @@ parsec_device_taskpool_register(parsec_device_module_t* device, const parsec_task_class_t* tc = tp->task_classes_array[i]; __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations; for( j = 0; NULL != chores[j].hook; j++ ) { - if( chores[j].type != device->type ) + if( !(chores[j].type & device->type) ) continue; if( NULL != chores[j].dyld_fn ) { /* the function has been set for another device of the same type */ @@ -1180,7 +1180,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index); dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index); - if(src_dev->super.type == dst_dev->super.type) { + if((src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE)) { assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) ); dir = parsec_device_gpu_transfer_direction_d2d; } else { @@ -1231,7 +1231,7 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts : dest->original->nb_elts; - if( src_dev->super.type == dst_dev->super.type ) { + if( (src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE) ) { assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) ); dir = parsec_device_gpu_transfer_direction_d2d; } else { @@ -1352,7 +1352,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, "GPU[%d:%s]:\tSelecting candidate data copy %p [ref_count %d] on data %p", gpu_device->super.device_index, gpu_device->super.name, task_data->data_in, task_data->data_in->super.super.obj_reference_count, original); - if( gpu_device->super.type == candidate_dev->super.type ) { + if( (gpu_device->super.type & PARSEC_DEV_ANY_TYPE) == (candidate_dev->super.type & PARSEC_DEV_ANY_TYPE) ) { if( gpu_device->peer_access_mask & (1 << candidate_dev->super.device_index) ) { /* We can directly do D2D, so let's skip the selection */ PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, @@ -1502,7 +1502,8 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__, candidate->device_private, candidate_dev->super.device_index, candidate_dev->super.name, gpu_elem->device_private, gpu_device->super.device_index, gpu_device->super.name, - nb_elts, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D"); + nb_elts, + (candidate_dev->super.type & gpu_device->super.type & PARSEC_DEV_ANY_TYPE)? "D2D": "H2D"); parsec_atomic_unlock( &original->lock ); assert(0); return PARSEC_HOOK_RETURN_ERROR; @@ -1868,15 +1869,26 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, } } - grab_a_task: + grab_a_task: if( NULL == stream->tasks[stream->start] ) { /* there is room on the stream */ task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending); /* get the best task */ } if( NULL == task ) { /* No tasks, we're done */ return PARSEC_HOOK_RETURN_DONE; } - PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)task); + /* Should we allow the tasks to be batched */ + if (PARSEC_GPU_TASK_TYPE_KERNEL == task->task_type ) { + if( PARSEC_DEV_CHORE_ALLOW_BATCH & task->ec->task_class->incarnations[0].type ) { + /* Don't singleton the task, allowing the kernel to extract the tasks it wants + * from the task ring, and singleton it or replace it with the aggregated tasks + * as necessary. + */ + goto move_forward_with_this_task; + } + } + PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t *)task); + move_forward_with_this_task: assert( NULL == stream->tasks[stream->start] ); schedule_task: diff --git a/parsec/mca/device/level_zero/device_level_zero_component.c b/parsec/mca/device/level_zero/device_level_zero_component.c index 7fe35b6a9..a1b30285e 100644 --- a/parsec/mca/device/level_zero/device_level_zero_component.c +++ b/parsec/mca/device/level_zero/device_level_zero_component.c @@ -410,7 +410,7 @@ static int device_level_zero_component_close(void) /* Check that no LEVEL_ZERO devices are still registered with PaRSEC */ for(i = 0; i < parsec_mca_device_enabled(); i++) { if( NULL == (cdev = (parsec_device_level_zero_module_t*)parsec_mca_device_get(i)) ) continue; - if(PARSEC_DEV_LEVEL_ZERO != cdev->super.super.type) continue; + if( !(PARSEC_DEV_LEVEL_ZERO & cdev->super.super.type) ) continue; PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream, "GPU[%d] LEVEL_ZERO device still registered with PaRSEC at the end of LEVEL_ZERO finalize.\n" diff --git a/parsec/mca/device/template/device_template_module.c b/parsec/mca/device/template/device_template_module.c index 3ad780551..7d88529f4 100644 --- a/parsec/mca/device/template/device_template_module.c +++ b/parsec/mca/device/template/device_template_module.c @@ -78,7 +78,7 @@ parsec_template_taskpool_register(parsec_device_module_t* device, const parsec_task_class_t* tc = tp->task_classes_array[i]; __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations; for( j = 0; NULL != chores[j].hook; j++ ) { - if( chores[j].type != device->type ) + if( chores[j].type & device->type ) continue; if( NULL != chores[j].dyld_fn ) { /* the function has been set for another device of the same type */ diff --git a/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c b/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c index 8e53cb653..e40afcfe2 100644 --- a/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c +++ b/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c @@ -86,7 +86,7 @@ copy_chores(parsec_taskpool_t *tp, parsec_dtd_taskpool_t *dtd_tp) parsec_hook_t **hook_not_const = (parsec_hook_t **)&(tp->task_classes_array[i]->incarnations[j].hook); /* saving the CPU hook only */ - if (tp->task_classes_array[i]->incarnations[j].type == PARSEC_DEV_CPU) { + if (tp->task_classes_array[i]->incarnations[j].type & PARSEC_DEV_CPU) { dtd_tp->actual_hook[i].hook = tp->task_classes_array[i]->incarnations[j].hook; } /* copying the fake hook in all the hooks (CPU, GPU etc) */ @@ -312,8 +312,8 @@ parsec_dtd_taskpool_insert_task_ptg_to_dtd( parsec_dtd_taskpool_t *dtd_tp, free(params); __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations; - for(int i = 0; PARSEC_DEV_NONE != incarnations[i].type; i++ ) { - if( PARSEC_DEV_CPU == incarnations[i].type ) { + for(int i = 0; PARSEC_DEV_NONE != (incarnations[i].type & PARSEC_DEV_ANY_TYPE); i++ ) { + if( PARSEC_DEV_CPU & incarnations[i].type ) { incarnations[i] = dtd_chore_for_testing; } } diff --git a/tests/dsl/dtd/dtd_test_cuda_task_insert.c b/tests/dsl/dtd/dtd_test_cuda_task_insert.c index 9b689db73..9b7d9783d 100644 --- a/tests/dsl/dtd/dtd_test_cuda_task_insert.c +++ b/tests/dsl/dtd/dtd_test_cuda_task_insert.c @@ -622,7 +622,7 @@ int get_nb_cuda_devices() for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { nb++; } } @@ -638,7 +638,7 @@ int *get_cuda_device_index() int i = 0; for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[i++] = device->device_index; } } diff --git a/tests/dsl/dtd/dtd_test_new_tile.c b/tests/dsl/dtd/dtd_test_new_tile.c index 36f6b99da..014b8ed81 100644 --- a/tests/dsl/dtd/dtd_test_new_tile.c +++ b/tests/dsl/dtd/dtd_test_new_tile.c @@ -277,7 +277,7 @@ int main(int argc, char **argv) #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(PARSEC_HAVE_CU_COMPILER) for(unsigned int i = 0; i < parsec_nb_devices; i++) { parsec_device_module_t *dev = parsec_mca_device_get(i); - if( dev->type == PARSEC_DEV_CUDA ) + if( dev->type & PARSEC_DEV_CUDA ) nb_gpus++; } if(nb_gpus > 0) { @@ -286,7 +286,7 @@ int main(int argc, char **argv) nb_gpus = 0; for(unsigned int i = 0; i < parsec_nb_devices; i++) { parsec_device_module_t *dev = parsec_mca_device_get(i); - if( dev->type == PARSEC_DEV_CUDA) { + if( dev->type & PARSEC_DEV_CUDA) { cudaError_t status; parsec_device_cuda_module_t *gpu_device = (parsec_device_cuda_module_t *)dev; status = cudaSetDevice( gpu_device->cuda_index ); diff --git a/tests/dsl/dtd/dtd_test_simple_gemm.c b/tests/dsl/dtd/dtd_test_simple_gemm.c index 9ac71a079..f15aba062 100644 --- a/tests/dsl/dtd/dtd_test_simple_gemm.c +++ b/tests/dsl/dtd/dtd_test_simple_gemm.c @@ -327,7 +327,7 @@ int get_nb_gpu_devices() for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { nb++; } } @@ -343,7 +343,7 @@ int *get_gpu_device_index() int i = 0; for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[i++] = device->device_index; } } diff --git a/tests/runtime/cuda/get_best_device_check.jdf b/tests/runtime/cuda/get_best_device_check.jdf index 65025d8e1..b3167c451 100644 --- a/tests/runtime/cuda/get_best_device_check.jdf +++ b/tests/runtime/cuda/get_best_device_check.jdf @@ -148,7 +148,7 @@ parsec_get_best_device_check_New(parsec_tiled_matrix_t *dcA, int *info) int nb = 0; for(int i = 0; i < (int)parsec_nb_devices; i++) { parsec_device_module_t *device = parsec_mca_device_get(i); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { nb++; } } @@ -160,7 +160,7 @@ parsec_get_best_device_check_New(parsec_tiled_matrix_t *dcA, int *info) nb = 0; for(int i = 0; i < (int)parsec_nb_devices; i++) { parsec_device_module_t *device = parsec_mca_device_get(i); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[nb++] = device->device_index; } } diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c index abc4b19c9..3b95a4d80 100644 --- a/tests/runtime/cuda/nvlink_wrapper.c +++ b/tests/runtime/cuda/nvlink_wrapper.c @@ -72,7 +72,7 @@ __parsec_nvlink_destructor( parsec_nvlink_taskpool_t* nvlink_taskpool) userM = nvlink_taskpool->_g_userM; for(g = 0, dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == cuda_device->super.super.type ) { + if( PARSEC_DEV_CUDA & cuda_device->super.super.type ) { parsec_data_t *dta = ((parsec_dc_t*)userM)->data_of((parsec_dc_t*)userM, g, userM->super.super.myrank); parsec_data_copy_t *gpu_copy = parsec_data_get_copy(dta, cuda_device->super.super.device_index); cudaError_t status = cudaSetDevice( cuda_device->cuda_index ); @@ -108,7 +108,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb nb = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[nb++] = device->device_index; } } @@ -168,7 +168,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb * in the JDF, this also pins the task on the GPU that we chose to host the tile */ for(int g = 0, dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == cuda_device->super.super.type ) { + if( PARSEC_DEV_CUDA & cuda_device->super.super.type ) { /* We get the data from the data collection */ parsec_data_t *dta = ((parsec_dc_t*)userM)->data_of((parsec_dc_t*)userM, g, ctx->my_rank); /* The corresponding data copy on CPU RAM */ diff --git a/tests/runtime/cuda/stress_wrapper.c b/tests/runtime/cuda/stress_wrapper.c index ee40903e0..c688fcda4 100644 --- a/tests/runtime/cuda/stress_wrapper.c +++ b/tests/runtime/cuda/stress_wrapper.c @@ -41,7 +41,7 @@ parsec_taskpool_t* testing_stress_New( parsec_context_t *ctx, int depth, int mb nb = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[nb++] = device->device_index; } } From b52a683832e0f2519f6061c81967c0e6cf5c4b57 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 26 Jul 2024 00:48:27 -0700 Subject: [PATCH 02/14] A small example pf batching. Signed-off-by: George Bosilca --- tests/runtime/cuda/stage_custom.jdf | 68 +++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index 7df99800f..6ff3e2467 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -45,7 +45,7 @@ stage_stride_in(parsec_gpu_task_t *gtask, dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i]; elem_sz = parsec_datadist_getsizeoftype(dc->mtype); in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index); - if(in_elem_dev->super.type != PARSEC_DEV_CUDA ){ + if( !(in_elem_dev->super.type & PARSEC_DEV_CUDA) ) { /* copy width bytes heigth times, skipping pitch - width bytes every time */ size_t dpitch = dc->mb * elem_sz; size_t spitch = dc->llm * elem_sz; @@ -117,6 +117,18 @@ typedef void (*cublas_dgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, /* Pre-declare function used as a property of some parameterized task */ static int64_t gemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); +static int +complete_batched_callback(parsec_device_gpu_module_t *dev, + parsec_gpu_task_t ** gpu_task, + parsec_gpu_exec_stream_t *gpu_stream) +{ + printf("complete_batched_callback\n"); + (void)dev; (void) gpu_task; (void)gpu_stream; + parsec_list_item_t* output_stream_ghost = &dev->exec_stream[1]->fifo_pending->ghost_element; + parsec_list_item_ring_merge(output_stream_ghost, &(*gpu_task)->list_item); + *gpu_task = NULL; + return PARSEC_HOOK_RETURN_DONE; +} %} %option no_taskpool_instance = true /* can be anything */ @@ -146,8 +158,38 @@ RW A <- descA(m, k) -> A TASK_CHECK(m,k) BODY [type=CUDA + batch = true dyld=cublasDgemm dyldtype=cublas_dgemm_t] { + parsec_list_item_t* ghost_elem = (parsec_list_item_t*)gpu_task->list_item.list_prev; + if( ghost_elem != gpu_task->list_item.list_next ) { /* more than one gpu_task in the ring */ + printf("We got a list of tasks!!! Let's batch them together (up to 5)\n\n"); + parsec_list_item_t* ring = parsec_list_item_ring_chop((parsec_list_item_t*)gpu_task); + parsec_gpu_task_t* task = (parsec_gpu_task_t*)ring; + int how_many = 1; /* we start with the current gpu_task */ + do { + if( gpu_task->ec->task_class == task->ec->task_class ) { + /* same task class as the current one, possible to batch */ + ring = parsec_list_item_ring_chop((parsec_list_item_t*)task); + (void)parsec_list_item_ring_push(&gpu_task->list_item, (parsec_list_item_t*)task); + how_many++; /* one more into the batch */ + if( (5 == how_many) || (NULL == ring) ) { + /* let's stop here for now */ + break; + } + } else { + ring = (parsec_list_item_t*)ring->list_next; /* more to the next task */ + } + task = (parsec_gpu_task_t*)ring; + } while(ring != ghost_elem); + /* we now have two separated task rings: the gpu_task with all the tasks that will be batched + * and the ring that has all the remaining items in the list (including the list's ghost_elem). + * The remaining list is already stored in the gpu_stream->fifo_pending. + */ + gpu_task->complete_stage = complete_batched_callback; + } else { + parsec_list_item_singleton(&gpu_task->list_item); + } double lalpha = 1.0; double lbeta = 2.0; int tempmm = descA->mb; @@ -155,14 +197,22 @@ BODY [type=CUDA cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); - parsec_body.dyld_fn( 'N', 'N', - tempmm, tempmm, tempmm, - lalpha, (double*)A, ldam, - (double*)A, ldam, - lbeta, (double*)A, ldam ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status, - {return PARSEC_HOOK_RETURN_ERROR;} ); + + parsec_gpu_task_t* current_gpu_task = gpu_task; + do { + __parsec_stage_custom_TASK_GPU_task_t *task = (__parsec_stage_custom_TASK_GPU_task_t*)current_gpu_task->ec; + _f_A = task->data._f_A.data_out; + A = PARSEC_DATA_COPY_GET_PTR(_f_A); + parsec_body.dyld_fn( 'N', 'N', + tempmm, tempmm, tempmm, + lalpha, (double*)A, ldam, + (double*)A, ldam, + lbeta, (double*)A, ldam ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status, + {return PARSEC_HOOK_RETURN_ERROR;} ); + current_gpu_task = (parsec_gpu_task_t*)current_gpu_task->list_item.list_next; + } while( current_gpu_task != gpu_task ); } END From eebe8c502c835bc67a3fb5f054e0b7972b31313a Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 26 Jul 2024 01:44:44 -0700 Subject: [PATCH 03/14] Few fixes to the batch list manipulations. Signed-off-by: George Bosilca --- parsec/mca/device/device_gpu.c | 1 + tests/runtime/cuda/stage_custom.jdf | 31 ++++++++++++++++++----------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index d44a3ed2b..01f538bdb 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -1834,6 +1834,7 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, if( 1 == rc ) { /* Save the task for the next step */ task = *out_task = stream->tasks[stream->end]; + printf("Complete task %p on stream %s{%p}\n", (void*)task, stream->name, (void*)stream); PARSEC_DEBUG_VERBOSE(19, parsec_gpu_output_stream, "GPU[%d:%s]: Completed %s on stream %s{%p}", gpu_device->super.device_index, gpu_device->super.name, diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index 6ff3e2467..027d09b5c 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -161,34 +161,41 @@ BODY [type=CUDA batch = true dyld=cublasDgemm dyldtype=cublas_dgemm_t] { - parsec_list_item_t* ghost_elem = (parsec_list_item_t*)gpu_task->list_item.list_prev; - if( ghost_elem != gpu_task->list_item.list_next ) { /* more than one gpu_task in the ring */ - printf("We got a list of tasks!!! Let's batch them together (up to 5)\n\n"); - parsec_list_item_t* ring = parsec_list_item_ring_chop((parsec_list_item_t*)gpu_task); - parsec_gpu_task_t* task = (parsec_gpu_task_t*)ring; + parsec_list_item_singleton(&gpu_task->list_item); + parsec_list_item_t* store_back = NULL; + if( !parsec_list_nolock_is_empty(gpu_stream->fifo_pending) ) { /* more than one gpu_task in the ring */ int how_many = 1; /* we start with the current gpu_task */ do { + parsec_list_item_t* item = parsec_list_pop_front(gpu_stream->fifo_pending); + parsec_list_item_singleton(item); + parsec_gpu_task_t* task = (parsec_gpu_task_t*)item; + if( gpu_task->ec->task_class == task->ec->task_class ) { /* same task class as the current one, possible to batch */ - ring = parsec_list_item_ring_chop((parsec_list_item_t*)task); (void)parsec_list_item_ring_push(&gpu_task->list_item, (parsec_list_item_t*)task); how_many++; /* one more into the batch */ - if( (5 == how_many) || (NULL == ring) ) { + printf("Add task %p to the %p batch\n", task, gpu_task); + if( 5 == how_many ) { /* let's stop here for now */ break; } } else { - ring = (parsec_list_item_t*)ring->list_next; /* more to the next task */ + if( NULL == store_back ) { + store_back = item; + } else { + parsec_list_item_ring_push(store_back, item); /* build the list of un-batcheable tasks */ + } } - task = (parsec_gpu_task_t*)ring; - } while(ring != ghost_elem); + } while( !parsec_list_nolock_is_empty(gpu_stream->fifo_pending) ); /* we now have two separated task rings: the gpu_task with all the tasks that will be batched * and the ring that has all the remaining items in the list (including the list's ghost_elem). * The remaining list is already stored in the gpu_stream->fifo_pending. */ gpu_task->complete_stage = complete_batched_callback; - } else { - parsec_list_item_singleton(&gpu_task->list_item); + printf("submit multiple tasks into one %p on stream %s{%p}\n", gpu_task, gpu_stream->name, (void*)gpu_stream); + if( NULL != store_back ) { + parsec_list_push_back(gpu_stream->fifo_pending, store_back); + } } double lalpha = 1.0; double lbeta = 2.0; From 6183645cb703f37a169ab481d62f02e56ffdd38c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 26 Jul 2024 08:44:42 -0700 Subject: [PATCH 04/14] Working state. The issue was that I forgot to clean the complete_stage after the callback, so it got called multiple times during the different completion stages of the task (completion of the execution and then later completion of the d2h transfers). Signed-off-by: George Bosilca --- parsec/mca/device/device_gpu.c | 1 - tests/runtime/cuda/stage_custom.jdf | 10 +++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 01f538bdb..d44a3ed2b 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -1834,7 +1834,6 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, if( 1 == rc ) { /* Save the task for the next step */ task = *out_task = stream->tasks[stream->end]; - printf("Complete task %p on stream %s{%p}\n", (void*)task, stream->name, (void*)stream); PARSEC_DEBUG_VERBOSE(19, parsec_gpu_output_stream, "GPU[%d:%s]: Completed %s on stream %s{%p}", gpu_device->super.device_index, gpu_device->super.name, diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index 027d09b5c..1a4b704ba 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -122,10 +122,12 @@ complete_batched_callback(parsec_device_gpu_module_t *dev, parsec_gpu_task_t ** gpu_task, parsec_gpu_exec_stream_t *gpu_stream) { - printf("complete_batched_callback\n"); + PARSEC_DEBUG_VERBOSE((10, parsec_debug_output, "complete_batched_callback for batched task %p on stream %s{%p}\n", + gpu_task, gpu_stream->name, (void*)gpu_stream)); (void)dev; (void) gpu_task; (void)gpu_stream; parsec_list_item_t* output_stream_ghost = &dev->exec_stream[1]->fifo_pending->ghost_element; parsec_list_item_ring_merge(output_stream_ghost, &(*gpu_task)->list_item); + (*gpu_task)->complete_stage = NULL; *gpu_task = NULL; return PARSEC_HOOK_RETURN_DONE; } @@ -174,7 +176,8 @@ BODY [type=CUDA /* same task class as the current one, possible to batch */ (void)parsec_list_item_ring_push(&gpu_task->list_item, (parsec_list_item_t*)task); how_many++; /* one more into the batch */ - printf("Add task %p to the %p batch\n", task, gpu_task); + PARSEC_DEBUG_VERBOSE((10, parsec_debug_output, "Add task %p to the %p batch on stream %s{%p}\n", + task, gpu_task, gpu_stream->name, (void*)gpu_stream)); if( 5 == how_many ) { /* let's stop here for now */ break; @@ -192,7 +195,8 @@ BODY [type=CUDA * The remaining list is already stored in the gpu_stream->fifo_pending. */ gpu_task->complete_stage = complete_batched_callback; - printf("submit multiple tasks into one %p on stream %s{%p}\n", gpu_task, gpu_stream->name, (void*)gpu_stream); + PARSEC_DEBUG_VERBOSE((10, parsec_debug_output, "submit multiple tasks into one %p on stream %s{%p}\n", + gpu_task, gpu_stream->name, (void*)gpu_stream)); if( NULL != store_back ) { parsec_list_push_back(gpu_stream->fifo_pending, store_back); } From c403e176538604e6c5cab67fb83b161becde98a0 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sat, 27 Jul 2024 21:31:46 -0700 Subject: [PATCH 05/14] Fix use of calloc. The count is first, then the sizeof. Signed-off-by: George Bosilca --- parsec/class/info.c | 4 ++-- parsec/data_dist/matrix/map_operator.c | 6 +++--- parsec/data_dist/matrix/vector_two_dim_cyclic.c | 2 +- parsec/interfaces/dtd/insert_function.c | 5 +++-- parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 6 +++--- parsec/maxheap.c | 4 ++-- parsec/mca/device/device.c | 2 +- parsec/mca/sched/lfq/sched_lfq_module.c | 2 +- parsec/mca/sched/lhq/sched_lhq_module.c | 2 +- parsec/mca/sched/ltq/sched_ltq_module.c | 2 +- parsec/mca/sched/pbq/sched_pbq_module.c | 2 +- parsec/parsec_internal.h | 3 +-- tests/class/atomics.c | 6 +++--- tests/class/future.c | 2 +- tests/class/future_datacopy.c | 2 +- tests/class/hash.c | 6 +++--- tests/class/lifo.c | 4 ++-- tests/class/list.c | 4 ++-- tests/class/rwlock.c | 2 +- tests/collections/redistribute/redistribute_check2.jdf | 2 +- tests/collections/reshape/common.c | 2 +- tests/dsl/dtd/dtd_test_new_tile.c | 2 +- tests/dsl/ptg/branching/branching_data.c | 2 +- tests/dsl/ptg/choice/main.c | 2 +- tests/profiling/async.jdf | 2 +- tools/profiling/dbpreader.c | 2 +- 26 files changed, 40 insertions(+), 40 deletions(-) diff --git a/parsec/class/info.c b/parsec/class/info.c index e9c45cb51..640c9d591 100644 --- a/parsec/class/info.c +++ b/parsec/class/info.c @@ -217,7 +217,7 @@ void parsec_info_object_array_init(parsec_info_object_array_t *oa, parsec_info_t if(oa->known_infos == 0) oa->info_objects = NULL; else - oa->info_objects = calloc(sizeof(void*), oa->known_infos); + oa->info_objects = calloc(oa->known_infos, sizeof(void*)); oa->infos = nfo; oa->cons_obj = cons_obj; } @@ -265,7 +265,7 @@ static void parsec_ioa_resize_and_rdlock(parsec_info_object_array_t *oa, parsec_ oa->info_objects = realloc(oa->info_objects, sizeof(void *) * ns); memset(&oa->info_objects[oa->known_infos - 1], 0, ns - oa->known_infos); } else { - oa->info_objects = calloc(sizeof(void*), ns); + oa->info_objects = calloc(ns, sizeof(void*)); } oa->known_infos = ns; } diff --git a/parsec/data_dist/matrix/map_operator.c b/parsec/data_dist/matrix/map_operator.c index 1d6af9196..280bb614c 100644 --- a/parsec/data_dist/matrix/map_operator.c +++ b/parsec/data_dist/matrix/map_operator.c @@ -106,7 +106,7 @@ static const parsec_symbol_t symb_column = { .flags = PARSEC_SYMBOL_IS_STANDALONE }; -static inline int affinity_of_map_operator(parsec_task_t *this_task, +static inline int affinity_of_map_operator(const parsec_task_t *this_task, parsec_data_ref_t *ref) { const parsec_map_operator_taskpool_t *__tp = (const parsec_map_operator_taskpool_t*)this_task->taskpool; @@ -117,7 +117,7 @@ static inline int affinity_of_map_operator(parsec_task_t *this_task, return 1; } -static inline int initial_data_of_map_operator(parsec_task_t *this_task, +static inline int initial_data_of_map_operator(const parsec_task_t *this_task, parsec_data_ref_t *refs) { int __flow_nb = 0; @@ -134,7 +134,7 @@ static inline int initial_data_of_map_operator(parsec_task_t *this_task, return __flow_nb; } -static inline int final_data_of_map_operator(parsec_task_t *this_task, +static inline int final_data_of_map_operator(const parsec_task_t *this_task, parsec_data_ref_t *data_refs) { int __flow_nb = 0; diff --git a/parsec/data_dist/matrix/vector_two_dim_cyclic.c b/parsec/data_dist/matrix/vector_two_dim_cyclic.c index 1c0b70a74..838f02f25 100644 --- a/parsec/data_dist/matrix/vector_two_dim_cyclic.c +++ b/parsec/data_dist/matrix/vector_two_dim_cyclic.c @@ -251,7 +251,7 @@ static parsec_data_t* vector_twoDBC_data_of(parsec_data_collection_t *desc, ...) #endif /* Compute the local tile row */ - assert( dc->super.bsiz == dc->super.mb ); + assert( dc->super.bsiz == (size_t)dc->super.mb ); local_m = m / dc->lcm; diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c index 91543fdfd..04ca48c3d 100644 --- a/parsec/interfaces/dtd/insert_function.c +++ b/parsec/interfaces/dtd/insert_function.c @@ -410,7 +410,7 @@ parsec_dtd_taskpool_destructor(parsec_dtd_taskpool_t *tp) free((void *)tp->super.profiling_array); #endif /* defined(PARSEC_PROF_TRACE) */ - if( tp->super.taskpool_name != NULL ) { + if( NULL != tp->super.taskpool_name) { free(tp->super.taskpool_name); tp->super.taskpool_name = NULL; } @@ -2303,6 +2303,7 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t return device->kernel_scheduler(device, es, gpu_task); #else parsec_warning("DTD: Selected best device is a GPU, but no GPU is supported at compile time. Falling back to CPU"); + (void)this_task; return PARSEC_HOOK_RETURN_NEXT; #endif } @@ -3464,7 +3465,7 @@ parsec_arena_datatype_t *parsec_dtd_create_arena_datatype(parsec_context_t *ctx, if(NULL != new_adt) return NULL; #endif - new_adt = calloc(sizeof(parsec_arena_datatype_t), 1); + new_adt = calloc(1, sizeof(parsec_arena_datatype_t)); if(NULL == new_adt) return NULL; new_adt->ht_item.key = my_id; diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 3730303d3..8d85fe6e5 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -2044,7 +2044,7 @@ static void jdf_generate_affinity( const jdf_t *jdf, const jdf_function_entry_t assert( NULL == data_affinity->var ); } - coutput("static inline int %s(%s *this_task,\n" + coutput("static inline int %s(const %s *this_task,\n" " parsec_data_ref_t *ref)\n" "{\n" " const __parsec_%s_internal_taskpool_t *__parsec_tp = (const __parsec_%s_internal_taskpool_t*)this_task->taskpool;\n", @@ -8428,14 +8428,14 @@ int jdf_force_termdet_dynamic(jdf_t* jdf) return rc; } - termdet_expr = (jdf_expr_t *)calloc(sizeof(jdf_expr_t), 1); + termdet_expr = (jdf_expr_t *)calloc(1, sizeof(jdf_expr_t)); termdet_expr->op = JDF_STRING; termdet_expr->local_variables = NULL; termdet_expr->scope = -1; termdet_expr->alias = NULL; termdet_expr->jdf_var = strdup(JDF_PROP_TERMDET_DYNAMIC); - property = (jdf_def_list_t*)calloc(sizeof(jdf_def_list_t), 1); + property = (jdf_def_list_t*)calloc(1, sizeof(jdf_def_list_t)); property->expr = termdet_expr; property->name = strdup(JDF_PROP_TERMDET_NAME); property->next = jdf->global_properties; diff --git a/parsec/maxheap.c b/parsec/maxheap.c index 46d2cf14a..d6dc7b3b4 100644 --- a/parsec/maxheap.c +++ b/parsec/maxheap.c @@ -25,7 +25,7 @@ static inline int hiBit(unsigned int n) parsec_heap_t* heap_create(void) { - parsec_heap_t* heap = calloc(sizeof(parsec_heap_t), 1); + parsec_heap_t* heap = calloc(1, sizeof(parsec_heap_t)); /* Point back to the parent structure */ heap->list_item.list_next = (parsec_list_item_t*)heap; heap->list_item.list_prev = (parsec_list_item_t*)heap; @@ -67,7 +67,7 @@ void heap_insert(parsec_heap_t * heap, parsec_task_t * elem) } parents_size = level_counter; - parsec_task_t ** parents = calloc(sizeof(parsec_task_t *), level_counter); + parsec_task_t ** parents = calloc(level_counter, sizeof(parsec_task_t *)); // now the bitmask is two places farther than we want it, so back down bitmask = bitmask >> 2; diff --git a/parsec/mca/device/device.c b/parsec/mca/device/device.c index d1d42dbf7..25436f26e 100644 --- a/parsec/mca/device/device.c +++ b/parsec/mca/device/device.c @@ -408,7 +408,7 @@ void parsec_compute_best_unit( uint64_t length, float* updated_value, char** bes void parsec_devices_save_statistics(uint64_t **pstats) { if(NULL == *pstats) { - *pstats = (uint64_t*)calloc(sizeof(uint64_t), parsec_nb_devices * 6 /* see below for the number of arrays */); + *pstats = (uint64_t*)calloc(parsec_nb_devices * 6 /* see below for the number of arrays */, sizeof(uint64_t)); } else { memset(*pstats, 0, parsec_nb_devices * sizeof(uint64_t) * 6); diff --git a/parsec/mca/sched/lfq/sched_lfq_module.c b/parsec/mca/sched/lfq/sched_lfq_module.c index 721146ec2..ce20ed9dc 100644 --- a/parsec/mca/sched/lfq/sched_lfq_module.c +++ b/parsec/mca/sched/lfq/sched_lfq_module.c @@ -64,7 +64,7 @@ static int flow_lfq_init(parsec_execution_stream_t* es, struct parsec_barrier_t* vp = es->virtual_process; /* Every flow creates its own local object */ - sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(sizeof(parsec_mca_sched_local_queues_scheduler_object_t), 1); + sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(1, sizeof(parsec_mca_sched_local_queues_scheduler_object_t)); es->scheduler_object = sched_obj; if( 0 == es->th_id ) { /* And flow 0 creates the system_queue */ sched_obj->system_queue = PARSEC_OBJ_NEW(parsec_dequeue_t); diff --git a/parsec/mca/sched/lhq/sched_lhq_module.c b/parsec/mca/sched/lhq/sched_lhq_module.c index 6d9ec2424..e21bd81bd 100644 --- a/parsec/mca/sched/lhq/sched_lhq_module.c +++ b/parsec/mca/sched/lhq/sched_lhq_module.c @@ -70,7 +70,7 @@ static int flow_lhq_init(parsec_execution_stream_t* ces, struct parsec_barrier_t /* First of all, we allocate the scheduling object memory for all threads */ es = vp->execution_streams[t]; - sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(sizeof(parsec_mca_sched_local_queues_scheduler_object_t), 1); + sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(1, sizeof(parsec_mca_sched_local_queues_scheduler_object_t)); es->scheduler_object = sched_obj; if( es->th_id == 0 ) { diff --git a/parsec/mca/sched/ltq/sched_ltq_module.c b/parsec/mca/sched/ltq/sched_ltq_module.c index f1bf081b5..81c009e76 100644 --- a/parsec/mca/sched/ltq/sched_ltq_module.c +++ b/parsec/mca/sched/ltq/sched_ltq_module.c @@ -63,7 +63,7 @@ static int flow_ltq_init(parsec_execution_stream_t* es, struct parsec_barrier_t* uint32_t queue_size; parsec_vp_t * vp = es->virtual_process; - sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(sizeof(parsec_mca_sched_local_queues_scheduler_object_t), 1); + sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(1, sizeof(parsec_mca_sched_local_queues_scheduler_object_t)); es->scheduler_object = sched_obj; if( es->th_id == 0 ) { diff --git a/parsec/mca/sched/pbq/sched_pbq_module.c b/parsec/mca/sched/pbq/sched_pbq_module.c index e604e8dc0..2c3b94865 100644 --- a/parsec/mca/sched/pbq/sched_pbq_module.c +++ b/parsec/mca/sched/pbq/sched_pbq_module.c @@ -61,7 +61,7 @@ static int flow_pbq_init(parsec_execution_stream_t* es, struct parsec_barrier_t* uint32_t queue_size = 0; /* Every flow creates its own local object */ - sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(sizeof(parsec_mca_sched_local_queues_scheduler_object_t), 1); + sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(1, sizeof(parsec_mca_sched_local_queues_scheduler_object_t)); es->scheduler_object = sched_obj; if( es->th_id == 0 ) { /* And flow 0 creates the system_queue */ sched_obj->system_queue = PARSEC_OBJ_NEW(parsec_dequeue_t); diff --git a/parsec/parsec_internal.h b/parsec/parsec_internal.h index 970587260..90bcafbca 100644 --- a/parsec/parsec_internal.h +++ b/parsec/parsec_internal.h @@ -337,8 +337,7 @@ typedef struct parsec_data_ref_s { parsec_data_key_t key; } parsec_data_ref_t; -typedef int (parsec_data_ref_fn_t)(parsec_task_t *task, - parsec_data_ref_t *ref); +typedef int (parsec_data_ref_fn_t)(const parsec_task_t *task, parsec_data_ref_t *ref); #define PARSEC_HAS_IN_IN_DEPENDENCIES 0x0001 #define PARSEC_HAS_OUT_OUT_DEPENDENCIES 0x0002 diff --git a/tests/class/atomics.c b/tests/class/atomics.c index fc7f0ecbb..74c7e0695 100644 --- a/tests/class/atomics.c +++ b/tests/class/atomics.c @@ -242,9 +242,9 @@ int main(int argc, char *argv[]) } } - threads = calloc(sizeof(pthread_t), nb_tests); - params = calloc(sizeof(param_t), nb_tests); - values = calloc(sizeof(values_t), nb_tests+2); + threads = calloc(nb_tests, sizeof(pthread_t)); + params = calloc(nb_tests, sizeof(param_t)); + values = calloc(nb_tests+2, sizeof(values_t)); gettimeofday(&now, NULL); srand48(now.tv_usec ^ getpid()); diff --git a/tests/class/future.c b/tests/class/future.c index ff3ee4f05..887ca9e14 100644 --- a/tests/class/future.c +++ b/tests/class/future.c @@ -119,7 +119,7 @@ int main(int argc, char* argv[]) c_fut = PARSEC_OBJ_NEW(parsec_countable_future_t); parsec_future_init(c_fut, NULL, cores); printf("running with %d cores and %d copies\n", cores, ncopy); - threads = calloc(sizeof(pthread_t), cores); + threads = calloc(cores, sizeof(pthread_t)); fut_array = malloc(cores*ncopy*sizeof(parsec_base_future_t*)); data = malloc(cores*ncopy*sizeof(int)); diff --git a/tests/class/future_datacopy.c b/tests/class/future_datacopy.c index effad1fe7..28af1262d 100644 --- a/tests/class/future_datacopy.c +++ b/tests/class/future_datacopy.c @@ -162,7 +162,7 @@ int main(int argc, char* argv[]) } printf("running with %d cores and %d copies\n", cores, ncopy); - threads = calloc(sizeof(pthread_t), cores); + threads = calloc(cores, sizeof(pthread_t)); fut_array = malloc(ncopy*sizeof(parsec_datacopy_future_t*)); data = malloc(cores*ncopy*sizeof(int)); diff --git a/tests/class/hash.c b/tests/class/hash.c index 850713ee7..8fe64e466 100644 --- a/tests/class/hash.c +++ b/tests/class/hash.c @@ -597,9 +597,9 @@ int main(int argc, char *argv[]) } } - threads = calloc(sizeof(pthread_t), maxthreads); - params = calloc(sizeof(param_t), maxthreads+1); - keys = calloc(sizeof(uint64_t), nb_tests); + threads = calloc(maxthreads, sizeof(pthread_t)); + params = calloc(maxthreads+1, sizeof(param_t)); + keys = calloc(nb_tests, sizeof(uint64_t)); init_keys(keys, nb_tests, seed, structured_keys); for(md_tuning = md_tuning_min; md_tuning < md_tuning_max; md_tuning += md_tuning_inc) { diff --git a/tests/class/lifo.c b/tests/class/lifo.c index 907d215d7..544d679cf 100644 --- a/tests/class/lifo.c +++ b/tests/class/lifo.c @@ -254,8 +254,8 @@ int main(int argc, char *argv[]) } } - threads = (pthread_t*)calloc(sizeof(pthread_t), nbthreads); - times = (uint64_t*)calloc(sizeof(uint64_t), nbthreads); + threads = (pthread_t*)calloc(nbthreads, sizeof(pthread_t)); + times = (uint64_t*)calloc(nbthreads, sizeof(uint64_t)); PARSEC_OBJ_CONSTRUCT(&lifo1, parsec_lifo_t); PARSEC_OBJ_CONSTRUCT(&lifo2, parsec_lifo_t); diff --git a/tests/class/list.c b/tests/class/list.c index d85a728bb..6451eb5ee 100644 --- a/tests/class/list.c +++ b/tests/class/list.c @@ -299,8 +299,8 @@ int main(int argc, char *argv[]) } } - threads = (pthread_t*)calloc(sizeof(pthread_t), nbthreads); - times = (uint64_t*)calloc(sizeof(uint64_t), nbthreads); + threads = (pthread_t*)calloc(nbthreads, sizeof(pthread_t)); + times = (uint64_t*)calloc(nbthreads, sizeof(uint64_t)); PARSEC_OBJ_CONSTRUCT( &l1, parsec_list_t ); PARSEC_OBJ_CONSTRUCT( &l2, parsec_list_t ); diff --git a/tests/class/rwlock.c b/tests/class/rwlock.c index d16a221ec..d0a06f431 100644 --- a/tests/class/rwlock.c +++ b/tests/class/rwlock.c @@ -138,7 +138,7 @@ int main(int argc, char *argv[]) parsec_atomic_rwlock_init(&rwlock); - threads = (pthread_t*)calloc(sizeof(pthread_t), maxthreads); + threads = (pthread_t*)calloc(maxthreads, sizeof(pthread_t)); for( nbthreads = minthreads; nbthreads < maxthreads; nbthreads++) { parsec_barrier_init(&barrier, NULL, nbthreads+1); diff --git a/tests/collections/redistribute/redistribute_check2.jdf b/tests/collections/redistribute/redistribute_check2.jdf index fcb4c0b52..0693f5b08 100644 --- a/tests/collections/redistribute/redistribute_check2.jdf +++ b/tests/collections/redistribute/redistribute_check2.jdf @@ -143,7 +143,7 @@ int parsec_redistribute_check2(parsec_context_t *parsec, int nb_threads = parsec->virtual_processes[0]->nb_cores; /* Used for error accumulation */ - long long int *info = (long long int *)calloc(sizeof(long long int), nb_threads); + long long int *info = (long long int *)calloc(nb_threads, sizeof(long long int)); parsec_redistribute_check2 = parsec_redistribute_check2_New( (parsec_tiled_matrix_t *)dcY, diff --git a/tests/collections/reshape/common.c b/tests/collections/reshape/common.c index 04b2cd825..3ade805d6 100644 --- a/tests/collections/reshape/common.c +++ b/tests/collections/reshape/common.c @@ -158,7 +158,7 @@ int reshape_set_matrix_value_position_swap(parsec_execution_stream_t *es, int check_matrix_equal(parsec_matrix_block_cyclic_t dcA, parsec_matrix_block_cyclic_t dcA_check){ int ret = 0; - for(int i=0; i < dcA_check.super.nb_local_tiles * dcA_check.super.bsiz; i++){ + for(size_t i = 0; i < (dcA_check.super.nb_local_tiles * dcA_check.super.bsiz); i++) { if( ((int*)dcA.mat)[i] != ((int*)dcA_check.mat)[i]){ ret = 1; break; diff --git a/tests/dsl/dtd/dtd_test_new_tile.c b/tests/dsl/dtd/dtd_test_new_tile.c index 014b8ed81..9bf463495 100644 --- a/tests/dsl/dtd/dtd_test_new_tile.c +++ b/tests/dsl/dtd/dtd_test_new_tile.c @@ -378,7 +378,7 @@ int main(int argc, char **argv) parsec_dtd_task_class_add_chore(dtd_tp, fourth_tc, PARSEC_DEV_CPU, cpu_reduce); parsec_dtd_tile_t **new_tiles; - new_tiles = (parsec_dtd_tile_t**)calloc(sizeof(parsec_dtd_tile_t *), NCASE*world); + new_tiles = (parsec_dtd_tile_t**)calloc(NCASE*world, sizeof(parsec_dtd_tile_t *)); acc = 0; diff --git a/tests/dsl/ptg/branching/branching_data.c b/tests/dsl/ptg/branching/branching_data.c index e362cb5c8..4aeffc9c9 100644 --- a/tests/dsl/ptg/branching/branching_data.c +++ b/tests/dsl/ptg/branching/branching_data.c @@ -98,7 +98,7 @@ parsec_data_collection_t *create_and_distribute_data(int rank, int world, int si #endif parsec_type_create_contiguous(size, parsec_datatype_int32_t, &d->default_dtt); - m->data = calloc(sizeof(parsec_data_t*), nb); + m->data = calloc(nb, sizeof(parsec_data_t*)); m->nt = nb; m->size = size; m->ptr = (int32_t*)malloc(nb * size * sizeof(int32_t)); diff --git a/tests/dsl/ptg/choice/main.c b/tests/dsl/ptg/choice/main.c index 29988fd1d..a8066a9df 100644 --- a/tests/dsl/ptg/choice/main.c +++ b/tests/dsl/ptg/choice/main.c @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) dcA = create_and_distribute_data(rank, world, size); parsec_data_collection_set_key(dcA, "A"); - decision = (int*)calloc(sizeof(int), nb+1); + decision = (int*)calloc(nb+1, sizeof(int)); choice = choice_new(dcA, size, decision, nb, world); rc = parsec_context_add_taskpool(parsec, choice); diff --git a/tests/profiling/async.jdf b/tests/profiling/async.jdf index 487c2ea21..de24694c8 100644 --- a/tests/profiling/async.jdf +++ b/tests/profiling/async.jdf @@ -211,7 +211,7 @@ int main( int argc, char** argv ) profiling_save_iinfo("NB", NB); #endif - taskqueue = (parsec_task_t**)calloc(sizeof(parsec_task_t*), (NB+1)); + taskqueue = (parsec_task_t**)calloc(NB+1, sizeof(parsec_task_t*)); /** * Build the data and the arena to hold it up. */ diff --git a/tools/profiling/dbpreader.c b/tools/profiling/dbpreader.c index 00eb00acd..941351031 100644 --- a/tools/profiling/dbpreader.c +++ b/tools/profiling/dbpreader.c @@ -1071,7 +1071,7 @@ static dbp_multifile_reader_t *open_files(int nbfiles, char **filenames) dbp->last_error = SUCCESS; dbp->dico_size = 0; dbp->dico_allocated = 8; - dbp->dico_keys = calloc(sizeof(dbp_dictionary_t), dbp->dico_allocated); + dbp->dico_keys = calloc(dbp->dico_allocated, sizeof(dbp_dictionary_t)); n = 0; for(i = 0; i < nbfiles; i++) { From 88bf42e1c1841de40f6f811c47896d2119a8d6f5 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sun, 28 Jul 2024 13:07:47 -0700 Subject: [PATCH 06/14] Allow selection of a particular GPU (via the mask). Signed-off-by: George Bosilca --- .../mca/device/cuda/device_cuda_component.c | 35 +++++++------------ 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index c06ba416b..2e3582a4a 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -40,7 +40,8 @@ int parsec_cuda_max_streams = PARSEC_GPU_MAX_STREAMS; int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks; char* parsec_cuda_lib_path = NULL; -static int cuda_mask; +static int parsec_device_cuda_mask = 0xFF; +static int parsec_device_cuda_avail = 0; static int parsec_cuda_sort_pending; #if defined(PARSEC_PROF_TRACE) @@ -104,10 +105,10 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority else parsec_device_cuda_component.modules = NULL; - for( i = j = 0; i < parsec_device_cuda_enabled; i++ ) { + for( i = j = 0; (i < parsec_device_cuda_avail) && (j < parsec_device_cuda_enabled); i++ ) { /* Allow fine grain selection of the GPU's */ - if( !((1 << i) & cuda_mask) ) continue; + if( !((1 << i) & parsec_device_cuda_mask) ) continue; rc = parsec_cuda_module_init(i, &parsec_device_cuda_component.modules[j]); if( PARSEC_SUCCESS != rc ) { @@ -139,7 +140,7 @@ static int device_cuda_component_register(void) false, false, -1, &parsec_device_cuda_enabled); (void)parsec_mca_param_reg_int_name("device_cuda", "mask", "The bitwise mask of CUDA devices to be enabled (default all)", - false, false, 0xffffffff, &cuda_mask); + false, false, 0xffffffff, &parsec_device_cuda_mask); (void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask", "What devices are allowed to use NVLINK if available (default all)", false, false, 0xffffffff, &parsec_cuda_nvlink_mask); @@ -185,15 +186,14 @@ static int device_cuda_component_register(void) static int device_cuda_component_open(void) { cudaError_t cudastatus; - int ndevices; if( 0 == parsec_device_cuda_enabled ) { return MCA_ERROR; /* Nothing to do around here */ } - cudastatus = cudaGetDeviceCount( &ndevices ); + cudastatus = cudaGetDeviceCount(&parsec_device_cuda_avail); if( cudaErrorNoDevice == (cudaError_t) cudastatus ) { - ndevices = 0; + parsec_device_cuda_avail = 0; /* This is normal on machines with no GPUs, let it flow * to do the normal checks vis-a-vis the number of requested * devices and issue a warning only when not fulfilling @@ -208,31 +208,22 @@ static int device_cuda_component_open(void) } ); } - if( ndevices > parsec_device_cuda_enabled ) { - if( 0 < parsec_device_cuda_enabled ) { - ndevices = parsec_device_cuda_enabled; - } - } else if (ndevices < parsec_device_cuda_enabled ) { + /* Update the number of GPU for the upper layer */ + if (parsec_device_cuda_avail < parsec_device_cuda_enabled ) { if( 0 < parsec_device_cuda_enabled ) { - if( 0 == ndevices ) { + if( 0 == parsec_device_cuda_avail ) { parsec_warning("User requested %d CUDA devices, but none are available on %s." " CUDA support will be therefore disabled.", parsec_device_cuda_enabled, parsec_hostname); } else { parsec_warning("User requested %d CUDA devices, but only %d are available on %s.", - parsec_device_cuda_enabled, ndevices, parsec_hostname); + parsec_device_cuda_enabled, parsec_device_cuda_avail, parsec_hostname); } - parsec_mca_param_set_int(parsec_device_cuda_enabled_index, ndevices); } + parsec_mca_param_set_int(parsec_device_cuda_enabled_index, parsec_device_cuda_avail); } - /* Update the number of GPU for the upper layer */ - parsec_device_cuda_enabled = ndevices; - if( 0 == ndevices ) { - return MCA_ERROR; - } - - return MCA_SUCCESS; + return (0 == parsec_device_cuda_avail) ? MCA_ERROR : MCA_SUCCESS; } /** From 733c05ed230caf16704d20162531b9eb420f875c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 29 Jul 2024 09:35:25 -0700 Subject: [PATCH 07/14] Correctly add the unselected tasks back into the runtime. Signed-off-by: George Bosilca --- tests/runtime/cuda/stage_custom.jdf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index 1a4b704ba..44d1d35fd 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -172,7 +172,7 @@ BODY [type=CUDA parsec_list_item_singleton(item); parsec_gpu_task_t* task = (parsec_gpu_task_t*)item; - if( gpu_task->ec->task_class == task->ec->task_class ) { + if( false && (gpu_task->ec->task_class == task->ec->task_class) ) { /* same task class as the current one, possible to batch */ (void)parsec_list_item_ring_push(&gpu_task->list_item, (parsec_list_item_t*)task); how_many++; /* one more into the batch */ @@ -194,11 +194,12 @@ BODY [type=CUDA * and the ring that has all the remaining items in the list (including the list's ghost_elem). * The remaining list is already stored in the gpu_stream->fifo_pending. */ - gpu_task->complete_stage = complete_batched_callback; + if( how_many > 1 ) + gpu_task->complete_stage = complete_batched_callback; PARSEC_DEBUG_VERBOSE((10, parsec_debug_output, "submit multiple tasks into one %p on stream %s{%p}\n", gpu_task, gpu_stream->name, (void*)gpu_stream)); if( NULL != store_back ) { - parsec_list_push_back(gpu_stream->fifo_pending, store_back); + parsec_list_item_ring_merge(&gpu_stream->fifo_pending->ghost_element, store_back); } } double lalpha = 1.0; From a3aee186b727800d44e920da7660abe5bf56a7a1 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 31 Jul 2024 22:40:21 -0700 Subject: [PATCH 08/14] Example of passing info to the batch callback. Signed-off-by: George Bosilca --- tests/runtime/cuda/stage_custom.jdf | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index 44d1d35fd..642aabe9b 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -117,6 +117,12 @@ typedef void (*cublas_dgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, /* Pre-declare function used as a property of some parameterized task */ static int64_t gemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); +typedef struct deallocation_helper_s { + parsec_assignment_t m; + parsec_assignment_t k; + void** my_storage[3]; + } free_helper_t; + static int complete_batched_callback(parsec_device_gpu_module_t *dev, parsec_gpu_task_t ** gpu_task, @@ -127,6 +133,8 @@ complete_batched_callback(parsec_device_gpu_module_t *dev, (void)dev; (void) gpu_task; (void)gpu_stream; parsec_list_item_t* output_stream_ghost = &dev->exec_stream[1]->fifo_pending->ghost_element; parsec_list_item_ring_merge(output_stream_ghost, &(*gpu_task)->list_item); + free_helper_t* helper = (free_helper_t*)(((*gpu_task)->ec)->locals); (void)helper; + //printf(" release %p and %p and %p\n", helper->my_storage[0], helper->my_storage[1], helper->my_storage[2]); (*gpu_task)->complete_stage = NULL; *gpu_task = NULL; return PARSEC_HOOK_RETURN_DONE; @@ -172,7 +180,7 @@ BODY [type=CUDA parsec_list_item_singleton(item); parsec_gpu_task_t* task = (parsec_gpu_task_t*)item; - if( false && (gpu_task->ec->task_class == task->ec->task_class) ) { + if( gpu_task->ec->task_class == task->ec->task_class ) { /* same task class as the current one, possible to batch */ (void)parsec_list_item_ring_push(&gpu_task->list_item, (parsec_list_item_t*)task); how_many++; /* one more into the batch */ @@ -211,6 +219,10 @@ BODY [type=CUDA cublasSetKernelStream( parsec_body.stream ); parsec_gpu_task_t* current_gpu_task = gpu_task; + free_helper_t* helper = (free_helper_t*)((gpu_task->ec)->locals); + helper->my_storage[0] = (void*)(uintptr_t)0xdeadbeef; + helper->my_storage[1] = (void*)(uintptr_t)0xabcdabcd; + helper->my_storage[2] = (void*)(uintptr_t)0xcdefcdef; do { __parsec_stage_custom_TASK_GPU_task_t *task = (__parsec_stage_custom_TASK_GPU_task_t*)current_gpu_task->ec; _f_A = task->data._f_A.data_out; From d6437505442f3681bdad93bffbe186d8e745c288 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 31 Jul 2024 22:43:26 -0700 Subject: [PATCH 09/14] Add a CUDA-based RTT test. This allows to check if the data can be send and received directly to and from GPU buffers. Signed-off-by: George Bosilca --- tests/runtime/cuda/CMakeLists.txt | 7 +- tests/runtime/cuda/rtt.jdf | 26 ++++ tests/runtime/cuda/rtt_main.c | 225 ++++++++++++++++++++++++++++++ 3 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 tests/runtime/cuda/rtt.jdf create mode 100644 tests/runtime/cuda/rtt_main.c diff --git a/tests/runtime/cuda/CMakeLists.txt b/tests/runtime/cuda/CMakeLists.txt index fbb5a5022..fdfbf9f89 100644 --- a/tests/runtime/cuda/CMakeLists.txt +++ b/tests/runtime/cuda/CMakeLists.txt @@ -22,7 +22,12 @@ if(PARSEC_HAVE_CUDA) endif( NOT TARGET CUDA::cublas ) # Testing for geting best device - parsec_addtest_executable(C testing_get_best_device SOURCES "testing_get_best_device.c") + parsec_addtest_executable(C testing_get_best_device SOURCES testing_get_best_device.c) target_include_directories(testing_get_best_device PRIVATE $<$:${CMAKE_CURRENT_SOURCE_DIR}>) target_ptg_sources(testing_get_best_device PRIVATE "get_best_device_check.jdf") + + # Testing for communications to and from GPU + parsec_addtest_executable(C cuda_rtt SOURCES rtt.c rtt_main.c) + target_include_directories(cuda_rtt PRIVATE $<$:${CMAKE_CURRENT_SOURCE_DIR}>) + target_ptg_sources(cuda_rtt PRIVATE "rtt.jdf") endif(PARSEC_HAVE_CUDA) diff --git a/tests/runtime/cuda/rtt.jdf b/tests/runtime/cuda/rtt.jdf new file mode 100644 index 000000000..28fe93176 --- /dev/null +++ b/tests/runtime/cuda/rtt.jdf @@ -0,0 +1,26 @@ +extern "C" %{ + /* This simple example does not need to include anything */ +%} + +%option no_taskpool_instance = true /* can be anything */ + +NT +WS + +PING(k) + +k = 0 .. NT-1 +: A(0, k % WS) + +RW T <- (k == 0) ? A(0, k % WS) : T PING(k-1) + -> (k < NT) ? T PING(k+1) : A(0, k % WS) + +; 0 + +BODY [type = CUDA] + printf("ping(%d)\n", k); + /* + int r; MPI_Comm_rank(MPI_COMM_WORLD, &r); + printf("%d: PING(%d)\n", r, k); + */ +END diff --git a/tests/runtime/cuda/rtt_main.c b/tests/runtime/cuda/rtt_main.c new file mode 100644 index 000000000..e3c212907 --- /dev/null +++ b/tests/runtime/cuda/rtt_main.c @@ -0,0 +1,225 @@ +/** + * Copyright (c) 2019-2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + */ + +#include "parsec.h" +#include "parsec/data_distribution.h" +#include "parsec/data_dist/matrix/matrix.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +#include "parsec/include/parsec/execution_stream.h" +#include "parsec/utils/mca_param.h" + +#include "rtt.h" + +#if defined(DISTRIBUTED) +#include +#endif + #include + +static int nb_gpus = 1, gpu_mask = 0xff; +static int cuda_device_index_len = 0, *cuda_device_index = NULL; + +/** + * @brief init operator + * + * @param [in] es: execution stream + * @param [in] descA: tiled matrix date descriptor + * @param [inout] A: inout data + * @param [in] uplo: matrix shape + * @param [in] m: tile row index + * @param [in] n: tile column index + * @param [in] args: NULL + */ +static int matrix_init_ops(parsec_execution_stream_t *es, + const parsec_tiled_matrix_t *descA, + void *_A, parsec_matrix_uplo_t uplo, + int m, int n, void *args) +{ + memset(_A, 1, m*n); + + /* Address warning when compile */ +#if 1 + parsec_data_key_t key = descA->super.data_key((parsec_data_collection_t*)descA, m, n); + parsec_data_t* data = descA->super.data_of_key((parsec_data_collection_t*)descA, key); + parsec_advise_data_on_device(data, + cuda_device_index[m % cuda_device_index_len], + PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); +#endif + (void)es; (void)uplo;(void)n;(void)m;(void)args; + return 0; +} + +static void +__parsec_rtt_destructor(parsec_rtt_taskpool_t *rtt_tp) +{ + parsec_type_free(&(rtt_tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX].opaque_dtt)); +} + +PARSEC_OBJ_CLASS_INSTANCE(parsec_rtt_taskpool_t, parsec_taskpool_t, + NULL, __parsec_rtt_destructor); + +parsec_taskpool_t *rtt_New(parsec_context_t *ctx, size_t size, int roundtrips) +{ + parsec_rtt_taskpool_t *tp = NULL; + parsec_datatype_t block; + size_t mb = sqrt(size), nb = size / mb; + + if (mb <= 0) { + fprintf(stderr, "To work, RTT must do at least one round time trip of at least one byte\n"); + return (parsec_taskpool_t *)tp; + } + + parsec_matrix_block_cyclic_t* dcA = (parsec_matrix_block_cyclic_t *)calloc(1, sizeof(parsec_matrix_block_cyclic_t)); + parsec_matrix_block_cyclic_init(dcA, PARSEC_MATRIX_BYTE, PARSEC_MATRIX_TILE, + ctx->my_rank, + mb, nb, + mb, ctx->nb_nodes * nb, + 0, 0, + mb, ctx->nb_nodes * nb, + 1, ctx->nb_nodes, 1, 1, + 0, 0); + dcA->mat = parsec_data_allocate((size_t)dcA->super.nb_local_tiles * + (size_t)dcA->super.bsiz * + (size_t)parsec_datadist_getsizeoftype(dcA->super.mtype)); + parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A"); + + /* Initialize and place the dcA */ + parsec_apply(ctx, PARSEC_MATRIX_FULL, + (parsec_tiled_matrix_t *)dcA, + (parsec_tiled_matrix_unary_op_t)matrix_init_ops, NULL); + + tp = parsec_rtt_new((parsec_data_collection_t*)dcA, roundtrips, ctx->nb_nodes); + + ptrdiff_t lb, extent; + parsec_type_create_contiguous(mb*nb, parsec_datatype_uint8_t, &block); + parsec_type_extent(block, &lb, &extent); + + parsec_arena_datatype_construct(&tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX], + extent, PARSEC_ARENA_ALIGNMENT_SSE, + block); + return (parsec_taskpool_t *)tp; +} + +int main(int argc, char *argv[]) +{ + parsec_context_t *parsec = NULL; + parsec_taskpool_t *tp; + int size = 1, rank = 0, loops = 100, frags = 1, nb_runs = 1, cores = 2, do_sleep = 0, ch, use_opt = 1; + struct timeval tstart, tend; + size_t msg_size = 8*1024; + double t, bw; + + while ((ch = getopt(argc, argv, "c:g:G:l:f:m:n:s:")) != -1) { + switch (ch) { + case 'c': cores = atoi(optarg); use_opt += 2; break; + case 'g': nb_gpus = atoi(optarg); use_opt += 2; break; + case 'G': gpu_mask = atoi(optarg); use_opt += 2; break; + case 'l': loops = atoi(optarg); use_opt += 2; break; + case 'f': frags = atoi(optarg); use_opt += 2; break; + case 'm': msg_size = (size_t)atoi(optarg); use_opt += 2; break; + case 'n': nb_runs = atoi(optarg); use_opt += 2; break; + case 's': do_sleep = atoi(optarg); use_opt += 2; break; + default: + fprintf(stderr, + "-c : number of cores to use (default 2)\n" + "-g : number of GPU to use (default 1)\n" + "-G : GPU mask to use (-1 to modulo rank per node)\n" + "-l : loops of bandwidth(default: 100)\n" + "-f : frags, number of fragments (default: 1)\n" + "-m : size, size of message (default: 1024 * 8)\n" + "-n : number of runs (default: 1)\n" + "-s : number of seconds to sleep before running the tests\n" + "\n"); + exit(1); + } + } + /* Remove all options already acknowledged */ + if( NULL == argv[optind] ) { + argc = 1; + } else { + memcpy(&argv[1], &argv[use_opt+1], (argc - use_opt) * sizeof(char*)); + argc -= use_opt; + } + argv[argc] = NULL; +#if defined(DISTRIBUTED) +#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) + extern char **environ; + char *value; + asprintf(&value, "%d", nb_gpus); + parsec_setenv_mca_param("device_cuda_enabled", value, &environ); + free(value); + value = NULL; + if (0xFF != gpu_mask) { + asprintf(&value, "%d", gpu_mask); + parsec_setenv_mca_param("device_cuda_mask", value, &environ); + free(value); + value = NULL; + } +#endif + { + int provided; + MPI_Init_thread(NULL, NULL, MPI_THREAD_SERIALIZED, &provided); + } + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#endif /* DISTRIBUTED */ + if( 0 == rank ) { + printf("Running %d tests of %d steps RTT with a data of size %zu\n", + nb_runs, loops, msg_size); + } + parsec = parsec_init(cores, &argc, &argv); + + /* can the test run? */ + nb_gpus = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA); + assert(nb_gpus >= 0); + if(nb_gpus == 0) { + parsec_warning("This test can only run if at least one GPU device is present"); + exit(-PARSEC_ERR_DEVICE); + } + if( do_sleep ) { + sleep(do_sleep); + } + cuda_device_index = (int *)malloc(parsec_nb_devices * sizeof(int)); + cuda_device_index_len = 0; + for (int dev = 0; dev < (int)parsec_nb_devices; dev++) { + parsec_device_module_t *device = parsec_mca_device_get(dev); + if (PARSEC_DEV_CUDA & device->type) { + cuda_device_index[cuda_device_index_len++] = device->device_index; + } + } + +#if defined(PARSEC_HAVE_MPI) + MPI_Barrier(MPI_COMM_WORLD); +#endif /* defined(PARSEC_HAVE_MPI) */ + gettimeofday(&tstart, NULL); + for( int test_id = 0; test_id < nb_runs; test_id++ ) { + tp = rtt_New(parsec, msg_size, loops); + if( NULL != tp ) { + parsec_context_add_taskpool(parsec, tp); + parsec_context_start(parsec); + parsec_context_wait(parsec); + parsec_taskpool_free(tp); + } + } +#if defined(PARSEC_HAVE_MPI) + MPI_Barrier(MPI_COMM_WORLD); +#endif /* defined(PARSEC_HAVE_MPI) */ + gettimeofday(&tend, NULL); + + if( 0 == rank ) { + t = ((tend.tv_sec - tstart.tv_sec) * 1000000.0 + (tend.tv_usec - tstart.tv_usec)) / 1000000.0; /* in seconds */ + double total_payload = (double)nb_runs * (double)loops * (double)msg_size / 1024.0 / 1024.0 / 1024.0; + bw = total_payload / t; + printf("%d\t%d\t%d\t%zu\t%08.4g s\t%4.8g GB/s\n", nb_runs, frags, loops, msg_size*sizeof(uint8_t), t, bw); + } + + free(cuda_device_index); cuda_device_index = NULL; + cuda_device_index_len = 0; + parsec_fini(&parsec); +#if defined(DISTRIBUTED) + MPI_Finalize(); +#endif /* DISTRIBUTED */ + return 0; +} From bd35c5dffebe0995c5bda7362b2b85d5995fa7df Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 31 Jul 2024 22:44:27 -0700 Subject: [PATCH 10/14] Provide a way to do GPU masking. Signed-off-by: George Bosilca --- tests/runtime/cuda/stress_main.c | 24 +++++++++++++- tests/runtime/cuda/testing_get_best_device.c | 33 ++++++++++---------- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/tests/runtime/cuda/stress_main.c b/tests/runtime/cuda/stress_main.c index 31176f4a8..5f060bdfc 100644 --- a/tests/runtime/cuda/stress_main.c +++ b/tests/runtime/cuda/stress_main.c @@ -2,6 +2,7 @@ #include "parsec/data_distribution.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +#include "parsec/utils/mca_param.h" #include "stress.h" #include "stress_wrapper.h" @@ -15,7 +16,7 @@ int main(int argc, char *argv[]) parsec_context_t *parsec = NULL; parsec_taskpool_t *tp; int size = 1; - int rank = 0; + int rank = 0, nb_gpus = 1; #if defined(DISTRIBUTED) { @@ -24,6 +25,27 @@ int main(int argc, char *argv[]) } MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) + { + MPI_Comm local_comm; + int local_rank, local_size; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, + MPI_INFO_NULL, &local_comm); + MPI_Comm_rank(local_comm, &local_rank); + MPI_Comm_size(local_comm, &local_size); + MPI_Comm_free(&local_comm); + int gpu_mask = 0; + for (int i = 0; i < nb_gpus; i++) + { + gpu_mask |= ((1 << local_rank) << i); + } + char *value; + asprintf(&value, "%d", gpu_mask); + parsec_setenv_mca_param("device_cuda_mask", value, &environ); + free(value); + value = NULL; + } +#endif /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)*/ #endif /* DISTRIBUTED */ parsec = parsec_init(-1, &argc, &argv); diff --git a/tests/runtime/cuda/testing_get_best_device.c b/tests/runtime/cuda/testing_get_best_device.c index 4bfd43b5d..f42279534 100644 --- a/tests/runtime/cuda/testing_get_best_device.c +++ b/tests/runtime/cuda/testing_get_best_device.c @@ -45,17 +45,10 @@ int main(int argc, char *argv[]) char **pargv; /* Default */ - int m = 0; - int N = 8; - int NB = 4; - int P = 1; - int KP = 1; - int KQ = 1; - int cores = -1; - int nb_gpus = 0; - int info = 0; - - while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:h")) != -1) { + int m = 0, N = 8, NB = 4, P = 1, KP = 1, KQ = 1; + int cores = -1, nb_gpus = 0, nb_avail_gpu = 0, info = 0, gpu_mask = 0xFF; + + while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:G:h")) != -1) { switch (ch) { case 'm': m = atoi(optarg); break; case 'N': N = atoi(optarg); break; @@ -65,6 +58,7 @@ int main(int argc, char *argv[]) case 'P': P = atoi(optarg); break; case 'c': cores = atoi(optarg); break; case 'g': nb_gpus = atoi(optarg); break; + case 'G': gpu_mask = atoi(optarg); break; case '?': case 'h': default: fprintf(stderr, "-m : initialize MPI_THREAD_MULTIPLE (default: 0/no)\n" @@ -75,6 +69,7 @@ int main(int argc, char *argv[]) "-P : rows (P) in the PxQ process grid (default: 1)\n" "-c : number of cores used (default: -1)\n" "-g : number of GPUs used (default: 0)\n" + "-G : mask of the GPUs to be used (default: 0xff)" "-h : print this help message\n" "\n"); exit(1); @@ -102,16 +97,20 @@ int main(int argc, char *argv[]) break; } } - #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) extern char **environ; char *value; if( nb_gpus < 1 && 0 == rank ) { - fprintf(stderr, "Warning: if run on GPUs, please set --gpus=value bigger than 0\n"); + fprintf(stderr, "Warning: if run on GPUs, please set -g value bigger than 0\n"); } asprintf(&value, "%d", nb_gpus); parsec_setenv_mca_param( "device_cuda_enabled", value, &environ ); - free(value); + free(value); value = NULL; + if( 0xFF != gpu_mask ) { + asprintf(&value, "%d", gpu_mask); + parsec_setenv_mca_param("device_cuda_mask", value, &environ); + free(value); value = NULL; + } #endif /* Initialize PaRSEC */ @@ -134,7 +133,7 @@ int main(int argc, char *argv[]) } cores = nb_total_comp_threads; } - + nb_avail_gpu = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA); /* initializing matrix structure */ parsec_matrix_block_cyclic_t dcA; parsec_matrix_block_cyclic_init(&dcA, PARSEC_MATRIX_DOUBLE, PARSEC_MATRIX_TILE, @@ -153,9 +152,9 @@ int main(int argc, char *argv[]) /* Main routines */ SYNC_TIME_START(); info = parsec_get_best_device_check(parsec, (parsec_tiled_matrix_t *)&dcA); - SYNC_TIME_PRINT(rank, ("Get_best_device" "\tN= %d NB= %d " + SYNC_TIME_PRINT(rank, ("Get_best_device\tN= %d NB= %d " "PxQ= %d %d KPxKQ= %d %d cores= %d nb_gpus= %d\n", - N, NB, P, nodes/P, KP, KQ, cores, parsec_nb_devices-2)); + N, NB, P, nodes / P, KP, KQ, cores, nb_avail_gpu)); /* Check result */ if( 0 == rank && info != 0 ) { From 9079ec69c3bca83a56c51dd2bf21d46d5a961455 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sun, 4 Aug 2024 21:50:59 -0700 Subject: [PATCH 11/14] Allow JDF with no dependencies, no datatype and no arenas. Signed-off-by: George Bosilca --- parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 8d85fe6e5..997a9738f 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -4566,11 +4566,14 @@ static void jdf_generate_destructor( const jdf_t *jdf ) " free(__parsec_tp->super.super.task_classes_array); __parsec_tp->super.super.task_classes_array = NULL;\n" " __parsec_tp->super.super.nb_task_classes = 0;\n" "\n" + "#if PARSEC_%s_ADT_IDX_MAX > 0\n" " for(i = 0; i < (uint32_t)__parsec_tp->super.arenas_datatypes_size; i++) {\n" " if( NULL != __parsec_tp->super.arenas_datatypes[i].arena ) {\n" " PARSEC_OBJ_RELEASE(__parsec_tp->super.arenas_datatypes[i].arena);\n" " }\n" - " }\n"); + " }\n" + "#endif /* PARSEC_%s_ADT_IDX_MAX > 0 */", + jdf_basename, jdf_basename); coutput(" /* Destroy the data repositories for this object */\n"); for( f = jdf->functions; NULL != f; f = f->next ) { From 4373702ac2d4aa676e8a0a9c347924768ac88208 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 31 Jul 2024 22:45:18 -0700 Subject: [PATCH 12/14] Transfer data to and from GPU. This is a multi-part patch that allows the CPU to prepare a data copy mapped onto a device. 1. The first question is how is such a device selected ? The allocation of such a copy happen way before the scheduler is invoked for a task, in fact before the task is even ready. Thus, we need to decide on the location of this copy only based on some static information, such as the task affinity. Therefore, this approach only works for owner-compute type of tasks, where the task will be executed on the device that owns the data used for the task affinity. 2. Pass the correct data copy across the entire system, instead of falling back to data copy of the device 0 (CPU memory) Signed-off-by: George Bosilca --- parsec/CMakeLists.txt | 1 + parsec/arena.c | 114 +++++-- parsec/arena.h | 10 +- parsec/data.c | 107 +++++-- parsec/data.h | 10 +- parsec/data_dist/matrix/map_operator.c | 4 +- .../matrix/two_dim_rectangle_cyclic.h | 1 - parsec/data_internal.h | 42 ++- .../interfaces/dtd/insert_function_internal.h | 10 +- parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 44 ++- parsec/mca/device/cuda/device_cuda_module.c | 6 +- parsec/mca/device/device_gpu.c | 289 +++++++++++++----- parsec/mca/device/device_gpu.h | 2 +- parsec/parsec.c | 14 +- parsec/parsec_mpi_funnelled.c | 19 ++ parsec/parsec_reshape.c | 24 +- parsec/remote_dep.c | 2 +- parsec/remote_dep.h | 5 + parsec/remote_dep_mpi.c | 92 ++++-- parsec/utils/zone_malloc.c | 8 +- parsec/utils/zone_malloc.h | 3 +- tests/apps/haar_tree/project_dyn.jdf | 2 +- tests/runtime/cuda/rtt.jdf | 2 +- tests/runtime/cuda/rtt_main.c | 101 +++--- tests/runtime/cuda/stage_custom.jdf | 12 +- 25 files changed, 652 insertions(+), 272 deletions(-) diff --git a/parsec/CMakeLists.txt b/parsec/CMakeLists.txt index eea8bd70a..237985bce 100644 --- a/parsec/CMakeLists.txt +++ b/parsec/CMakeLists.txt @@ -238,6 +238,7 @@ if( BUILD_PARSEC ) $<$:OTF2::OTF2> $<$:MPI::MPI_C> $<$:CUDA::cudart> + $<$:cuda> $<$:hip::host> ${EXTRA_LIBS} INTERFACE diff --git a/parsec/arena.c b/parsec/arena.c index 8d518d00a..648c7ed16 100644 --- a/parsec/arena.c +++ b/parsec/arena.c @@ -235,43 +235,109 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy, return PARSEC_SUCCESS; } -parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena, - size_t count, int device, - parsec_datatype_t dtt) +#include "parsec/utils/zone_malloc.h" +#include "mca/device/device_gpu.h" + +static inline parsec_data_copy_t * +parsec_arena_internal_copy_new(parsec_arena_t *arena, + parsec_data_t *data, + size_t count, int device, + parsec_datatype_t dtt) { - parsec_data_t *data; - parsec_data_copy_t *copy; - int rc; - - - data = parsec_data_new(); + parsec_data_copy_t *copy = NULL; + parsec_data_t* ldata = data; if( NULL == data ) { + ldata = parsec_data_new(); + if( NULL == ldata ) { + return NULL; + } + } + if( 0 == device ) { + copy = parsec_data_copy_new(ldata, device, dtt, + PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA); + if (NULL == copy) { + goto free_and_return; + } + int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt); + if (PARSEC_SUCCESS != rc) { + goto free_and_return; + } + return copy; + } + /** + * This part is not really nice, it breaks the separation between devices, and how their memory is + * managed. But, it should give nice perfromance improvements if the communication layer is + * capable of sending or receiving data directly to and from the accelerator memory. The only drawback + * is that once the GPU memory is full, this will fail, so the soeftware will fall back to the + * prior behavior, going through the CPU memory. + * + * The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies + * are released from the different LRU lists. + */ + parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device); + if (NULL == gpu_device) { return NULL; } + size_t size = count * arena->elem_size; + void* device_private = zone_malloc(gpu_device->memory, size); + if( NULL == device_private ) { + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n", + device, size, (void *)copy->arena_chunk); + goto free_and_return; + } + copy = parsec_data_copy_new(ldata, device, dtt, + PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED); + if (NULL == copy) { + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n", + device, size, (void *)copy->arena_chunk); + zone_free(gpu_device->memory, device_private); + goto free_and_return; + } + copy->dtt = dtt; + copy->device_private = device_private; + copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory; + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, " + "data ptr %p", + device, size, (void*)copy->arena_chunk, (void*)copy->device_private); + copy->version = 0; + copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID; + copy->original->owner_device = device; + copy->original->preferred_device = device; + return copy; + free_and_return: + if( NULL != copy ) + PARSEC_OBJ_RELEASE(copy); + if( NULL == data) + PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */ + return NULL; +} - copy = parsec_data_copy_new( data, device, dtt, - PARSEC_DATA_FLAG_ARENA | - PARSEC_DATA_FLAG_PARSEC_OWNED | - PARSEC_DATA_FLAG_PARSEC_MANAGED); +parsec_data_copy_t * +parsec_arena_get_new_copy(parsec_arena_t *arena, + size_t count, int device, + parsec_datatype_t dtt) +{ + parsec_data_copy_t *dev0_copy, *copy; - if(NULL == copy) { - PARSEC_OBJ_RELEASE(data); + dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt); + if( NULL == dev0_copy ) { return NULL; } + dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID; + dev0_copy->version = 0; /* start from somewhere */ + if( 0 == device ) { + return dev0_copy; + } - rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt); - + copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt); + if( NULL == copy ) { + copy = dev0_copy; /* return the main memory data copy */ + } /* This data is going to be released once all copies are released * It does not exist without at least a copy, and we don't give the * pointer to the user, so we must remove our retain from it */ - PARSEC_OBJ_RELEASE(data); - - if( PARSEC_SUCCESS != rc ) { - PARSEC_OBJ_RELEASE(copy); - return NULL; - } - + PARSEC_OBJ_RELEASE(dev0_copy->original); return copy; } diff --git a/parsec/arena.h b/parsec/arena.h index a7c6160a1..51c38239b 100644 --- a/parsec/arena.h +++ b/parsec/arena.h @@ -133,15 +133,15 @@ int parsec_arena_construct_ex(parsec_arena_t* arena, * enough resource to allocate a new data copy of this type. */ -parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena, - size_t count, int device, - parsec_datatype_t dtt); +parsec_data_copy_t *parsec_arena_get_new_copy(parsec_arena_t *arena, + size_t count, int device, + parsec_datatype_t dtt); /** * @brief Allocates memory for a given data copy. This is a function used by * DSLs to set the memory associated with a data copy they have created. - * It is also used by parsec_arena_get_copy. - * + * It is also used by parsec_arena_get_new_copy. + * * @param copy the (empty) data copy to allocate memory for. NB: the @p original * field of this data copy must be set. The operation overwrites the device * dtt and count of this data copy, as well as the device_private pointer. diff --git a/parsec/data.c b/parsec/data.c index a08c9d5d6..0512784ed 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -15,6 +15,7 @@ #include "parsec/sys/atomic.h" #include "parsec/remote_dep.h" #include "parsec/parsec_internal.h" +#include "parsec/utils/zone_malloc.h" static parsec_lifo_t parsec_data_lifo; static parsec_lifo_t parsec_data_copies_lifo; @@ -65,6 +66,7 @@ static void parsec_data_construct(parsec_data_t* obj ) obj->preferred_device = -1; obj->key = 0; obj->nb_elts = 0; + obj->nb_copies = 0; for( uint32_t i = 0; i < parsec_nb_devices; obj->device_copies[i] = NULL, i++ ); obj->dc = NULL; @@ -99,11 +101,12 @@ static void parsec_data_destruct(parsec_data_t* obj ) * GPU copies are normally stored in LRU lists, and must be * destroyed by the release list to free the memory on the device */ - PARSEC_OBJ_RELEASE( copy ); + PARSEC_DATA_COPY_RELEASE(copy); } } assert(NULL == obj->device_copies[i]); } + assert(0 == obj->nb_copies); } PARSEC_OBJ_CLASS_INSTANCE(parsec_data_t, parsec_object_t, @@ -161,8 +164,8 @@ void parsec_data_delete(parsec_data_t* data) inline int parsec_data_copy_attach(parsec_data_t* data, - parsec_data_copy_t* copy, - uint8_t device) + parsec_data_copy_t* copy, + uint8_t device) { assert(NULL == copy->original); assert(NULL == copy->older); @@ -175,6 +178,7 @@ parsec_data_copy_attach(parsec_data_t* data, copy->older = NULL; return PARSEC_ERROR; } + parsec_atomic_fetch_add_int32(&data->nb_copies, 1); PARSEC_OBJ_RETAIN(data); return PARSEC_SUCCESS; } @@ -192,6 +196,7 @@ int parsec_data_copy_detach(parsec_data_t* data, return PARSEC_ERR_NOT_FOUND; } data->device_copies[device] = copy->older; + parsec_atomic_fetch_add_int32(&data->nb_copies, -1); copy->original = NULL; copy->older = NULL; @@ -221,7 +226,7 @@ parsec_data_copy_t* parsec_data_copy_new(parsec_data_t* data, uint8_t device, } copy->flags = flags; if( PARSEC_SUCCESS != parsec_data_copy_attach(data, copy, device) ) { - PARSEC_OBJ_RELEASE(copy); + PARSEC_DATA_COPY_RELEASE(copy); return NULL; } copy->dtt = dtt; @@ -330,6 +335,12 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data, copy = data->device_copies[device]; assert( NULL != copy ); + if( valid_copy == device ) { + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, + "DEV[%d]: already has ownership of data %p to copy %p in mode %d", + device, data, copy, access_mode); + goto bookkeeping; + } PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "DEV[%d]: start transfer ownership of data %p to copy %p in mode %d", device, data, copy, access_mode); @@ -417,6 +428,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data, } } + bookkeeping: if( PARSEC_FLOW_ACCESS_READ & access_mode ) { copy->readers++; } @@ -435,40 +447,52 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data, return valid_copy; } -static char dump_coherency_codex(parsec_data_coherency_t state) -{ - if( PARSEC_DATA_COHERENCY_INVALID == state ) return 'I'; - if( PARSEC_DATA_COHERENCY_OWNED == state ) return 'O'; - if( PARSEC_DATA_COHERENCY_EXCLUSIVE == state ) return 'E'; - if( PARSEC_DATA_COHERENCY_SHARED == state ) return 'S'; - return 'X'; -} - -void parsec_dump_data_copy(parsec_data_copy_t* copy) +void parsec_data_copy_dump(parsec_data_copy_t* copy) { - parsec_debug_verbose(0, 0, "- [%d]: copy %p state %c readers %d version %u\n", - (int)copy->device_index, copy, dump_coherency_codex(copy->coherency_state), copy->readers, copy->version); + char *tranfer = "---", flags[] = "----", *coherency = "undef"; + switch(copy->data_transfer_status) { + case PARSEC_DATA_STATUS_NOT_TRANSFER: tranfer = "no"; break; + case PARSEC_DATA_STATUS_UNDER_TRANSFER: tranfer = "yes"; break; + case PARSEC_DATA_STATUS_COMPLETE_TRANSFER: tranfer = "no"; break; + } + if (copy->flags & PARSEC_DATA_FLAG_ARENA) flags[0] = 'A'; + if (copy->flags & PARSEC_DATA_FLAG_TRANSIT) flags[1] = 'T'; + if (copy->flags & PARSEC_DATA_FLAG_PARSEC_MANAGED) flags[2] = 'M'; + if (copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) flags[3] = 'O'; + + if( PARSEC_DATA_COHERENCY_INVALID == copy->coherency_state ) coherency = "invalid"; + if( PARSEC_DATA_COHERENCY_OWNED == copy->coherency_state ) coherency = "owned"; + if( PARSEC_DATA_COHERENCY_EXCLUSIVE == copy->coherency_state ) coherency = "exclusive"; + if( PARSEC_DATA_COHERENCY_SHARED == copy->coherency_state ) coherency = "shared"; + + parsec_debug_verbose(0, 0, "%s [%d]: copy %p [ref %d] coherency %s readers %d version %u transit %s flags %s\n" + " older %p orig %p arena %p dev_priv %p\n", + ((NULL != copy->original) && (copy->original->owner_device == copy->device_index)) ? "*" : " ", + (int)copy->device_index, copy, copy->super.super.obj_reference_count, coherency, copy->readers, copy->version, tranfer, flags, + (void *)copy->older, (void *)copy->original, (void *)copy->arena_chunk, copy->device_private); } -void parsec_dump_data(parsec_data_t* data) +void parsec_data_dump(parsec_data_t* data) { - parsec_debug_verbose(0, 0, "data %p key %lu owner %d\n", data, data->key, data->owner_device); + parsec_debug_verbose(0, 0, "data %p [ref %d] key %lu owner dev %d pref dev %d copies %d dc %p [# elems %zu]\n", + data, data->super.obj_reference_count, data->key, data->owner_device, data->preferred_device, data->nb_copies, + (void*)data->dc, data->nb_elts); for( uint32_t i = 0; i < parsec_nb_devices; i++ ) { if( NULL != data->device_copies[i]) - parsec_dump_data_copy(data->device_copies[i]); + parsec_data_copy_dump(data->device_copies[i]); } } parsec_data_copy_t* parsec_data_get_copy(parsec_data_t* data, uint32_t device) { - return PARSEC_DATA_GET_COPY(data, device); + return PARSEC_DATA_GET_COPY(data, device); } void parsec_data_copy_release(parsec_data_copy_t* copy) { - /* TODO: Move the copy back to the CPU before destroying it */ + /* TODO: Move the copy back to the CPU before destroying it */ PARSEC_DATA_COPY_RELEASE(copy); } @@ -509,7 +533,7 @@ parsec_data_create( parsec_data_t **holder, if( !parsec_atomic_cas_ptr(holder, NULL, data) ) { parsec_data_copy_detach(data, data_copy, 0); - PARSEC_OBJ_RELEASE(data_copy); + PARSEC_DATA_COPY_RELEASE(data_copy); data = *holder; } } else { @@ -560,3 +584,42 @@ parsec_data_destroy( parsec_data_t *data ) #endif PARSEC_OBJ_RELEASE(data); } + +#include "parsec/utils/debug.h" + +int parsec_data_release_self_contained_data(parsec_data_t *data) +{ + if (data->super.obj_reference_count != data->nb_copies) return 0; + parsec_data_copy_t *copy; + PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Examine the status of data %p with %d copies and refcounts at %s:%d\n", + data, data->nb_copies, __FILE__, __LINE__); + /* this data is only referenced by it's own copies. If these copies are also only referenced by + * data, then we can release them all. + */ + for( uint32_t i = 0; i < parsec_nb_devices; i++) { + if (NULL == (copy = data->device_copies[i])) continue; + if( copy->super.super.obj_reference_count > 1 ) + return 0; + } + PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Force the release of data %p at %s:%d", + copy, __FILE__, __LINE__); + for( uint32_t i = 0; i < parsec_nb_devices; i++) { + if (NULL == (copy = data->device_copies[i])) continue; + assert(1 == copy->super.super.obj_reference_count); + if( 0 == copy->device_index ) { + PARSEC_OBJ_RELEASE(copy); + assert(NULL == copy); + } else { + /* Do not release data copies that do not belong to the CPU or really bad things will happen. + * Only the device manager can release these copies, the best we can do here is to detach them + * from the data and eventually release their memory. + */ + parsec_data_copy_detach(data, copy, copy->device_index); + zone_free((zone_malloc_t *)copy->arena_chunk, copy->device_private); + copy->device_private = NULL; + copy->arena_chunk = NULL; + } + } + return 1; +} + diff --git a/parsec/data.h b/parsec/data.h index f21c2918b..eafb8c504 100644 --- a/parsec/data.h +++ b/parsec/data.h @@ -31,9 +31,9 @@ typedef uint8_t parsec_data_coherency_t; #define PARSEC_DATA_COHERENCY_SHARED ((parsec_data_coherency_t)0x4) typedef uint8_t parsec_data_status_t; -#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_coherency_t)0x0) -#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_coherency_t)0x1) -#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_coherency_t)0x2) +#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_status_t)0x0) +#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_status_t)0x1) +#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_status_t)0x2) /** * Data copies have three levels of 'ownership': * - a data copy can be owned and managed by PaRSEC. @@ -124,8 +124,8 @@ PARSEC_DECLSPEC void parsec_data_end_transfer_ownership_to_copy(parsec_data_t* data, uint8_t device, uint8_t access_mode); -PARSEC_DECLSPEC void parsec_dump_data_copy(parsec_data_copy_t* copy); -PARSEC_DECLSPEC void parsec_dump_data(parsec_data_t* copy); +PARSEC_DECLSPEC void parsec_data_copy_dump(parsec_data_copy_t *copy); +PARSEC_DECLSPEC void parsec_data_dump(parsec_data_t* copy); PARSEC_DECLSPEC parsec_data_t * parsec_data_create( parsec_data_t **holder, diff --git a/parsec/data_dist/matrix/map_operator.c b/parsec/data_dist/matrix/map_operator.c index 280bb614c..e6ba9a2e4 100644 --- a/parsec/data_dist/matrix/map_operator.c +++ b/parsec/data_dist/matrix/map_operator.c @@ -296,13 +296,13 @@ static int data_lookup(parsec_execution_stream_t *es, this_task->data[0].data_in = parsec_data_get_copy(src(m,n), 0); this_task->data[0].source_repo_entry = NULL; this_task->data[0].data_out = NULL; - PARSEC_OBJ_RETAIN(this_task->data[0].data_in); + PARSEC_DATA_COPY_RETAIN(this_task->data[0].data_in); } if( NULL != __tp->dest ) { this_task->data[1].data_in = parsec_data_get_copy(dest(m,n), 0); this_task->data[1].source_repo_entry = NULL; this_task->data[1].data_out = this_task->data[1].data_in; - PARSEC_OBJ_RETAIN(this_task->data[1].data_in); + PARSEC_DATA_COPY_RETAIN(this_task->data[1].data_in); } return PARSEC_HOOK_RETURN_DONE; } diff --git a/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h b/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h index 9d766bb34..6c1587eb8 100644 --- a/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h +++ b/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h @@ -45,7 +45,6 @@ typedef struct parsec_matrix_block_cyclic { * @param dc matrix description structure, already allocated, that will be initialize * @param mtype type of data used for this matrix * @param storage type of storage of data - * @param nodes number of nodes * @param myrank rank of the local node (as of mpi rank) * @param mb number of row in a tile * @param nb number of column in a tile diff --git a/parsec/data_internal.h b/parsec/data_internal.h index 49b3a3c7f..c15af1b0c 100644 --- a/parsec/data_internal.h +++ b/parsec/data_internal.h @@ -19,6 +19,7 @@ #include "parsec/arena.h" #include "parsec/data.h" #include "parsec/class/parsec_future.h" +#include "parsec/utils/debug.h" /** * This structure is the keeper of all the information regarding @@ -30,11 +31,12 @@ struct parsec_data_s { parsec_atomic_lock_t lock; - parsec_data_key_t key; int8_t owner_device; int8_t preferred_device; /* Hint set from the MEMADVICE device API to define on * which device this data should be modified RW when there * are multiple choices. -1 means no preference. */ + int32_t nb_copies; /* How many valid copies are attached to this data */ + parsec_data_key_t key; struct parsec_data_collection_s* dc; size_t nb_elts; /* size in bytes of the memory layout */ struct parsec_data_copy_s *device_copies[]; /* this array allocated according to the number of devices @@ -85,22 +87,50 @@ PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_data_copy_t); #define PARSEC_DATA_GET_COPY(DATA, DEVID) \ ((DATA)->device_copies[(DEVID)]) + +int parsec_data_release_self_contained_data(parsec_data_t* data); /** * Decrease the refcount of this copy of the data. If the refcount reach * 0 the upper level is in charge of cleaning up and releasing all content * of the copy. */ -#define PARSEC_DATA_COPY_RELEASE(DATA) \ +#if 0 +#define PARSEC_DATA_COPY_RELEASE(COPY) \ do { \ - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Release data copy %p at %s:%d", (DATA), __FILE__, __LINE__); \ - PARSEC_OBJ_RELEASE((DATA)); \ + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Release data copy %p at %s:%d", (COPY), __FILE__, __LINE__); \ + PARSEC_OBJ_RELEASE((COPY)); \ + if( (NULL != (COPY)) && (NULL != ((COPY)->original)) ) parsec_data_release_self_contained_data((COPY)->original); \ } while(0) +#define PARSEC_DATA_COPY_RETAIN(COPY) \ + do { \ + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p at %s:%d", (COPY), __FILE__, __LINE__); \ + PARSEC_OBJ_RETAIN((COPY)); \ + } while(0) +#else +static inline void __parsec_data_copy_release(parsec_data_copy_t** copy) +{ + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Release data copy %p at %s:%d", *copy, __FILE__, __LINE__); + PARSEC_OBJ_RELEASE(*copy); + if ((NULL != *copy) && (NULL != (*copy)->original) && (1 == (*copy)->super.super.obj_reference_count)) + parsec_data_release_self_contained_data((*copy)->original); +} +#define PARSEC_DATA_COPY_RELEASE(COPY) \ + __parsec_data_copy_release(&(COPY)) + +static inline void __parsec_data_copy_retain(parsec_data_copy_t* copy) +{ + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p at %s:%d", copy, __FILE__, __LINE__); + PARSEC_OBJ_RETAIN(copy); +} +#define PARSEC_DATA_COPY_RETAIN(COPY) \ + __parsec_data_copy_retain((COPY)) +#endif /* 0 */ /** * Return the device private pointer for a datacopy. */ -#define PARSEC_DATA_COPY_GET_PTR(DATA) \ - ((DATA) ? (DATA)->device_private : NULL) +#define PARSEC_DATA_COPY_GET_PTR(COPY) \ + ((COPY) ? (COPY)->device_private : NULL) /** @} */ diff --git a/parsec/interfaces/dtd/insert_function_internal.h b/parsec/interfaces/dtd/insert_function_internal.h index adfc7e54b..640923ebc 100644 --- a/parsec/interfaces/dtd/insert_function_internal.h +++ b/parsec/interfaces/dtd/insert_function_internal.h @@ -417,16 +417,16 @@ void parsec_dtd_fini(); static inline void -parsec_dtd_retain_data_copy( parsec_data_copy_t *data ) +parsec_dtd_retain_data_copy( parsec_data_copy_t *copy ) { - assert( data->super.super.obj_reference_count >= 1 ); - PARSEC_OBJ_RETAIN(data); + assert( copy->super.super.obj_reference_count >= 1 ); + PARSEC_DATA_COPY_RETAIN(copy); } static inline void -parsec_dtd_release_data_copy( parsec_data_copy_t *data ) +parsec_dtd_release_data_copy(parsec_data_copy_t *copy) { - PARSEC_OBJ_RELEASE(data); + PARSEC_DATA_COPY_RELEASE(copy); } diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 997a9738f..2f37075da 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -672,6 +672,17 @@ static char* dump_local_assignments( void** elem, void* arg ) if( dos > 0 ) { string_arena_init(info->sa); string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name); +#if 0 + jdf_expr_t* type_str = jdf_find_property( def->properties, "type", NULL ); + if( NULL == type_str ) { + string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name); + } else { + expr_info_t expr_info = {.sa = info->sa, .prefix = "", .suffix = "", .assignments = "locals"}; + string_arena_add_string(info->sa, "const %s %s = %s%s.value;", + dump_expr((void**)type_str, &expr_info), + def->name, info->holder, def->name); + } +#endif if( dos > 1 ) string_arena_add_string(info->sa, " (void)%s;", def->name); return string_arena_get_string(info->sa); @@ -5595,7 +5606,7 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call, jdf_generate_code_reshape_input_from_desc(jdf, f, flow, dl, spaces); coutput("%s this_task->data._f_%s.data_out = chunk;\n" - "%s PARSEC_OBJ_RETAIN(chunk);\n", + "%s PARSEC_DATA_COPY_RETAIN(chunk);\n", spaces, flow->varname, spaces); @@ -5621,7 +5632,7 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call, assert( dl->datatype_local.count != NULL ); string_arena_add_string(sa2, "%s", dump_expr((void**)dl->datatype_local.count, &info)); - coutput("%s chunk = parsec_arena_get_copy(%s->arena, %s, target_device, %s->opaque_dtt);\n" + coutput("%s chunk = parsec_arena_get_new_copy(%s->arena, %s, target_device, %s->opaque_dtt);\n" "%s chunk->original->owner_device = target_device;\n" "%s this_task->data._f_%s.data_out = chunk;\n", spaces, string_arena_get_string(sa), string_arena_get_string(sa2), string_arena_get_string(sa), @@ -5652,12 +5663,19 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call, /* Code to create & fulfill a reshape promise locally in case this input dependency is typed */ jdf_generate_code_reshape_input_from_dep(jdf, f, flow, dl, spaces); - coutput("%s this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n" - "#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n" + /* TODO: Setting the data_out here is kind of random, especially as some copy of the input flow. The only thing + * that would make sense here is to set the data_out to the dep outputs back into the user memory (output + * dep with a target into a data collection), to give the opportunity to the accelerator components to + * do a pushout to the desired location (instead of the current approach that will do a pushout to the + * data_copy on device 0 followed by a memcpy into the desired location). + */ + //coutput("%s this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n", + // spaces, flow->varname); + + coutput("#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n" "%s parsec_prof_grapher_data_input(chunk->original, (parsec_task_t*)this_task, &%s, 0);\n" "#endif\n" "%s }\n", - spaces, flow->varname, spaces, JDF_OBJECT_ONAME( flow ), spaces); } @@ -5731,7 +5749,7 @@ static void jdf_generate_code_call_init_output(const jdf_t *jdf, const jdf_call_ spaces, flow->varname, spaces); - coutput("%s chunk = parsec_arena_get_copy(%s->arena, %s, target_device, %s);\n" + coutput("%s chunk = parsec_arena_get_new_copy(%s->arena, %s, target_device, %s);\n" "%s chunk->original->owner_device = target_device;\n", spaces, string_arena_get_string(sa_arena), string_arena_get_string(sa_count), string_arena_get_string(sa_datatype), spaces); @@ -6515,10 +6533,10 @@ jdf_generate_code_data_lookup(const jdf_t *jdf, * This way, it's only retained once during release_deps. */ coutput(" if( NULL == this_task->repo_entry ){\n" - " this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, " + " this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, \n" " %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals));\n" - " data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);" - " this_task->repo_entry ->generator = (void*)this_task; /* for AYU */\n" + " data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);\n" + " this_task->repo_entry->generator = (void*)this_task; /* for AYU */\n" "#if defined(PARSEC_SIM)\n" " assert(this_task->repo_entry ->sim_exec_date == 0);\n" " this_task->repo_entry ->sim_exec_date = this_task->sim_exec_date;\n" @@ -6528,7 +6546,7 @@ jdf_generate_code_data_lookup(const jdf_t *jdf, jdf_property_get_string(f->properties, JDF_PROP_UD_MAKE_KEY_FN_NAME, NULL), f->fname); - coutput(" /* The reshape repo is the current task repo. */" + coutput(" /* The reshape repo is the current task repo. */\n" " reshape_repo = %s_repo;\n" " reshape_entry_key = %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals) ;\n" " reshape_entry = this_task->repo_entry;\n", @@ -7036,6 +7054,12 @@ static void jdf_generate_code_hook(const jdf_t *jdf, output = UTIL_DUMP_LIST(sa, f->dataflow, next, dump_data_initialization_from_data_array, &ai2, "", "", "", ""); if( 0 != strlen(output) ) { + coutput("/* Make sure we have the data_out set to the data_in */\n"); + for( fl = f->dataflow; fl != NULL; fl = fl->next) { + if( fl->flow_flags & JDF_FLOW_TYPE_CTL ) continue; + coutput(" this_task->data._f_%s.data_out = this_task->data._f_%s.data_in;\n", + fl->varname, fl->varname); + } coutput(" /** Declare the variables that will hold the data, and all the accounting for each */\n" "%s\n", output); diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c index 312775719..d88a01ee5 100644 --- a/parsec/mca/device/cuda/device_cuda_module.c +++ b/parsec/mca/device/cuda/device_cuda_module.c @@ -573,9 +573,9 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) gpu_device->find_incarnation = parsec_cuda_find_incarnation; if( PARSEC_SUCCESS != parsec_device_memory_reserve(gpu_device, - parsec_cuda_memory_percentage, - parsec_cuda_memory_number_of_blocks, - parsec_cuda_memory_block_size) ) { + parsec_cuda_memory_percentage, + parsec_cuda_memory_number_of_blocks, + parsec_cuda_memory_block_size) ) { goto release_device; } diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index d44a3ed2b..c25a242e8 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -341,7 +341,7 @@ void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device) parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item; parsec_output(parsec_gpu_output_stream, " %d. elem %p flags 0x%x GPU mem %p\n", i, gpu_copy, gpu_copy->flags, gpu_copy->device_private); - parsec_dump_data_copy(gpu_copy); + parsec_data_copy_dump(gpu_copy); i++; }); } @@ -353,7 +353,7 @@ void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device) parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item; parsec_output(parsec_gpu_output_stream, " %d. elem %p flags 0x%x GPU mem %p\n", i, gpu_copy, gpu_copy->flags, gpu_copy->device_private); - parsec_dump_data_copy(gpu_copy); + parsec_data_copy_dump(gpu_copy); i++; }); } @@ -478,7 +478,7 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p [ref_count %d]", data->device_copies[ data->owner_device ], data->device_copies[ data->owner_device ]->super.super.obj_reference_count); - PARSEC_OBJ_RETAIN(data->device_copies[ data->owner_device ]); + PARSEC_DATA_COPY_RETAIN(data->device_copies[ data->owner_device ]); gpu_task->ec->data[0].data_in = data->device_copies[ data->owner_device ]; gpu_task->ec->data[0].data_out = NULL; gpu_task->ec->data[0].source_repo_entry = NULL; @@ -614,9 +614,8 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device, parsec_warning("GPU[%d:%s] Invalid argument: requesting 0 bytes of memory", gpu_device->super.device_index, gpu_device->super.name); return PARSEC_ERROR; - } else { - alloc_size = number_blocks * eltsize; } + alloc_size = number_blocks * eltsize; } else { /* number_blocks == -1 means memory_percentage is used */ alloc_size = (memory_percentage * initial_free_mem) / 100; @@ -668,7 +667,7 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device, gpu_elem->flags |= PARSEC_DATA_FLAG_PARSEC_OWNED; gpu_elem->device_index = gpu_device->super.device_index; mem_elem_per_gpu++; - PARSEC_OBJ_RETAIN(gpu_elem); + PARSEC_DATA_COPY_RETAIN(gpu_elem); PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%d:%s] Retain and insert GPU copy %p [ref_count %d] in LRU", gpu_device->super.device_index, gpu_device->super.name, gpu_elem, gpu_elem->super.obj_reference_count); @@ -731,12 +730,22 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count, original, (NULL != original ? original->dc : NULL)); assert( gpu_copy->device_index == gpu_device->super.device_index ); - - if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ) { - parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d and it is discarding it!", - gpu_device->super.device_index, gpu_device->super.name, original->key); + if( NULL == gpu_copy->device_private ) { + PARSEC_DEBUG_VERBOSE(35, parsec_gpu_output_stream, + "GPU[%d:%s] copy %p is dangling without private data. This is OK.", + gpu_device->super.device_index, gpu_device->super.name, (void*)gpu_copy); + goto release_and_continue; + } + if( NULL == gpu_copy->original ) { + PARSEC_DEBUG_VERBOSE(35, parsec_gpu_output_stream, + "GPU[%d:%s] copy %p detached from a data but not yet reclaimed!", + gpu_device->super.device_index, gpu_device->super.name, (void*)gpu_copy); + } + if (PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state) { + parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d (%p) and it is discarding it!", + gpu_device->super.device_index, gpu_device->super.name, original->key, (void*)gpu_copy->device_private); } - assert(0 != (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); + assert(0 != (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED)); #if defined(PARSEC_GPU_ALLOC_PER_TILE) gpu_device->memory_free( gpu_copy->device_private ); @@ -760,11 +769,12 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de #endif gpu_copy->device_private = NULL; + release_and_continue: /* At this point the data copies should have no attachment to a data_t. Thus, * before we get here (aka below parsec_fini), the destructor of the data * collection must have been called, releasing all the copies. */ - PARSEC_OBJ_RELEASE(gpu_copy); assert(NULL == gpu_copy); + PARSEC_DATA_COPY_RELEASE(gpu_copy); assert(NULL == gpu_copy); } } @@ -824,6 +834,87 @@ parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device ) return PARSEC_SUCCESS; } +int +parsec_device_get_copy( parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t** dc ) +{ + char task_name[] = "unknown"; + parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL, *lru_gpu_elem; + /* Get the head of the LRU, assuming it has no readers and mark it as used, using the same mechanism as + * the GPU to GPU tranfers. Once the communication into this copy completes, the task will get into + * the GPU queues, and the data will be reattributed accordingly to this GPU. + */ + find_another_data: + lru_gpu_elem = (parsec_gpu_data_copy_t*)parsec_list_pop_front(&gpu_device->gpu_mem_lru); + if( NULL == lru_gpu_elem ) { + /* nothing available on the GPU. Let the upper level know about this */ + *dc = NULL; + return PARSEC_ERR_OUT_OF_RESOURCE; + } + if( 0 != lru_gpu_elem->readers ) { + PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, + "GPU[%d:%s]:%s: Drop LRU-retrieved GPU copy %p [readers %d, ref_count %d] original %p", + gpu_device->super.device_index, gpu_device->super.name, task_name, + lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original); + /* We do not add the copy back into the LRU. This means that for now this copy is not + * tracked via the LRU (despite being only used in read mode) and instead is dangling + * on other tasks. Thus, it will eventually need to be added back into the LRU when + * current task using it completes. + */ + goto find_another_data; + } + /* It's also possible that the ref_count of that element is bigger than 1 + * In that case, it's because some task completion did not execute yet, and + * we need to keep it in the list until it reaches 1. + */ + if( lru_gpu_elem->super.super.obj_reference_count > 1 ) { + /* It's also possible (although unlikely) that we livelock here: + * if gpu_mem_lru has *only* elements with readers == 0 but + * ref_count > 1, then we might pop/push forever. We save the + * earliest element found and if we see it again it means we + * run over the entire list without finding a suitable replacement. + * We need to make progress on something else. This remains safe for as long as the + * LRU is only modified by a single thread (in this case the current thread). + */ + PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, + "GPU[%d:%s]:%s: Push back LRU-retrieved GPU copy %p [readers %d, ref_count %d] original %p", + gpu_device->super.device_index, gpu_device->super.name, task_name, + lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original); + assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); + parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super); + goto find_another_data; + } + if( gpu_mem_lru_cycling == lru_gpu_elem ) { + PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, + "GPU[%d:%s]: Cycle detected on allocating memory for %s", + gpu_device->super.device_index, gpu_device->super.name, task_name); + *dc = NULL; /* did our best but failed to find a data. Return and allocate it onto another device. */ + return PARSEC_ERR_OUT_OF_RESOURCE; + } + /* detect cycles to have an opportunity to stop */ + gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */ + + parsec_data_t* master = lru_gpu_elem->original; + if (NULL == master ) { + /* This copy has been detached by the CPU once it has been consumed (by the communication engine), + * there is no device memory associated with it, we can safely release the CPU copy. + */ + assert(1 == lru_gpu_elem->super.super.obj_reference_count); + PARSEC_OBJ_RELEASE(lru_gpu_elem); + goto find_another_data; + } + parsec_atomic_lock(&master->lock); + if ( lru_gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ) { + /* can't reuse, it is drained right now by another device */ + parsec_atomic_unlock(&master->lock); + goto find_another_data; + } + parsec_data_copy_detach(master, lru_gpu_elem, gpu_device->super.device_index); + parsec_atomic_wmb(); + *dc = lru_gpu_elem; + parsec_atomic_unlock(&master->lock); + return PARSEC_SUCCESS; +} + /** * Try to find memory space to move all data on the GPU. We attach a device_elem to * a memory_elem as soon as a device_elem is available. If we fail to find enough @@ -862,7 +953,10 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, assert( flow && (flow->flow_index == i) ); /* Skip CTL flows only */ - if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue; + if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) { + gpu_task->flow_nb_elts[i] = 0; /* assume there is nothing to transfer to the GPU */ + continue; + } PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%d:%s]:%s: Investigating flow %s:%d", @@ -870,7 +964,16 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, temp_loc[i] = NULL; if (this_task->data[i].data_in == NULL) continue; - + /* if the input data is already on this device there is nothing else to do */ + if( gpu_device->super.device_index == this_task->data[i].data_in->device_index ) { + PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, + "GPU[%d:%s]:%s: Flow %s:%i was already on the device %p%s", + gpu_device->super.device_index, gpu_device->super.name, task_name, + flow->name, i, gpu_elem, + this_task->data[i].data_in->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : ""); + this_task->data[i].data_out = this_task->data[i].data_in; + continue; + } master = this_task->data[i].data_in->original; parsec_atomic_lock(&master->lock); gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index); @@ -905,6 +1008,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, copy_readers_update = 0; assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]); + gpu_elem->arena_chunk = (parsec_arena_chunk_t *)gpu_device->memory; if( NULL == gpu_elem->device_private ) { #endif @@ -926,6 +1030,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, for( j = 0; j <= i; j++ ) { /* This flow could be a control flow */ if( NULL == temp_loc[j] ) continue; + this_task->data[j].data_out = gpu_elem; /* reset the data out */ /* This flow could be non-parsec-owned, in which case we can't reclaim it */ if( 0 == (temp_loc[j]->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue; PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, @@ -936,7 +1041,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, parsec_list_push_front(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)temp_loc[j]); } #if !defined(PARSEC_GPU_ALLOC_PER_TILE) - PARSEC_OBJ_RELEASE(gpu_elem); + PARSEC_DATA_COPY_RELEASE(gpu_elem); #endif parsec_atomic_unlock(&master->lock); return PARSEC_HOOK_RETURN_AGAIN; @@ -1093,7 +1198,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, "GPU[%d:%s]:%s: Release LRU-retrieved GPU copy %p [ref_count %d: must be 1]", gpu_device->super.device_index, gpu_device->super.name, task_name, lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count); - PARSEC_OBJ_RELEASE(lru_gpu_elem); + PARSEC_DATA_COPY_RELEASE(lru_gpu_elem); assert( NULL == lru_gpu_elem ); goto malloc_data; } @@ -1165,8 +1270,8 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, parsec_gpu_exec_stream_t *gpu_stream) { int ret; - parsec_data_copy_t * source; - parsec_data_copy_t * dest; + parsec_data_copy_t * src_copy; + parsec_data_copy_t * dst_copy; parsec_device_gpu_module_t *src_dev; parsec_device_gpu_module_t *dst_dev; parsec_task_t *task = gtask->ec; @@ -1175,10 +1280,10 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, for(int i = 0; i < task->task_class->nb_flows; i++) { if( !(flow_mask & (1U << i)) ) continue; - source = gtask->sources[i]; - dest = task->data[i].data_out; - src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index); - dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index); + src_copy = gtask->sources[i]; + dst_copy = task->data[i].data_out; + src_dev = (parsec_device_gpu_module_t *)parsec_mca_device_get(src_copy->device_index); + dst_dev = (parsec_device_gpu_module_t *)parsec_mca_device_get(dst_copy->device_index); if((src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE)) { assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) ); @@ -1187,13 +1292,12 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, dir = parsec_device_gpu_transfer_direction_h2d; } - count = (source->original->nb_elts <= dest->original->nb_elts) ? - source->original->nb_elts : dest->original->nb_elts; - ret = dst_dev->memcpy_async( dst_dev, gpu_stream, - dest->device_private, - source->device_private, - count, - dir ); + count = (src_copy->original->nb_elts <= dst_copy->original->nb_elts) ? src_copy->original->nb_elts : dst_copy->original->nb_elts; + ret = dst_dev->memcpy_async(dst_dev, gpu_stream, + dst_copy->device_private, + src_copy->device_private, + count, + dir); if(PARSEC_SUCCESS != ret) return PARSEC_HOOK_RETURN_ERROR; } @@ -1215,8 +1319,8 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, parsec_gpu_exec_stream_t *gpu_stream) { int ret; - parsec_data_copy_t * source; - parsec_data_copy_t * dest; + parsec_data_copy_t * src_copy; + parsec_data_copy_t * dst_copy; parsec_device_gpu_module_t *dst_dev, *src_dev; parsec_task_t *task = gtask->ec; size_t count; @@ -1224,13 +1328,13 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, int i; for(i = 0; i < task->task_class->nb_flows; i++){ if(flow_mask & (1U << i)){ - source = task->data[i].data_out; - dest = source->original->device_copies[0]; - dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index); - src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index); - - count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts : - dest->original->nb_elts; + src_copy = task->data[i].data_out; + dst_copy = src_copy->original->device_copies[0]; + dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dst_copy->device_index); + src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(src_copy->device_index); + assert(dst_copy->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER); + dst_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; + count = (src_copy->original->nb_elts <= dst_copy->original->nb_elts) ? src_copy->original->nb_elts : dst_copy->original->nb_elts; if( (src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE) ) { assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) ); dir = parsec_device_gpu_transfer_direction_d2d; @@ -1238,8 +1342,8 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, dir = parsec_device_gpu_transfer_direction_d2h; } ret = src_dev->memcpy_async( src_dev, gpu_stream, - dest->device_private, - source->device_private, + dst_copy->device_private, + src_copy->device_private, count, dir ); if(PARSEC_SUCCESS != ret) { @@ -1277,7 +1381,24 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, "GPU[%d:%s]: Prefetch task %p is staging in", gpu_device->super.device_index, gpu_device->super.name, gpu_task); } - + if( NULL == gpu_elem ) { + if( candidate->device_index == gpu_device->super.device_index ) { + /* the candidate is already located on the GPU, no transfer should be necessary but let's do the bookkeeping */ + if( (PARSEC_FLOW_ACCESS_WRITE & type) && (gpu_task->task_type != PARSEC_GPU_TASK_TYPE_PREFETCH) ) { + candidate->version++; + parsec_list_item_ring_chop((parsec_list_item_t *)candidate); + PARSEC_LIST_ITEM_SINGLETON(candidate); + } + if( PARSEC_FLOW_ACCESS_READ & type ) { + parsec_atomic_fetch_add_int32(&candidate->readers, 1); + } + return PARSEC_HOOK_RETURN_DONE; + } + parsec_warning("GPU[%d:%s]:\t device_data_stage_in without a proper data_out on the device " + "and with a data_in (%p) located on another device %d", + gpu_device->super.device_index, gpu_device->super.name, + candidate, candidate->device_index); + } parsec_atomic_lock( &original->lock ); gpu_task->sources[flow->flow_index] = candidate; /* default source for the transfer */ @@ -1648,6 +1769,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, /* We also don't push back non-parsec-owned copies */ if(NULL != task->data[i].data_out && 0 == (task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED)) continue; + if( gpu_device->super.device_index == task->data[i].data_in->device_index ) continue; flow = gtask->flow[i]; assert( flow ); @@ -2115,7 +2237,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, /* If the gpu copy is not owned by parsec, we don't manage it at all */ if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue; original = gpu_copy->original; - rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << i), gpu_stream): PARSEC_SUCCESS; + rc = gpu_task->stage_out ? gpu_task->stage_out(gpu_task, (1U << i), gpu_stream): PARSEC_SUCCESS; if(PARSEC_SUCCESS != rc) { parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n" "\tdata %s <<%p>> -> <<%p>>\n", @@ -2153,6 +2275,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, assert( this_task->data[i].data_in == NULL || original == this_task->data[i].data_in->original ); +#if 0 if( (gpu_task->task_type != PARSEC_GPU_TASK_TYPE_D2D_COMPLETE) && !(flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) { /* Do not propagate GPU copies to successors (temporary solution) */ this_task->data[i].data_out = original->device_copies[0]; @@ -2164,6 +2287,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, this_task->data[i].data_out, this_task->data[i].data_out->super.super.obj_reference_count, original); } +#endif parsec_atomic_lock(&original->lock); if( flow->flow_flags & PARSEC_FLOW_ACCESS_READ ) { int current_readers = parsec_atomic_fetch_sub_int32(&gpu_copy->readers, 1) - 1; @@ -2237,7 +2361,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, #endif /* Move the data back into main memory */ rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS; - if(PARSEC_SUCCESS != rc) { + if( PARSEC_SUCCESS != rc ) { parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n" "\tdata %s <<%p>> -> <<%p>>\n", gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__, @@ -2276,8 +2400,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, parsec_gpu_task_t *gpu_task ) { parsec_task_t *this_task = gpu_task->ec; - parsec_gpu_data_copy_t *gpu_copy, *cpu_copy; - parsec_data_t *original; + parsec_gpu_data_copy_t *gpu_copy; int i; #if defined(PARSEC_DEBUG_NOISIER) @@ -2302,45 +2425,57 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, } gpu_copy = this_task->data[i].data_out; - original = gpu_copy->original; - cpu_copy = original->device_copies[0]; /* If it is a copy managed by the user, don't bother either */ if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue; +#if 0 + parsec_data_t *original = gpu_copy->original; + parsec_gpu_data_copy_t *cpu_copy = original->device_copies[0]; + if( this_task->data[i].data_in == this_task->data[i].data_out ) { + /** + * There might be a race condition here. We can't assume the first CPU + * version is the corresponding CPU copy, as a new CPU-bound data + * might have been created meanwhile. + * + * WARNING: For now we always forward the cpu_copy to the next task, to + * do that, we lie to the engine by updating the CPU copy to the same + * status than the GPU copy without updating the data itself. Thus, the + * cpu copy is really invalid. this is related to Issue #88, and the + * fact that: + * - we don't forward the gpu copy as output + * - we always take a cpu copy as input, so it has to be in the + * same state as the GPU to prevent an extra data movement. + */ + assert( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ); + gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; + cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; - /** - * There might be a race condition here. We can't assume the first CPU - * version is the corresponding CPU copy, as a new CPU-bound data - * might have been created meanwhile. - * - * WARNING: For now we always forward the cpu_copy to the next task, to - * do that, we lie to the engine by updating the CPU copy to the same - * status than the GPU copy without updating the data itself. Thus, the - * cpu copy is really invalid. this is related to Issue #88, and the - * fact that: - * - we don't forward the gpu copy as output - * - we always take a cpu copy as input, so it has to be in the - * same state as the GPU to prevent an extra data movement. - */ - assert( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ); - gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; - cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; - - cpu_copy->version = gpu_copy->version; - PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, - "GPU[%d:%s]: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]", - gpu_device->super.device_index, gpu_device->super.name, - cpu_copy, cpu_copy->super.super.obj_reference_count, cpu_copy->version, gpu_copy, gpu_copy->super.super.obj_reference_count); - - /** - * Let's lie to the engine by reporting that working version of this - * data is now on the CPU. - */ - this_task->data[i].data_out = cpu_copy; + cpu_copy->version = gpu_copy->version; + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "GPU[%d:%s]: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]", + gpu_device->super.device_index, gpu_device->super.name, + cpu_copy, cpu_copy->super.super.obj_reference_count, cpu_copy->version, gpu_copy, gpu_copy->super.super.obj_reference_count); - assert( 0 <= gpu_copy->readers ); + /** + * Let's lie to the engine by reporting that working version of this + * data is now on the CPU. + */ + this_task->data[i].data_out = cpu_copy; + } +#endif + assert(0 <= gpu_copy->readers); if( gpu_task->pushout & (1 << i) ) { + parsec_data_t *original = gpu_copy->original; + parsec_gpu_data_copy_t *cpu_copy = original->device_copies[0]; + /* Update the CPU copy to reflect the current status */ + assert(cpu_copy->version < gpu_copy->version); + cpu_copy->version = gpu_copy->version; + cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; + gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; + assert(PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy->data_transfer_status); + cpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER; + PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU copy %p [ref_count %d] moved to the read LRU in %s", gpu_copy, gpu_copy->super.super.obj_reference_count, __func__); diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index d931b37ff..70c6a9abe 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -65,7 +65,7 @@ typedef int (parsec_stage_in_function_t)(parsec_gpu_task_t *gtask, /* Function type to transfer data from the GPU device. * Transfer transfer the contiguous bytes from - * task->data[i].data_in to task->data[i].data_out. + * task->data[i].data_out to the copy on device 0. * * @param[in] task parsec_task_t containing task->data[i].data_in, task->data[i].data_out. * @param[in] flow_mask indicating task flows for which to transfer. diff --git a/parsec/parsec.c b/parsec/parsec.c index 095fcad27..abf36f809 100644 --- a/parsec/parsec.c +++ b/parsec/parsec.c @@ -1798,7 +1798,7 @@ parsec_release_dep_fct(parsec_execution_stream_t *es, * Check that we don't forward a NULL data to someone else. This * can be done only on the src node, since the dst node can * check for datatypes without knowing the data yet. - * By checking now, we allow for the data to be created any time bfore we + * By checking now, we allow for the data to be created any time before we * actually try to transfer it. */ if( PARSEC_UNLIKELY((data->data == NULL) && @@ -1839,20 +1839,20 @@ parsec_release_dep_fct(parsec_execution_stream_t *es, #ifdef PARSEC_RESHAPE_BEFORE_SEND_TO_REMOTE /* Now everything is a reshaping entry */ /* Check if we need to reshape before sending */ - if(parsec_is_CTL_dep(output->data)){ /* CTL DEP */ + if(parsec_is_CTL_dep(output->data)) { /* CTL DEP */ output->data.data_future = NULL; output->data.repo = NULL; output->data.repo_key = -1; - }else{ + } else { /* Get reshape from whatever repo it has been set up into */ output->data.data_future = (parsec_datacopy_future_t*)target_dc; output->data.repo = target_repo; output->data.repo_key = target_repo_entry->ht_item.key; PARSEC_DEBUG_VERBOSE(4, parsec_debug_output, - "th%d RESHAPE_PROMISE SETUP FOR REMOTE DEPS [%p:%p] for INLINE REMOTE %s fut %p", - es->th_id, output->data.data, (output->data.data)->dtt, - (target_repo == successor_repo? "UNFULFILLED" : "FULFILLED"), - output->data.data_future); + "th%d RESHAPE_PROMISE SETUP FOR REMOTE DEPS [%p:%p] for INLINE REMOTE %s fut %p", + es->th_id, output->data.data, (output->data.data)->dtt, + (target_repo == successor_repo? "UNFULFILLED" : "FULFILLED"), + output->data.data_future); } #endif } else { diff --git a/parsec/parsec_mpi_funnelled.c b/parsec/parsec_mpi_funnelled.c index 7e158f11f..c0a95dc17 100644 --- a/parsec/parsec_mpi_funnelled.c +++ b/parsec/parsec_mpi_funnelled.c @@ -1319,6 +1319,10 @@ parsec_check_overlapping_binding(parsec_context_t *context) #endif } +#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) +#include +#endif /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */ + int mpi_no_thread_enable(parsec_comm_engine_t *ce) { @@ -1330,6 +1334,21 @@ mpi_no_thread_enable(parsec_comm_engine_t *ce) if(parsec_ce_mpi_comm == (MPI_Comm)context->comm_ctx) { return PARSEC_SUCCESS; } +#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) + /* The communication thread need to have a CUDA context in order to be able to use + * CUDA managed memory. It should be enough to just create a context onto the first + * active device. + */ + for (int dev = 0; dev < (int)parsec_nb_devices; dev++) { + parsec_device_module_t *device = parsec_mca_device_get(dev); + if (PARSEC_DEV_CUDA & device->type) { + parsec_device_gpu_module_t *gpu_dev = (parsec_device_gpu_module_t*)device; + gpu_dev->set_device(gpu_dev); + } + } +#endif /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */ + + /* Finish the initialization of the communication engine */ parsec_ce.mem_register = mpi_no_thread_mem_register; parsec_ce.mem_unregister = mpi_no_thread_mem_unregister; diff --git a/parsec/parsec_reshape.c b/parsec/parsec_reshape.c index 121a8570b..eb48e1d46 100644 --- a/parsec/parsec_reshape.c +++ b/parsec/parsec_reshape.c @@ -50,8 +50,8 @@ void parsec_cleanup_reshape_promise(parsec_base_future_t *future) free(match_data); } if(d_fut->super.tracked_data != NULL){ - parsec_data_copy_t * data = (parsec_data_copy_t*) d_fut->super.tracked_data; - PARSEC_DATA_COPY_RELEASE(data); + parsec_data_copy_t * copy = (parsec_data_copy_t*) d_fut->super.tracked_data; + PARSEC_DATA_COPY_RELEASE(copy); } } @@ -141,7 +141,7 @@ parsec_new_reshape_promise(parsec_dep_data_description_t* data, * going to consume the original data->data in order to reshape it, and * all other successors will use directly the reshaped data instead. */ - PARSEC_OBJ_RETAIN( future_in_data->data ); + PARSEC_DATA_COPY_RETAIN( future_in_data->data ); return data_future; } @@ -331,7 +331,7 @@ parsec_create_reshape_promise(parsec_execution_stream_t *es, } #endif - /* retain the future if it's being reuse. */ + /* retain the future if it's being reused. */ if ( !new_future ) PARSEC_OBJ_RETAIN(data->data_future); /* Set up the reshape promise. */ @@ -439,7 +439,7 @@ parsec_set_up_reshape_promise(parsec_execution_stream_t *es, /* Data has been received with the expected remote type of the * successor contained on data->data->dtt. */ data->local.dst_datatype = data->local.src_datatype = data->data->dtt; - }else{ + } else { /* Packed data because multiple unpacking alternatives at reception. */ const parsec_task_class_t* fct = newcontext->task_class; uint32_t flow_mask = (1U << dep->flow->flow_index) | 0x80000000; /* in flow */ @@ -579,7 +579,7 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es, data->data_future = (parsec_datacopy_future_t*)reshape_repo_entry->data[dep_flow_index]; } - if(data->data_future == NULL){ + if(data->data_future == NULL) { parsec_create_reshape_promise(es, data, 0, -1, -1, /* no src dst rank */ @@ -595,9 +595,6 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es, char task_name[MAX_TASK_STRLEN]; parsec_task_snprintf(task_name, MAX_TASK_STRLEN, task); - char type_string[MAX_TASK_STRLEN]="UNFULFILLED"; - char orig_string[MAX_TASK_STRLEN]="LOCAL INLINE"; - char type_name_src[MAX_TASK_STRLEN] = "NULL"; char type_name_dst[MAX_TASK_STRLEN] = "NULL"; char type_name_data[MAX_TASK_STRLEN] = "NULL"; @@ -608,12 +605,11 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es, PARSEC_DEBUG_VERBOSE(12, parsec_debug_output, "th%d RESHAPE_PROMISE CREATE %s %s [%s:%p:%p -> %p] flow_idx %u fut %p on %s(%p) k%d dtt %s -> %s [data %s]", - es->th_id, type_string, orig_string, task_name, data->data, data->data->dtt, + es->th_id, "UNFULFILLED", "LOCAL INLINE", task_name, data->data, data->data->dtt, data->local.dst_datatype, dep_flow_index, data->data_future, "CURR_REPO", setup_repo, setup_repo_key, type_name_src, type_name_dst, type_name_data); - #endif } @@ -629,7 +625,7 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es, es->th_id, *reshape, (*reshape)->dtt, data->data_future); /* reshape completed */ - PARSEC_OBJ_RETAIN(*reshape); + PARSEC_DATA_COPY_RETAIN(*reshape); PARSEC_OBJ_RELEASE(data->data_future); /* Clean up the old stuff on the repo used temporarily to hold * the inline reshape promise. @@ -642,7 +638,7 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es, /** * * Routine to obtain a reshaped copy matching the specifications when reading - * a tile from the datacollection. + * a tile from the data collection. * If a reshape needs to be performed, it is done using an inline reshape * promise, i.e., creating and fulfilling a local future promise (only * the current task instance is involved). Each thread accessing the same @@ -769,7 +765,7 @@ parsec_get_copy_reshape_from_dep(parsec_execution_stream_t *es, "th%d RESHAPE_PROMISE OBTAINED [%p:%p] for %s fut %p", es->th_id, *reshape, (*reshape)->dtt, task_string, data->data_future); - PARSEC_OBJ_RETAIN(*reshape); + PARSEC_DATA_COPY_RETAIN(*reshape); PARSEC_OBJ_RELEASE(data->data_future); return PARSEC_HOOK_RETURN_RESHAPE_DONE; diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index 1f6920cac..fe4b6c5ec 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -492,7 +492,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, /* This assert is not correct anymore, we don't need an arena to send to a remote * assert(NULL != output->data.remote.arena);*/ assert( !parsec_is_CTL_dep(&output->data) ); - PARSEC_OBJ_RETAIN(output->data.data); + PARSEC_DATA_COPY_RETAIN(output->data.data); } for( array_index = count = 0; count < remote_deps->output[i].count_bits; array_index++ ) { diff --git a/parsec/remote_dep.h b/parsec/remote_dep.h index 17fa00fa4..358be17b8 100644 --- a/parsec/remote_dep.h +++ b/parsec/remote_dep.h @@ -87,6 +87,11 @@ struct parsec_dep_data_description_s { */ parsec_datacopy_future_t *data_future; + /* If we can extract a preferred location for the incoming data set it + * here, otherwise the memory for the incoming data will be allocated + * on the main memory (device 0). + */ + int32_t preferred_device; #ifdef PARSEC_RESHAPE_BEFORE_SEND_TO_REMOTE /* Keeping current repo & key to be able to consume when * the "remote" successors (aka the communication engine) diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 71a61f675..9a34d56a9 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -142,10 +142,6 @@ parsec_execution_stream_t parsec_comm_es = { static void remote_dep_mpi_put_start(parsec_execution_stream_t* es, dep_cmd_item_t* item); static void remote_dep_mpi_get_start(parsec_execution_stream_t* es, parsec_remote_deps_t* deps); -static void remote_dep_mpi_get_end(parsec_execution_stream_t* es, - int idx, - parsec_remote_deps_t* deps); - static int remote_dep_mpi_get_end_cb(parsec_comm_engine_t *ce, parsec_ce_tag_t tag, @@ -565,25 +561,34 @@ void parsec_remote_dep_memcpy(parsec_execution_stream_t* es, item->cmd.memcpy.destination = dst; item->cmd.memcpy.layout = data->local; - PARSEC_OBJ_RETAIN(src); + PARSEC_DATA_COPY_RETAIN(src); remote_dep_inc_flying_messages(tp); parsec_dequeue_push_back(&dep_cmd_queue, (parsec_list_item_t*) item); } static inline parsec_data_copy_t* -remote_dep_copy_allocate(parsec_dep_type_description_t* data) +remote_dep_copy_allocate(parsec_dep_type_description_t* data, int preferred_device) { parsec_data_copy_t* dc; if( NULL == data->arena ) { assert(0 == data->dst_count); return NULL; } - dc = parsec_arena_get_copy(data->arena, data->dst_count, 0, data->dst_datatype); - - dc->coherency_state = PARSEC_DATA_COHERENCY_EXCLUSIVE; - PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI:\tMalloc new remote tile %p size %" PRIu64 " count = %" PRIu64 " displ = %" PRIi64 " %p", - dc, data->arena->elem_size, data->dst_count, data->dst_displ, data->arena); + /* Go and allocate on the preferred device. If that fails, fall back and allocate a + * copy on the main memory (device 0), and parsec will transfer the data as needed + * for all tasks executing on acclerators. + */ + dc = parsec_arena_get_new_copy(data->arena, data->dst_count, preferred_device, data->dst_datatype); + PARSEC_DATA_COPY_RETAIN(dc); + /* don't use preferred_device, it might not be the location where the data copy resides */ + parsec_data_start_transfer_ownership_to_copy(dc->original, dc->device_index, PARSEC_FLOW_ACCESS_WRITE); + if (dc->device_index != preferred_device) { + PARSEC_DEBUG_VERBOSE(5, parsec_comm_output_stream, "MPI:\tFail to allocate tile on device %d and instead allocate on device %d\n", + preferred_device, dc->device_index); + } + PARSEC_DEBUG_VERBOSE(5, parsec_comm_output_stream, "MPI:\tMalloc new temporary tile [dev %d] copy %p size %" PRIu64 " count = %" PRIu64 " displ = %" PRIi64 " %p", + dc->device_index, dc, data->arena->elem_size, data->dst_count, data->dst_displ, data->arena); return dc; } @@ -598,7 +603,7 @@ static inline parsec_data_copy_t* reshape_copy_allocate(parsec_dep_type_description_t* data) { parsec_data_copy_t* dc; - dc = remote_dep_copy_allocate(data); + dc = remote_dep_copy_allocate(data, 0 /* default device */); parsec_data_start_transfer_ownership_to_copy(dc->original, 0, @@ -608,7 +613,7 @@ reshape_copy_allocate(parsec_dep_type_description_t* data) /** * - * Routine to fulfilled a reshape promise by the current thread + * Fulfill a reshape promise by the current thread * (when MPI_THREAD_MULTIPLE) or delegate the reshaping to the communication * thread. * Routine set as callback when initializing a future. @@ -657,9 +662,9 @@ void parsec_local_reshape_cb(parsec_base_future_t *future, ... ) } if(src_pack_size != dst_pack_size){ parsec_warning("parsec_local_reshape: reshape requested between dtt with different packed size fut %p dtt [%p:%s = sz(%d) -> %p:%s= sz(%d)]", - future, - dt->local->src_datatype, type_name_src, src_pack_size, - dt->local->dst_datatype, type_name_dst, dst_pack_size); + future, + dt->local->src_datatype, type_name_src, src_pack_size, + dt->local->dst_datatype, type_name_dst, dst_pack_size); } #endif @@ -779,13 +784,19 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu, return PARSEC_ITERATE_STOP; } if(old_dtt != PARSEC_DATATYPE_NULL) { + /* multiple input deps from the same predecessor exists. Be careful on what format + * to receive the data. It would not make sense to receive different amount, but + * it is legal to receive them with different type signatures. In this case, ignore + * the datatype, and instead fall back into a packed format (aka bytes) and the + * entire length of the incomming data. + */ if(old_dtt != output->data.remote.dst_datatype) { #if defined(PARSEC_DEBUG_NOISIER) char type_name_src[MAX_TASK_STRLEN] = "NULL"; char type_name_dst[MAX_TASK_STRLEN] = "NULL"; int len; - if(old_dtt!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(old_dtt, type_name_src, &len); - if(output->data.remote.dst_datatype!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(output->data.remote.dst_datatype, type_name_dst, &len); + if(old_dtt != PARSEC_DATATYPE_NULL) MPI_Type_get_name(old_dtt, type_name_src, &len); + if(output->data.remote.dst_datatype != PARSEC_DATATYPE_NULL) MPI_Type_get_name(output->data.remote.dst_datatype, type_name_dst, &len); PARSEC_DEBUG_VERBOSE(30, parsec_comm_output_stream, "MPI: retrieve dtt for %s [dep_datatype_index %x] DTT: old %s new %s (%p) --> PACKED", newcontext->task_class->name, dep->dep_datatype_index, type_name_src, type_name_dst, output->data.remote.dst_datatype); #endif @@ -793,7 +804,6 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu, parsec_ce.pack_size(&parsec_ce, output->data.remote.dst_count, output->data.remote.dst_datatype, &dsize); output->data.remote.src_count = output->data.remote.dst_count = dsize; output->data.remote.src_datatype = output->data.remote.dst_datatype = PARSEC_DATATYPE_PACKED; - return PARSEC_ITERATE_STOP; } } @@ -808,6 +818,18 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu, newcontext->task_class->name, dep->dep_datatype_index, type_name_src, type_name_dst, output->data.remote.dst_datatype); } #endif + /* Predict where the incoming temporary should be located, by using the data_affinity. + * This only works is the task affinity is linked to the output location of the task, which + * is mostly true for owner-compute type of algorithms. + */ + output->data.preferred_device = 0; /* the default is CPU memory (aka. device 0) */ + if (NULL != fct->data_affinity ) { + parsec_data_ref_t dref; + fct->data_affinity(newcontext, &dref); + parsec_data_t* data = dref.dc->data_of_key(dref.dc, dref.key); + output->data.preferred_device = (-1 != data->preferred_device) ? + data->preferred_device : data->owner_device; + } return PARSEC_ITERATE_CONTINUE; } @@ -989,6 +1011,12 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, task.data[target->flow_index].source_repo_entry = NULL; task.data[target->flow_index].data_in = origin->output[i].data.data; task.data[target->flow_index].data_out = origin->output[i].data.data; + if( NULL != origin->output[i].data.data ) { /* nothing for control flows */ + /* The data has been fully received, mark the copy accordingly */ + task.data[target->flow_index].data_in->coherency_state = PARSEC_DATA_COHERENCY_OWNED; + task.data[target->flow_index].data_in->flags &= ~PARSEC_DATA_FLAG_TRANSIT; /* not in transit anymore */ + task.data[target->flow_index].data_in->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER; + } } #ifdef PARSEC_DIST_COLLECTIVES @@ -1436,7 +1464,7 @@ static int local_dep_nothread_reshape(parsec_execution_stream_t* es, * once all successors have consumed the future, in case it is needed * as an input for nested futures. */ - PARSEC_OBJ_RETAIN(cmd->memcpy.source); + PARSEC_DATA_COPY_RETAIN(cmd->memcpy.source); int rc = remote_dep_nothread_memcpy(es, item); assert(MPI_SUCCESS == rc); @@ -1649,7 +1677,7 @@ remote_dep_mpi_put_start(parsec_execution_stream_t* es, deps->output[k].data.data = reshape_data; - PARSEC_OBJ_RETAIN(reshape_data); + PARSEC_DATA_COPY_RETAIN(reshape_data); PARSEC_DATA_COPY_RELEASE(old_data);/*old data has been retained for remote communication*/ PARSEC_OBJ_RELEASE(deps->output[k].data.data_future); @@ -1854,7 +1882,8 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es, if((length - (*position)) >= (int)data_sizes[ds_idx]) { assert(NULL == data_desc->data); /* we do not support in-place tiles now, make sure it doesn't happen yet */ if(NULL == data_desc->data) { - data_desc->data = remote_dep_copy_allocate(type_desc); + /* if we have to unpack the data onto this new copy we might want to allocated it on the CPU */ + data_desc->data = remote_dep_copy_allocate(type_desc, 0); } #ifndef PARSEC_PROF_DRY_DEP PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, @@ -2080,9 +2109,13 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es, /* prepare the local receiving data */ assert(NULL == deps->output[k].data.data); /* we do not support in-place tiles now, make sure it doesn't happen yet */ if(NULL == deps->output[k].data.data) { - deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote); + deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote, + deps->output[k].data.preferred_device); } - dtt = deps->output[k].data.remote.dst_datatype; + /* Mark the data under tranfer */ + deps->output[k].data.data->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; + deps->output[k].data.data->flags |= PARSEC_DATA_FLAG_TRANSIT; + dtt = deps->output[k].data.remote.dst_datatype; nbdtt = deps->output[k].data.remote.dst_count; /* We have the remote mem_handle. @@ -2159,14 +2192,6 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es, } } -static void remote_dep_mpi_get_end(parsec_execution_stream_t* es, - int idx, - parsec_remote_deps_t* deps) -{ - /* The ref on the data will be released below */ - remote_dep_release_incoming(es, deps, (1U<es_profile, MPI_Data_pldr_ek, callback_data->event_id); #endif /* PARSEC_PROF_TRACE */ - remote_dep_mpi_get_end(es, callback_data->k, deps); + remote_dep_release_incoming(es, deps, (1U << callback_data->k)); parsec_ce.mem_unregister(&callback_data->memory_handle); parsec_thread_mempool_free(parsec_remote_dep_cb_data_mempool->thread_mempools, callback_data); @@ -2239,6 +2264,7 @@ int remote_dep_ce_reconfigure(parsec_context_t* context) * execution stream to parsec_comm_es. */ parsec_set_my_execution_stream(&parsec_comm_es); } + return PARSEC_SUCCESS; } diff --git a/parsec/utils/zone_malloc.c b/parsec/utils/zone_malloc.c index fc6d6f238..54e01b55c 100644 --- a/parsec/utils/zone_malloc.c +++ b/parsec/utils/zone_malloc.c @@ -38,7 +38,7 @@ zone_malloc_t* zone_malloc_init(void* base_ptr, int _max_segment, size_t _unit_s gdata->base = base_ptr; gdata->unit_size = _unit_size; gdata->max_segment = _max_segment; - + gdata->lock = PARSEC_ATOMIC_UNLOCKED; gdata->next_tid = 0; gdata->segments = (segment_t *)malloc(sizeof(segment_t) * _max_segment); #if defined(PARSEC_DEBUG) @@ -74,6 +74,7 @@ void *zone_malloc(zone_malloc_t *gdata, size_t size) int next_tid, current_tid, new_tid; int cycled_through = 0, nb_units; + parsec_atomic_lock(&gdata->lock); /* Let's start with the last remembered free slot */ current_tid = gdata->next_tid; nb_units = (size + gdata->unit_size - 1) / gdata->unit_size; @@ -87,6 +88,7 @@ void *zone_malloc(zone_malloc_t *gdata, size_t size) cycled_through = 1; current_segment = SEGMENT_AT_TID(gdata, current_tid); } else { + parsec_atomic_unlock(&gdata->lock); return NULL; } } @@ -111,12 +113,14 @@ void *zone_malloc(zone_malloc_t *gdata, size_t size) current_segment->nb_units = nb_units; } - return (void*)(gdata->base + (current_tid * gdata->unit_size)); + parsec_atomic_unlock(&gdata->lock); + return (void *)(gdata->base + (current_tid * gdata->unit_size)); } current_tid += current_segment->nb_units; } while( current_tid != gdata->next_tid ); + parsec_atomic_unlock(&gdata->lock); return NULL; } diff --git a/parsec/utils/zone_malloc.h b/parsec/utils/zone_malloc.h index a775c6372..c79a67227 100644 --- a/parsec/utils/zone_malloc.h +++ b/parsec/utils/zone_malloc.h @@ -8,7 +8,7 @@ #define _ZONE_MALLOC_H_ #include "parsec/parsec_config.h" - +#include "parsec/include/parsec/sys/atomic.h" #include #include @@ -30,6 +30,7 @@ typedef struct zone_malloc_s { size_t unit_size; /* Basic Unit */ int max_segment; /* Maximum number of segment */ int next_tid; /* Next TID to look at for a malloc */ + parsec_atomic_lock_t lock; } zone_malloc_t; diff --git a/tests/apps/haar_tree/project_dyn.jdf b/tests/apps/haar_tree/project_dyn.jdf index a34a9cf80..878b74489 100644 --- a/tests/apps/haar_tree/project_dyn.jdf +++ b/tests/apps/haar_tree/project_dyn.jdf @@ -135,7 +135,7 @@ static int my_project_dyn_startup(parsec_execution_stream_t * es, __parsec_proje new_task->data._f_RL.data_out = NULL; new_task->data._f_NODE.source_repo = NULL; new_task->data._f_NODE.source_repo_entry = NULL; - chunk = parsec_arena_get_copy(__tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].arena, 1, 0, __tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].opaque_dtt); + chunk = parsec_arena_get_new_copy(__tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].arena, 1, 0, __tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].opaque_dtt); chunk->original->owner_device = 0; new_task->data._f_NODE.data_out = chunk; new_task->data._f_NODE.data_in = chunk; diff --git a/tests/runtime/cuda/rtt.jdf b/tests/runtime/cuda/rtt.jdf index 28fe93176..a7b3a6cdd 100644 --- a/tests/runtime/cuda/rtt.jdf +++ b/tests/runtime/cuda/rtt.jdf @@ -13,7 +13,7 @@ k = 0 .. NT-1 : A(0, k % WS) RW T <- (k == 0) ? A(0, k % WS) : T PING(k-1) - -> (k < NT) ? T PING(k+1) : A(0, k % WS) + -> (k < NT) ? T PING(k+1) ; 0 diff --git a/tests/runtime/cuda/rtt_main.c b/tests/runtime/cuda/rtt_main.c index e3c212907..0d2f5ae41 100644 --- a/tests/runtime/cuda/rtt_main.c +++ b/tests/runtime/cuda/rtt_main.c @@ -60,40 +60,15 @@ __parsec_rtt_destructor(parsec_rtt_taskpool_t *rtt_tp) PARSEC_OBJ_CLASS_INSTANCE(parsec_rtt_taskpool_t, parsec_taskpool_t, NULL, __parsec_rtt_destructor); -parsec_taskpool_t *rtt_New(parsec_context_t *ctx, size_t size, int roundtrips) +parsec_taskpool_t *rtt_New(parsec_context_t *ctx, parsec_matrix_block_cyclic_t *dcA, + parsec_datatype_t block, int roundtrips) { parsec_rtt_taskpool_t *tp = NULL; - parsec_datatype_t block; - size_t mb = sqrt(size), nb = size / mb; - - if (mb <= 0) { - fprintf(stderr, "To work, RTT must do at least one round time trip of at least one byte\n"); - return (parsec_taskpool_t *)tp; - } - - parsec_matrix_block_cyclic_t* dcA = (parsec_matrix_block_cyclic_t *)calloc(1, sizeof(parsec_matrix_block_cyclic_t)); - parsec_matrix_block_cyclic_init(dcA, PARSEC_MATRIX_BYTE, PARSEC_MATRIX_TILE, - ctx->my_rank, - mb, nb, - mb, ctx->nb_nodes * nb, - 0, 0, - mb, ctx->nb_nodes * nb, - 1, ctx->nb_nodes, 1, 1, - 0, 0); - dcA->mat = parsec_data_allocate((size_t)dcA->super.nb_local_tiles * - (size_t)dcA->super.bsiz * - (size_t)parsec_datadist_getsizeoftype(dcA->super.mtype)); - parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A"); - - /* Initialize and place the dcA */ - parsec_apply(ctx, PARSEC_MATRIX_FULL, - (parsec_tiled_matrix_t *)dcA, - (parsec_tiled_matrix_unary_op_t)matrix_init_ops, NULL); tp = parsec_rtt_new((parsec_data_collection_t*)dcA, roundtrips, ctx->nb_nodes); + tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX].opaque_dtt = block; ptrdiff_t lb, extent; - parsec_type_create_contiguous(mb*nb, parsec_datatype_uint8_t, &block); parsec_type_extent(block, &lb, &extent); parsec_arena_datatype_construct(&tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX], @@ -144,26 +119,33 @@ int main(int argc, char *argv[]) } argv[argc] = NULL; #if defined(DISTRIBUTED) + { + int provided; + MPI_Init_thread(NULL, NULL, MPI_THREAD_SERIALIZED, &provided); + } + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) - extern char **environ; - char *value; - asprintf(&value, "%d", nb_gpus); - parsec_setenv_mca_param("device_cuda_enabled", value, &environ); - free(value); - value = NULL; - if (0xFF != gpu_mask) { + if (0xFF == gpu_mask) { + extern char **environ; + MPI_Comm local_comm; + int local_rank, local_size; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, + MPI_INFO_NULL, &local_comm); + MPI_Comm_rank(local_comm, &local_rank); + MPI_Comm_size(local_comm, &local_size); + MPI_Comm_free(&local_comm); + int gpu_mask = 0; + for (int i = 0; i < nb_gpus; i++) { + gpu_mask |= ((1 << local_rank) << i); + } + char *value; asprintf(&value, "%d", gpu_mask); parsec_setenv_mca_param("device_cuda_mask", value, &environ); free(value); value = NULL; } #endif - { - int provided; - MPI_Init_thread(NULL, NULL, MPI_THREAD_SERIALIZED, &provided); - } - MPI_Comm_size(MPI_COMM_WORLD, &size); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); #endif /* DISTRIBUTED */ if( 0 == rank ) { printf("Running %d tests of %d steps RTT with a data of size %zu\n", @@ -178,9 +160,6 @@ int main(int argc, char *argv[]) parsec_warning("This test can only run if at least one GPU device is present"); exit(-PARSEC_ERR_DEVICE); } - if( do_sleep ) { - sleep(do_sleep); - } cuda_device_index = (int *)malloc(parsec_nb_devices * sizeof(int)); cuda_device_index_len = 0; for (int dev = 0; dev < (int)parsec_nb_devices; dev++) { @@ -190,12 +169,44 @@ int main(int argc, char *argv[]) } } + parsec_datatype_t block; + size_t mb = sqrt(msg_size), nb = msg_size / mb; + if (mb <= 0) { + fprintf(stderr, "To work, RTT must do at least one round time trip of at least one byte\n"); + exit(-1); + } + + parsec_matrix_block_cyclic_t* dcA = (parsec_matrix_block_cyclic_t *)calloc(1, sizeof(parsec_matrix_block_cyclic_t)); + parsec_matrix_block_cyclic_init(dcA, PARSEC_MATRIX_BYTE, PARSEC_MATRIX_TILE, + parsec->my_rank, + mb, nb, + mb, parsec->nb_nodes * nb, + 0, 0, + mb, parsec->nb_nodes * nb, + 1, parsec->nb_nodes, 1, 1, + 0, 0); + dcA->mat = parsec_data_allocate((size_t)dcA->super.nb_local_tiles * + (size_t)dcA->super.bsiz * + (size_t)parsec_datadist_getsizeoftype(dcA->super.mtype)); + parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A"); + + parsec_type_create_contiguous(mb * nb, parsec_datatype_uint8_t, &block); + + /* Initialize and place the dcA */ + parsec_apply(parsec, PARSEC_MATRIX_FULL, + (parsec_tiled_matrix_t *)dcA, + (parsec_tiled_matrix_unary_op_t)matrix_init_ops, NULL); + + + if( do_sleep ) { + sleep(do_sleep); + } #if defined(PARSEC_HAVE_MPI) MPI_Barrier(MPI_COMM_WORLD); #endif /* defined(PARSEC_HAVE_MPI) */ gettimeofday(&tstart, NULL); for( int test_id = 0; test_id < nb_runs; test_id++ ) { - tp = rtt_New(parsec, msg_size, loops); + tp = rtt_New(parsec, dcA, block, loops); if( NULL != tp ) { parsec_context_add_taskpool(parsec, tp); parsec_context_start(parsec); diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index 642aabe9b..986c443db 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -128,8 +128,8 @@ complete_batched_callback(parsec_device_gpu_module_t *dev, parsec_gpu_task_t ** gpu_task, parsec_gpu_exec_stream_t *gpu_stream) { - PARSEC_DEBUG_VERBOSE((10, parsec_debug_output, "complete_batched_callback for batched task %p on stream %s{%p}\n", - gpu_task, gpu_stream->name, (void*)gpu_stream)); + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "complete_batched_callback for batched task %p on stream %s{%p}\n", + gpu_task, gpu_stream->name, (void*)gpu_stream); (void)dev; (void) gpu_task; (void)gpu_stream; parsec_list_item_t* output_stream_ghost = &dev->exec_stream[1]->fifo_pending->ghost_element; parsec_list_item_ring_merge(output_stream_ghost, &(*gpu_task)->list_item); @@ -184,8 +184,8 @@ BODY [type=CUDA /* same task class as the current one, possible to batch */ (void)parsec_list_item_ring_push(&gpu_task->list_item, (parsec_list_item_t*)task); how_many++; /* one more into the batch */ - PARSEC_DEBUG_VERBOSE((10, parsec_debug_output, "Add task %p to the %p batch on stream %s{%p}\n", - task, gpu_task, gpu_stream->name, (void*)gpu_stream)); + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Add task %p to the %p batch on stream %s{%p}\n", + task, gpu_task, gpu_stream->name, (void*)gpu_stream); if( 5 == how_many ) { /* let's stop here for now */ break; @@ -204,8 +204,8 @@ BODY [type=CUDA */ if( how_many > 1 ) gpu_task->complete_stage = complete_batched_callback; - PARSEC_DEBUG_VERBOSE((10, parsec_debug_output, "submit multiple tasks into one %p on stream %s{%p}\n", - gpu_task, gpu_stream->name, (void*)gpu_stream)); + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "submit multiple tasks into one %p on stream %s{%p}\n", + gpu_task, gpu_stream->name, (void*)gpu_stream); if( NULL != store_back ) { parsec_list_item_ring_merge(&gpu_stream->fifo_pending->ghost_element, store_back); } From 10ca380373f25a911fbca5d6c6d38b6c1d2664ff Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 7 Aug 2024 19:56:30 -0700 Subject: [PATCH 13/14] Mostly improvement to the debuging output. Name the data_t allocated for temporaries allowing developers to track them through the execution. Add the keys to all outputs (tasks and copies). Signed-off-by: George Bosilca --- parsec/arena.c | 9 +++++++++ parsec/class/info.c | 2 ++ parsec/data.c | 5 +++-- parsec/mca/device/device_gpu.c | 26 ++++++++++++++------------ parsec/parsec.c | 14 ++++++++++++-- parsec/remote_dep_mpi.c | 1 + parsec/utils/debug.h | 2 +- parsec/utils/zone_malloc.c | 12 ++++-------- 8 files changed, 46 insertions(+), 25 deletions(-) diff --git a/parsec/arena.c b/parsec/arena.c index 648c7ed16..908c313c7 100644 --- a/parsec/arena.c +++ b/parsec/arena.c @@ -238,6 +238,10 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy, #include "parsec/utils/zone_malloc.h" #include "mca/device/device_gpu.h" +#if defined(PARSEC_DEBUG) +static int64_t parsec_countable_incoming_message = 0xF000000000000000; +#endif /* defined(PARSEC_DEBUG) */ + static inline parsec_data_copy_t * parsec_arena_internal_copy_new(parsec_arena_t *arena, parsec_data_t *data, @@ -251,6 +255,11 @@ parsec_arena_internal_copy_new(parsec_arena_t *arena, if( NULL == ldata ) { return NULL; } +#if defined(PARSEC_DEBUG) + /* Name the data with a default key to facilitate debuging */ + ldata->key = (uint64_t)parsec_atomic_fetch_inc_int64(&parsec_countable_incoming_message); + ldata->key |= ((uint64_t)device) << 56; +#endif /* defined(PARSEC_DEBUG) */ } if( 0 == device ) { copy = parsec_data_copy_new(ldata, device, dtt, diff --git a/parsec/class/info.c b/parsec/class/info.c index 640c9d591..aa5b2b901 100644 --- a/parsec/class/info.c +++ b/parsec/class/info.c @@ -312,6 +312,8 @@ void *parsec_info_get(parsec_info_object_array_t *oa, parsec_info_id_t iid) if(NULL == ie->constructor) return ret; nio = ie->constructor(oa->cons_obj, ie->cons_data); + if( NULL == nio ) + return ret; ret = parsec_info_test_and_set(oa, iid, nio, NULL); if(ret != nio && NULL != ie->destructor) { ie->destructor(nio, ie->des_data); diff --git a/parsec/data.c b/parsec/data.c index 0512784ed..6f72c0211 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -466,10 +466,11 @@ void parsec_data_copy_dump(parsec_data_copy_t* copy) if( PARSEC_DATA_COHERENCY_SHARED == copy->coherency_state ) coherency = "shared"; parsec_debug_verbose(0, 0, "%s [%d]: copy %p [ref %d] coherency %s readers %d version %u transit %s flags %s\n" - " older %p orig %p arena %p dev_priv %p\n", + " older %p orig %p [%llx] arena %p dev_priv %p\n", ((NULL != copy->original) && (copy->original->owner_device == copy->device_index)) ? "*" : " ", (int)copy->device_index, copy, copy->super.super.obj_reference_count, coherency, copy->readers, copy->version, tranfer, flags, - (void *)copy->older, (void *)copy->original, (void *)copy->arena_chunk, copy->device_private); + (void *)copy->older, (void *)copy->original, + (NULL != copy->original) ? (unsigned long)copy->original->key : (unsigned long)-1, (void *)copy->arena_chunk, copy->device_private); } void parsec_data_dump(parsec_data_t* data) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index c25a242e8..8914363bb 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -300,7 +300,7 @@ void parsec_device_dump_exec_stream(parsec_gpu_exec_stream_t* exec_stream) int i; parsec_debug_verbose(0, parsec_gpu_output_stream, - "Dev: GPU stream %d{%p} [events = %d, start = %d, end = %d, executed = %d]", + "Dev: GPU stream %s{%p} [events = %d, start = %d, end = %d, executed = %d]", exec_stream->name, exec_stream, exec_stream->max_events, exec_stream->start, exec_stream->end, exec_stream->executed); for( i = 0; i < exec_stream->max_events; i++ ) { @@ -321,12 +321,12 @@ void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device) data_in_dev += gpu_device->super.data_in_from_device[i]; } - parsec_output(parsec_gpu_output_stream, "\n\n"); - parsec_output(parsec_gpu_output_stream, "Device %d:%d (%p) epoch\n", gpu_device->super.device_index, - gpu_device->super.device_index, gpu_device, gpu_device->data_avail_epoch); - parsec_output(parsec_gpu_output_stream, "\tpeer mask %x executed tasks with %llu streams %d\n", - gpu_device->peer_access_mask, (unsigned long long)gpu_device->super.executed_tasks, gpu_device->num_exec_streams); - parsec_output(parsec_gpu_output_stream, "\tstats transferred [in: %llu from host %llu from other device out: %llu] required [in: %llu out: %llu]\n", + parsec_output(parsec_gpu_output_stream, + "\n\nDevice %s:%d (%p) epoch %zu\n" + "\tpeer mask %x executed tasks %llu streams %d\n" + "\tstats transferred [in: %llu from host %llu from other device out: %llu] required [in: %llu out: %llu]\n", + gpu_device->super.name, gpu_device->super.device_index, gpu_device, gpu_device->data_avail_epoch, + gpu_device->peer_access_mask, (unsigned long long)gpu_device->super.executed_tasks, gpu_device->num_exec_streams, (unsigned long long)data_in_host, (unsigned long long)data_in_dev, (unsigned long long)gpu_device->super.data_out_to_host, (unsigned long long)gpu_device->super.required_data_in, (unsigned long long)gpu_device->super.required_data_out); @@ -1030,7 +1030,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, for( j = 0; j <= i; j++ ) { /* This flow could be a control flow */ if( NULL == temp_loc[j] ) continue; - this_task->data[j].data_out = gpu_elem; /* reset the data out */ + this_task->data[j].data_out = NULL; /* reset the data out */ /* This flow could be non-parsec-owned, in which case we can't reclaim it */ if( 0 == (temp_loc[j]->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue; PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, @@ -1044,6 +1044,9 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, PARSEC_DATA_COPY_RELEASE(gpu_elem); #endif parsec_atomic_unlock(&master->lock); + if( data_avail_epoch ) { /* update the memory epoch */ + gpu_device->data_avail_epoch++; + } return PARSEC_HOOK_RETURN_AGAIN; } @@ -1381,7 +1384,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, "GPU[%d:%s]: Prefetch task %p is staging in", gpu_device->super.device_index, gpu_device->super.name, gpu_task); } - if( NULL == gpu_elem ) { + if( gpu_elem == candidate ) { /* data already located in the right place */ if( candidate->device_index == gpu_device->super.device_index ) { /* the candidate is already located on the GPU, no transfer should be necessary but let's do the bookkeeping */ if( (PARSEC_FLOW_ACCESS_WRITE & type) && (gpu_task->task_type != PARSEC_GPU_TASK_TYPE_PREFETCH) ) { @@ -2116,7 +2119,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device, gpu_task->last_data_check_epoch = gpu_device->data_avail_epoch; return ret; } - + gpu_task->last_status = 0; /* mark the task as clean */ for( i = 0; i < this_task->task_class->nb_flows; i++ ) { flow = gpu_task->flow[i]; @@ -2144,11 +2147,10 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device, return ret; } } - PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: Push task %s DONE", gpu_device->super.device_index, gpu_device->super.name, - parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) ); + parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task)); gpu_task->complete_stage = parsec_device_callback_complete_push; #if defined(PARSEC_PROF_TRACE) gpu_task->prof_key_end = -1; /* We do not log that event as the completion of this task */ diff --git a/parsec/parsec.c b/parsec/parsec.c index abf36f809..c8a32093f 100644 --- a/parsec/parsec.c +++ b/parsec/parsec.c @@ -1942,10 +1942,20 @@ parsec_task_snprintf( char* str, size_t size, task->locals[i].value ); if( index >= size ) return str; } - index += snprintf(str + index, size - index, "]<%d>", task->priority ); + index += snprintf(str + index, size - index, "]<%d> keys = {", task->priority ); + if( index >= size ) return str; + for( i = 0; i < tc->nb_flows; i++ ) { + char *prefix = (i == 0) ? "" : ", "; + if ((NULL == task->data[i].data_in) || (NULL == task->data[i].data_in->original)) + index += snprintf(str + index, size - index, "%s*", prefix); + else + index += snprintf(str + index, size - index, "%s%lx", prefix, task->data[i].data_in->original->key); + if( index >= size ) return str; + } + index += snprintf(str + index, size - index, "}" ); if( index >= size ) return str; if( NULL != task->taskpool ) { - index += snprintf(str + index, size - index, "{%u}", task->taskpool->taskpool_id ); + index += snprintf(str + index, size - index, " {tp: %u}", task->taskpool->taskpool_id ); if( index >= size ) return str; } return str; diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 9a34d56a9..53423f293 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -80,6 +80,7 @@ remote_dep_cmd_to_string(remote_dep_wire_activate_t* origin, if( NULL == task.task_class ) return snprintf(str, len, "UNKNOWN_of_TASKCLASS_%d", origin->task_class_id), str; memcpy(&task.locals, origin->locals, sizeof(parsec_assignment_t) * task.task_class->nb_locals); task.priority = 0xFFFFFFFF; + for(int i = 0; i < task.task_class->nb_flows; task.data[i++].data_in = NULL); return parsec_task_snprintf(str, len, &task); } diff --git a/parsec/utils/debug.h b/parsec/utils/debug.h index ddda2352b..13038c79e 100644 --- a/parsec/utils/debug.h +++ b/parsec/utils/debug.h @@ -160,7 +160,7 @@ extern void (*parsec_weaksym_exit)(int status); #else /* defined(PARSEC_DEBUG_NOISIER) */ #define PARSEC_DEBUG_VERBOSE(...) do{} while(0) -#endif /* defined(PARSEC_DEBUG_VERBOSE) */ +#endif /* defined(PARSEC_DEBUG_NOISIER) */ /** $brief To check if any parsec function returned error. */ diff --git a/parsec/utils/zone_malloc.c b/parsec/utils/zone_malloc.c index 54e01b55c..45099deac 100644 --- a/parsec/utils/zone_malloc.c +++ b/parsec/utils/zone_malloc.c @@ -83,14 +83,10 @@ void *zone_malloc(zone_malloc_t *gdata, size_t size) current_segment = SEGMENT_AT_TID(gdata, current_tid); if( NULL == current_segment ) { /* Maybe there is a free slot in the beginning. Let's cycle at least once before we bail out */ - if( cycled_through == 0 ) { - current_tid = 0; - cycled_through = 1; - current_segment = SEGMENT_AT_TID(gdata, current_tid); - } else { - parsec_atomic_unlock(&gdata->lock); - return NULL; - } + if( 0 != cycled_through ) break; + current_tid = 0; + cycled_through = 1; + current_segment = SEGMENT_AT_TID(gdata, current_tid); } if( current_segment->status == SEGMENT_EMPTY && current_segment->nb_units >= nb_units ) { From b99876463b9b43d6360b1a54d11ab76ab847e94f Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 7 Aug 2024 21:02:29 -0700 Subject: [PATCH 14/14] Add a configure option to enable GPU-aware communications. Signed-off-by: George Bosilca --- CMakeLists.txt | 3 +++ parsec/include/parsec/parsec_options.h.in | 1 + parsec/mca/device/device_gpu.c | 11 +++++++---- parsec/parsec_internal.h | 9 +++++++++ parsec/parsec_mpi_funnelled.c | 10 ++++++++++ parsec/remote_dep_mpi.c | 4 ++-- 6 files changed, 32 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35cbb152b..aaa1e44a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,6 +165,9 @@ option(PARSEC_DIST_WITH_MPI if(PARSEC_DIST_WITH_MPI AND 0) message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one") endif() +option(PARSEC_MPI_IS_GPU_AWARE + "Build PaRSEC assuming the MPI library is GPU-aware, aka. can move data directly to and from GPU memory.\ + As of today (mid 2024) while most MPI support such an option, they require a single process per GPU" ON) option(PARSEC_DIST_THREAD "Use an extra thread to progress the data movements" ON) option(PARSEC_DIST_PRIORITIES diff --git a/parsec/include/parsec/parsec_options.h.in b/parsec/include/parsec/parsec_options.h.in index a5a143e12..30521103e 100644 --- a/parsec/include/parsec/parsec_options.h.in +++ b/parsec/include/parsec/parsec_options.h.in @@ -71,6 +71,7 @@ /* Communication engine */ #cmakedefine PARSEC_DIST_WITH_MPI +#cmakedefine PARSEC_MPI_IS_GPU_AWARE #cmakedefine PARSEC_DIST_THREAD #cmakedefine PARSEC_DIST_PRIORITIES #cmakedefine PARSEC_DIST_COLLECTIVES diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 8914363bb..27c043542 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -954,9 +954,9 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, /* Skip CTL flows only */ if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) { - gpu_task->flow_nb_elts[i] = 0; /* assume there is nothing to transfer to the GPU */ + gpu_task->flow_nb_elts[i] = 0; /* assume there is nothing to transfer to the GPU */ continue; - } + } PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%d:%s]:%s: Investigating flow %s:%d", @@ -971,7 +971,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, gpu_device->super.device_index, gpu_device->super.name, task_name, flow->name, i, gpu_elem, this_task->data[i].data_in->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : ""); - this_task->data[i].data_out = this_task->data[i].data_in; + this_task->data[i].data_out = this_task->data[i].data_in; continue; } master = this_task->data[i].data_in->original; @@ -2477,7 +2477,10 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED; assert(PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy->data_transfer_status); cpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER; - + if( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY) ) { + /* Report the CPU copy as the output of the task. */ + this_task->data[i].data_out = cpu_copy; + } PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU copy %p [ref_count %d] moved to the read LRU in %s", gpu_copy, gpu_copy->super.super.obj_reference_count, __func__); diff --git a/parsec/parsec_internal.h b/parsec/parsec_internal.h index 90bcafbca..2ae80a1d9 100644 --- a/parsec/parsec_internal.h +++ b/parsec/parsec_internal.h @@ -200,6 +200,9 @@ PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_taskpool_t); #define PARSEC_DEPENDENCIES_STARTUP_TASK ((parsec_dependency_t)(1<<29)) #define PARSEC_DEPENDENCIES_BITMASK (~(PARSEC_DEPENDENCIES_TASK_DONE|PARSEC_DEPENDENCIES_IN_DONE|PARSEC_DEPENDENCIES_STARTUP_TASK)) +#define PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY 0x00000002 +#define PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY 0x00000001 + /** * This structure is used internally by the parsec_dependencies_t structures */ @@ -491,6 +494,12 @@ PARSEC_DECLSPEC extern int parsec_slow_bind_warning; * the scheduler, but can provide a better cache reuse. */ PARSEC_DECLSPEC extern int parsec_runtime_keep_highest_priority_task; +/** + * Global configuration mask allowing or not for the data to be sent or received, + * from or to, GPU memory. It can be an OR between PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY + * and PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY. + */ +PARSEC_DECLSPEC extern int parsec_mpi_allow_gpu_memory_communications; /** * Description of the state of the task. It indicates what will be the next diff --git a/parsec/parsec_mpi_funnelled.c b/parsec/parsec_mpi_funnelled.c index c0a95dc17..338416713 100644 --- a/parsec/parsec_mpi_funnelled.c +++ b/parsec/parsec_mpi_funnelled.c @@ -201,6 +201,8 @@ parsec_list_t mpi_funnelled_dynamic_sendreq_fifo; /* ordered non threaded fifo * parsec_list_t mpi_funnelled_dynamic_recvreq_fifo; /* ordered non threaded fifo */ parsec_mempool_t *mpi_funnelled_dynamic_req_mempool = NULL; +int parsec_mpi_allow_gpu_memory_communications = 3; + /* This structure is used to save all the information necessary to * invoke a callback after a MPI_Request is satisfied */ @@ -506,6 +508,14 @@ static int mpi_funneled_init_once(parsec_context_t* context) MAX_MPI_TAG, (unsigned int)MAX_MPI_TAG, MAX_MPI_TAG / MAX_DEP_OUT_COUNT); } +#if !defined(PARSEC_MPI_IS_GPU_AWARE) + parsec_mpi_allow_gpu_memory_communications = 0; +#endif + parsec_mca_param_reg_int_name("mpi", "gpu_aware", + "Enabled if PaRSEC should allow MPI to move data directly from or to GPU memory. Otherwise, all data" + " movements will transit through CPU memory, and will always have a backup copy there. Accepted values " + "are ORed between 1 for receiving into GPU memory and 2 for sending from GPU memory", + false, false, parsec_mpi_allow_gpu_memory_communications, &parsec_mpi_allow_gpu_memory_communications); (void)context; return 0; } diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 53423f293..9d5a7c641 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -2110,8 +2110,8 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es, /* prepare the local receiving data */ assert(NULL == deps->output[k].data.data); /* we do not support in-place tiles now, make sure it doesn't happen yet */ if(NULL == deps->output[k].data.data) { - deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote, - deps->output[k].data.preferred_device); + int best_device = (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY) ? deps->output[k].data.preferred_device : 0; + deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote, best_device); } /* Mark the data under tranfer */ deps->output[k].data.data->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER;