Transfer data to and from GPU.

This is a multi-part patch that allows the CPU to prepare a data copy mapped onto a device. 1. The first question is how is such a device selected ? The allocation of such a copy happen way before the scheduler is invoked for a task, in fact before the task is even ready. Thus, we need to decide on the location of this copy only based on some static information, such as the task affinity. Therefore, this approach only works for owner-compute type of tasks, where the task will be executed on the device that owns the data used for the task affinity. 2. Pass the correct data copy across the entire system, instead of falling back to data copy of the device 0 (CPU memory) Signed-off-by: George Bosilca <gbosilca@nvidia.com>
bosilca · Aug 1, 2024 · aedff44 · aedff44
1 parent 7b137ba
commit aedff44
Show file tree

Hide file tree

Showing 14 changed files with 345 additions and 95 deletions.
diff --git a/parsec/CMakeLists.txt b/parsec/CMakeLists.txt
@@ -238,6 +238,7 @@ if( BUILD_PARSEC )
     $<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
     $<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
     $<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
+    $<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
     $<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
     ${EXTRA_LIBS}
     INTERFACE

diff --git a/parsec/arena.c b/parsec/arena.c
@@ -235,43 +235,99 @@ int  parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
     return PARSEC_SUCCESS;
 }
 
-parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
-                                          size_t count, int device,
-                                          parsec_datatype_t dtt)
+#include "parsec/utils/zone_malloc.h"
+#include "mca/device/device_gpu.h"
+
+static inline parsec_data_copy_t *
+parsec_arena_internal_copy_new(parsec_arena_t *arena,
+                               parsec_data_t *data,
+                               size_t count, int device,
+                               parsec_datatype_t dtt)
 {
-    parsec_data_t *data;
     parsec_data_copy_t *copy;
-    int rc;
-
-
-    data = parsec_data_new();
     if( NULL == data ) {
+        data = parsec_data_new();
+        if( NULL == data ) {
+            return NULL;
+        }
+    }
+    copy = parsec_data_copy_new(data, device, dtt,
+                                PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
+
+    if (NULL == copy) {
+        PARSEC_OBJ_RELEASE(data);
         return NULL;
     }
+    if( 0 == device ) {
+        copy->flags |= PARSEC_DATA_FLAG_ARENA; /* allocated from an arena */
+        int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
+        if (PARSEC_SUCCESS != rc) {
+            PARSEC_OBJ_RELEASE(copy);
+            return NULL;
+        }
+        return copy;
+    }
+    /**
+     * This part is not really nice, it breaks the separation between devices, and how their memory is
+     * managed. But, it should give nice perfromance improvements if the communication layer is
+     * capable of sending or receiving data directly to and from the accelerator memory. The only drawback
+     * is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
+     * prior behavior, going through the CPU memory.
+     *
+     * The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
+     * are released from the different LRU lists.
+     */
+    parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)parsec_mca_device_get(device);
+    if( NULL == gpu_device ) {
+        PARSEC_OBJ_RELEASE(copy);
+        return NULL;
+    }
+    size_t size = count * arena->elem_size;
 
-    copy = parsec_data_copy_new( data, device, dtt,
-                                 PARSEC_DATA_FLAG_ARENA |
-                                 PARSEC_DATA_FLAG_PARSEC_OWNED |
-                                 PARSEC_DATA_FLAG_PARSEC_MANAGED);
+    copy->dtt = dtt;
+    copy->device_private = zone_malloc(gpu_device->memory, size);
+    copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
+    PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
+                                                  "data ptr %p",
+                         device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
 
-    if(NULL == copy) {
-        PARSEC_OBJ_RELEASE(data);
+    if( NULL == copy->device_private ) {
+        PARSEC_OBJ_RELEASE(copy);
         return NULL;
     }
+    copy->version = 0;
+    copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
+    copy->original->owner_device = device;
+    copy->original->preferred_device = device;
+    return copy;
+}
 
-    rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
+parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
+                                          size_t count, int device,
+                                          parsec_datatype_t dtt)
+{
+    parsec_data_copy_t *dev0_copy, *copy;
 
+    dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
+    if( NULL == dev0_copy ) {
+        return NULL;
+    }
+    dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
+    dev0_copy->version = 0;  /* start from somewhere */
+    if( 0 == device ) {
+        return dev0_copy;
+    }
+
+    copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
+    if( NULL == copy ) {
+        PARSEC_OBJ_RELEASE(dev0_copy);
+        return NULL;
+    }
     /* This data is going to be released once all copies are released
      * It does not exist without at least a copy, and we don't give the
      * pointer to the user, so we must remove our retain from it
      */
-    PARSEC_OBJ_RELEASE(data);
-
-    if( PARSEC_SUCCESS != rc ) {
-        PARSEC_OBJ_RELEASE(copy);
-        return NULL;
-    }
-
+    PARSEC_OBJ_RELEASE(dev0_copy->original);
     return copy;
 }
 

diff --git a/parsec/data.c b/parsec/data.c
@@ -330,6 +330,12 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
     copy = data->device_copies[device];
     assert( NULL != copy );
 
+    if( valid_copy == device ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
+                             "DEV[%d]: already has ownership of data %p to copy %p in mode %d",
+                             device, data, copy, access_mode);
+        goto bookkeeping;
+    }
     PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
                          "DEV[%d]: start transfer ownership of data %p to copy %p in mode %d",
                          device, data, copy, access_mode);
@@ -417,6 +423,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
         }
     }
 
+  bookkeeping:
     if( PARSEC_FLOW_ACCESS_READ & access_mode ) {
         copy->readers++;
     }

diff --git a/parsec/data.h b/parsec/data.h
@@ -31,9 +31,9 @@ typedef uint8_t parsec_data_coherency_t;
 #define    PARSEC_DATA_COHERENCY_SHARED    ((parsec_data_coherency_t)0x4)
 
 typedef uint8_t parsec_data_status_t;
-#define    PARSEC_DATA_STATUS_NOT_TRANSFER          ((parsec_data_coherency_t)0x0)
-#define    PARSEC_DATA_STATUS_UNDER_TRANSFER        ((parsec_data_coherency_t)0x1)
-#define    PARSEC_DATA_STATUS_COMPLETE_TRANSFER     ((parsec_data_coherency_t)0x2)
+#define    PARSEC_DATA_STATUS_NOT_TRANSFER          ((parsec_data_status_t)0x0)
+#define    PARSEC_DATA_STATUS_UNDER_TRANSFER        ((parsec_data_status_t)0x1)
+#define    PARSEC_DATA_STATUS_COMPLETE_TRANSFER     ((parsec_data_status_t)0x2)
 /**
  * Data copies have three levels of 'ownership':
  * - a data copy can be owned and managed by PaRSEC.

diff --git a/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h b/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h
@@ -45,7 +45,6 @@ typedef struct parsec_matrix_block_cyclic {
  * @param dc matrix description structure, already allocated, that will be initialize
  * @param mtype type of data used for this matrix
  * @param storage type of storage of data
- * @param nodes number of nodes
  * @param myrank rank of the local node (as of mpi rank)
  * @param mb number of row in a tile
  * @param nb number of column in a tile

diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -672,6 +672,17 @@ static char* dump_local_assignments( void** elem, void* arg )
     if( dos > 0 ) {
         string_arena_init(info->sa);
         string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
+#if 0
+        jdf_expr_t* type_str = jdf_find_property( def->properties, "type", NULL );
+        if( NULL == type_str ) {
+            string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
+        } else {
+            expr_info_t expr_info = {.sa = info->sa, .prefix = "", .suffix = "", .assignments = "locals"};
+            string_arena_add_string(info->sa, "const %s %s = %s%s.value;",
+                                    dump_expr((void**)type_str, &expr_info),
+                                    def->name, info->holder, def->name);
+        }
+#endif
         if( dos > 1 )
             string_arena_add_string(info->sa, " (void)%s;", def->name);
         return string_arena_get_string(info->sa);
@@ -5649,12 +5660,19 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call,
 
         /* Code to create & fulfill a reshape promise locally in case this input dependency is typed */
         jdf_generate_code_reshape_input_from_dep(jdf, f, flow, dl, spaces);
-        coutput("%s    this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n"
-                "#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
+        /* TODO: Setting the data_out here is kind of random, especially as some copy of the input flow. The only thing
+         *       that would make sense here is to set the data_out to the dep outputs back into the user memory (output
+         *       dep with a target into a data collection), to give the opportunity to the accelerator components to
+         *       do a pushout to the desired location (instead of the current approach that will do a pushout to the
+         *       data_copy on device 0 followed by a memcpy into the desired location).
+         */
+        //coutput("%s    this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n",
+        //        spaces, flow->varname);
+
+        coutput("#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
                 "%s    parsec_prof_grapher_data_input(chunk->original, (parsec_task_t*)this_task, &%s, 0);\n"
                 "#endif\n"
                 "%s  }\n",
-                spaces, flow->varname,
                 spaces, JDF_OBJECT_ONAME( flow ),
                 spaces);
     }
@@ -6512,10 +6530,10 @@ jdf_generate_code_data_lookup(const jdf_t *jdf,
      * This way, it's only retained once during release_deps.
      */
     coutput("  if( NULL == this_task->repo_entry ){\n"
-            "    this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, "
+            "    this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, \n"
             "                                      %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals));\n"
-            "    data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);"
-            "    this_task->repo_entry ->generator = (void*)this_task;  /* for AYU */\n"
+            "    data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);\n"
+            "    this_task->repo_entry->generator = (void*)this_task;  /* for AYU */\n"
             "#if defined(PARSEC_SIM)\n"
             "    assert(this_task->repo_entry ->sim_exec_date == 0);\n"
             "    this_task->repo_entry ->sim_exec_date = this_task->sim_exec_date;\n"
@@ -6525,7 +6543,7 @@ jdf_generate_code_data_lookup(const jdf_t *jdf,
             jdf_property_get_string(f->properties, JDF_PROP_UD_MAKE_KEY_FN_NAME, NULL),
             f->fname);
 
-    coutput("  /* The reshape repo is the current task repo. */"
+    coutput("  /* The reshape repo is the current task repo. */\n"
             "  reshape_repo = %s_repo;\n"
             "  reshape_entry_key = %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals) ;\n"
             "  reshape_entry = this_task->repo_entry;\n",
@@ -7033,6 +7051,12 @@ static void jdf_generate_code_hook(const jdf_t *jdf,
     output = UTIL_DUMP_LIST(sa, f->dataflow, next,
                             dump_data_initialization_from_data_array, &ai2, "", "", "", "");
     if( 0 != strlen(output) ) {
+        coutput("/* Make sure we have the data_out set to the data_in */\n");
+        for( fl = f->dataflow; fl != NULL; fl = fl->next) {
+            if( fl->flow_flags & JDF_FLOW_TYPE_CTL ) continue;
+            coutput("  this_task->data._f_%s.data_out = this_task->data._f_%s.data_in;\n",
+                    fl->varname, fl->varname);
+        }
         coutput("  /** Declare the variables that will hold the data, and all the accounting for each */\n"
                 "%s\n",
                 output);