diff --git a/CMakeLists.txt b/CMakeLists.txt
index 339913987..aaa1e44a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -165,6 +165,9 @@ option(PARSEC_DIST_WITH_MPI
 if(PARSEC_DIST_WITH_MPI AND 0)
   message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one")
 endif()
+option(PARSEC_MPI_IS_GPU_AWARE
+       "Build PaRSEC assuming the MPI library is GPU-aware, aka. can move data directly to and from GPU memory.\
+       As of today (mid 2024) while most MPI support such an option, they require a single process per GPU" ON)
 option(PARSEC_DIST_THREAD
   "Use an extra thread to progress the data movements" ON)
 option(PARSEC_DIST_PRIORITIES
@@ -181,6 +184,8 @@ option(PARSEC_GPU_ALLOC_PER_TILE
 mark_as_advanced(PARSEC_GPU_ALLOC_PER_TILE)
 option(PARSEC_GPU_WITH_CUDA
   "Enable GPU support using CUDA kernels" ON)
+option(PARSEC_GPU_WITH_CUDA_BATCH
+  "Enable the runtime support for batched kernels" ON)
 option(PARSEC_GPU_WITH_HIP
     "Enable GPU support using HIP kernels" ON)
 option(PARSEC_GPU_WITH_LEVEL_ZERO
@@ -729,6 +734,12 @@ int main(int argc, char *argv[]) {
     endif (CUDAToolkit_FOUND)
     set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
   endif( PARSEC_GPU_WITH_CUDA )
+  if( PARSEC_GPU_WITH_CUDA_BATCH )
+    if( NOT PARSEC_HAVE_CUDA)
+      message(FATAL_ERROR "PARSEC_GPU_WITH_CUDA_BATCH requires PARSEC_GPU_WITH_CUDA. Enable both or none")
+    endif( NOT PARSEC_HAVE_CUDA)
+    set(PARSEC_HAVE_CUDA_BATCH True CACHE BOOL "True if support for batched CUDA has been enabled")
+  endif( PARSEC_GPU_WITH_CUDA_BATCH )
 
   if( PARSEC_GPU_WITH_HIP )
     # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents
diff --git a/cmake_modules/PaRSECConfig.cmake.in b/cmake_modules/PaRSECConfig.cmake.in
index 5d7827bee..3dd2bc459 100644
--- a/cmake_modules/PaRSECConfig.cmake.in
+++ b/cmake_modules/PaRSECConfig.cmake.in
@@ -65,6 +65,10 @@ endif(@PARSEC_DIST_WITH_MPI@)
 if(@PARSEC_HAVE_CUDA@)
   find_package(CUDAToolkit REQUIRED)
   set(PARSEC_HAVE_CUDA TRUE)
+
+  if(@PARSEC_HAVE_CUDA_BATCH@)
+    set(PARSEC_HAVE_CUDA_BATCH TRUE)
+  endif(@PARSEC_HAVE_CUDA_BATCH@)
 endif(@PARSEC_HAVE_CUDA@)
 
 if(@PARSEC_HAVE_HIP@)
diff --git a/parsec/CMakeLists.txt b/parsec/CMakeLists.txt
index eea8bd70a..237985bce 100644
--- a/parsec/CMakeLists.txt
+++ b/parsec/CMakeLists.txt
@@ -238,6 +238,7 @@ if( BUILD_PARSEC )
     $<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
     $<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
     $<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
+    $<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
     $<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
     ${EXTRA_LIBS}
     INTERFACE
diff --git a/parsec/arena.c b/parsec/arena.c
index 8d518d00a..908c313c7 100644
--- a/parsec/arena.c
+++ b/parsec/arena.c
@@ -235,43 +235,118 @@ int  parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
     return PARSEC_SUCCESS;
 }
 
-parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
-                                          size_t count, int device,
-                                          parsec_datatype_t dtt)
+#include "parsec/utils/zone_malloc.h"
+#include "mca/device/device_gpu.h"
+
+#if defined(PARSEC_DEBUG)
+static int64_t parsec_countable_incoming_message = 0xF000000000000000;
+#endif  /* defined(PARSEC_DEBUG) */
+
+static inline parsec_data_copy_t *
+parsec_arena_internal_copy_new(parsec_arena_t *arena,
+                               parsec_data_t *data,
+                               size_t count, int device,
+                               parsec_datatype_t dtt)
 {
-    parsec_data_t *data;
-    parsec_data_copy_t *copy;
-    int rc;
-
-    
-    data = parsec_data_new();
+    parsec_data_copy_t *copy = NULL;
+    parsec_data_t* ldata = data;
     if( NULL == data ) {
+        ldata = parsec_data_new();
+        if( NULL == ldata ) {
+            return NULL;
+        }
+#if defined(PARSEC_DEBUG)
+        /* Name the data with a default key to facilitate debuging */
+        ldata->key = (uint64_t)parsec_atomic_fetch_inc_int64(&parsec_countable_incoming_message);
+        ldata->key |= ((uint64_t)device) << 56;
+#endif  /* defined(PARSEC_DEBUG) */
+    }
+    if( 0 == device ) {
+        copy = parsec_data_copy_new(ldata, device, dtt,
+                                    PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA);
+        if (NULL == copy) {
+            goto free_and_return;
+        }
+        int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
+        if (PARSEC_SUCCESS != rc) {
+            goto free_and_return;
+        }
+        return copy;
+    }
+    /**
+     * This part is not really nice, it breaks the separation between devices, and how their memory is
+     * managed. But, it should give nice perfromance improvements if the communication layer is
+     * capable of sending or receiving data directly to and from the accelerator memory. The only drawback
+     * is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
+     * prior behavior, going through the CPU memory.
+     *
+     * The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
+     * are released from the different LRU lists.
+     */
+    parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device);
+    if (NULL == gpu_device) {
         return NULL;
     }
+    size_t size = count * arena->elem_size;
+    void* device_private = zone_malloc(gpu_device->memory, size);
+    if( NULL == device_private ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n",
+                             device, size, (void *)copy->arena_chunk);
+        goto free_and_return;
+    }
+    copy = parsec_data_copy_new(ldata, device, dtt,
+                                PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
+    if (NULL == copy) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n",
+                             device, size, (void *)copy->arena_chunk);
+        zone_free(gpu_device->memory, device_private);
+        goto free_and_return;
+    }
+    copy->dtt = dtt;
+    copy->device_private = device_private;
+    copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
+    PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
+                                                  "data ptr %p",
+                         device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
+    copy->version = 0;
+    copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
+    copy->original->owner_device = device;
+    copy->original->preferred_device = device;
+    return copy;
+  free_and_return:
+    if( NULL != copy )
+        PARSEC_OBJ_RELEASE(copy);
+    if( NULL == data)
+        PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */
+    return NULL;
+}
 
-    copy = parsec_data_copy_new( data, device, dtt,
-                                 PARSEC_DATA_FLAG_ARENA |
-                                 PARSEC_DATA_FLAG_PARSEC_OWNED |
-                                 PARSEC_DATA_FLAG_PARSEC_MANAGED);
+parsec_data_copy_t *
+parsec_arena_get_new_copy(parsec_arena_t *arena,
+                          size_t count, int device,
+                          parsec_datatype_t dtt)
+{
+    parsec_data_copy_t *dev0_copy, *copy;
 
-    if(NULL == copy) {
-        PARSEC_OBJ_RELEASE(data);
+    dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
+    if( NULL == dev0_copy ) {
         return NULL;
     }
+    dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
+    dev0_copy->version = 0;  /* start from somewhere */
+    if( 0 == device ) {
+        return dev0_copy;
+    }
 
-    rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
-
+    copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
+    if( NULL == copy ) {
+        copy = dev0_copy;  /* return the main memory data copy */
+    }
     /* This data is going to be released once all copies are released
      * It does not exist without at least a copy, and we don't give the
      * pointer to the user, so we must remove our retain from it
      */
-    PARSEC_OBJ_RELEASE(data);
-
-    if( PARSEC_SUCCESS != rc ) {
-        PARSEC_OBJ_RELEASE(copy);
-        return NULL;
-    }
-
+    PARSEC_OBJ_RELEASE(dev0_copy->original);
     return copy;
 }
 
diff --git a/parsec/arena.h b/parsec/arena.h
index a7c6160a1..51c38239b 100644
--- a/parsec/arena.h
+++ b/parsec/arena.h
@@ -133,15 +133,15 @@ int parsec_arena_construct_ex(parsec_arena_t* arena,
  *   enough resource to allocate a new data copy of this type.
  */
 
-parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
-                                          size_t count, int device,
-                                          parsec_datatype_t dtt);
+parsec_data_copy_t *parsec_arena_get_new_copy(parsec_arena_t *arena,
+                                              size_t count, int device,
+                                              parsec_datatype_t dtt);
 
 /**
  * @brief Allocates memory for a given data copy. This is a function used by
  *  DSLs to set the memory associated with a data copy they have created.
- *  It is also used by parsec_arena_get_copy.
- * 
+ *  It is also used by parsec_arena_get_new_copy.
+ *
  * @param copy the (empty) data copy to allocate memory for. NB: the @p original
  *  field of this data copy must be set. The operation overwrites the device
  *  dtt and count of this data copy, as well as the device_private pointer.
diff --git a/parsec/class/info.c b/parsec/class/info.c
index e9c45cb51..aa5b2b901 100644
--- a/parsec/class/info.c
+++ b/parsec/class/info.c
@@ -217,7 +217,7 @@ void parsec_info_object_array_init(parsec_info_object_array_t *oa, parsec_info_t
     if(oa->known_infos == 0)
         oa->info_objects = NULL;
     else
-        oa->info_objects = calloc(sizeof(void*), oa->known_infos);
+        oa->info_objects = calloc(oa->known_infos, sizeof(void*));
     oa->infos = nfo;
     oa->cons_obj = cons_obj;
 }
@@ -265,7 +265,7 @@ static void parsec_ioa_resize_and_rdlock(parsec_info_object_array_t *oa, parsec_
                 oa->info_objects = realloc(oa->info_objects, sizeof(void *) * ns);
                 memset(&oa->info_objects[oa->known_infos - 1], 0, ns - oa->known_infos);
             } else {
-                oa->info_objects = calloc(sizeof(void*), ns);
+                oa->info_objects = calloc(ns, sizeof(void*));
             }
             oa->known_infos = ns;
         }
@@ -312,6 +312,8 @@ void *parsec_info_get(parsec_info_object_array_t *oa, parsec_info_id_t iid)
     if(NULL == ie->constructor)
         return ret;
     nio = ie->constructor(oa->cons_obj, ie->cons_data);
+    if( NULL == nio )
+        return ret;
     ret = parsec_info_test_and_set(oa, iid, nio, NULL);
     if(ret != nio && NULL != ie->destructor) {
         ie->destructor(nio, ie->des_data);
diff --git a/parsec/data.c b/parsec/data.c
index a08c9d5d6..6f72c0211 100644
--- a/parsec/data.c
+++ b/parsec/data.c
@@ -15,6 +15,7 @@
 #include "parsec/sys/atomic.h"
 #include "parsec/remote_dep.h"
 #include "parsec/parsec_internal.h"
+#include "parsec/utils/zone_malloc.h"
 
 static parsec_lifo_t parsec_data_lifo;
 static parsec_lifo_t parsec_data_copies_lifo;
@@ -65,6 +66,7 @@ static void parsec_data_construct(parsec_data_t* obj )
     obj->preferred_device = -1;
     obj->key              = 0;
     obj->nb_elts          = 0;
+    obj->nb_copies        = 0;
     for( uint32_t i = 0; i < parsec_nb_devices;
          obj->device_copies[i] = NULL, i++ );
     obj->dc               = NULL;
@@ -99,11 +101,12 @@ static void parsec_data_destruct(parsec_data_t* obj )
                  * GPU copies are normally stored in LRU lists, and must be
                  * destroyed by the release list to free the memory on the device
                  */
-                PARSEC_OBJ_RELEASE( copy );
+                PARSEC_DATA_COPY_RELEASE(copy);
             }
         }
         assert(NULL == obj->device_copies[i]);
     }
+    assert(0 == obj->nb_copies);
 }
 
 PARSEC_OBJ_CLASS_INSTANCE(parsec_data_t, parsec_object_t,
@@ -161,8 +164,8 @@ void parsec_data_delete(parsec_data_t* data)
 
 inline int
 parsec_data_copy_attach(parsec_data_t* data,
-                       parsec_data_copy_t* copy,
-                       uint8_t device)
+                        parsec_data_copy_t* copy,
+                        uint8_t device)
 {
     assert(NULL == copy->original);
     assert(NULL == copy->older);
@@ -175,6 +178,7 @@ parsec_data_copy_attach(parsec_data_t* data,
         copy->older = NULL;
         return PARSEC_ERROR;
     }
+    parsec_atomic_fetch_add_int32(&data->nb_copies, 1);
     PARSEC_OBJ_RETAIN(data);
     return PARSEC_SUCCESS;
 }
@@ -192,6 +196,7 @@ int parsec_data_copy_detach(parsec_data_t* data,
         return PARSEC_ERR_NOT_FOUND;
     }
     data->device_copies[device] = copy->older;
+    parsec_atomic_fetch_add_int32(&data->nb_copies, -1);
 
     copy->original     = NULL;
     copy->older        = NULL;
@@ -221,7 +226,7 @@ parsec_data_copy_t* parsec_data_copy_new(parsec_data_t* data, uint8_t device,
     }
     copy->flags = flags;
     if( PARSEC_SUCCESS != parsec_data_copy_attach(data, copy, device) ) {
-        PARSEC_OBJ_RELEASE(copy);
+        PARSEC_DATA_COPY_RELEASE(copy);
         return NULL;
     }
     copy->dtt = dtt;
@@ -330,6 +335,12 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
     copy = data->device_copies[device];
     assert( NULL != copy );
 
+    if( valid_copy == device ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
+                             "DEV[%d]: already has ownership of data %p to copy %p in mode %d",
+                             device, data, copy, access_mode);
+        goto bookkeeping;
+    }
     PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
                          "DEV[%d]: start transfer ownership of data %p to copy %p in mode %d",
                          device, data, copy, access_mode);
@@ -417,6 +428,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
         }
     }
 
+  bookkeeping:
     if( PARSEC_FLOW_ACCESS_READ & access_mode ) {
         copy->readers++;
     }
@@ -435,40 +447,53 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
     return valid_copy;
 }
 
-static char dump_coherency_codex(parsec_data_coherency_t state)
-{
-    if( PARSEC_DATA_COHERENCY_INVALID   == state ) return 'I';
-    if( PARSEC_DATA_COHERENCY_OWNED     == state ) return 'O';
-    if( PARSEC_DATA_COHERENCY_EXCLUSIVE == state ) return 'E';
-    if( PARSEC_DATA_COHERENCY_SHARED    == state ) return 'S';
-    return 'X';
-}
-
-void parsec_dump_data_copy(parsec_data_copy_t* copy)
+void parsec_data_copy_dump(parsec_data_copy_t* copy)
 {
-    parsec_debug_verbose(0, 0, "-  [%d]: copy %p state %c readers %d version %u\n",
-           (int)copy->device_index, copy, dump_coherency_codex(copy->coherency_state), copy->readers, copy->version);
+    char *tranfer = "---", flags[] = "----", *coherency = "undef";
+    switch(copy->data_transfer_status) {
+        case PARSEC_DATA_STATUS_NOT_TRANSFER: tranfer = "no"; break;
+        case PARSEC_DATA_STATUS_UNDER_TRANSFER: tranfer = "yes"; break;
+        case PARSEC_DATA_STATUS_COMPLETE_TRANSFER: tranfer = "no"; break;
+    }
+    if (copy->flags & PARSEC_DATA_FLAG_ARENA) flags[0] = 'A';
+    if (copy->flags & PARSEC_DATA_FLAG_TRANSIT) flags[1] = 'T';
+    if (copy->flags & PARSEC_DATA_FLAG_PARSEC_MANAGED) flags[2] = 'M';
+    if (copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) flags[3] = 'O';
+
+    if( PARSEC_DATA_COHERENCY_INVALID   == copy->coherency_state ) coherency = "invalid";
+    if( PARSEC_DATA_COHERENCY_OWNED     == copy->coherency_state ) coherency = "owned";
+    if( PARSEC_DATA_COHERENCY_EXCLUSIVE == copy->coherency_state ) coherency = "exclusive";
+    if( PARSEC_DATA_COHERENCY_SHARED    == copy->coherency_state ) coherency = "shared";
+
+    parsec_debug_verbose(0, 0, "%s  [%d]: copy %p [ref %d] coherency %s readers %d version %u transit %s flags %s\n"
+                               "          older %p orig %p [%llx] arena %p dev_priv %p\n",
+                         ((NULL != copy->original) && (copy->original->owner_device == copy->device_index)) ? "*" : " ",
+                         (int)copy->device_index, copy, copy->super.super.obj_reference_count, coherency, copy->readers, copy->version, tranfer, flags,
+                         (void *)copy->older, (void *)copy->original,
+                         (NULL != copy->original) ? (unsigned long)copy->original->key : (unsigned long)-1, (void *)copy->arena_chunk, copy->device_private);
 }
 
-void parsec_dump_data(parsec_data_t* data)
+void parsec_data_dump(parsec_data_t* data)
 {
-    parsec_debug_verbose(0, 0, "data %p key %lu owner %d\n", data, data->key, data->owner_device);
+    parsec_debug_verbose(0, 0, "data %p [ref %d] key %lu owner dev %d pref dev %d copies %d dc %p [# elems %zu]\n",
+                         data, data->super.obj_reference_count, data->key, data->owner_device, data->preferred_device, data->nb_copies,
+                         (void*)data->dc, data->nb_elts);
 
     for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
         if( NULL != data->device_copies[i])
-            parsec_dump_data_copy(data->device_copies[i]);
+            parsec_data_copy_dump(data->device_copies[i]);
     }
 }
 
 parsec_data_copy_t*
 parsec_data_get_copy(parsec_data_t* data, uint32_t device)
 {
-    return PARSEC_DATA_GET_COPY(data, device);
+   return PARSEC_DATA_GET_COPY(data, device);
 }
 
 void parsec_data_copy_release(parsec_data_copy_t* copy)
 {
-    /* TODO: Move the copy back to the CPU before destroying it */
+   /* TODO: Move the copy back to the CPU before destroying it */
     PARSEC_DATA_COPY_RELEASE(copy);
 }
 
@@ -509,7 +534,7 @@ parsec_data_create( parsec_data_t **holder,
 
         if( !parsec_atomic_cas_ptr(holder, NULL, data) ) {
             parsec_data_copy_detach(data, data_copy, 0);
-            PARSEC_OBJ_RELEASE(data_copy);
+            PARSEC_DATA_COPY_RELEASE(data_copy);
             data = *holder;
         }
     } else {
@@ -560,3 +585,42 @@ parsec_data_destroy( parsec_data_t *data )
 #endif
     PARSEC_OBJ_RELEASE(data);
 }
+
+#include "parsec/utils/debug.h"
+
+int parsec_data_release_self_contained_data(parsec_data_t *data)
+{
+    if (data->super.obj_reference_count != data->nb_copies) return 0;
+    parsec_data_copy_t *copy;
+    PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Examine the status of data %p with %d copies and refcounts at %s:%d\n",
+                         data, data->nb_copies, __FILE__, __LINE__);
+    /* this data is only referenced by it's own copies. If these copies are also only referenced by
+     * data, then we can release them all.
+     */
+    for( uint32_t i = 0; i < parsec_nb_devices; i++) {
+        if (NULL == (copy = data->device_copies[i])) continue;
+        if( copy->super.super.obj_reference_count > 1 )
+            return 0;
+    }
+    PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Force the release of data %p at %s:%d",
+		         copy, __FILE__, __LINE__);
+    for( uint32_t i = 0; i < parsec_nb_devices; i++) {
+        if (NULL == (copy = data->device_copies[i])) continue;
+        assert(1 == copy->super.super.obj_reference_count);
+        if( 0 == copy->device_index ) {
+            PARSEC_OBJ_RELEASE(copy);
+            assert(NULL == copy);
+        } else {
+            /* Do not release data copies that do not belong to the CPU or really bad things will happen.
+             * Only the device manager can release these copies, the best we can do here is to detach them
+             * from the data and eventually release their memory.
+             */
+            parsec_data_copy_detach(data, copy, copy->device_index);
+            zone_free((zone_malloc_t *)copy->arena_chunk, copy->device_private);
+            copy->device_private = NULL;
+            copy->arena_chunk = NULL;
+        }
+    }
+    return 1;
+}
+
diff --git a/parsec/data.h b/parsec/data.h
index f21c2918b..eafb8c504 100644
--- a/parsec/data.h
+++ b/parsec/data.h
@@ -31,9 +31,9 @@ typedef uint8_t parsec_data_coherency_t;
 #define    PARSEC_DATA_COHERENCY_SHARED    ((parsec_data_coherency_t)0x4)
 
 typedef uint8_t parsec_data_status_t;
-#define    PARSEC_DATA_STATUS_NOT_TRANSFER          ((parsec_data_coherency_t)0x0)
-#define    PARSEC_DATA_STATUS_UNDER_TRANSFER        ((parsec_data_coherency_t)0x1)
-#define    PARSEC_DATA_STATUS_COMPLETE_TRANSFER     ((parsec_data_coherency_t)0x2)
+#define    PARSEC_DATA_STATUS_NOT_TRANSFER          ((parsec_data_status_t)0x0)
+#define    PARSEC_DATA_STATUS_UNDER_TRANSFER        ((parsec_data_status_t)0x1)
+#define    PARSEC_DATA_STATUS_COMPLETE_TRANSFER     ((parsec_data_status_t)0x2)
 /**
  * Data copies have three levels of 'ownership':
  * - a data copy can be owned and managed by PaRSEC.
@@ -124,8 +124,8 @@ PARSEC_DECLSPEC void
 parsec_data_end_transfer_ownership_to_copy(parsec_data_t* data,
                                                 uint8_t device,
                                                 uint8_t access_mode);
-PARSEC_DECLSPEC void parsec_dump_data_copy(parsec_data_copy_t* copy);
-PARSEC_DECLSPEC void parsec_dump_data(parsec_data_t* copy);
+PARSEC_DECLSPEC void parsec_data_copy_dump(parsec_data_copy_t *copy);
+PARSEC_DECLSPEC void parsec_data_dump(parsec_data_t* copy);
 
 PARSEC_DECLSPEC parsec_data_t *
 parsec_data_create( parsec_data_t **holder,
diff --git a/parsec/data_dist/matrix/map_operator.c b/parsec/data_dist/matrix/map_operator.c
index 1d6af9196..e6ba9a2e4 100644
--- a/parsec/data_dist/matrix/map_operator.c
+++ b/parsec/data_dist/matrix/map_operator.c
@@ -106,7 +106,7 @@ static const parsec_symbol_t symb_column = {
     .flags = PARSEC_SYMBOL_IS_STANDALONE
 };
 
-static inline int affinity_of_map_operator(parsec_task_t *this_task,
+static inline int affinity_of_map_operator(const parsec_task_t *this_task,
                                            parsec_data_ref_t *ref)
 {
     const parsec_map_operator_taskpool_t *__tp = (const parsec_map_operator_taskpool_t*)this_task->taskpool;
@@ -117,7 +117,7 @@ static inline int affinity_of_map_operator(parsec_task_t *this_task,
     return 1;
 }
 
-static inline int initial_data_of_map_operator(parsec_task_t *this_task,
+static inline int initial_data_of_map_operator(const parsec_task_t *this_task,
                                                parsec_data_ref_t *refs)
 {
     int __flow_nb = 0;
@@ -134,7 +134,7 @@ static inline int initial_data_of_map_operator(parsec_task_t *this_task,
     return __flow_nb;
 }
 
-static inline int final_data_of_map_operator(parsec_task_t *this_task,
+static inline int final_data_of_map_operator(const parsec_task_t *this_task,
                                              parsec_data_ref_t *data_refs)
 {
     int __flow_nb = 0;
@@ -296,13 +296,13 @@ static int data_lookup(parsec_execution_stream_t *es,
         this_task->data[0].data_in   = parsec_data_get_copy(src(m,n), 0);
         this_task->data[0].source_repo_entry = NULL;
         this_task->data[0].data_out  = NULL;
-        PARSEC_OBJ_RETAIN(this_task->data[0].data_in);
+        PARSEC_DATA_COPY_RETAIN(this_task->data[0].data_in);
     }
     if( NULL != __tp->dest ) {
         this_task->data[1].data_in   = parsec_data_get_copy(dest(m,n), 0);
         this_task->data[1].source_repo_entry = NULL;
         this_task->data[1].data_out  = this_task->data[1].data_in;
-        PARSEC_OBJ_RETAIN(this_task->data[1].data_in);
+        PARSEC_DATA_COPY_RETAIN(this_task->data[1].data_in);
     }
     return PARSEC_HOOK_RETURN_DONE;
 }
diff --git a/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h b/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h
index 9d766bb34..6c1587eb8 100644
--- a/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h
+++ b/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h
@@ -45,7 +45,6 @@ typedef struct parsec_matrix_block_cyclic {
  * @param dc matrix description structure, already allocated, that will be initialize
  * @param mtype type of data used for this matrix
  * @param storage type of storage of data
- * @param nodes number of nodes
  * @param myrank rank of the local node (as of mpi rank)
  * @param mb number of row in a tile
  * @param nb number of column in a tile
diff --git a/parsec/data_dist/matrix/vector_two_dim_cyclic.c b/parsec/data_dist/matrix/vector_two_dim_cyclic.c
index 1c0b70a74..838f02f25 100644
--- a/parsec/data_dist/matrix/vector_two_dim_cyclic.c
+++ b/parsec/data_dist/matrix/vector_two_dim_cyclic.c
@@ -251,7 +251,7 @@ static parsec_data_t* vector_twoDBC_data_of(parsec_data_collection_t *desc, ...)
 #endif
 
     /* Compute the local tile row */
-    assert( dc->super.bsiz == dc->super.mb );
+    assert( dc->super.bsiz == (size_t)dc->super.mb );
 
     local_m = m / dc->lcm;
 
diff --git a/parsec/data_internal.h b/parsec/data_internal.h
index 49b3a3c7f..c15af1b0c 100644
--- a/parsec/data_internal.h
+++ b/parsec/data_internal.h
@@ -19,6 +19,7 @@
 #include "parsec/arena.h"
 #include "parsec/data.h"
 #include "parsec/class/parsec_future.h"
+#include "parsec/utils/debug.h"
 
 /**
  * This structure is the keeper of all the information regarding
@@ -30,11 +31,12 @@ struct parsec_data_s {
 
     parsec_atomic_lock_t       lock;
 
-    parsec_data_key_t          key;
     int8_t                     owner_device;
     int8_t                     preferred_device; /* Hint set from the MEMADVICE device API to define on
                                                   * which device this data should be modified RW when there
                                                   * are multiple choices. -1 means no preference. */
+    int32_t                    nb_copies;        /* How many valid copies are attached to this data */
+    parsec_data_key_t          key;
     struct parsec_data_collection_s*     dc;
     size_t                     nb_elts;          /* size in bytes of the memory layout */
     struct parsec_data_copy_s *device_copies[];  /* this array allocated according to the number of devices
@@ -85,22 +87,50 @@ PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_data_copy_t);
 
 #define PARSEC_DATA_GET_COPY(DATA, DEVID) \
     ((DATA)->device_copies[(DEVID)])
+
+int parsec_data_release_self_contained_data(parsec_data_t* data);
 /**
  * Decrease the refcount of this copy of the data. If the refcount reach
  * 0 the upper level is in charge of cleaning up and releasing all content
  * of the copy.
  */
-#define PARSEC_DATA_COPY_RELEASE(DATA)     \
+#if 0
+#define PARSEC_DATA_COPY_RELEASE(COPY)     \
     do {                                  \
-        PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Release data copy %p at %s:%d", (DATA), __FILE__, __LINE__); \
-        PARSEC_OBJ_RELEASE((DATA));                                            \
+        PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Release data copy %p at %s:%d", (COPY), __FILE__, __LINE__); \
+        PARSEC_OBJ_RELEASE((COPY));                                            \
+        if( (NULL != (COPY)) && (NULL != ((COPY)->original)) ) parsec_data_release_self_contained_data((COPY)->original); \
     } while(0)
 
+#define PARSEC_DATA_COPY_RETAIN(COPY)     \
+    do {                                  \
+        PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p at %s:%d", (COPY), __FILE__, __LINE__); \
+        PARSEC_OBJ_RETAIN((COPY));                                            \
+    } while(0)
+#else
+static inline void __parsec_data_copy_release(parsec_data_copy_t** copy)
+{
+    PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Release data copy %p at %s:%d", *copy, __FILE__, __LINE__);
+    PARSEC_OBJ_RELEASE(*copy);
+    if ((NULL != *copy) && (NULL != (*copy)->original) && (1 == (*copy)->super.super.obj_reference_count))
+        parsec_data_release_self_contained_data((*copy)->original);
+}
+#define PARSEC_DATA_COPY_RELEASE(COPY) \
+    __parsec_data_copy_release(&(COPY))
+
+static inline void __parsec_data_copy_retain(parsec_data_copy_t* copy)
+{
+    PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p at %s:%d", copy, __FILE__, __LINE__);
+    PARSEC_OBJ_RETAIN(copy);
+}
+#define PARSEC_DATA_COPY_RETAIN(COPY) \
+    __parsec_data_copy_retain((COPY))
+#endif  /* 0 */
 /**
  * Return the device private pointer for a datacopy.
  */
-#define PARSEC_DATA_COPY_GET_PTR(DATA) \
-    ((DATA) ? (DATA)->device_private : NULL)
+#define PARSEC_DATA_COPY_GET_PTR(COPY) \
+    ((COPY) ? (COPY)->device_private : NULL)
 
 /** @} */
 
diff --git a/parsec/include/parsec/parsec_options.h.in b/parsec/include/parsec/parsec_options.h.in
index 85af2db8e..30521103e 100644
--- a/parsec/include/parsec/parsec_options.h.in
+++ b/parsec/include/parsec/parsec_options.h.in
@@ -71,6 +71,7 @@
 
 /* Communication engine */
 #cmakedefine PARSEC_DIST_WITH_MPI
+#cmakedefine PARSEC_MPI_IS_GPU_AWARE
 #cmakedefine PARSEC_DIST_THREAD
 #cmakedefine PARSEC_DIST_PRIORITIES
 #cmakedefine PARSEC_DIST_COLLECTIVES
@@ -130,6 +131,7 @@
 #cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT
+#cmakedefine PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_OPENCL_SUPPORT
diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c
index d1eac58a8..04ca48c3d 100644
--- a/parsec/interfaces/dtd/insert_function.c
+++ b/parsec/interfaces/dtd/insert_function.c
@@ -410,7 +410,7 @@ parsec_dtd_taskpool_destructor(parsec_dtd_taskpool_t *tp)
         free((void *)tp->super.profiling_array);
 #endif /* defined(PARSEC_PROF_TRACE) */
  
-    if( tp->super.taskpool_name != NULL ) {
+    if( NULL != tp->super.taskpool_name) {
         free(tp->super.taskpool_name);
         tp->super.taskpool_name = NULL;
     }
@@ -1477,7 +1477,7 @@ parsec_dtd_startup(parsec_context_t *context,
         if( !(tp->devices_index_mask & (1 << device->device_index))) continue;  /* not supported */
         // If CUDA is enabled, let the CUDA device activated for this
         // taskpool.
-        if( PARSEC_DEV_CUDA == device->type ) continue;
+        if( PARSEC_DEV_CUDA & device->type ) continue;
         if( NULL != device->taskpool_register )
             if( PARSEC_SUCCESS !=
                 device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
@@ -2303,6 +2303,7 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t
     return device->kernel_scheduler(device, es, gpu_task);
 #else
     parsec_warning("DTD: Selected best device is a GPU, but no GPU is supported at compile time. Falling back to CPU");
+    (void)this_task;
     return PARSEC_HOOK_RETURN_NEXT;
 #endif
 }
@@ -2355,8 +2356,8 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     /* We assume that incarnations is big enough, because it has been pre-allocated
      * with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */
     incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations;
-    for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && incarnations[i].type != PARSEC_DEV_NONE; i++) {
-        if( incarnations[i].type == device_type ) {
+    for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) {
+        if( incarnations[i].type & PARSEC_DEV_ANY_TYPE & device_type ) {
             parsec_warning("A chore for this device type has already been added to task class '%s'\n",
                            tc->name);
             return PARSEC_ERROR;
@@ -2369,7 +2370,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     }
 
     incarnations[i].type = device_type;
-    if(PARSEC_DEV_CUDA == device_type) {
+    if(PARSEC_DEV_CUDA & device_type) {
         incarnations[i].hook = parsec_dtd_gpu_task_submit;
         dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
     }
@@ -3258,19 +3259,20 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
             dtd_tc = parsec_dtd_create_task_classv(name_of_kernel, nb_params, params);
             tc = &dtd_tc->super;
 
-            __parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
-            (*incarnations)[0].type = device_type;
-            if( device_type == PARSEC_DEV_CUDA ) {
+            __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations;
+            incarnations[0].type = device_type;
+            if( device_type & PARSEC_DEV_CUDA ) {
                 /* Special case for CUDA: we need an intermediate */
-                (*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
+                incarnations[0].hook = parsec_dtd_gpu_task_submit;
                 dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
             }
             else {
                 /* Default case: the user-provided function is directly the hook to call */
-                (*incarnations)[0].hook = fpointer; // We can directly call the CPU hook
+                incarnations[0].hook = fpointer; // We can directly call the CPU hook
                 dtd_tc->cpu_func_ptr = fpointer;
             }
-            (*incarnations)[1].type = PARSEC_DEV_NONE;
+            incarnations[1].type = PARSEC_DEV_NONE;
+            incarnations[1].hook = NULL;
 
             /* Bookkeeping of the task class */
             parsec_dtd_register_task_class(&dtd_tp->super, fkey, tc);
@@ -3463,7 +3465,7 @@ parsec_arena_datatype_t *parsec_dtd_create_arena_datatype(parsec_context_t *ctx,
     if(NULL != new_adt)
         return NULL;
 #endif
-    new_adt = calloc(sizeof(parsec_arena_datatype_t), 1);
+    new_adt = calloc(1, sizeof(parsec_arena_datatype_t));
     if(NULL == new_adt)
         return NULL;
     new_adt->ht_item.key = my_id;
diff --git a/parsec/interfaces/dtd/insert_function_internal.h b/parsec/interfaces/dtd/insert_function_internal.h
index adfc7e54b..640923ebc 100644
--- a/parsec/interfaces/dtd/insert_function_internal.h
+++ b/parsec/interfaces/dtd/insert_function_internal.h
@@ -417,16 +417,16 @@ void
 parsec_dtd_fini();
 
 static inline void
-parsec_dtd_retain_data_copy( parsec_data_copy_t *data )
+parsec_dtd_retain_data_copy( parsec_data_copy_t *copy )
 {
-    assert( data->super.super.obj_reference_count >= 1 );
-    PARSEC_OBJ_RETAIN(data);
+    assert( copy->super.super.obj_reference_count >= 1 );
+    PARSEC_DATA_COPY_RETAIN(copy);
 }
 
 static inline void
-parsec_dtd_release_data_copy( parsec_data_copy_t *data )
+parsec_dtd_release_data_copy(parsec_data_copy_t *copy)
 {
-    PARSEC_OBJ_RELEASE(data);
+    PARSEC_DATA_COPY_RELEASE(copy);
 }
 
 
diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index ec53abeff..2f37075da 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -672,6 +672,17 @@ static char* dump_local_assignments( void** elem, void* arg )
     if( dos > 0 ) {
         string_arena_init(info->sa);
         string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
+#if 0
+        jdf_expr_t* type_str = jdf_find_property( def->properties, "type", NULL );
+        if( NULL == type_str ) {
+            string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
+        } else {
+            expr_info_t expr_info = {.sa = info->sa, .prefix = "", .suffix = "", .assignments = "locals"};
+            string_arena_add_string(info->sa, "const %s %s = %s%s.value;",
+                                    dump_expr((void**)type_str, &expr_info),
+                                    def->name, info->holder, def->name);
+        }
+#endif
         if( dos > 1 )
             string_arena_add_string(info->sa, " (void)%s;", def->name);
         return string_arena_get_string(info->sa);
@@ -2044,7 +2055,7 @@ static void jdf_generate_affinity( const jdf_t *jdf, const jdf_function_entry_t
         assert( NULL == data_affinity->var );
     }
 
-    coutput("static inline int %s(%s *this_task,\n"
+    coutput("static inline int %s(const %s *this_task,\n"
             "                     parsec_data_ref_t *ref)\n"
             "{\n"
             "    const __parsec_%s_internal_taskpool_t *__parsec_tp = (const __parsec_%s_internal_taskpool_t*)this_task->taskpool;\n",
@@ -3938,6 +3949,7 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf,
     jdf_def_list_t* dyld_property;
     jdf_def_list_t* evaluate_property = NULL;
     jdf_def_list_t* device_property = NULL;
+    jdf_def_list_t* batch_property = NULL;
 
     (void)jdf;
     string_arena_add_string(sa, "static const __parsec_chore_t __%s_chores[] ={\n", base_name);
@@ -3945,18 +3957,32 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf,
         jdf_find_property(body->properties, "type", &type_property);
         jdf_find_property(body->properties, "dyld", &dyld_property);
         jdf_find_property(body->properties, JDF_BODY_PROP_EVALUATE, &evaluate_property);
-        if( NULL == type_property) {
+        jdf_find_property(body->properties, "batch", &batch_property);
+        if (NULL == type_property)
+        {
             string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_CPU_SUPPORT)\n");
             string_arena_add_string(sa, "    { .type     = PARSEC_DEV_CPU,\n");
             string_arena_add_string(sa, "      .evaluate = (parsec_evaluate_function_t*)%s,\n",
                                     (NULL == evaluate_property) ? "NULL" : evaluate_property->expr->jdf_c_code.fname);
             string_arena_add_string(sa, "      .hook     = (parsec_hook_t*)hook_of_%s },\n", base_name);
             string_arena_add_string(sa, "#endif  /* defined(PARSEC_HAVE_DEV_CPU_SUPPORT) */\n");
-        } else {
+            if( NULL != batch_property ) {
+                fprintf(stderr,
+                        "Error: batched property (%s) not allowed for devices other than accelerators in body of task %s at line %d\n",
+                        batch_property->expr->jdf_var, f->fname, JDF_OBJECT_LINENO(body));
+                assert( NULL != batch_property );
+            }
+        }
+        else
+        {
             char* dev_upper = strdup_upper(type_property->expr->jdf_var);
 
             string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", dev_upper);
-            string_arena_add_string(sa, "    { .type     = PARSEC_DEV_%s,\n", dev_upper);
+            string_arena_add_string(sa, "    { .type     = PARSEC_DEV_%s", dev_upper);
+            if( NULL != batch_property) {
+                string_arena_add_string(sa, " | PARSEC_DEV_CHORE_ALLOW_BATCH");
+            }
+            string_arena_add_string(sa, ",\n");
             if( NULL == dyld_property ) {
                 string_arena_add_string(sa, "      .dyld     = NULL,\n");
             } else {
@@ -4491,7 +4517,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf )
             "    parsec_task_class_t* tc = (parsec_task_class_t*)__parsec_tp->super.super.task_classes_array[i];\n"
             "    __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;\n"
             "    uint32_t idx = 0, j;\n"
-            "    for( j = 0; PARSEC_DEV_NONE != chores[j].type; j++ ) {\n"
+            "    for( j = 0; PARSEC_DEV_NONE != (chores[j].type & PARSEC_DEV_ANY_TYPE); j++ ) {\n"
             "      if( !(supported_dev & chores[j].type) ) continue;\n"
             "      if( j != idx ) {\n"
             "        chores[idx] = chores[j];\n"
@@ -4551,11 +4577,14 @@ static void jdf_generate_destructor( const jdf_t *jdf )
             "  free(__parsec_tp->super.super.task_classes_array); __parsec_tp->super.super.task_classes_array = NULL;\n"
             "  __parsec_tp->super.super.nb_task_classes = 0;\n"
             "\n"
+            "#if PARSEC_%s_ADT_IDX_MAX > 0\n"
             "  for(i = 0; i < (uint32_t)__parsec_tp->super.arenas_datatypes_size; i++) {\n"
             "    if( NULL != __parsec_tp->super.arenas_datatypes[i].arena ) {\n"
             "      PARSEC_OBJ_RELEASE(__parsec_tp->super.arenas_datatypes[i].arena);\n"
             "    }\n"
-            "  }\n");
+            "  }\n"
+            "#endif  /* PARSEC_%s_ADT_IDX_MAX > 0 */",
+            jdf_basename, jdf_basename);
 
     coutput("  /* Destroy the data repositories for this object */\n");
     for( f = jdf->functions; NULL != f; f = f->next ) {
@@ -4680,7 +4709,7 @@ static void jdf_generate_constructor( const jdf_t* jdf )
     coutput("  for( i = 0; i < __parsec_tp->super.super.nb_task_classes; i++ ) {\n"
             "    __parsec_tp->super.super.task_classes_array[i] = tc = malloc(sizeof(parsec_task_class_t));\n"
             "    memcpy(tc, %s_task_classes[i], sizeof(parsec_task_class_t));\n"
-            "    for( j = 0; PARSEC_DEV_NONE != tc->incarnations[j].type; j++);  /* compute the number of incarnations */\n"
+            "    for( j = 0; PARSEC_DEV_NONE != (tc->incarnations[j].type & PARSEC_DEV_ANY_TYPE); j++);  /* compute the number of incarnations */\n"
             "    tc->incarnations = (__parsec_chore_t*)malloc((j+1) * sizeof(__parsec_chore_t));\n    "
             "    memcpy((__parsec_chore_t*)tc->incarnations, %s_task_classes[i]->incarnations, (j+1) * sizeof(__parsec_chore_t));\n\n"
             "    /* Add a placeholder for initialization and startup task */\n"
@@ -5577,7 +5606,7 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call,
             jdf_generate_code_reshape_input_from_desc(jdf, f, flow, dl, spaces);
 
             coutput("%s    this_task->data._f_%s.data_out = chunk;\n"
-                    "%s    PARSEC_OBJ_RETAIN(chunk);\n",
+                    "%s    PARSEC_DATA_COPY_RETAIN(chunk);\n",
                     spaces, flow->varname,
                     spaces);
 
@@ -5603,7 +5632,7 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call,
                 assert( dl->datatype_local.count != NULL );
                 string_arena_add_string(sa2, "%s", dump_expr((void**)dl->datatype_local.count, &info));
 
-                coutput("%s    chunk = parsec_arena_get_copy(%s->arena, %s, target_device, %s->opaque_dtt);\n"
+                coutput("%s    chunk = parsec_arena_get_new_copy(%s->arena, %s, target_device, %s->opaque_dtt);\n"
                         "%s    chunk->original->owner_device = target_device;\n"
                         "%s    this_task->data._f_%s.data_out = chunk;\n",
                         spaces, string_arena_get_string(sa), string_arena_get_string(sa2), string_arena_get_string(sa),
@@ -5634,12 +5663,19 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call,
 
         /* Code to create & fulfill a reshape promise locally in case this input dependency is typed */
         jdf_generate_code_reshape_input_from_dep(jdf, f, flow, dl, spaces);
-        coutput("%s    this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n"
-                "#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
+        /* TODO: Setting the data_out here is kind of random, especially as some copy of the input flow. The only thing
+         *       that would make sense here is to set the data_out to the dep outputs back into the user memory (output
+         *       dep with a target into a data collection), to give the opportunity to the accelerator components to
+         *       do a pushout to the desired location (instead of the current approach that will do a pushout to the
+         *       data_copy on device 0 followed by a memcpy into the desired location).
+         */
+        //coutput("%s    this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n",
+        //        spaces, flow->varname);
+
+        coutput("#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
                 "%s    parsec_prof_grapher_data_input(chunk->original, (parsec_task_t*)this_task, &%s, 0);\n"
                 "#endif\n"
                 "%s  }\n",
-                spaces, flow->varname,
                 spaces, JDF_OBJECT_ONAME( flow ),
                 spaces);
     }
@@ -5713,7 +5749,7 @@ static void jdf_generate_code_call_init_output(const jdf_t *jdf, const jdf_call_
              spaces, flow->varname,
              spaces);
 
-    coutput("%s    chunk = parsec_arena_get_copy(%s->arena, %s, target_device, %s);\n"
+    coutput("%s    chunk = parsec_arena_get_new_copy(%s->arena, %s, target_device, %s);\n"
             "%s    chunk->original->owner_device = target_device;\n",
             spaces, string_arena_get_string(sa_arena), string_arena_get_string(sa_count), string_arena_get_string(sa_datatype),
             spaces);
@@ -6497,10 +6533,10 @@ jdf_generate_code_data_lookup(const jdf_t *jdf,
      * This way, it's only retained once during release_deps.
      */
     coutput("  if( NULL == this_task->repo_entry ){\n"
-            "    this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, "
+            "    this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, \n"
             "                                      %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals));\n"
-            "    data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);"
-            "    this_task->repo_entry ->generator = (void*)this_task;  /* for AYU */\n"
+            "    data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);\n"
+            "    this_task->repo_entry->generator = (void*)this_task;  /* for AYU */\n"
             "#if defined(PARSEC_SIM)\n"
             "    assert(this_task->repo_entry ->sim_exec_date == 0);\n"
             "    this_task->repo_entry ->sim_exec_date = this_task->sim_exec_date;\n"
@@ -6510,7 +6546,7 @@ jdf_generate_code_data_lookup(const jdf_t *jdf,
             jdf_property_get_string(f->properties, JDF_PROP_UD_MAKE_KEY_FN_NAME, NULL),
             f->fname);
 
-    coutput("  /* The reshape repo is the current task repo. */"
+    coutput("  /* The reshape repo is the current task repo. */\n"
             "  reshape_repo = %s_repo;\n"
             "  reshape_entry_key = %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals) ;\n"
             "  reshape_entry = this_task->repo_entry;\n",
@@ -6731,8 +6767,8 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         coutput("  /* Pointer to dynamic gpu function */\n"
                 "  {\n"
                 "    int chore_idx = 0;\n"
-                "    for ( ; PARSEC_DEV_NONE != this_task->task_class->incarnations[chore_idx].type; ++chore_idx) {\n"
-                "      if (this_task->task_class->incarnations[chore_idx].type == PARSEC_DEV_%s) break;\n"
+                "    for ( ; PARSEC_DEV_NONE != (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_ANY_TYPE); ++chore_idx) {\n"
+                "      if (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_%s) break;\n"
                 "    }\n"
                 "    /* The void* cast prevents the compiler from complaining about the type change */\n"
                 "    parsec_body.dyld_fn = (%s)(void*)this_task->task_class->incarnations[chore_idx].dyld_fn;\n"
@@ -6983,7 +7019,7 @@ static void jdf_generate_code_hook(const jdf_t *jdf,
     coutput("#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", type_upper);
     if( NULL != type_property) {
 
-        if (!strcasecmp(type_property->expr->jdf_var, "cuda")
+        if (!strncasecmp(type_property->expr->jdf_var, "cuda", 4)  /* for batched */
          || !strcasecmp(type_property->expr->jdf_var, "hip")) {
             jdf_generate_code_hook_gpu(jdf, f, body, name);
             goto hook_end_block;
@@ -7018,6 +7054,12 @@ static void jdf_generate_code_hook(const jdf_t *jdf,
     output = UTIL_DUMP_LIST(sa, f->dataflow, next,
                             dump_data_initialization_from_data_array, &ai2, "", "", "", "");
     if( 0 != strlen(output) ) {
+        coutput("/* Make sure we have the data_out set to the data_in */\n");
+        for( fl = f->dataflow; fl != NULL; fl = fl->next) {
+            if( fl->flow_flags & JDF_FLOW_TYPE_CTL ) continue;
+            coutput("  this_task->data._f_%s.data_out = this_task->data._f_%s.data_in;\n",
+                    fl->varname, fl->varname);
+        }
         coutput("  /** Declare the variables that will hold the data, and all the accounting for each */\n"
                 "%s\n",
                 output);
@@ -8413,14 +8455,14 @@ int jdf_force_termdet_dynamic(jdf_t* jdf)
         return rc;
     }
 
-    termdet_expr = (jdf_expr_t *)calloc(sizeof(jdf_expr_t), 1);
+    termdet_expr = (jdf_expr_t *)calloc(1, sizeof(jdf_expr_t));
     termdet_expr->op = JDF_STRING;
     termdet_expr->local_variables = NULL;
     termdet_expr->scope = -1;
     termdet_expr->alias = NULL;
     termdet_expr->jdf_var = strdup(JDF_PROP_TERMDET_DYNAMIC);
 
-    property = (jdf_def_list_t*)calloc(sizeof(jdf_def_list_t), 1);
+    property = (jdf_def_list_t*)calloc(1, sizeof(jdf_def_list_t));
     property->expr = termdet_expr;
     property->name = strdup(JDF_PROP_TERMDET_NAME);
     property->next = jdf->global_properties;
diff --git a/parsec/maxheap.c b/parsec/maxheap.c
index 46d2cf14a..d6dc7b3b4 100644
--- a/parsec/maxheap.c
+++ b/parsec/maxheap.c
@@ -25,7 +25,7 @@ static inline int hiBit(unsigned int n)
 
 parsec_heap_t* heap_create(void)
 {
-    parsec_heap_t* heap = calloc(sizeof(parsec_heap_t), 1);
+    parsec_heap_t* heap = calloc(1, sizeof(parsec_heap_t));
     /* Point back to the parent structure */
     heap->list_item.list_next = (parsec_list_item_t*)heap;
     heap->list_item.list_prev = (parsec_list_item_t*)heap;
@@ -67,7 +67,7 @@ void heap_insert(parsec_heap_t * heap, parsec_task_t * elem)
         }
         parents_size = level_counter;
 
-        parsec_task_t ** parents = calloc(sizeof(parsec_task_t *), level_counter);
+        parsec_task_t ** parents = calloc(level_counter, sizeof(parsec_task_t *));
         // now the bitmask is two places farther than we want it, so back down
         bitmask = bitmask >> 2;
 
diff --git a/parsec/mca/device/CMakeLists.txt b/parsec/mca/device/CMakeLists.txt
index 85d23ba0d..0fef7a88a 100644
--- a/parsec/mca/device/CMakeLists.txt
+++ b/parsec/mca/device/CMakeLists.txt
@@ -14,6 +14,9 @@ set(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT 0 CACHE BOOL  "PaRSEC has support for Recu
 if(PARSEC_HAVE_CUDA)
   set(PARSEC_HAVE_DEV_CUDA_SUPPORT 1 CACHE BOOL "PaRSEC support for CUDA")
 endif(PARSEC_HAVE_CUDA)
+if(PARSEC_HAVE_CUDA_BATCH)
+  set(PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT 1 CACHE BOOL "PaRSEC support for batched CUDA")
+endif(PARSEC_HAVE_CUDA_BATCH)
 if(PARSEC_HAVE_HIP)
   set(PARSEC_HAVE_DEV_HIP_SUPPORT 1 CACHE BOOL "PaRSEC support for HIP")
 endif(PARSEC_HAVE_HIP)
diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c
index 4dc516e05..2e3582a4a 100644
--- a/parsec/mca/device/cuda/device_cuda_component.c
+++ b/parsec/mca/device/cuda/device_cuda_component.c
@@ -40,7 +40,8 @@ int parsec_cuda_max_streams = PARSEC_GPU_MAX_STREAMS;
 int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks;
 char* parsec_cuda_lib_path = NULL;
 
-static int cuda_mask;
+static int parsec_device_cuda_mask = 0xFF;
+static int parsec_device_cuda_avail = 0;
 static int parsec_cuda_sort_pending;
 
 #if defined(PARSEC_PROF_TRACE)
@@ -104,10 +105,10 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority
     else
         parsec_device_cuda_component.modules = NULL;
 
-    for( i = j = 0; i < parsec_device_cuda_enabled; i++ ) {
+    for( i = j = 0; (i < parsec_device_cuda_avail) && (j < parsec_device_cuda_enabled); i++ ) {
 
         /* Allow fine grain selection of the GPU's */
-        if( !((1 << i) & cuda_mask) ) continue;
+        if( !((1 << i) & parsec_device_cuda_mask) ) continue;
 
         rc = parsec_cuda_module_init(i, &parsec_device_cuda_component.modules[j]);
         if( PARSEC_SUCCESS != rc ) {
@@ -139,7 +140,7 @@ static int device_cuda_component_register(void)
                                                    false, false, -1, &parsec_device_cuda_enabled);
     (void)parsec_mca_param_reg_int_name("device_cuda", "mask",
                                         "The bitwise mask of CUDA devices to be enabled (default all)",
-                                        false, false, 0xffffffff, &cuda_mask);
+                                        false, false, 0xffffffff, &parsec_device_cuda_mask);
      (void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask",
                                         "What devices are allowed to use NVLINK if available (default all)",
                                         false, false, 0xffffffff, &parsec_cuda_nvlink_mask);
@@ -185,15 +186,14 @@ static int device_cuda_component_register(void)
 static int device_cuda_component_open(void)
 {
     cudaError_t cudastatus;
-    int ndevices;
 
     if( 0 == parsec_device_cuda_enabled ) {
         return MCA_ERROR;  /* Nothing to do around here */
     }
 
-    cudastatus = cudaGetDeviceCount( &ndevices );
+    cudastatus = cudaGetDeviceCount(&parsec_device_cuda_avail);
     if( cudaErrorNoDevice == (cudaError_t) cudastatus ) {
-        ndevices = 0;
+        parsec_device_cuda_avail = 0;
         /* This is normal on machines with no GPUs, let it flow
          * to do the normal checks vis-a-vis the number of requested
          * devices and issue a warning only when not fulfilling
@@ -208,31 +208,22 @@ static int device_cuda_component_open(void)
                              } );
     }
 
-    if( ndevices > parsec_device_cuda_enabled ) {
-        if( 0 < parsec_device_cuda_enabled ) {
-            ndevices = parsec_device_cuda_enabled;
-        }
-    } else if (ndevices < parsec_device_cuda_enabled ) {
+    /* Update the number of GPU for the upper layer */
+    if (parsec_device_cuda_avail < parsec_device_cuda_enabled ) {
         if( 0 < parsec_device_cuda_enabled ) {
-            if( 0 == ndevices ) {
+            if( 0 == parsec_device_cuda_avail ) {
                 parsec_warning("User requested %d CUDA devices, but none are available on %s."
                                " CUDA support will be therefore disabled.",
                                parsec_device_cuda_enabled, parsec_hostname);
             } else {
                 parsec_warning("User requested %d CUDA devices, but only %d are available on %s.",
-                               parsec_device_cuda_enabled, ndevices, parsec_hostname);
+                               parsec_device_cuda_enabled, parsec_device_cuda_avail, parsec_hostname);
             }
-            parsec_mca_param_set_int(parsec_device_cuda_enabled_index, ndevices);
         }
+        parsec_mca_param_set_int(parsec_device_cuda_enabled_index, parsec_device_cuda_avail);
     }
 
-    /* Update the number of GPU for the upper layer */
-    parsec_device_cuda_enabled = ndevices;
-    if( 0 == ndevices ) {
-        return MCA_ERROR;
-    }
-
-    return MCA_SUCCESS;
+    return (0 == parsec_device_cuda_avail) ? MCA_ERROR : MCA_SUCCESS;
 }
 
 /**
@@ -276,7 +267,7 @@ static int device_cuda_component_close(void)
     /* Check that no CUDA devices are still registered with PaRSEC */
     for(i = 0; i < parsec_mca_device_enabled(); i++) {
         if( NULL == (cdev = (parsec_device_cuda_module_t*)parsec_mca_device_get(i)) ) continue;
-        if(PARSEC_DEV_CUDA != cdev->super.super.type) continue;
+        if(PARSEC_DEV_CUDA & cdev->super.super.type) continue;
 
         PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream,
                              "GPU[%d:%s] CUDA device %d still registered with PaRSEC at the end of CUDA finalize.\n"
diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c
index 312775719..d88a01ee5 100644
--- a/parsec/mca/device/cuda/device_cuda_module.c
+++ b/parsec/mca/device/cuda/device_cuda_module.c
@@ -573,9 +573,9 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     gpu_device->find_incarnation = parsec_cuda_find_incarnation;
 
     if( PARSEC_SUCCESS != parsec_device_memory_reserve(gpu_device,
-                                                           parsec_cuda_memory_percentage,
-                                                           parsec_cuda_memory_number_of_blocks,
-                                                           parsec_cuda_memory_block_size) ) {
+                                                       parsec_cuda_memory_percentage,
+                                                       parsec_cuda_memory_number_of_blocks,
+                                                       parsec_cuda_memory_block_size) ) {
         goto release_device;
     }
 
diff --git a/parsec/mca/device/device.c b/parsec/mca/device/device.c
index d7ea17a99..25436f26e 100644
--- a/parsec/mca/device/device.c
+++ b/parsec/mca/device/device.c
@@ -107,7 +107,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
 
     /* Run the evaluates for the incarnation types to determine if they can
      * execute this task */
-    for(chore_id = 0; PARSEC_DEV_NONE != tc->incarnations[chore_id].type; chore_id++) {
+    for(chore_id = 0; PARSEC_DEV_NONE != (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); chore_id++) {
         if( 0 == (this_task->chore_mask & (1<<chore_id)) ) continue;
         if( NULL == tc->incarnations[chore_id].hook ) continue; /* dyld hook not found during initialization */
 
@@ -116,7 +116,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             if( PARSEC_HOOK_RETURN_DONE != rc ) {
                 if( PARSEC_HOOK_RETURN_NEXT != rc ) {
                     PARSEC_DEBUG_VERBOSE(5, parsec_device_output, "Failed to evaluate %s[%d] chore %d",
-                                         tmp, tc->incarnations[chore_id].type,
+                                         tmp, tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE,
                                          chore_id);
                 }
                 /* Mark this chore as tested */
@@ -124,7 +124,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
                 continue;
             }
         }
-        valid_types |= tc->incarnations[chore_id].type; /* the eval accepted the type, but no device specified yet */
+        valid_types |= (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); /* the eval accepted the type, but no device specified yet */
         /* Evaluate may have picked a device, abide by it */
         if( NULL != this_task->selected_device ) {
             assert( this_task->selected_device->type & valid_types );
@@ -140,7 +140,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
     if (PARSEC_DEV_CPU == valid_types) { /* shortcut for CPU only tasks */
         this_task->selected_device = dev = parsec_mca_device_get(0);
         this_task->load = 0;
-        for(chore_id = 0; tc->incarnations[chore_id].type != PARSEC_DEV_CPU; chore_id++);
+        for(chore_id = 0; !(tc->incarnations[chore_id].type & PARSEC_DEV_CPU); chore_id++);
         this_task->selected_chore = chore_id;
         PARSEC_DEBUG_VERBOSE(80, parsec_device_output, "%s: Task %s cpu-only task set selected_device %d:%s",
                              __func__, tmp, dev->device_index, dev->name);
@@ -226,7 +226,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             /* Skip the device if no incarnations for its type */
             if(!(dev->type & valid_types)) continue;
             /* Skip recursive devices: time estimates are computed on the associated CPU device */
-            if(dev->type == PARSEC_DEV_RECURSIVE) continue;
+            if(dev->type & PARSEC_DEV_RECURSIVE) continue;
 
             eta = dev->device_load + time_estimate(this_task, dev);
             if( best_eta > eta ) {
@@ -244,14 +244,14 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             goto no_valid_device;
 
         this_task->selected_device = parsec_mca_device_get(best_index);
-        assert( this_task->selected_device->type != PARSEC_DEV_RECURSIVE );
+        assert( !(this_task->selected_device->type & PARSEC_DEV_RECURSIVE) );
     }
 
 device_selected:
     dev = this_task->selected_device;
     assert( NULL != dev );
     assert( tp->devices_index_mask & (1 << dev->device_index) );
-    for(chore_id = 0; tc->incarnations[chore_id].type != dev->type; chore_id++)
+    for(chore_id = 0; !(tc->incarnations[chore_id].type & dev->type); chore_id++)
         assert(PARSEC_DEV_NONE != tc->incarnations[chore_id].type /* we have selected this device, so there *must* be an incarnation that matches */);
     this_task->selected_chore = chore_id;
     this_task->load = time_estimate(this_task, dev);
@@ -408,7 +408,7 @@ void parsec_compute_best_unit( uint64_t length, float* updated_value, char** bes
 
 void parsec_devices_save_statistics(uint64_t **pstats) {
     if(NULL == *pstats) {
-        *pstats = (uint64_t*)calloc(sizeof(uint64_t), parsec_nb_devices * 6 /* see below for the number of arrays */);
+        *pstats = (uint64_t*)calloc(parsec_nb_devices * 6 /* see below for the number of arrays */, sizeof(uint64_t));
     }
     else {
         memset(*pstats, 0, parsec_nb_devices * sizeof(uint64_t) * 6);
@@ -740,8 +740,8 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
     for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
         parsec_device_module_t* device = parsec_devices[i];
         if( NULL == device ) continue;
-        if( PARSEC_DEV_RECURSIVE == device->type ) continue;
-        if( PARSEC_DEV_CPU == device->type ) {
+        if( PARSEC_DEV_RECURSIVE & device->type ) continue;
+        if( PARSEC_DEV_CPU & device->type ) {
             c = 0;
             for(int p = 0; p < context->nb_vp; p++)
                 c += context->virtual_processes[p]->nb_cores;
@@ -760,7 +760,7 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
     for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
         parsec_device_module_t* device = parsec_devices[i];
         if( NULL == device ) continue;
-        if( PARSEC_DEV_RECURSIVE == device->type ) continue;
+        if( PARSEC_DEV_RECURSIVE & device->type ) continue;
         device->time_estimate_default = total_gflops_fp64/(double)device->gflops_fp64;
         parsec_debug_verbose(6, parsec_device_output, "  Dev[%d] default-time-estimate %-4"PRId64" <- double %-8"PRId64" single %-8"PRId64" tensor %-8"PRId64" half %-8"PRId64" %s",
                              i, device->time_estimate_default, device->gflops_fp64, device->gflops_fp32, device->gflops_tf32, device->gflops_fp16, device->gflops_guess? "GUESSED": "");
@@ -925,7 +925,7 @@ device_taskpool_register_static(parsec_device_module_t* device, parsec_taskpool_
             continue;
         __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
         for( j = 0; NULL != chores[j].hook; j++ ) {
-            if( chores[j].type != device->type )
+            if( !(chores[j].type & device->type) )
                 continue;
             if(  NULL != chores[j].dyld_fn ) {
                 continue;  /* the function has been set for another device of the same type */
diff --git a/parsec/mca/device/device.h b/parsec/mca/device/device.h
index bbd26cf00..d471eff9b 100644
--- a/parsec/mca/device/device.h
+++ b/parsec/mca/device/device.h
@@ -65,10 +65,13 @@ typedef struct parsec_device_base_component_2_0_0 parsec_device_base_component_t
 #define PARSEC_DEV_CUDA       ((uint8_t)(1 << 2))
 #define PARSEC_DEV_HIP        ((uint8_t)(1 << 3))
 #define PARSEC_DEV_LEVEL_ZERO ((uint8_t)(1 << 4))
+#define PARSEC_DEV_CUDA_BATCH ((uint8_t)(1 << 5))
 #define PARSEC_DEV_TEMPLATE   ((uint8_t)(1 << 7))
 #define PARSEC_DEV_ANY_TYPE   ((uint8_t)    0x3f)
 #define PARSEC_DEV_ALL        ((uint8_t)    0x3f)
 #define PARSEC_DEV_MAX_NB_TYPE                (7)
+/* The following flags are extensions to the device type */
+#define PARSEC_DEV_CHORE_ALLOW_BATCH  ((uint32_t)0x00000100)
 
 #define PARSEC_DEV_GPU_MASK   (PARSEC_DEV_CUDA|PARSEC_DEV_HIP|PARSEC_DEV_LEVEL_ZERO)
 #define PARSEC_DEV_IS_GPU(t)  (0 != ((t) & PARSEC_DEV_GPU_MASK))
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index 24e40d5fd..27c043542 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -300,7 +300,7 @@ void parsec_device_dump_exec_stream(parsec_gpu_exec_stream_t* exec_stream)
     int i;
 
     parsec_debug_verbose(0, parsec_gpu_output_stream,
-                         "Dev: GPU stream %d{%p} [events = %d, start = %d, end = %d, executed = %d]",
+                         "Dev: GPU stream %s{%p} [events = %d, start = %d, end = %d, executed = %d]",
                          exec_stream->name, exec_stream, exec_stream->max_events, exec_stream->start, exec_stream->end,
                          exec_stream->executed);
     for( i = 0; i < exec_stream->max_events; i++ ) {
@@ -321,12 +321,12 @@ void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device)
         data_in_dev += gpu_device->super.data_in_from_device[i];
     }
 
-    parsec_output(parsec_gpu_output_stream, "\n\n");
-    parsec_output(parsec_gpu_output_stream, "Device %d:%d (%p) epoch\n", gpu_device->super.device_index,
-                  gpu_device->super.device_index, gpu_device, gpu_device->data_avail_epoch);
-    parsec_output(parsec_gpu_output_stream, "\tpeer mask %x executed tasks with %llu streams %d\n",
-                  gpu_device->peer_access_mask, (unsigned long long)gpu_device->super.executed_tasks, gpu_device->num_exec_streams);
-    parsec_output(parsec_gpu_output_stream, "\tstats transferred [in: %llu from host %llu from other device out: %llu] required [in: %llu out: %llu]\n",
+    parsec_output(parsec_gpu_output_stream,
+                  "\n\nDevice %s:%d (%p) epoch %zu\n"
+                  "\tpeer mask %x executed tasks %llu streams %d\n"
+                  "\tstats transferred [in: %llu from host %llu from other device out: %llu] required [in: %llu out: %llu]\n",
+                  gpu_device->super.name, gpu_device->super.device_index, gpu_device, gpu_device->data_avail_epoch,
+                  gpu_device->peer_access_mask, (unsigned long long)gpu_device->super.executed_tasks, gpu_device->num_exec_streams,
                   (unsigned long long)data_in_host, (unsigned long long)data_in_dev,
                   (unsigned long long)gpu_device->super.data_out_to_host,
                   (unsigned long long)gpu_device->super.required_data_in, (unsigned long long)gpu_device->super.required_data_out);
@@ -341,7 +341,7 @@ void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device)
                                  parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
                                  parsec_output(parsec_gpu_output_stream, "  %d. elem %p flags 0x%x GPU mem %p\n",
                                                i, gpu_copy, gpu_copy->flags, gpu_copy->device_private);
-                                 parsec_dump_data_copy(gpu_copy);
+                                 parsec_data_copy_dump(gpu_copy);
                                  i++;
                              });
     }
@@ -353,7 +353,7 @@ void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device)
                                  parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
                                  parsec_output(parsec_gpu_output_stream, "  %d. elem %p flags 0x%x GPU mem %p\n",
                                                i, gpu_copy, gpu_copy->flags, gpu_copy->device_private);
-                                 parsec_dump_data_copy(gpu_copy);
+                                 parsec_data_copy_dump(gpu_copy);
                                  i++;
                              });
     }
@@ -478,7 +478,7 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int
             PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p [ref_count %d]",
                                  data->device_copies[ data->owner_device ],
                                  data->device_copies[ data->owner_device ]->super.super.obj_reference_count);
-            PARSEC_OBJ_RETAIN(data->device_copies[ data->owner_device ]);
+            PARSEC_DATA_COPY_RETAIN(data->device_copies[ data->owner_device ]);
             gpu_task->ec->data[0].data_in = data->device_copies[ data->owner_device ];
             gpu_task->ec->data[0].data_out = NULL;
             gpu_task->ec->data[0].source_repo_entry = NULL;
@@ -526,7 +526,7 @@ parsec_device_taskpool_register(parsec_device_module_t* device,
         const parsec_task_class_t* tc = tp->task_classes_array[i];
         __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
         for( j = 0; NULL != chores[j].hook; j++ ) {
-            if( chores[j].type != device->type )
+            if( !(chores[j].type & device->type) )
                 continue;
             if( NULL != chores[j].dyld_fn ) {
                 /* the function has been set for another device of the same type */
@@ -614,9 +614,8 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
             parsec_warning("GPU[%d:%s] Invalid argument: requesting 0 bytes of memory",
                            gpu_device->super.device_index, gpu_device->super.name);
             return PARSEC_ERROR;
-        } else {
-            alloc_size = number_blocks * eltsize;
         }
+        alloc_size = number_blocks * eltsize;
     } else {
         /* number_blocks == -1 means memory_percentage is used */
         alloc_size = (memory_percentage * initial_free_mem) / 100;
@@ -668,7 +667,7 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
         gpu_elem->flags |= PARSEC_DATA_FLAG_PARSEC_OWNED;
         gpu_elem->device_index = gpu_device->super.device_index;
         mem_elem_per_gpu++;
-        PARSEC_OBJ_RETAIN(gpu_elem);
+        PARSEC_DATA_COPY_RETAIN(gpu_elem);
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                             "GPU[%d:%s] Retain and insert GPU copy %p [ref_count %d] in LRU",
                             gpu_device->super.device_index, gpu_device->super.name, gpu_elem, gpu_elem->super.obj_reference_count);
@@ -731,12 +730,22 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de
                             gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count,
                              original, (NULL != original ? original->dc : NULL));
         assert( gpu_copy->device_index == gpu_device->super.device_index );
-
-        if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ) {
-            parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d and it is discarding it!",
-                           gpu_device->super.device_index, gpu_device->super.name, original->key);
+        if( NULL == gpu_copy->device_private ) {
+            PARSEC_DEBUG_VERBOSE(35, parsec_gpu_output_stream,
+                                 "GPU[%d:%s] copy %p is dangling without private data. This is OK.",
+                                 gpu_device->super.device_index, gpu_device->super.name, (void*)gpu_copy);
+            goto release_and_continue;
+        }
+        if( NULL == gpu_copy->original ) {
+            PARSEC_DEBUG_VERBOSE(35, parsec_gpu_output_stream,
+                                 "GPU[%d:%s] copy %p detached from a data but not yet reclaimed!",
+                                 gpu_device->super.device_index, gpu_device->super.name, (void*)gpu_copy);
+        }
+        if (PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state) {
+            parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d (%p) and it is discarding it!",
+                           gpu_device->super.device_index, gpu_device->super.name, original->key, (void*)gpu_copy->device_private);
         }
-        assert(0 != (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
+        assert(0 != (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED));
 
 #if defined(PARSEC_GPU_ALLOC_PER_TILE)
         gpu_device->memory_free( gpu_copy->device_private );
@@ -760,11 +769,12 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de
 #endif
         gpu_copy->device_private = NULL;
 
+      release_and_continue:
         /* At this point the data copies should have no attachment to a data_t. Thus,
          * before we get here (aka below parsec_fini), the destructor of the data
          * collection must have been called, releasing all the copies.
          */
-        PARSEC_OBJ_RELEASE(gpu_copy); assert(NULL == gpu_copy);
+        PARSEC_DATA_COPY_RELEASE(gpu_copy); assert(NULL == gpu_copy);
     }
 }
 
@@ -824,6 +834,87 @@ parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device )
     return PARSEC_SUCCESS;
 }
 
+int
+parsec_device_get_copy( parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t** dc )
+{
+    char task_name[] = "unknown";
+    parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL, *lru_gpu_elem;
+    /* Get the head of the LRU, assuming it has no readers and mark it as used, using the same mechanism as
+     * the GPU to GPU tranfers. Once the communication into this copy completes, the task will get into
+     * the GPU queues, and the data will be reattributed accordingly to this GPU.
+     */
+  find_another_data:
+    lru_gpu_elem = (parsec_gpu_data_copy_t*)parsec_list_pop_front(&gpu_device->gpu_mem_lru);
+    if( NULL == lru_gpu_elem ) {
+        /* nothing available on the GPU. Let the upper level know about this */
+        *dc = NULL;
+        return PARSEC_ERR_OUT_OF_RESOURCE;
+    }
+    if( 0 != lru_gpu_elem->readers ) {
+        PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
+                             "GPU[%d:%s]:%s: Drop LRU-retrieved GPU copy %p [readers %d, ref_count %d] original %p",
+                             gpu_device->super.device_index, gpu_device->super.name, task_name,
+                             lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
+        /* We do not add the copy back into the LRU. This means that for now this copy is not
+         * tracked via the LRU (despite being only used in read mode) and instead is dangling
+         * on other tasks. Thus, it will eventually need to be added back into the LRU when
+         * current task using it completes.
+        */
+        goto find_another_data;
+    }
+    /* It's also possible that the ref_count of that element is bigger than 1
+     * In that case, it's because some task completion did not execute yet, and
+     * we need to keep it in the list until it reaches 1.
+     */
+    if( lru_gpu_elem->super.super.obj_reference_count > 1 ) {
+        /* It's also possible (although unlikely) that we livelock here:
+         * if gpu_mem_lru has *only* elements with readers == 0 but
+         * ref_count > 1, then we might pop/push forever. We save the
+         * earliest element found and if we see it again it means we
+         * run over the entire list without finding a suitable replacement.
+         * We need to make progress on something else. This remains safe for as long as the
+         * LRU is only modified by a single thread (in this case the current thread).
+         */
+        PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
+                             "GPU[%d:%s]:%s: Push back LRU-retrieved GPU copy %p [readers %d, ref_count %d] original %p",
+                             gpu_device->super.device_index, gpu_device->super.name, task_name,
+                             lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
+        assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
+        parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super);
+        goto find_another_data;
+    }
+    if( gpu_mem_lru_cycling == lru_gpu_elem ) {
+        PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
+                             "GPU[%d:%s]: Cycle detected on allocating memory for %s",
+                             gpu_device->super.device_index, gpu_device->super.name, task_name);
+        *dc = NULL;  /* did our best but failed to find a data. Return and allocate it onto another device. */
+        return PARSEC_ERR_OUT_OF_RESOURCE;
+    }
+    /* detect cycles to have an opportunity to stop */
+    gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling;  /* update the cycle detector */
+
+    parsec_data_t* master = lru_gpu_elem->original;
+    if (NULL == master ) {
+        /* This copy has been detached by the CPU once it has been consumed (by the communication engine),
+         * there is no device memory associated with it, we can safely release the CPU copy.
+         */
+        assert(1 == lru_gpu_elem->super.super.obj_reference_count);
+        PARSEC_OBJ_RELEASE(lru_gpu_elem);
+        goto find_another_data;
+    }
+    parsec_atomic_lock(&master->lock);
+    if ( lru_gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ) {
+        /* can't reuse, it is drained right now by another device */
+        parsec_atomic_unlock(&master->lock);
+        goto find_another_data;
+    }
+    parsec_data_copy_detach(master, lru_gpu_elem, gpu_device->super.device_index);
+    parsec_atomic_wmb();
+    *dc = lru_gpu_elem;
+    parsec_atomic_unlock(&master->lock);
+    return PARSEC_SUCCESS;
+}
+
 /**
  * Try to find memory space to move all data on the GPU. We attach a device_elem to
  * a memory_elem as soon as a device_elem is available. If we fail to find enough
@@ -862,7 +953,10 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
         assert( flow && (flow->flow_index == i) );
 
         /* Skip CTL flows only */
-        if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
+        if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) {
+            gpu_task->flow_nb_elts[i] = 0;  /* assume there is nothing to transfer to the GPU */
+            continue;
+        }
 
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                              "GPU[%d:%s]:%s: Investigating flow %s:%d",
@@ -870,7 +964,16 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
         temp_loc[i] = NULL;
         if (this_task->data[i].data_in == NULL)
             continue;
-
+        /* if the input data is already on this device there is nothing else to do */
+        if( gpu_device->super.device_index == this_task->data[i].data_in->device_index ) {
+            PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
+                                 "GPU[%d:%s]:%s: Flow %s:%i was already on the device %p%s",
+                                 gpu_device->super.device_index, gpu_device->super.name, task_name,
+                                 flow->name, i, gpu_elem,
+                                 this_task->data[i].data_in->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : "");
+            this_task->data[i].data_out = this_task->data[i].data_in;
+            continue;
+        }
         master   = this_task->data[i].data_in->original;
         parsec_atomic_lock(&master->lock);
         gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index);
@@ -905,6 +1008,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
         copy_readers_update = 0;
         assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
         gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]);
+        gpu_elem->arena_chunk = (parsec_arena_chunk_t *)gpu_device->memory;
         if( NULL == gpu_elem->device_private ) {
 #endif
 
@@ -926,6 +1030,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                 for( j = 0; j <= i; j++ ) {
                     /* This flow could be a control flow */
                     if( NULL == temp_loc[j] ) continue;
+                    this_task->data[j].data_out = NULL;  /* reset the data out */
                     /* This flow could be non-parsec-owned, in which case we can't reclaim it */
                     if( 0 == (temp_loc[j]->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
                     PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
@@ -936,9 +1041,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                     parsec_list_push_front(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)temp_loc[j]);
                 }
 #if !defined(PARSEC_GPU_ALLOC_PER_TILE)
-                PARSEC_OBJ_RELEASE(gpu_elem);
+                PARSEC_DATA_COPY_RELEASE(gpu_elem);
 #endif
                 parsec_atomic_unlock(&master->lock);
+                if( data_avail_epoch ) {  /* update the memory epoch */
+                    gpu_device->data_avail_epoch++;
+                }
                 return PARSEC_HOOK_RETURN_AGAIN;
             }
 
@@ -1093,7 +1201,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                                  "GPU[%d:%s]:%s: Release LRU-retrieved GPU copy %p [ref_count %d: must be 1]",
                                  gpu_device->super.device_index, gpu_device->super.name, task_name,
                                  lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count);
-            PARSEC_OBJ_RELEASE(lru_gpu_elem);
+            PARSEC_DATA_COPY_RELEASE(lru_gpu_elem);
             assert( NULL == lru_gpu_elem );
             goto malloc_data;
         }
@@ -1165,8 +1273,8 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
                             parsec_gpu_exec_stream_t *gpu_stream)
 {
     int ret;
-    parsec_data_copy_t * source;
-    parsec_data_copy_t * dest;
+    parsec_data_copy_t * src_copy;
+    parsec_data_copy_t * dst_copy;
     parsec_device_gpu_module_t *src_dev;
     parsec_device_gpu_module_t *dst_dev;
     parsec_task_t *task = gtask->ec;
@@ -1175,25 +1283,24 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
 
     for(int i = 0; i < task->task_class->nb_flows; i++) {
         if( !(flow_mask & (1U << i)) ) continue;
-        source = gtask->sources[i];
-        dest = task->data[i].data_out;
-        src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
-        dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
+        src_copy = gtask->sources[i];
+        dst_copy = task->data[i].data_out;
+        src_dev = (parsec_device_gpu_module_t *)parsec_mca_device_get(src_copy->device_index);
+        dst_dev = (parsec_device_gpu_module_t *)parsec_mca_device_get(dst_copy->device_index);
 
-        if(src_dev->super.type == dst_dev->super.type) {
+        if((src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE)) {
             assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) );
             dir = parsec_device_gpu_transfer_direction_d2d;
         } else {
             dir = parsec_device_gpu_transfer_direction_h2d;
         }
 
-        count = (source->original->nb_elts <= dest->original->nb_elts) ?
-            source->original->nb_elts : dest->original->nb_elts;
-        ret = dst_dev->memcpy_async( dst_dev, gpu_stream,
-                                     dest->device_private,
-                                     source->device_private,
-                                     count,
-                                     dir );
+        count = (src_copy->original->nb_elts <= dst_copy->original->nb_elts) ? src_copy->original->nb_elts : dst_copy->original->nb_elts;
+        ret = dst_dev->memcpy_async(dst_dev, gpu_stream,
+                                    dst_copy->device_private,
+                                    src_copy->device_private,
+                                    count,
+                                    dir);
         if(PARSEC_SUCCESS != ret)
             return PARSEC_HOOK_RETURN_ERROR;
     }
@@ -1215,8 +1322,8 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
                              parsec_gpu_exec_stream_t *gpu_stream)
 {
     int ret;
-    parsec_data_copy_t * source;
-    parsec_data_copy_t * dest;
+    parsec_data_copy_t * src_copy;
+    parsec_data_copy_t * dst_copy;
     parsec_device_gpu_module_t *dst_dev, *src_dev;
     parsec_task_t *task = gtask->ec;
     size_t count;
@@ -1224,22 +1331,22 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
     int i;
     for(i = 0; i < task->task_class->nb_flows; i++){
         if(flow_mask & (1U << i)){
-            source = task->data[i].data_out;
-            dest = source->original->device_copies[0];
-            dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
-            src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
-
-            count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts :
-                        dest->original->nb_elts;
-            if( src_dev->super.type == dst_dev->super.type ) {
+            src_copy = task->data[i].data_out;
+            dst_copy = src_copy->original->device_copies[0];
+            dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dst_copy->device_index);
+            src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(src_copy->device_index);
+            assert(dst_copy->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
+            dst_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER;
+            count = (src_copy->original->nb_elts <= dst_copy->original->nb_elts) ? src_copy->original->nb_elts : dst_copy->original->nb_elts;
+            if( (src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE) ) {
                 assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) );
                 dir = parsec_device_gpu_transfer_direction_d2d;
             } else {
                 dir = parsec_device_gpu_transfer_direction_d2h;
             }
             ret = src_dev->memcpy_async( src_dev, gpu_stream,
-                                         dest->device_private,
-                                         source->device_private,
+                                         dst_copy->device_private,
+                                         src_copy->device_private,
                                          count,
                                          dir );
             if(PARSEC_SUCCESS != ret) {
@@ -1277,7 +1384,24 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                              "GPU[%d:%s]: Prefetch task %p is staging in",
                              gpu_device->super.device_index, gpu_device->super.name, gpu_task);
     }
-
+    if( gpu_elem == candidate ) {  /* data already located in the right place */
+        if( candidate->device_index == gpu_device->super.device_index ) {
+            /* the candidate is already located on the GPU, no transfer should be necessary but let's do the bookkeeping */
+            if( (PARSEC_FLOW_ACCESS_WRITE & type) && (gpu_task->task_type != PARSEC_GPU_TASK_TYPE_PREFETCH) ) {
+                candidate->version++;
+                parsec_list_item_ring_chop((parsec_list_item_t *)candidate);
+                PARSEC_LIST_ITEM_SINGLETON(candidate);
+            }
+            if( PARSEC_FLOW_ACCESS_READ & type ) {
+                parsec_atomic_fetch_add_int32(&candidate->readers, 1);
+            }
+            return PARSEC_HOOK_RETURN_DONE;
+        }
+        parsec_warning("GPU[%d:%s]:\t device_data_stage_in without a proper data_out on the device "
+                       "and with a data_in (%p) located on another device %d",
+                       gpu_device->super.device_index, gpu_device->super.name,
+                       candidate, candidate->device_index);
+    }
     parsec_atomic_lock( &original->lock );
 
     gpu_task->sources[flow->flow_index] = candidate;  /* default source for the transfer */
@@ -1352,7 +1476,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream,
                              "GPU[%d:%s]:\tSelecting candidate data copy %p [ref_count %d] on data %p",
                              gpu_device->super.device_index, gpu_device->super.name, task_data->data_in, task_data->data_in->super.super.obj_reference_count, original);
-        if( gpu_device->super.type == candidate_dev->super.type ) {
+        if( (gpu_device->super.type & PARSEC_DEV_ANY_TYPE) == (candidate_dev->super.type & PARSEC_DEV_ANY_TYPE) ) {
             if( gpu_device->peer_access_mask & (1 << candidate_dev->super.device_index) ) {
                 /* We can directly do D2D, so let's skip the selection */
                 PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream,
@@ -1502,7 +1626,8 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                         gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__,
                         candidate->device_private, candidate_dev->super.device_index, candidate_dev->super.name,
                         gpu_elem->device_private, gpu_device->super.device_index, gpu_device->super.name,
-                        nb_elts, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D");
+                        nb_elts,
+                        (candidate_dev->super.type & gpu_device->super.type & PARSEC_DEV_ANY_TYPE)? "D2D": "H2D");
         parsec_atomic_unlock( &original->lock );
         assert(0);
         return PARSEC_HOOK_RETURN_ERROR;
@@ -1647,6 +1772,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
         /* We also don't push back non-parsec-owned copies */
         if(NULL != task->data[i].data_out &&
            0 == (task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED)) continue;
+        if( gpu_device->super.device_index == task->data[i].data_in->device_index ) continue;
 
         flow = gtask->flow[i];
         assert( flow );
@@ -1868,15 +1994,26 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device,
         }
     }
 
- grab_a_task:
+  grab_a_task:
     if( NULL == stream->tasks[stream->start] ) {  /* there is room on the stream */
         task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending);  /* get the best task */
     }
     if( NULL == task ) {  /* No tasks, we're done */
         return PARSEC_HOOK_RETURN_DONE;
     }
-    PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)task);
+    /* Should we allow the tasks to be batched */
+    if (PARSEC_GPU_TASK_TYPE_KERNEL == task->task_type ) {
+        if( PARSEC_DEV_CHORE_ALLOW_BATCH & task->ec->task_class->incarnations[0].type ) {
+            /* Don't singleton the task, allowing the kernel to extract the tasks it wants 
+             * from the task ring, and singleton it or replace it with the aggregated tasks
+             * as necessary.
+             */
+            goto move_forward_with_this_task;
+        }
+    }
+    PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t *)task);
 
+  move_forward_with_this_task:
     assert( NULL == stream->tasks[stream->start] );
 
   schedule_task:
@@ -1982,7 +2119,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
         gpu_task->last_data_check_epoch = gpu_device->data_avail_epoch;
         return ret;
     }
-
+    gpu_task->last_status = 0;  /* mark the task as clean */
     for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
 
         flow = gpu_task->flow[i];
@@ -2010,11 +2147,10 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
             return ret;
         }
     }
-
     PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                          "GPU[%d:%s]: Push task %s DONE",
                          gpu_device->super.device_index, gpu_device->super.name,
-                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
+                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task));
     gpu_task->complete_stage = parsec_device_callback_complete_push;
 #if defined(PARSEC_PROF_TRACE)
     gpu_task->prof_key_end = -1; /* We do not log that event as the completion of this task */
@@ -2103,7 +2239,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
             /* If the gpu copy is not owned by parsec, we don't manage it at all */
             if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
             original = gpu_copy->original;
-            rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << i), gpu_stream): PARSEC_SUCCESS;
+            rc = gpu_task->stage_out ? gpu_task->stage_out(gpu_task, (1U << i), gpu_stream): PARSEC_SUCCESS;
             if(PARSEC_SUCCESS != rc) {
                 parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n"
                                 "\tdata %s <<%p>> -> <<%p>>\n",
@@ -2141,6 +2277,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
 
         assert( this_task->data[i].data_in == NULL || original == this_task->data[i].data_in->original );
 
+#if 0
         if( (gpu_task->task_type != PARSEC_GPU_TASK_TYPE_D2D_COMPLETE) && !(flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
             /* Do not propagate GPU copies to successors (temporary solution) */
             this_task->data[i].data_out = original->device_copies[0];
@@ -2152,6 +2289,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                                      this_task->data[i].data_out, this_task->data[i].data_out->super.super.obj_reference_count,
                                      original);
         }
+#endif
         parsec_atomic_lock(&original->lock);
         if( flow->flow_flags & PARSEC_FLOW_ACCESS_READ ) {
             int current_readers = parsec_atomic_fetch_sub_int32(&gpu_copy->readers, 1) - 1;
@@ -2225,7 +2363,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
 #endif
                 /* Move the data back into main memory */
                 rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS;
-                if(PARSEC_SUCCESS != rc) {
+                if( PARSEC_SUCCESS != rc ) {
                     parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n"
                                     "\tdata %s <<%p>> -> <<%p>>\n",
                                     gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__,
@@ -2264,8 +2402,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
                              parsec_gpu_task_t          *gpu_task )
 {
     parsec_task_t *this_task = gpu_task->ec;
-    parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
-    parsec_data_t              *original;
+    parsec_gpu_data_copy_t     *gpu_copy;
     int i;
 
 #if defined(PARSEC_DEBUG_NOISIER)
@@ -2290,45 +2427,60 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
         }
 
         gpu_copy = this_task->data[i].data_out;
-        original = gpu_copy->original;
-        cpu_copy = original->device_copies[0];
 
         /* If it is a copy managed by the user, don't bother either */
         if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
+#if 0
+        parsec_data_t *original = gpu_copy->original;
+        parsec_gpu_data_copy_t *cpu_copy = original->device_copies[0];
+        if( this_task->data[i].data_in == this_task->data[i].data_out ) {
+            /**
+             * There might be a race condition here. We can't assume the first CPU
+             * version is the corresponding CPU copy, as a new CPU-bound data
+             * might have been created meanwhile.
+             *
+             * WARNING: For now we always forward the cpu_copy to the next task, to
+             * do that, we lie to the engine by updating the CPU copy to the same
+             * status than the GPU copy without updating the data itself. Thus, the
+             * cpu copy is really invalid. this is related to Issue #88, and the
+             * fact that:
+             *      - we don't forward the gpu copy as output
+             *      - we always take a cpu copy as input, so it has to be in the
+             *        same state as the GPU to prevent an extra data movement.
+             */
+            assert( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state );
+            gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+            cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
 
-        /**
-         * There might be a race condition here. We can't assume the first CPU
-         * version is the corresponding CPU copy, as a new CPU-bound data
-         * might have been created meanwhile.
-         *
-         * WARNING: For now we always forward the cpu_copy to the next task, to
-         * do that, we lie to the engine by updating the CPU copy to the same
-         * status than the GPU copy without updating the data itself. Thus, the
-         * cpu copy is really invalid. this is related to Issue #88, and the
-         * fact that:
-         *      - we don't forward the gpu copy as output
-         *      - we always take a cpu copy as input, so it has to be in the
-         *        same state as the GPU to prevent an extra data movement.
-         */
-        assert( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state );
-        gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
-        cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
-
-        cpu_copy->version = gpu_copy->version;
-        PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
-                             "GPU[%d:%s]: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]",
-                             gpu_device->super.device_index, gpu_device->super.name,
-                             cpu_copy, cpu_copy->super.super.obj_reference_count, cpu_copy->version, gpu_copy, gpu_copy->super.super.obj_reference_count);
-
-        /**
-         * Let's lie to the engine by reporting that working version of this
-         * data is now on the CPU.
-         */
-        this_task->data[i].data_out = cpu_copy;
+            cpu_copy->version = gpu_copy->version;
+            PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
+                                 "GPU[%d:%s]: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]",
+                                 gpu_device->super.device_index, gpu_device->super.name,
+                                 cpu_copy, cpu_copy->super.super.obj_reference_count, cpu_copy->version, gpu_copy, gpu_copy->super.super.obj_reference_count);
 
-        assert( 0 <= gpu_copy->readers );
+            /**
+             * Let's lie to the engine by reporting that working version of this
+             * data is now on the CPU.
+             */
+            this_task->data[i].data_out = cpu_copy;
+        }
+#endif
+        assert(0 <= gpu_copy->readers);
 
         if( gpu_task->pushout & (1 << i) ) {
+            parsec_data_t *original = gpu_copy->original;
+            parsec_gpu_data_copy_t *cpu_copy = original->device_copies[0];
+            /* Update the CPU copy to reflect the current status */
+            assert(cpu_copy->version < gpu_copy->version);
+            cpu_copy->version = gpu_copy->version;
+            cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+            gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+            assert(PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy->data_transfer_status);
+            cpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
+            if( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY) ) {
+                /* Report the CPU copy as the output of the task. */
+                this_task->data[i].data_out = cpu_copy;
+            }
             PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                                  "GPU copy %p [ref_count %d] moved to the read LRU in %s",
                                  gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index d931b37ff..70c6a9abe 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -65,7 +65,7 @@ typedef int (parsec_stage_in_function_t)(parsec_gpu_task_t        *gtask,
 
 /* Function type to transfer data from the GPU device.
  * Transfer transfer the <count> contiguous bytes from
- * task->data[i].data_in to task->data[i].data_out.
+ * task->data[i].data_out to the copy on device 0.
  *
  * @param[in] task parsec_task_t containing task->data[i].data_in, task->data[i].data_out.
  * @param[in] flow_mask indicating task flows for which to transfer.
diff --git a/parsec/mca/device/level_zero/device_level_zero_component.c b/parsec/mca/device/level_zero/device_level_zero_component.c
index 7fe35b6a9..a1b30285e 100644
--- a/parsec/mca/device/level_zero/device_level_zero_component.c
+++ b/parsec/mca/device/level_zero/device_level_zero_component.c
@@ -410,7 +410,7 @@ static int device_level_zero_component_close(void)
     /* Check that no LEVEL_ZERO devices are still registered with PaRSEC */
     for(i = 0; i < parsec_mca_device_enabled(); i++) {
         if( NULL == (cdev = (parsec_device_level_zero_module_t*)parsec_mca_device_get(i)) ) continue;
-        if(PARSEC_DEV_LEVEL_ZERO != cdev->super.super.type) continue;
+        if( !(PARSEC_DEV_LEVEL_ZERO & cdev->super.super.type) ) continue;
 
         PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream,
                              "GPU[%d] LEVEL_ZERO device still registered with PaRSEC at the end of LEVEL_ZERO finalize.\n"
diff --git a/parsec/mca/device/template/device_template_module.c b/parsec/mca/device/template/device_template_module.c
index 3ad780551..7d88529f4 100644
--- a/parsec/mca/device/template/device_template_module.c
+++ b/parsec/mca/device/template/device_template_module.c
@@ -78,7 +78,7 @@ parsec_template_taskpool_register(parsec_device_module_t* device,
         const parsec_task_class_t* tc = tp->task_classes_array[i];
         __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
         for( j = 0; NULL != chores[j].hook; j++ ) {
-            if( chores[j].type != device->type )
+            if( chores[j].type & device->type )
                 continue;
             if( NULL != chores[j].dyld_fn ) {
                 /* the function has been set for another device of the same type */
diff --git a/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c b/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c
index 8e53cb653..e40afcfe2 100644
--- a/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c
+++ b/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c
@@ -86,7 +86,7 @@ copy_chores(parsec_taskpool_t *tp, parsec_dtd_taskpool_t *dtd_tp)
             parsec_hook_t **hook_not_const = (parsec_hook_t **)&(tp->task_classes_array[i]->incarnations[j].hook);
 
             /* saving the CPU hook only */
-            if (tp->task_classes_array[i]->incarnations[j].type == PARSEC_DEV_CPU) {
+            if (tp->task_classes_array[i]->incarnations[j].type & PARSEC_DEV_CPU) {
                 dtd_tp->actual_hook[i].hook = tp->task_classes_array[i]->incarnations[j].hook;
             }
             /* copying the fake hook in all the hooks (CPU, GPU etc) */
@@ -312,8 +312,8 @@ parsec_dtd_taskpool_insert_task_ptg_to_dtd( parsec_dtd_taskpool_t  *dtd_tp,
         free(params);
 
         __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations;
-        for(int i = 0; PARSEC_DEV_NONE != incarnations[i].type; i++ ) {
-            if( PARSEC_DEV_CPU == incarnations[i].type ) {
+        for(int i = 0; PARSEC_DEV_NONE != (incarnations[i].type & PARSEC_DEV_ANY_TYPE); i++ ) {
+            if( PARSEC_DEV_CPU & incarnations[i].type ) {
                 incarnations[i] = dtd_chore_for_testing;
             }
         }
diff --git a/parsec/mca/sched/lfq/sched_lfq_module.c b/parsec/mca/sched/lfq/sched_lfq_module.c
index 721146ec2..ce20ed9dc 100644
--- a/parsec/mca/sched/lfq/sched_lfq_module.c
+++ b/parsec/mca/sched/lfq/sched_lfq_module.c
@@ -64,7 +64,7 @@ static int flow_lfq_init(parsec_execution_stream_t* es, struct parsec_barrier_t*
     vp = es->virtual_process;
 
     /* Every flow creates its own local object */
-    sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(sizeof(parsec_mca_sched_local_queues_scheduler_object_t), 1);
+    sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(1, sizeof(parsec_mca_sched_local_queues_scheduler_object_t));
     es->scheduler_object = sched_obj;
     if( 0 == es->th_id ) {  /* And flow 0 creates the system_queue */
         sched_obj->system_queue = PARSEC_OBJ_NEW(parsec_dequeue_t);
diff --git a/parsec/mca/sched/lhq/sched_lhq_module.c b/parsec/mca/sched/lhq/sched_lhq_module.c
index 6d9ec2424..e21bd81bd 100644
--- a/parsec/mca/sched/lhq/sched_lhq_module.c
+++ b/parsec/mca/sched/lhq/sched_lhq_module.c
@@ -70,7 +70,7 @@ static int flow_lhq_init(parsec_execution_stream_t* ces, struct parsec_barrier_t
             /* First of all, we allocate the scheduling object memory for all threads */
             es = vp->execution_streams[t];
 
-            sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(sizeof(parsec_mca_sched_local_queues_scheduler_object_t), 1);
+            sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(1, sizeof(parsec_mca_sched_local_queues_scheduler_object_t));
             es->scheduler_object = sched_obj;
 
             if( es->th_id == 0 ) {
diff --git a/parsec/mca/sched/ltq/sched_ltq_module.c b/parsec/mca/sched/ltq/sched_ltq_module.c
index f1bf081b5..81c009e76 100644
--- a/parsec/mca/sched/ltq/sched_ltq_module.c
+++ b/parsec/mca/sched/ltq/sched_ltq_module.c
@@ -63,7 +63,7 @@ static int flow_ltq_init(parsec_execution_stream_t* es, struct parsec_barrier_t*
     uint32_t queue_size;
     parsec_vp_t * vp = es->virtual_process;
 
-    sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(sizeof(parsec_mca_sched_local_queues_scheduler_object_t), 1);
+    sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(1, sizeof(parsec_mca_sched_local_queues_scheduler_object_t));
     es->scheduler_object = sched_obj;
 
     if( es->th_id == 0 ) {
diff --git a/parsec/mca/sched/pbq/sched_pbq_module.c b/parsec/mca/sched/pbq/sched_pbq_module.c
index e604e8dc0..2c3b94865 100644
--- a/parsec/mca/sched/pbq/sched_pbq_module.c
+++ b/parsec/mca/sched/pbq/sched_pbq_module.c
@@ -61,7 +61,7 @@ static int flow_pbq_init(parsec_execution_stream_t* es, struct parsec_barrier_t*
     uint32_t queue_size = 0;
 
     /* Every flow creates its own local object */
-    sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(sizeof(parsec_mca_sched_local_queues_scheduler_object_t), 1);
+    sched_obj = (parsec_mca_sched_local_queues_scheduler_object_t*)calloc(1, sizeof(parsec_mca_sched_local_queues_scheduler_object_t));
     es->scheduler_object = sched_obj;
     if( es->th_id == 0 ) { /* And flow 0 creates the system_queue */
         sched_obj->system_queue = PARSEC_OBJ_NEW(parsec_dequeue_t);
diff --git a/parsec/parsec.c b/parsec/parsec.c
index 095fcad27..c8a32093f 100644
--- a/parsec/parsec.c
+++ b/parsec/parsec.c
@@ -1798,7 +1798,7 @@ parsec_release_dep_fct(parsec_execution_stream_t *es,
      * Check that we don't forward a NULL data to someone else. This
      * can be done only on the src node, since the dst node can
      * check for datatypes without knowing the data yet.
-     * By checking now, we allow for the data to be created any time bfore we
+     * By checking now, we allow for the data to be created any time before we
      * actually try to transfer it.
      */
     if( PARSEC_UNLIKELY((data->data == NULL) &&
@@ -1839,20 +1839,20 @@ parsec_release_dep_fct(parsec_execution_stream_t *es,
 #ifdef PARSEC_RESHAPE_BEFORE_SEND_TO_REMOTE
                     /* Now everything is a reshaping entry */
                     /* Check if we need to reshape before sending */
-                    if(parsec_is_CTL_dep(output->data)){ /* CTL DEP */
+                    if(parsec_is_CTL_dep(output->data)) { /* CTL DEP */
                         output->data.data_future = NULL;
                         output->data.repo = NULL;
                         output->data.repo_key = -1;
-                    }else{
+                    } else {
                         /* Get reshape from whatever repo it has been set up into */
                         output->data.data_future = (parsec_datacopy_future_t*)target_dc;
                         output->data.repo = target_repo;
                         output->data.repo_key = target_repo_entry->ht_item.key;
                         PARSEC_DEBUG_VERBOSE(4, parsec_debug_output,
-                                         "th%d RESHAPE_PROMISE SETUP FOR REMOTE DEPS [%p:%p] for INLINE REMOTE %s fut %p",
-                                         es->th_id, output->data.data, (output->data.data)->dtt,
-                                         (target_repo == successor_repo? "UNFULFILLED" : "FULFILLED"),
-                                         output->data.data_future);
+                                             "th%d RESHAPE_PROMISE SETUP FOR REMOTE DEPS [%p:%p] for INLINE REMOTE %s fut %p",
+                                             es->th_id, output->data.data, (output->data.data)->dtt,
+                                             (target_repo == successor_repo? "UNFULFILLED" : "FULFILLED"),
+                                             output->data.data_future);
                     }
 #endif
                 } else {
@@ -1942,10 +1942,20 @@ parsec_task_snprintf( char* str, size_t size,
                            task->locals[i].value );
         if( index >= size ) return str;
     }
-    index += snprintf(str + index, size - index, "]<%d>", task->priority );
+    index += snprintf(str + index, size - index, "]<%d> keys = {", task->priority );
+    if( index >= size ) return str;
+    for( i = 0; i < tc->nb_flows; i++ ) {
+        char *prefix = (i == 0) ? "" : ", ";
+        if ((NULL == task->data[i].data_in) || (NULL == task->data[i].data_in->original))
+            index += snprintf(str + index, size - index, "%s*", prefix);
+        else
+            index += snprintf(str + index, size - index, "%s%lx", prefix, task->data[i].data_in->original->key);
+        if( index >= size ) return str;
+    }
+    index += snprintf(str + index, size - index, "}" );
     if( index >= size ) return str;
     if( NULL != task->taskpool ) {
-        index += snprintf(str + index, size - index, "{%u}", task->taskpool->taskpool_id );
+        index += snprintf(str + index, size - index, " {tp: %u}", task->taskpool->taskpool_id );
         if( index >= size ) return str;
     }
     return str;
diff --git a/parsec/parsec_internal.h b/parsec/parsec_internal.h
index 970587260..2ae80a1d9 100644
--- a/parsec/parsec_internal.h
+++ b/parsec/parsec_internal.h
@@ -200,6 +200,9 @@ PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_taskpool_t);
 #define PARSEC_DEPENDENCIES_STARTUP_TASK   ((parsec_dependency_t)(1<<29))
 #define PARSEC_DEPENDENCIES_BITMASK        (~(PARSEC_DEPENDENCIES_TASK_DONE|PARSEC_DEPENDENCIES_IN_DONE|PARSEC_DEPENDENCIES_STARTUP_TASK))
 
+#define PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY 0x00000002
+#define PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY 0x00000001
+
 /**
  * This structure is used internally by the parsec_dependencies_t structures
  */
@@ -337,8 +340,7 @@ typedef struct parsec_data_ref_s {
     parsec_data_key_t key;
 } parsec_data_ref_t;
 
-typedef int (parsec_data_ref_fn_t)(parsec_task_t *task,
-                                  parsec_data_ref_t *ref);
+typedef int (parsec_data_ref_fn_t)(const parsec_task_t *task, parsec_data_ref_t *ref);
 
 #define PARSEC_HAS_IN_IN_DEPENDENCIES     0x0001
 #define PARSEC_HAS_OUT_OUT_DEPENDENCIES   0x0002
@@ -492,6 +494,12 @@ PARSEC_DECLSPEC extern int parsec_slow_bind_warning;
  * the scheduler, but can provide a better cache reuse.
  */
 PARSEC_DECLSPEC extern int parsec_runtime_keep_highest_priority_task;
+/**
+ * Global configuration mask allowing or not for the data to be sent or received,
+ * from or to, GPU memory. It can be an OR between PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY
+ * and PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY.
+ */
+PARSEC_DECLSPEC extern int parsec_mpi_allow_gpu_memory_communications;
 
 /**
  * Description of the state of the task. It indicates what will be the next
diff --git a/parsec/parsec_mpi_funnelled.c b/parsec/parsec_mpi_funnelled.c
index 7e158f11f..338416713 100644
--- a/parsec/parsec_mpi_funnelled.c
+++ b/parsec/parsec_mpi_funnelled.c
@@ -201,6 +201,8 @@ parsec_list_t mpi_funnelled_dynamic_sendreq_fifo; /* ordered non threaded fifo *
 parsec_list_t mpi_funnelled_dynamic_recvreq_fifo; /* ordered non threaded fifo */
 parsec_mempool_t *mpi_funnelled_dynamic_req_mempool = NULL;
 
+int parsec_mpi_allow_gpu_memory_communications = 3;
+
 /* This structure is used to save all the information necessary to
  * invoke a callback after a MPI_Request is satisfied
  */
@@ -506,6 +508,14 @@ static int mpi_funneled_init_once(parsec_context_t* context)
                              MAX_MPI_TAG, (unsigned int)MAX_MPI_TAG, MAX_MPI_TAG / MAX_DEP_OUT_COUNT);
     }
 
+#if !defined(PARSEC_MPI_IS_GPU_AWARE)
+    parsec_mpi_allow_gpu_memory_communications = 0;
+#endif
+    parsec_mca_param_reg_int_name("mpi", "gpu_aware",
+                                  "Enabled if PaRSEC should allow MPI to move data directly from or to GPU memory. Otherwise, all data"
+                                  " movements will transit through CPU memory, and will always have a backup copy there. Accepted values "
+                                  "are ORed between 1 for receiving into GPU memory and 2 for sending from GPU memory",
+                                  false, false, parsec_mpi_allow_gpu_memory_communications, &parsec_mpi_allow_gpu_memory_communications);
     (void)context;
     return 0;
 }
@@ -1319,6 +1329,10 @@ parsec_check_overlapping_binding(parsec_context_t *context)
 #endif
 }
 
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+#include <parsec/mca/device/device_gpu.h>
+#endif  /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+
 int
 mpi_no_thread_enable(parsec_comm_engine_t *ce)
 {
@@ -1330,6 +1344,21 @@ mpi_no_thread_enable(parsec_comm_engine_t *ce)
     if(parsec_ce_mpi_comm == (MPI_Comm)context->comm_ctx) {
         return PARSEC_SUCCESS;
     }
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+    /* The communication thread need to have a CUDA context in order to be able to use
+     * CUDA managed memory. It should be enough to just create a context onto the first
+     * active device.
+     */
+    for (int dev = 0; dev < (int)parsec_nb_devices; dev++) {
+        parsec_device_module_t *device = parsec_mca_device_get(dev);
+        if (PARSEC_DEV_CUDA & device->type) {
+            parsec_device_gpu_module_t *gpu_dev = (parsec_device_gpu_module_t*)device;
+            gpu_dev->set_device(gpu_dev);
+        }
+    }
+#endif  /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+
+
     /* Finish the initialization of the communication engine */
     parsec_ce.mem_register        = mpi_no_thread_mem_register;
     parsec_ce.mem_unregister      = mpi_no_thread_mem_unregister;
diff --git a/parsec/parsec_reshape.c b/parsec/parsec_reshape.c
index 121a8570b..eb48e1d46 100644
--- a/parsec/parsec_reshape.c
+++ b/parsec/parsec_reshape.c
@@ -50,8 +50,8 @@ void parsec_cleanup_reshape_promise(parsec_base_future_t *future)
         free(match_data);
     }
     if(d_fut->super.tracked_data != NULL){
-        parsec_data_copy_t * data = (parsec_data_copy_t*) d_fut->super.tracked_data;
-        PARSEC_DATA_COPY_RELEASE(data);
+        parsec_data_copy_t * copy = (parsec_data_copy_t*) d_fut->super.tracked_data;
+        PARSEC_DATA_COPY_RELEASE(copy);
     }
 }
 
@@ -141,7 +141,7 @@ parsec_new_reshape_promise(parsec_dep_data_description_t* data,
      * going to consume the original data->data in order to reshape it, and
      * all other successors will use directly the reshaped data instead.
      */
-    PARSEC_OBJ_RETAIN( future_in_data->data );
+    PARSEC_DATA_COPY_RETAIN( future_in_data->data );
 
     return data_future;
 }
@@ -331,7 +331,7 @@ parsec_create_reshape_promise(parsec_execution_stream_t *es,
     }
 #endif
 
-    /* retain the future if it's being reuse. */
+    /* retain the future if it's being reused. */
     if ( !new_future ) PARSEC_OBJ_RETAIN(data->data_future);
 
     /* Set up the reshape promise. */
@@ -439,7 +439,7 @@ parsec_set_up_reshape_promise(parsec_execution_stream_t *es,
             /* Data has been received with the expected remote type of the
              * successor contained on data->data->dtt. */
             data->local.dst_datatype = data->local.src_datatype = data->data->dtt;
-        }else{
+        } else {
             /* Packed data because multiple unpacking alternatives at reception. */
             const parsec_task_class_t* fct = newcontext->task_class;
             uint32_t flow_mask = (1U << dep->flow->flow_index) | 0x80000000;  /* in flow */
@@ -579,7 +579,7 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
         data->data_future = (parsec_datacopy_future_t*)reshape_repo_entry->data[dep_flow_index];
     }
 
-    if(data->data_future == NULL){
+    if(data->data_future == NULL) {
         parsec_create_reshape_promise(es,
                                       data,
                                       0, -1, -1, /* no src dst rank */
@@ -595,9 +595,6 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
         char task_name[MAX_TASK_STRLEN];
         parsec_task_snprintf(task_name, MAX_TASK_STRLEN, task);
 
-        char type_string[MAX_TASK_STRLEN]="UNFULFILLED";
-        char orig_string[MAX_TASK_STRLEN]="LOCAL INLINE";
-
         char type_name_src[MAX_TASK_STRLEN] = "NULL";
         char type_name_dst[MAX_TASK_STRLEN] = "NULL";
         char type_name_data[MAX_TASK_STRLEN] = "NULL";
@@ -608,12 +605,11 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
 
         PARSEC_DEBUG_VERBOSE(12, parsec_debug_output,
                              "th%d RESHAPE_PROMISE CREATE %s %s [%s:%p:%p -> %p] flow_idx %u fut %p on %s(%p) k%d dtt %s -> %s [data %s]",
-                             es->th_id, type_string, orig_string, task_name, data->data, data->data->dtt,
+                             es->th_id, "UNFULFILLED", "LOCAL INLINE", task_name, data->data, data->data->dtt,
                              data->local.dst_datatype,
                              dep_flow_index,
                              data->data_future,
                              "CURR_REPO", setup_repo, setup_repo_key, type_name_src, type_name_dst, type_name_data);
-
 #endif
     }
 
@@ -629,7 +625,7 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
                          es->th_id, *reshape, (*reshape)->dtt, data->data_future);
 
     /* reshape completed */
-    PARSEC_OBJ_RETAIN(*reshape);
+    PARSEC_DATA_COPY_RETAIN(*reshape);
     PARSEC_OBJ_RELEASE(data->data_future);
     /* Clean up the old stuff on the repo used temporarily to hold
      * the inline reshape promise.
@@ -642,7 +638,7 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
 /**
  *
  * Routine to obtain a reshaped copy matching the specifications when reading
- * a tile from the datacollection.
+ * a tile from the data collection.
  * If a reshape needs to be performed, it is done using an inline reshape
  * promise, i.e., creating and fulfilling a local future promise (only
  * the current task instance is involved). Each thread accessing the same
@@ -769,7 +765,7 @@ parsec_get_copy_reshape_from_dep(parsec_execution_stream_t *es,
                          "th%d RESHAPE_PROMISE OBTAINED [%p:%p] for %s fut %p",
                          es->th_id, *reshape, (*reshape)->dtt, task_string, data->data_future);
 
-    PARSEC_OBJ_RETAIN(*reshape);
+    PARSEC_DATA_COPY_RETAIN(*reshape);
     PARSEC_OBJ_RELEASE(data->data_future);
 
     return PARSEC_HOOK_RETURN_RESHAPE_DONE;
diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c
index 1f6920cac..fe4b6c5ec 100644
--- a/parsec/remote_dep.c
+++ b/parsec/remote_dep.c
@@ -492,7 +492,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es,
             /* This assert is not correct anymore, we don't need an arena to send to a remote
              * assert(NULL != output->data.remote.arena);*/
             assert( !parsec_is_CTL_dep(&output->data) );
-            PARSEC_OBJ_RETAIN(output->data.data);
+            PARSEC_DATA_COPY_RETAIN(output->data.data);
         }
 
         for( array_index = count = 0; count < remote_deps->output[i].count_bits; array_index++ ) {
diff --git a/parsec/remote_dep.h b/parsec/remote_dep.h
index 17fa00fa4..358be17b8 100644
--- a/parsec/remote_dep.h
+++ b/parsec/remote_dep.h
@@ -87,6 +87,11 @@ struct parsec_dep_data_description_s {
      */
     parsec_datacopy_future_t      *data_future;
 
+    /* If we can extract a preferred location for the incoming data set it
+     * here, otherwise the memory for the incoming data will be allocated
+     * on the main memory (device 0).
+     */
+    int32_t preferred_device;
 #ifdef PARSEC_RESHAPE_BEFORE_SEND_TO_REMOTE
     /* Keeping current repo & key to be able to consume when
      * the "remote" successors (aka the communication engine)
diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c
index 71a61f675..9d5a7c641 100644
--- a/parsec/remote_dep_mpi.c
+++ b/parsec/remote_dep_mpi.c
@@ -80,6 +80,7 @@ remote_dep_cmd_to_string(remote_dep_wire_activate_t* origin,
     if( NULL == task.task_class ) return snprintf(str, len, "UNKNOWN_of_TASKCLASS_%d", origin->task_class_id), str;
     memcpy(&task.locals, origin->locals, sizeof(parsec_assignment_t) * task.task_class->nb_locals);
     task.priority     = 0xFFFFFFFF;
+    for(int i = 0; i < task.task_class->nb_flows; task.data[i++].data_in = NULL);
     return parsec_task_snprintf(str, len, &task);
 }
 
@@ -142,10 +143,6 @@ parsec_execution_stream_t parsec_comm_es = {
 static void remote_dep_mpi_put_start(parsec_execution_stream_t* es, dep_cmd_item_t* item);
 static void remote_dep_mpi_get_start(parsec_execution_stream_t* es, parsec_remote_deps_t* deps);
 
-static void remote_dep_mpi_get_end(parsec_execution_stream_t* es,
-                                   int idx,
-                                   parsec_remote_deps_t* deps);
-
 static int
 remote_dep_mpi_get_end_cb(parsec_comm_engine_t *ce,
                           parsec_ce_tag_t tag,
@@ -565,25 +562,34 @@ void parsec_remote_dep_memcpy(parsec_execution_stream_t* es,
     item->cmd.memcpy.destination  = dst;
     item->cmd.memcpy.layout       = data->local;
 
-    PARSEC_OBJ_RETAIN(src);
+    PARSEC_DATA_COPY_RETAIN(src);
     remote_dep_inc_flying_messages(tp);
 
     parsec_dequeue_push_back(&dep_cmd_queue, (parsec_list_item_t*) item);
 }
 
 static inline parsec_data_copy_t*
-remote_dep_copy_allocate(parsec_dep_type_description_t* data)
+remote_dep_copy_allocate(parsec_dep_type_description_t* data, int preferred_device)
 {
     parsec_data_copy_t* dc;
     if( NULL == data->arena ) {
         assert(0 == data->dst_count);
         return NULL;
     }
-    dc = parsec_arena_get_copy(data->arena, data->dst_count, 0, data->dst_datatype);
-
-    dc->coherency_state = PARSEC_DATA_COHERENCY_EXCLUSIVE;
-    PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI:\tMalloc new remote tile %p size %" PRIu64 " count = %" PRIu64 " displ = %" PRIi64 " %p",
-            dc, data->arena->elem_size, data->dst_count, data->dst_displ, data->arena);
+    /* Go and allocate on the preferred device. If that fails, fall back and allocate a
+     * copy on the main memory (device 0), and parsec will transfer the data as needed
+     * for all tasks executing on acclerators.
+     */
+    dc = parsec_arena_get_new_copy(data->arena, data->dst_count, preferred_device, data->dst_datatype);
+    PARSEC_DATA_COPY_RETAIN(dc);
+    /* don't use preferred_device, it might not be the location where the data copy resides */
+    parsec_data_start_transfer_ownership_to_copy(dc->original, dc->device_index, PARSEC_FLOW_ACCESS_WRITE);
+    if (dc->device_index != preferred_device) {
+        PARSEC_DEBUG_VERBOSE(5, parsec_comm_output_stream, "MPI:\tFail to allocate tile on device %d and instead allocate on device %d\n",
+                             preferred_device, dc->device_index);
+    }
+    PARSEC_DEBUG_VERBOSE(5, parsec_comm_output_stream, "MPI:\tMalloc new temporary tile [dev %d] copy %p size %" PRIu64 " count = %" PRIu64 " displ = %" PRIi64 " %p",
+                         dc->device_index, dc, data->arena->elem_size, data->dst_count, data->dst_displ, data->arena);
     return dc;
 }
 
@@ -598,7 +604,7 @@ static inline parsec_data_copy_t*
 reshape_copy_allocate(parsec_dep_type_description_t* data)
 {
     parsec_data_copy_t* dc;
-    dc = remote_dep_copy_allocate(data);
+    dc = remote_dep_copy_allocate(data, 0  /* default device */);
 
     parsec_data_start_transfer_ownership_to_copy(dc->original,
                                                  0,
@@ -608,7 +614,7 @@ reshape_copy_allocate(parsec_dep_type_description_t* data)
 
 /**
  *
- * Routine to fulfilled a reshape promise by the current thread
+ * Fulfill a reshape promise by the current thread
  * (when MPI_THREAD_MULTIPLE) or delegate the reshaping to the communication
  * thread.
  * Routine set as callback when initializing a future.
@@ -657,9 +663,9 @@ void parsec_local_reshape_cb(parsec_base_future_t *future, ... )
     }
     if(src_pack_size != dst_pack_size){
         parsec_warning("parsec_local_reshape: reshape requested between dtt with different packed size fut %p dtt [%p:%s = sz(%d) -> %p:%s= sz(%d)]",
-                         future,
-                         dt->local->src_datatype, type_name_src, src_pack_size,
-                         dt->local->dst_datatype, type_name_dst, dst_pack_size);
+                       future,
+                       dt->local->src_datatype, type_name_src, src_pack_size,
+                       dt->local->dst_datatype, type_name_dst, dst_pack_size);
     }
 #endif
 
@@ -779,13 +785,19 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu,
         return PARSEC_ITERATE_STOP;
     }
     if(old_dtt != PARSEC_DATATYPE_NULL) {
+        /* multiple input deps from the same predecessor exists. Be careful on what format
+         * to receive the data. It would not make sense to receive different amount, but
+         * it is legal to receive them with different type signatures. In this case, ignore
+         * the datatype, and instead fall back into a packed format (aka bytes) and the
+         * entire length of the incomming data.
+         */
         if(old_dtt != output->data.remote.dst_datatype) {
 #if defined(PARSEC_DEBUG_NOISIER)
             char type_name_src[MAX_TASK_STRLEN] = "NULL";
             char type_name_dst[MAX_TASK_STRLEN] = "NULL";
             int len;
-            if(old_dtt!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(old_dtt, type_name_src, &len);
-            if(output->data.remote.dst_datatype!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(output->data.remote.dst_datatype, type_name_dst, &len);
+            if(old_dtt != PARSEC_DATATYPE_NULL) MPI_Type_get_name(old_dtt, type_name_src, &len);
+            if(output->data.remote.dst_datatype != PARSEC_DATATYPE_NULL) MPI_Type_get_name(output->data.remote.dst_datatype, type_name_dst, &len);
             PARSEC_DEBUG_VERBOSE(30, parsec_comm_output_stream, "MPI: retrieve dtt for %s [dep_datatype_index %x] DTT: old %s new %s (%p) --> PACKED",
                 newcontext->task_class->name, dep->dep_datatype_index, type_name_src, type_name_dst, output->data.remote.dst_datatype);
 #endif
@@ -793,7 +805,6 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu,
             parsec_ce.pack_size(&parsec_ce, output->data.remote.dst_count, output->data.remote.dst_datatype, &dsize);
             output->data.remote.src_count = output->data.remote.dst_count = dsize;
             output->data.remote.src_datatype = output->data.remote.dst_datatype = PARSEC_DATATYPE_PACKED;
-
             return PARSEC_ITERATE_STOP;
         }
     }
@@ -808,6 +819,18 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu,
                 newcontext->task_class->name, dep->dep_datatype_index, type_name_src, type_name_dst, output->data.remote.dst_datatype);
     }
 #endif
+    /* Predict where the incoming temporary should be located, by using the data_affinity.
+     * This only works is the task affinity is linked to the output location of the task, which
+     * is mostly true for owner-compute type of algorithms.
+     */
+    output->data.preferred_device = 0;  /* the default is CPU memory (aka. device 0) */
+    if (NULL != fct->data_affinity ) {
+        parsec_data_ref_t dref;
+        fct->data_affinity(newcontext, &dref);
+        parsec_data_t* data = dref.dc->data_of_key(dref.dc, dref.key);
+        output->data.preferred_device = (-1 != data->preferred_device) ?
+                                        data->preferred_device : data->owner_device; 
+    }
     return PARSEC_ITERATE_CONTINUE;
 }
 
@@ -989,6 +1012,12 @@ remote_dep_release_incoming(parsec_execution_stream_t* es,
         task.data[target->flow_index].source_repo_entry = NULL;
         task.data[target->flow_index].data_in   = origin->output[i].data.data;
         task.data[target->flow_index].data_out  = origin->output[i].data.data;
+        if( NULL != origin->output[i].data.data ) {  /* nothing for control flows */
+            /* The data has been fully received, mark the copy accordingly */
+            task.data[target->flow_index].data_in->coherency_state = PARSEC_DATA_COHERENCY_OWNED;
+            task.data[target->flow_index].data_in->flags &= ~PARSEC_DATA_FLAG_TRANSIT;  /* not in transit anymore */
+            task.data[target->flow_index].data_in->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
+        }
     }
 
 #ifdef PARSEC_DIST_COLLECTIVES
@@ -1436,7 +1465,7 @@ static int local_dep_nothread_reshape(parsec_execution_stream_t* es,
      * once all successors have consumed the future, in case it is needed
      * as an input for nested futures.
      */
-    PARSEC_OBJ_RETAIN(cmd->memcpy.source);
+    PARSEC_DATA_COPY_RETAIN(cmd->memcpy.source);
 
     int rc = remote_dep_nothread_memcpy(es, item);
     assert(MPI_SUCCESS == rc);
@@ -1649,7 +1678,7 @@ remote_dep_mpi_put_start(parsec_execution_stream_t* es,
 
             deps->output[k].data.data = reshape_data;
 
-            PARSEC_OBJ_RETAIN(reshape_data);
+            PARSEC_DATA_COPY_RETAIN(reshape_data);
             PARSEC_DATA_COPY_RELEASE(old_data);/*old data has been retained for remote communication*/
 
             PARSEC_OBJ_RELEASE(deps->output[k].data.data_future);
@@ -1854,7 +1883,8 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es,
             if((length - (*position)) >= (int)data_sizes[ds_idx]) {
                 assert(NULL == data_desc->data); /* we do not support in-place tiles now, make sure it doesn't happen yet */
                 if(NULL == data_desc->data) {
-                    data_desc->data = remote_dep_copy_allocate(type_desc);
+                    /* if we have to unpack the data onto this new copy we might want to allocated it on the CPU */
+                    data_desc->data = remote_dep_copy_allocate(type_desc, 0);
                 }
 #ifndef PARSEC_PROF_DRY_DEP
                 PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream,
@@ -2080,9 +2110,13 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es,
         /* prepare the local receiving data */
         assert(NULL == deps->output[k].data.data); /* we do not support in-place tiles now, make sure it doesn't happen yet */
         if(NULL == deps->output[k].data.data) {
-            deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote);
+            int best_device = (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY) ? deps->output[k].data.preferred_device : 0;
+            deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote, best_device);
         }
-        dtt   = deps->output[k].data.remote.dst_datatype;
+        /* Mark the data under tranfer */
+        deps->output[k].data.data->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER;
+        deps->output[k].data.data->flags |= PARSEC_DATA_FLAG_TRANSIT;
+        dtt = deps->output[k].data.remote.dst_datatype;
         nbdtt = deps->output[k].data.remote.dst_count;
 
         /* We have the remote mem_handle.
@@ -2159,14 +2193,6 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es,
     }
 }
 
-static void remote_dep_mpi_get_end(parsec_execution_stream_t* es,
-                                   int idx,
-                                   parsec_remote_deps_t* deps)
-{
-    /* The ref on the data will be released below */
-    remote_dep_release_incoming(es, deps, (1U<<idx));
-}
-
 static int
 remote_dep_mpi_get_end_cb(parsec_comm_engine_t *ce,
                           parsec_ce_tag_t tag,
@@ -2197,7 +2223,7 @@ remote_dep_mpi_get_end_cb(parsec_comm_engine_t *ce,
 #if defined(PARSEC_PROF_TRACE)
     TAKE_TIME(es->es_profile, MPI_Data_pldr_ek, callback_data->event_id);
 #endif /* PARSEC_PROF_TRACE */
-    remote_dep_mpi_get_end(es, callback_data->k, deps);
+    remote_dep_release_incoming(es, deps, (1U << callback_data->k));
 
     parsec_ce.mem_unregister(&callback_data->memory_handle);
     parsec_thread_mempool_free(parsec_remote_dep_cb_data_mempool->thread_mempools, callback_data);
@@ -2239,6 +2265,7 @@ int remote_dep_ce_reconfigure(parsec_context_t* context)
          * execution stream to parsec_comm_es. */
         parsec_set_my_execution_stream(&parsec_comm_es);
     }
+
     return PARSEC_SUCCESS;
 }
 
diff --git a/parsec/utils/debug.h b/parsec/utils/debug.h
index ddda2352b..13038c79e 100644
--- a/parsec/utils/debug.h
+++ b/parsec/utils/debug.h
@@ -160,7 +160,7 @@ extern void (*parsec_weaksym_exit)(int status);
 #else  /* defined(PARSEC_DEBUG_NOISIER) */
 
 #define PARSEC_DEBUG_VERBOSE(...) do{} while(0)
-#endif /* defined(PARSEC_DEBUG_VERBOSE) */
+#endif /* defined(PARSEC_DEBUG_NOISIER) */
 
 /** $brief To check if any parsec function returned error.
  */
diff --git a/parsec/utils/zone_malloc.c b/parsec/utils/zone_malloc.c
index fc6d6f238..45099deac 100644
--- a/parsec/utils/zone_malloc.c
+++ b/parsec/utils/zone_malloc.c
@@ -38,7 +38,7 @@ zone_malloc_t* zone_malloc_init(void* base_ptr, int _max_segment, size_t _unit_s
     gdata->base               = base_ptr;
     gdata->unit_size          = _unit_size;
     gdata->max_segment        = _max_segment;
-
+    gdata->lock               = PARSEC_ATOMIC_UNLOCKED;
     gdata->next_tid = 0;
     gdata->segments = (segment_t *)malloc(sizeof(segment_t) * _max_segment);
 #if defined(PARSEC_DEBUG)
@@ -74,6 +74,7 @@ void *zone_malloc(zone_malloc_t *gdata, size_t size)
     int next_tid, current_tid, new_tid;
     int cycled_through = 0, nb_units;
 
+    parsec_atomic_lock(&gdata->lock);
     /* Let's start with the last remembered free slot */
     current_tid = gdata->next_tid;
     nb_units = (size + gdata->unit_size - 1) / gdata->unit_size;
@@ -82,13 +83,10 @@ void *zone_malloc(zone_malloc_t *gdata, size_t size)
         current_segment = SEGMENT_AT_TID(gdata, current_tid);
         if( NULL == current_segment ) {
             /* Maybe there is a free slot in the beginning. Let's cycle at least once before we bail out */
-            if( cycled_through == 0 ) {
-                current_tid = 0;
-                cycled_through = 1;
-                current_segment = SEGMENT_AT_TID(gdata, current_tid);
-            } else {
-                return NULL;
-            }
+            if( 0 != cycled_through ) break;
+            current_tid = 0;
+            cycled_through = 1;
+            current_segment = SEGMENT_AT_TID(gdata, current_tid);
         }
 
         if( current_segment->status == SEGMENT_EMPTY && current_segment->nb_units >= nb_units ) {
@@ -111,12 +109,14 @@ void *zone_malloc(zone_malloc_t *gdata, size_t size)
 
                 current_segment->nb_units = nb_units;
             }
-            return (void*)(gdata->base + (current_tid * gdata->unit_size));
+            parsec_atomic_unlock(&gdata->lock);
+            return (void *)(gdata->base + (current_tid * gdata->unit_size));
         }
 
         current_tid += current_segment->nb_units;
     } while( current_tid != gdata->next_tid );
 
+    parsec_atomic_unlock(&gdata->lock);
     return NULL;
 }
 
diff --git a/parsec/utils/zone_malloc.h b/parsec/utils/zone_malloc.h
index a775c6372..c79a67227 100644
--- a/parsec/utils/zone_malloc.h
+++ b/parsec/utils/zone_malloc.h
@@ -8,7 +8,7 @@
 #define _ZONE_MALLOC_H_
 
 #include "parsec/parsec_config.h"
-
+#include "parsec/include/parsec/sys/atomic.h"
 #include <stdlib.h>
 #include <assert.h>
 
@@ -30,6 +30,7 @@ typedef struct zone_malloc_s {
     size_t     unit_size;            /* Basic Unit                */
     int        max_segment;          /* Maximum number of segment */
     int        next_tid;             /* Next TID to look at for a malloc */
+    parsec_atomic_lock_t lock;
 } zone_malloc_t;
 
 
diff --git a/tests/apps/haar_tree/project_dyn.jdf b/tests/apps/haar_tree/project_dyn.jdf
index a34a9cf80..878b74489 100644
--- a/tests/apps/haar_tree/project_dyn.jdf
+++ b/tests/apps/haar_tree/project_dyn.jdf
@@ -135,7 +135,7 @@ static int my_project_dyn_startup(parsec_execution_stream_t * es, __parsec_proje
   new_task->data._f_RL.data_out = NULL;
   new_task->data._f_NODE.source_repo = NULL;
   new_task->data._f_NODE.source_repo_entry = NULL;
-  chunk = parsec_arena_get_copy(__tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].arena, 1, 0, __tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].opaque_dtt);
+  chunk = parsec_arena_get_new_copy(__tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].arena, 1, 0, __tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].opaque_dtt);
   chunk->original->owner_device = 0;
   new_task->data._f_NODE.data_out = chunk;
   new_task->data._f_NODE.data_in = chunk;
diff --git a/tests/class/atomics.c b/tests/class/atomics.c
index fc7f0ecbb..74c7e0695 100644
--- a/tests/class/atomics.c
+++ b/tests/class/atomics.c
@@ -242,9 +242,9 @@ int main(int argc, char *argv[])
         }
     }
 
-    threads = calloc(sizeof(pthread_t), nb_tests);
-    params = calloc(sizeof(param_t), nb_tests);
-    values = calloc(sizeof(values_t), nb_tests+2);
+    threads = calloc(nb_tests, sizeof(pthread_t));
+    params = calloc(nb_tests, sizeof(param_t));
+    values = calloc(nb_tests+2, sizeof(values_t));
 
     gettimeofday(&now, NULL);
     srand48(now.tv_usec ^ getpid());
diff --git a/tests/class/future.c b/tests/class/future.c
index ff3ee4f05..887ca9e14 100644
--- a/tests/class/future.c
+++ b/tests/class/future.c
@@ -119,7 +119,7 @@ int main(int argc, char* argv[])
     c_fut = PARSEC_OBJ_NEW(parsec_countable_future_t);
     parsec_future_init(c_fut, NULL, cores);
     printf("running with %d cores and %d copies\n", cores, ncopy);
-    threads = calloc(sizeof(pthread_t), cores);
+    threads = calloc(cores, sizeof(pthread_t));
 
     fut_array = malloc(cores*ncopy*sizeof(parsec_base_future_t*));
     data = malloc(cores*ncopy*sizeof(int));
diff --git a/tests/class/future_datacopy.c b/tests/class/future_datacopy.c
index effad1fe7..28af1262d 100644
--- a/tests/class/future_datacopy.c
+++ b/tests/class/future_datacopy.c
@@ -162,7 +162,7 @@ int main(int argc, char* argv[])
     }
     
     printf("running with %d cores and %d copies\n", cores, ncopy);
-    threads = calloc(sizeof(pthread_t), cores);
+    threads = calloc(cores, sizeof(pthread_t));
 
     fut_array = malloc(ncopy*sizeof(parsec_datacopy_future_t*));
     data = malloc(cores*ncopy*sizeof(int));
diff --git a/tests/class/hash.c b/tests/class/hash.c
index 850713ee7..8fe64e466 100644
--- a/tests/class/hash.c
+++ b/tests/class/hash.c
@@ -597,9 +597,9 @@ int main(int argc, char *argv[])
         }
     }
 
-    threads = calloc(sizeof(pthread_t), maxthreads);
-    params = calloc(sizeof(param_t), maxthreads+1);
-    keys = calloc(sizeof(uint64_t), nb_tests);
+    threads = calloc(maxthreads, sizeof(pthread_t));
+    params = calloc(maxthreads+1, sizeof(param_t));
+    keys = calloc(nb_tests, sizeof(uint64_t));
     init_keys(keys, nb_tests, seed, structured_keys);
 
     for(md_tuning = md_tuning_min; md_tuning < md_tuning_max; md_tuning += md_tuning_inc) {
diff --git a/tests/class/lifo.c b/tests/class/lifo.c
index 907d215d7..544d679cf 100644
--- a/tests/class/lifo.c
+++ b/tests/class/lifo.c
@@ -254,8 +254,8 @@ int main(int argc, char *argv[])
         }
     }
 
-    threads = (pthread_t*)calloc(sizeof(pthread_t), nbthreads);
-    times = (uint64_t*)calloc(sizeof(uint64_t), nbthreads);
+    threads = (pthread_t*)calloc(nbthreads, sizeof(pthread_t));
+    times = (uint64_t*)calloc(nbthreads, sizeof(uint64_t));
 
     PARSEC_OBJ_CONSTRUCT(&lifo1, parsec_lifo_t);
     PARSEC_OBJ_CONSTRUCT(&lifo2, parsec_lifo_t);
diff --git a/tests/class/list.c b/tests/class/list.c
index d85a728bb..6451eb5ee 100644
--- a/tests/class/list.c
+++ b/tests/class/list.c
@@ -299,8 +299,8 @@ int main(int argc, char *argv[])
         }
     }
 
-    threads = (pthread_t*)calloc(sizeof(pthread_t), nbthreads);
-    times = (uint64_t*)calloc(sizeof(uint64_t), nbthreads);
+    threads = (pthread_t*)calloc(nbthreads, sizeof(pthread_t));
+    times = (uint64_t*)calloc(nbthreads, sizeof(uint64_t));
 
     PARSEC_OBJ_CONSTRUCT( &l1, parsec_list_t );
     PARSEC_OBJ_CONSTRUCT( &l2, parsec_list_t );
diff --git a/tests/class/rwlock.c b/tests/class/rwlock.c
index d16a221ec..d0a06f431 100644
--- a/tests/class/rwlock.c
+++ b/tests/class/rwlock.c
@@ -138,7 +138,7 @@ int main(int argc, char *argv[])
 
     parsec_atomic_rwlock_init(&rwlock);
 
-    threads = (pthread_t*)calloc(sizeof(pthread_t), maxthreads);
+    threads = (pthread_t*)calloc(maxthreads, sizeof(pthread_t));
 
     for( nbthreads = minthreads; nbthreads < maxthreads; nbthreads++) {
         parsec_barrier_init(&barrier, NULL, nbthreads+1);
diff --git a/tests/collections/redistribute/redistribute_check2.jdf b/tests/collections/redistribute/redistribute_check2.jdf
index fcb4c0b52..0693f5b08 100644
--- a/tests/collections/redistribute/redistribute_check2.jdf
+++ b/tests/collections/redistribute/redistribute_check2.jdf
@@ -143,7 +143,7 @@ int parsec_redistribute_check2(parsec_context_t *parsec,
     int nb_threads = parsec->virtual_processes[0]->nb_cores;
 
     /* Used for error accumulation */
-    long long int *info = (long long int *)calloc(sizeof(long long int), nb_threads);
+    long long int *info = (long long int *)calloc(nb_threads, sizeof(long long int));
 
     parsec_redistribute_check2 = parsec_redistribute_check2_New(
                                  (parsec_tiled_matrix_t *)dcY,
diff --git a/tests/collections/reshape/common.c b/tests/collections/reshape/common.c
index 04b2cd825..3ade805d6 100644
--- a/tests/collections/reshape/common.c
+++ b/tests/collections/reshape/common.c
@@ -158,7 +158,7 @@ int reshape_set_matrix_value_position_swap(parsec_execution_stream_t *es,
 
 int check_matrix_equal(parsec_matrix_block_cyclic_t dcA, parsec_matrix_block_cyclic_t dcA_check){
     int ret = 0;
-    for(int i=0; i < dcA_check.super.nb_local_tiles * dcA_check.super.bsiz; i++){
+    for(size_t i = 0; i < (dcA_check.super.nb_local_tiles * dcA_check.super.bsiz); i++) {
         if( ((int*)dcA.mat)[i] != ((int*)dcA_check.mat)[i]){
             ret = 1;
             break;
diff --git a/tests/dsl/dtd/dtd_test_cuda_task_insert.c b/tests/dsl/dtd/dtd_test_cuda_task_insert.c
index 9b689db73..9b7d9783d 100644
--- a/tests/dsl/dtd/dtd_test_cuda_task_insert.c
+++ b/tests/dsl/dtd/dtd_test_cuda_task_insert.c
@@ -622,7 +622,7 @@ int get_nb_cuda_devices()
 
     for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             nb++;
         }
     }
@@ -638,7 +638,7 @@ int *get_cuda_device_index()
     int i = 0;
     for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[i++] = device->device_index;
         }
     }
diff --git a/tests/dsl/dtd/dtd_test_new_tile.c b/tests/dsl/dtd/dtd_test_new_tile.c
index 36f6b99da..9bf463495 100644
--- a/tests/dsl/dtd/dtd_test_new_tile.c
+++ b/tests/dsl/dtd/dtd_test_new_tile.c
@@ -277,7 +277,7 @@ int main(int argc, char **argv)
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(PARSEC_HAVE_CU_COMPILER)
     for(unsigned int i = 0; i < parsec_nb_devices; i++) {
         parsec_device_module_t *dev = parsec_mca_device_get(i);
-        if( dev->type == PARSEC_DEV_CUDA )
+        if( dev->type & PARSEC_DEV_CUDA )
             nb_gpus++;
     }
     if(nb_gpus > 0) {
@@ -286,7 +286,7 @@ int main(int argc, char **argv)
         nb_gpus = 0;
         for(unsigned int i = 0; i < parsec_nb_devices; i++) {
             parsec_device_module_t *dev = parsec_mca_device_get(i);
-            if( dev->type == PARSEC_DEV_CUDA) {
+            if( dev->type & PARSEC_DEV_CUDA) {
                 cudaError_t status;
                 parsec_device_cuda_module_t *gpu_device = (parsec_device_cuda_module_t *)dev;
                 status = cudaSetDevice( gpu_device->cuda_index );
@@ -378,7 +378,7 @@ int main(int argc, char **argv)
     parsec_dtd_task_class_add_chore(dtd_tp, fourth_tc, PARSEC_DEV_CPU, cpu_reduce);
 
     parsec_dtd_tile_t **new_tiles;
-    new_tiles = (parsec_dtd_tile_t**)calloc(sizeof(parsec_dtd_tile_t *), NCASE*world);
+    new_tiles = (parsec_dtd_tile_t**)calloc(NCASE*world, sizeof(parsec_dtd_tile_t *));
 
     acc = 0;
 
diff --git a/tests/dsl/dtd/dtd_test_simple_gemm.c b/tests/dsl/dtd/dtd_test_simple_gemm.c
index 9ac71a079..f15aba062 100644
--- a/tests/dsl/dtd/dtd_test_simple_gemm.c
+++ b/tests/dsl/dtd/dtd_test_simple_gemm.c
@@ -327,7 +327,7 @@ int get_nb_gpu_devices()
 
     for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             nb++;
         }
     }
@@ -343,7 +343,7 @@ int *get_gpu_device_index()
     int i = 0;
     for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[i++] = device->device_index;
         }
     }
diff --git a/tests/dsl/ptg/branching/branching_data.c b/tests/dsl/ptg/branching/branching_data.c
index e362cb5c8..4aeffc9c9 100644
--- a/tests/dsl/ptg/branching/branching_data.c
+++ b/tests/dsl/ptg/branching/branching_data.c
@@ -98,7 +98,7 @@ parsec_data_collection_t *create_and_distribute_data(int rank, int world, int si
 #endif
     parsec_type_create_contiguous(size, parsec_datatype_int32_t, &d->default_dtt);
 
-    m->data = calloc(sizeof(parsec_data_t*), nb);
+    m->data = calloc(nb, sizeof(parsec_data_t*));
     m->nt   = nb;
     m->size = size;
     m->ptr  = (int32_t*)malloc(nb * size * sizeof(int32_t));
diff --git a/tests/dsl/ptg/choice/main.c b/tests/dsl/ptg/choice/main.c
index 29988fd1d..a8066a9df 100644
--- a/tests/dsl/ptg/choice/main.c
+++ b/tests/dsl/ptg/choice/main.c
@@ -68,7 +68,7 @@ int main(int argc, char *argv[])
     dcA = create_and_distribute_data(rank, world, size);
     parsec_data_collection_set_key(dcA, "A");
 
-    decision = (int*)calloc(sizeof(int), nb+1);
+    decision = (int*)calloc(nb+1, sizeof(int));
 
     choice = choice_new(dcA, size, decision, nb, world);
     rc = parsec_context_add_taskpool(parsec, choice);
diff --git a/tests/profiling/async.jdf b/tests/profiling/async.jdf
index 487c2ea21..de24694c8 100644
--- a/tests/profiling/async.jdf
+++ b/tests/profiling/async.jdf
@@ -211,7 +211,7 @@ int main( int argc, char** argv )
     profiling_save_iinfo("NB", NB);
 #endif
 
-    taskqueue = (parsec_task_t**)calloc(sizeof(parsec_task_t*), (NB+1));
+    taskqueue = (parsec_task_t**)calloc(NB+1, sizeof(parsec_task_t*));
     /**
      * Build the data and the arena to hold it up.
      */
diff --git a/tests/runtime/cuda/CMakeLists.txt b/tests/runtime/cuda/CMakeLists.txt
index fbb5a5022..fdfbf9f89 100644
--- a/tests/runtime/cuda/CMakeLists.txt
+++ b/tests/runtime/cuda/CMakeLists.txt
@@ -22,7 +22,12 @@ if(PARSEC_HAVE_CUDA)
   endif( NOT TARGET CUDA::cublas )
 
   # Testing for geting best device
-  parsec_addtest_executable(C testing_get_best_device SOURCES "testing_get_best_device.c")
+  parsec_addtest_executable(C testing_get_best_device SOURCES testing_get_best_device.c)
   target_include_directories(testing_get_best_device PRIVATE $<$<NOT:${PARSEC_BUILD_INPLACE}>:${CMAKE_CURRENT_SOURCE_DIR}>)
   target_ptg_sources(testing_get_best_device PRIVATE "get_best_device_check.jdf")
+
+  # Testing for communications to and from GPU
+  parsec_addtest_executable(C cuda_rtt SOURCES rtt.c rtt_main.c)
+  target_include_directories(cuda_rtt PRIVATE $<$<NOT:${PARSEC_BUILD_INPLACE}>:${CMAKE_CURRENT_SOURCE_DIR}>)
+  target_ptg_sources(cuda_rtt PRIVATE "rtt.jdf")
 endif(PARSEC_HAVE_CUDA)
diff --git a/tests/runtime/cuda/get_best_device_check.jdf b/tests/runtime/cuda/get_best_device_check.jdf
index 65025d8e1..b3167c451 100644
--- a/tests/runtime/cuda/get_best_device_check.jdf
+++ b/tests/runtime/cuda/get_best_device_check.jdf
@@ -148,7 +148,7 @@ parsec_get_best_device_check_New(parsec_tiled_matrix_t *dcA, int *info)
     int nb = 0;
     for(int i = 0; i < (int)parsec_nb_devices; i++) {
         parsec_device_module_t *device = parsec_mca_device_get(i);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             nb++;
         }
     }
@@ -160,7 +160,7 @@ parsec_get_best_device_check_New(parsec_tiled_matrix_t *dcA, int *info)
     nb = 0;
     for(int i = 0; i < (int)parsec_nb_devices; i++) {
         parsec_device_module_t *device = parsec_mca_device_get(i);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[nb++] = device->device_index;
         }
     }
diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c
index abc4b19c9..3b95a4d80 100644
--- a/tests/runtime/cuda/nvlink_wrapper.c
+++ b/tests/runtime/cuda/nvlink_wrapper.c
@@ -72,7 +72,7 @@ __parsec_nvlink_destructor( parsec_nvlink_taskpool_t* nvlink_taskpool)
     userM = nvlink_taskpool->_g_userM;
     for(g = 0, dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == cuda_device->super.super.type ) {
+        if( PARSEC_DEV_CUDA & cuda_device->super.super.type ) {
             parsec_data_t *dta = ((parsec_dc_t*)userM)->data_of((parsec_dc_t*)userM, g, userM->super.super.myrank);
             parsec_data_copy_t *gpu_copy = parsec_data_get_copy(dta, cuda_device->super.super.device_index);
             cudaError_t status = cudaSetDevice( cuda_device->cuda_index );
@@ -108,7 +108,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
     nb = 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[nb++] = device->device_index;
         }
     }
@@ -168,7 +168,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
      * in the JDF, this also pins the task on the GPU that we chose to host the tile */
     for(int g = 0, dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == cuda_device->super.super.type ) {
+        if( PARSEC_DEV_CUDA & cuda_device->super.super.type ) {
             /* We get the data from the data collection */
             parsec_data_t *dta = ((parsec_dc_t*)userM)->data_of((parsec_dc_t*)userM, g, ctx->my_rank);
             /* The corresponding data copy on CPU RAM */
diff --git a/tests/runtime/cuda/rtt.jdf b/tests/runtime/cuda/rtt.jdf
new file mode 100644
index 000000000..a7b3a6cdd
--- /dev/null
+++ b/tests/runtime/cuda/rtt.jdf
@@ -0,0 +1,26 @@
+extern "C" %{
+   /* This simple example does not need to include anything */
+%}
+
+%option no_taskpool_instance = true  /* can be anything */
+
+NT
+WS
+
+PING(k)
+
+k = 0 .. NT-1
+: A(0, k % WS)
+
+RW T <- (k == 0) ? A(0, k % WS) : T PING(k-1)
+     -> (k < NT) ? T PING(k+1)
+
+; 0
+
+BODY [type = CUDA]
+	printf("ping(%d)\n", k);
+	/* 
+	int r; MPI_Comm_rank(MPI_COMM_WORLD, &r);
+	printf("%d: PING(%d)\n", r, k);
+ 	*/
+END
diff --git a/tests/runtime/cuda/rtt_main.c b/tests/runtime/cuda/rtt_main.c
new file mode 100644
index 000000000..0d2f5ae41
--- /dev/null
+++ b/tests/runtime/cuda/rtt_main.c
@@ -0,0 +1,236 @@
+/**
+ * Copyright (c) 2019-2024 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ */
+
+#include "parsec.h"
+#include "parsec/data_distribution.h"
+#include "parsec/data_dist/matrix/matrix.h"
+#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
+#include "parsec/include/parsec/execution_stream.h"
+#include "parsec/utils/mca_param.h"
+
+#include "rtt.h"
+
+#if defined(DISTRIBUTED)
+#include <mpi.h>
+#endif
+ #include <math.h>
+
+static int nb_gpus = 1, gpu_mask = 0xff;
+static int cuda_device_index_len = 0, *cuda_device_index = NULL;
+
+/**
+ * @brief init operator
+ *
+ * @param [in] es: execution stream
+ * @param [in] descA: tiled matrix date descriptor
+ * @param [inout] A:  inout data
+ * @param [in] uplo: matrix shape
+ * @param [in] m: tile row index
+ * @param [in] n: tile column index
+ * @param [in] args: NULL
+ */
+static int matrix_init_ops(parsec_execution_stream_t *es,
+                           const parsec_tiled_matrix_t *descA,
+                           void *_A, parsec_matrix_uplo_t uplo,
+                           int m, int n, void *args)
+{
+    memset(_A, 1, m*n);
+
+    /* Address warning when compile */
+#if 1
+    parsec_data_key_t key = descA->super.data_key((parsec_data_collection_t*)descA, m, n);
+    parsec_data_t* data = descA->super.data_of_key((parsec_data_collection_t*)descA, key);
+    parsec_advise_data_on_device(data,
+                                 cuda_device_index[m % cuda_device_index_len],
+                                 PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
+#endif
+    (void)es; (void)uplo;(void)n;(void)m;(void)args;
+    return 0;
+}
+
+static void
+__parsec_rtt_destructor(parsec_rtt_taskpool_t *rtt_tp)
+{
+    parsec_type_free(&(rtt_tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX].opaque_dtt));
+}
+
+PARSEC_OBJ_CLASS_INSTANCE(parsec_rtt_taskpool_t, parsec_taskpool_t,
+                          NULL, __parsec_rtt_destructor);
+
+parsec_taskpool_t *rtt_New(parsec_context_t *ctx, parsec_matrix_block_cyclic_t *dcA,
+                           parsec_datatype_t block, int roundtrips)
+{
+    parsec_rtt_taskpool_t *tp = NULL;
+
+    tp = parsec_rtt_new((parsec_data_collection_t*)dcA, roundtrips, ctx->nb_nodes);
+    tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX].opaque_dtt = block;
+
+    ptrdiff_t lb, extent;
+    parsec_type_extent(block, &lb, &extent);
+
+    parsec_arena_datatype_construct(&tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX],
+                                    extent, PARSEC_ARENA_ALIGNMENT_SSE,
+                                    block);
+    return (parsec_taskpool_t *)tp;
+}
+
+int main(int argc, char *argv[])
+{
+    parsec_context_t *parsec = NULL;
+    parsec_taskpool_t *tp;
+    int size = 1, rank = 0, loops = 100, frags = 1, nb_runs = 1, cores = 2, do_sleep = 0, ch, use_opt = 1;
+    struct timeval tstart, tend;
+    size_t msg_size = 8*1024;
+    double t, bw;
+
+    while ((ch = getopt(argc, argv, "c:g:G:l:f:m:n:s:")) != -1) {
+        switch (ch) {
+            case 'c': cores = atoi(optarg); use_opt += 2; break;
+            case 'g': nb_gpus = atoi(optarg); use_opt += 2; break;
+            case 'G': gpu_mask = atoi(optarg); use_opt += 2; break;
+            case 'l': loops = atoi(optarg); use_opt += 2; break;
+            case 'f': frags = atoi(optarg); use_opt += 2; break;
+            case 'm': msg_size = (size_t)atoi(optarg); use_opt += 2; break;
+            case 'n': nb_runs = atoi(optarg); use_opt += 2; break;
+            case 's': do_sleep = atoi(optarg); use_opt += 2; break;
+            default:
+                fprintf(stderr,
+                        "-c : number of cores to use (default 2)\n"
+                        "-g : number of GPU to use (default 1)\n"
+                        "-G : GPU mask to use (-1 to modulo rank per node)\n"
+                        "-l : loops of bandwidth(default: 100)\n"
+                        "-f : frags, number of fragments (default: 1)\n"
+                        "-m : size, size of message (default: 1024 * 8)\n"
+                        "-n : number of runs (default: 1)\n"
+                        "-s : number of seconds to sleep before running the tests\n"
+                        "\n");
+                 exit(1);
+        }
+    }
+    /* Remove all options already acknowledged */
+    if( NULL == argv[optind] ) {
+        argc = 1;
+    } else {
+        memcpy(&argv[1], &argv[use_opt+1], (argc - use_opt) * sizeof(char*));
+        argc -= use_opt;
+    }
+    argv[argc] = NULL;
+#if defined(DISTRIBUTED)
+    {
+        int provided;
+        MPI_Init_thread(NULL, NULL, MPI_THREAD_SERIALIZED, &provided);
+    }
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+    if (0xFF == gpu_mask) {
+        extern char **environ;
+        MPI_Comm local_comm;
+        int local_rank, local_size;
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                            MPI_INFO_NULL, &local_comm);
+        MPI_Comm_rank(local_comm, &local_rank);
+        MPI_Comm_size(local_comm, &local_size);
+        MPI_Comm_free(&local_comm);
+        int gpu_mask = 0;
+        for (int i = 0; i < nb_gpus; i++) {
+            gpu_mask |= ((1 << local_rank) << i);
+        }
+        char *value;
+        asprintf(&value, "%d", gpu_mask);
+        parsec_setenv_mca_param("device_cuda_mask", value, &environ);
+        free(value);
+        value = NULL;
+    }
+#endif
+#endif /* DISTRIBUTED */
+    if( 0 == rank ) {
+        printf("Running %d tests of %d steps RTT with a data of size %zu\n",
+               nb_runs, loops, msg_size);
+    }
+    parsec = parsec_init(cores, &argc, &argv);
+
+    /* can the test run? */
+    nb_gpus = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
+    assert(nb_gpus >= 0);
+    if(nb_gpus == 0) {
+        parsec_warning("This test can only run if at least one GPU device is present");
+        exit(-PARSEC_ERR_DEVICE);
+    }
+    cuda_device_index = (int *)malloc(parsec_nb_devices * sizeof(int));
+    cuda_device_index_len = 0;
+    for (int dev = 0; dev < (int)parsec_nb_devices; dev++) {
+        parsec_device_module_t *device = parsec_mca_device_get(dev);
+        if (PARSEC_DEV_CUDA & device->type) {
+            cuda_device_index[cuda_device_index_len++] = device->device_index;
+        }
+    }
+
+    parsec_datatype_t block;
+    size_t mb = sqrt(msg_size), nb = msg_size / mb;
+    if (mb <= 0) {
+        fprintf(stderr, "To work, RTT must do at least one round time trip of at least one byte\n");
+        exit(-1);
+    }
+
+    parsec_matrix_block_cyclic_t* dcA = (parsec_matrix_block_cyclic_t *)calloc(1, sizeof(parsec_matrix_block_cyclic_t));
+    parsec_matrix_block_cyclic_init(dcA, PARSEC_MATRIX_BYTE, PARSEC_MATRIX_TILE,
+                                    parsec->my_rank,
+                                    mb, nb,
+                                    mb, parsec->nb_nodes * nb,
+                                    0, 0,
+                                    mb, parsec->nb_nodes * nb,
+                                    1, parsec->nb_nodes, 1, 1,
+                                    0, 0);
+    dcA->mat = parsec_data_allocate((size_t)dcA->super.nb_local_tiles *
+                                    (size_t)dcA->super.bsiz *
+                                    (size_t)parsec_datadist_getsizeoftype(dcA->super.mtype));
+    parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A");
+
+    parsec_type_create_contiguous(mb * nb, parsec_datatype_uint8_t, &block);
+
+    /* Initialize and place the dcA */
+    parsec_apply(parsec, PARSEC_MATRIX_FULL,
+                 (parsec_tiled_matrix_t *)dcA,
+                 (parsec_tiled_matrix_unary_op_t)matrix_init_ops, NULL);
+
+
+    if( do_sleep ) {
+        sleep(do_sleep);
+    }
+#if defined(PARSEC_HAVE_MPI)
+        MPI_Barrier(MPI_COMM_WORLD);
+#endif  /* defined(PARSEC_HAVE_MPI) */
+    gettimeofday(&tstart, NULL);
+    for( int test_id = 0; test_id < nb_runs; test_id++ ) {
+        tp = rtt_New(parsec, dcA, block, loops);
+        if( NULL != tp ) {
+            parsec_context_add_taskpool(parsec, tp);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+            parsec_taskpool_free(tp);
+        }
+    }
+#if defined(PARSEC_HAVE_MPI)
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif  /* defined(PARSEC_HAVE_MPI) */
+    gettimeofday(&tend, NULL);
+
+    if( 0 == rank ) {
+        t = ((tend.tv_sec - tstart.tv_sec) * 1000000.0 + (tend.tv_usec - tstart.tv_usec)) / 1000000.0;  /* in seconds */
+        double total_payload = (double)nb_runs * (double)loops * (double)msg_size / 1024.0 / 1024.0 / 1024.0;
+        bw = total_payload / t;
+        printf("%d\t%d\t%d\t%zu\t%08.4g s\t%4.8g GB/s\n", nb_runs, frags, loops, msg_size*sizeof(uint8_t), t, bw);
+    }
+
+    free(cuda_device_index); cuda_device_index = NULL;
+    cuda_device_index_len = 0;
+    parsec_fini(&parsec);
+#if defined(DISTRIBUTED)
+    MPI_Finalize();
+#endif /* DISTRIBUTED */
+    return 0;
+}
diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf
index 7df99800f..986c443db 100644
--- a/tests/runtime/cuda/stage_custom.jdf
+++ b/tests/runtime/cuda/stage_custom.jdf
@@ -45,7 +45,7 @@ stage_stride_in(parsec_gpu_task_t *gtask,
             dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];
             elem_sz = parsec_datadist_getsizeoftype(dc->mtype);
             in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index);
-            if(in_elem_dev->super.type != PARSEC_DEV_CUDA ){
+            if( !(in_elem_dev->super.type & PARSEC_DEV_CUDA) ) {
                 /* copy width bytes heigth times, skipping pitch - width bytes every time */
                 size_t dpitch = dc->mb * elem_sz;
                 size_t spitch = dc->llm * elem_sz;
@@ -117,6 +117,28 @@ typedef void (*cublas_dgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k,
 /* Pre-declare function used as a property of some parameterized task */
 static int64_t gemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
 
+typedef struct deallocation_helper_s {
+    parsec_assignment_t m;
+    parsec_assignment_t k;
+    void** my_storage[3];
+  } free_helper_t;
+
+static int 
+complete_batched_callback(parsec_device_gpu_module_t *dev,
+                          parsec_gpu_task_t ** gpu_task,
+                          parsec_gpu_exec_stream_t *gpu_stream)
+{
+    PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "complete_batched_callback for batched task %p on stream %s{%p}\n",
+                         gpu_task, gpu_stream->name, (void*)gpu_stream);
+    (void)dev; (void) gpu_task; (void)gpu_stream;
+    parsec_list_item_t* output_stream_ghost = &dev->exec_stream[1]->fifo_pending->ghost_element;
+    parsec_list_item_ring_merge(output_stream_ghost, &(*gpu_task)->list_item);
+    free_helper_t* helper = (free_helper_t*)(((*gpu_task)->ec)->locals); (void)helper;
+    //printf(" release %p and %p and %p\n", helper->my_storage[0], helper->my_storage[1], helper->my_storage[2]);
+    (*gpu_task)->complete_stage = NULL;
+    *gpu_task = NULL;
+    return PARSEC_HOOK_RETURN_DONE;
+}
 %}
 
 %option no_taskpool_instance = true  /* can be anything */
@@ -146,8 +168,48 @@ RW A <- descA(m, k)
      -> A TASK_CHECK(m,k)
 
 BODY [type=CUDA
+      batch = true
       dyld=cublasDgemm dyldtype=cublas_dgemm_t]
 {
+    parsec_list_item_singleton(&gpu_task->list_item);
+    parsec_list_item_t* store_back = NULL;
+    if( !parsec_list_nolock_is_empty(gpu_stream->fifo_pending) ) {  /* more than one gpu_task in the ring */
+        int how_many = 1;  /* we start with the current gpu_task */
+        do {
+            parsec_list_item_t* item = parsec_list_pop_front(gpu_stream->fifo_pending);
+            parsec_list_item_singleton(item);
+            parsec_gpu_task_t* task = (parsec_gpu_task_t*)item;
+
+            if( gpu_task->ec->task_class == task->ec->task_class ) {
+                /* same task class as the current one, possible to batch */
+                (void)parsec_list_item_ring_push(&gpu_task->list_item, (parsec_list_item_t*)task);
+                how_many++;  /* one more into the batch */
+                PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Add task %p to the %p batch on stream %s{%p}\n",
+                                     task, gpu_task, gpu_stream->name, (void*)gpu_stream);
+                if( 5 == how_many ) {
+                    /* let's stop here for now */
+                    break;
+                }
+            } else {
+                if( NULL == store_back ) {
+                    store_back = item;
+                } else {
+                    parsec_list_item_ring_push(store_back, item);  /* build the list of un-batcheable tasks */
+                }
+            }
+        } while( !parsec_list_nolock_is_empty(gpu_stream->fifo_pending) );
+        /* we now have two separated task rings: the gpu_task with all the tasks that will be batched
+         * and the ring that has all the remaining items in the list (including the list's ghost_elem).
+         * The remaining list is already stored in the gpu_stream->fifo_pending.
+         */
+        if( how_many > 1 )
+            gpu_task->complete_stage = complete_batched_callback;
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "submit multiple tasks into one %p on stream %s{%p}\n",
+                             gpu_task, gpu_stream->name, (void*)gpu_stream);
+        if( NULL != store_back ) {
+            parsec_list_item_ring_merge(&gpu_stream->fifo_pending->ghost_element, store_back);
+        }
+    }
     double lalpha = 1.0;
     double lbeta  = 2.0;
     int tempmm = descA->mb;
@@ -155,14 +217,26 @@ BODY [type=CUDA
 
     cublasStatus_t status;
     cublasSetKernelStream( parsec_body.stream );
-    parsec_body.dyld_fn(  'N', 'N',
-                         tempmm, tempmm, tempmm,
-                         lalpha, (double*)A, ldam,
-                                 (double*)A, ldam,
-                         lbeta,  (double*)A, ldam );
-    status = cublasGetError();
-    PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
-                            {return PARSEC_HOOK_RETURN_ERROR;} );
+
+    parsec_gpu_task_t* current_gpu_task = gpu_task;
+    free_helper_t* helper = (free_helper_t*)((gpu_task->ec)->locals);
+    helper->my_storage[0] = (void*)(uintptr_t)0xdeadbeef;
+    helper->my_storage[1] = (void*)(uintptr_t)0xabcdabcd;
+    helper->my_storage[2] = (void*)(uintptr_t)0xcdefcdef;
+    do {
+        __parsec_stage_custom_TASK_GPU_task_t *task = (__parsec_stage_custom_TASK_GPU_task_t*)current_gpu_task->ec;
+        _f_A = task->data._f_A.data_out;
+        A = PARSEC_DATA_COPY_GET_PTR(_f_A);
+        parsec_body.dyld_fn(  'N', 'N',
+                            tempmm, tempmm, tempmm,
+                            lalpha, (double*)A, ldam,
+                                    (double*)A, ldam,
+                            lbeta,  (double*)A, ldam );
+        status = cublasGetError();
+        PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
+                                {return PARSEC_HOOK_RETURN_ERROR;} );
+        current_gpu_task = (parsec_gpu_task_t*)current_gpu_task->list_item.list_next;
+    } while( current_gpu_task != gpu_task );
 }
 END
 
diff --git a/tests/runtime/cuda/stress_main.c b/tests/runtime/cuda/stress_main.c
index 31176f4a8..5f060bdfc 100644
--- a/tests/runtime/cuda/stress_main.c
+++ b/tests/runtime/cuda/stress_main.c
@@ -2,6 +2,7 @@
 #include "parsec/data_distribution.h"
 #include "parsec/data_dist/matrix/matrix.h"
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
+#include "parsec/utils/mca_param.h"
 
 #include "stress.h"
 #include "stress_wrapper.h"
@@ -15,7 +16,7 @@ int main(int argc, char *argv[])
     parsec_context_t *parsec = NULL;
     parsec_taskpool_t *tp;
     int size = 1;
-    int rank = 0;
+    int rank = 0, nb_gpus = 1;
 
 #if defined(DISTRIBUTED)
     {
@@ -24,6 +25,27 @@ int main(int argc, char *argv[])
     }
     MPI_Comm_size(MPI_COMM_WORLD, &size);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+    {
+        MPI_Comm local_comm;
+        int local_rank, local_size;
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                            MPI_INFO_NULL, &local_comm);
+        MPI_Comm_rank(local_comm, &local_rank);
+        MPI_Comm_size(local_comm, &local_size);
+        MPI_Comm_free(&local_comm);
+        int gpu_mask = 0;
+        for (int i = 0; i < nb_gpus; i++)
+        {
+            gpu_mask |= ((1 << local_rank) << i);
+        }
+        char *value;
+        asprintf(&value, "%d", gpu_mask);
+        parsec_setenv_mca_param("device_cuda_mask", value, &environ);
+        free(value);
+        value = NULL;
+    }
+#endif /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)*/
 #endif /* DISTRIBUTED */
 
     parsec = parsec_init(-1, &argc, &argv);
diff --git a/tests/runtime/cuda/stress_wrapper.c b/tests/runtime/cuda/stress_wrapper.c
index ee40903e0..c688fcda4 100644
--- a/tests/runtime/cuda/stress_wrapper.c
+++ b/tests/runtime/cuda/stress_wrapper.c
@@ -41,7 +41,7 @@ parsec_taskpool_t* testing_stress_New( parsec_context_t *ctx, int depth, int mb
         nb = 0;
         for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
             parsec_device_module_t *device = parsec_mca_device_get(dev);
-            if( PARSEC_DEV_CUDA == device->type ) {
+            if( PARSEC_DEV_CUDA & device->type ) {
                 dev_index[nb++] = device->device_index;
             }
         }
diff --git a/tests/runtime/cuda/testing_get_best_device.c b/tests/runtime/cuda/testing_get_best_device.c
index 4bfd43b5d..f42279534 100644
--- a/tests/runtime/cuda/testing_get_best_device.c
+++ b/tests/runtime/cuda/testing_get_best_device.c
@@ -45,17 +45,10 @@ int main(int argc, char *argv[])
     char **pargv;
 
     /* Default */
-    int m = 0;
-    int N = 8;
-    int NB = 4;
-    int P = 1;
-    int KP = 1;
-    int KQ = 1;
-    int cores = -1;
-    int nb_gpus = 0;
-    int info = 0;
-
-    while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:h")) != -1) {
+    int m = 0, N = 8, NB = 4, P = 1, KP = 1, KQ = 1;
+    int cores = -1, nb_gpus = 0, nb_avail_gpu = 0, info = 0, gpu_mask = 0xFF;
+
+    while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:G:h")) != -1) {
         switch (ch) {
             case 'm': m = atoi(optarg); break;
             case 'N': N = atoi(optarg); break;
@@ -65,6 +58,7 @@ int main(int argc, char *argv[])
             case 'P': P = atoi(optarg); break;
             case 'c': cores = atoi(optarg); break;
             case 'g': nb_gpus = atoi(optarg); break;
+            case 'G': gpu_mask = atoi(optarg); break;
             case '?': case 'h': default:
                 fprintf(stderr,
                         "-m : initialize MPI_THREAD_MULTIPLE (default: 0/no)\n"
@@ -75,6 +69,7 @@ int main(int argc, char *argv[])
                         "-P : rows (P) in the PxQ process grid (default: 1)\n"
                         "-c : number of cores used (default: -1)\n"
                         "-g : number of GPUs used (default: 0)\n"
+                        "-G : mask of the GPUs to be used (default: 0xff)"
                         "-h : print this help message\n"
                         "\n");
                  exit(1);
@@ -102,16 +97,20 @@ int main(int argc, char *argv[])
             break;
         }
     }
-
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
     extern char **environ;
     char *value;
     if( nb_gpus < 1 && 0 == rank ) {
-        fprintf(stderr, "Warning: if run on GPUs, please set --gpus=value bigger than 0\n");
+        fprintf(stderr, "Warning: if run on GPUs, please set -g value bigger than 0\n");
     }
     asprintf(&value, "%d", nb_gpus);
     parsec_setenv_mca_param( "device_cuda_enabled", value, &environ );
-    free(value);
+    free(value); value = NULL;
+    if( 0xFF != gpu_mask ) {
+        asprintf(&value, "%d", gpu_mask);
+        parsec_setenv_mca_param("device_cuda_mask", value, &environ);
+        free(value); value = NULL;
+    }
 #endif
 
     /* Initialize PaRSEC */
@@ -134,7 +133,7 @@ int main(int argc, char *argv[])
         }
         cores = nb_total_comp_threads;
     }
-
+    nb_avail_gpu = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
     /* initializing matrix structure */
     parsec_matrix_block_cyclic_t dcA;
     parsec_matrix_block_cyclic_init(&dcA, PARSEC_MATRIX_DOUBLE, PARSEC_MATRIX_TILE,
@@ -153,9 +152,9 @@ int main(int argc, char *argv[])
     /* Main routines */
     SYNC_TIME_START(); 
     info = parsec_get_best_device_check(parsec, (parsec_tiled_matrix_t *)&dcA);
-    SYNC_TIME_PRINT(rank, ("Get_best_device" "\tN= %d NB= %d "
+    SYNC_TIME_PRINT(rank, ("Get_best_device\tN= %d NB= %d "
                            "PxQ= %d %d KPxKQ= %d %d cores= %d nb_gpus= %d\n",
-                           N, NB, P, nodes/P, KP, KQ, cores, parsec_nb_devices-2)); 
+                           N, NB, P, nodes / P, KP, KQ, cores, nb_avail_gpu));
 
     /* Check result */
     if( 0 == rank && info != 0 ) {
diff --git a/tools/profiling/dbpreader.c b/tools/profiling/dbpreader.c
index 00eb00acd..941351031 100644
--- a/tools/profiling/dbpreader.c
+++ b/tools/profiling/dbpreader.c
@@ -1071,7 +1071,7 @@ static dbp_multifile_reader_t *open_files(int nbfiles, char **filenames)
     dbp->last_error = SUCCESS;
     dbp->dico_size = 0;
     dbp->dico_allocated = 8;
-    dbp->dico_keys = calloc(sizeof(dbp_dictionary_t), dbp->dico_allocated);
+    dbp->dico_keys = calloc(dbp->dico_allocated, sizeof(dbp_dictionary_t));
 
     n = 0;
     for(i = 0; i < nbfiles; i++) {