From 97dbb09a435e68ed92205f9ac1469300a13a708a Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 20 Feb 2024 16:22:51 -0500
Subject: [PATCH] Add stream operations to accelerator components

- Stream-based alloc and free
- Stream-based memmove
- Wait for stream to complete

Also, enable querying for number of devices and memory bandwidth.
These operations are needed for operation device offloading.

Co-authored-by: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/accelerator.h            | 106 +++++++++-
 opal/mca/accelerator/cuda/accelerator_cuda.c  | 195 ++++++++++++++----
 opal/mca/accelerator/cuda/accelerator_cuda.h  |  10 +-
 .../cuda/accelerator_cuda_component.c         |  89 +++++++-
 .../null/accelerator_null_component.c         |  64 +++++-
 opal/mca/accelerator/rocm/accelerator_rocm.h  |   8 +-
 .../rocm/accelerator_rocm_component.c         |  49 ++++-
 .../rocm/accelerator_rocm_module.c            | 162 +++++++++++++--
 .../accelerator/ze/accelerator_ze_module.c    | 132 +++++++++---
 9 files changed, 716 insertions(+), 99 deletions(-)

diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h
index 0d660725acc..1163e72fa7a 100644
--- a/opal/mca/accelerator/accelerator.h
+++ b/opal/mca/accelerator/accelerator.h
@@ -5,6 +5,9 @@
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2023      Advanced Micro Devices, Inc. All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  *
  * $COPYRIGHT$
  *
@@ -193,7 +196,7 @@ typedef int (*opal_accelerator_base_module_create_stream_fn_t)(
  * @param[IN]  dev_id        Associated device for the event or
  *                           MCA_ACCELERATOR_NO_DEVICE_ID
  * @param[OUT] event         Event to create
- * @param[IN]  enable_ipc    support inter-process tracking of the event 
+ * @param[IN]  enable_ipc    support inter-process tracking of the event
  *
  * @return                   OPAL_SUCCESS or error status on failure.
  */
@@ -310,6 +313,31 @@ typedef int (*opal_accelerator_base_module_memmove_fn_t)(
     int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
     opal_accelerator_transfer_type_t type);
 
+
+/**
+ * Copies memory asynchronously from src to dest. Memory of dest and src
+ * may overlap. Optionally can specify the transfer type to
+ * avoid pointer detection for performance. The operations will be enqueued
+ * into the provided stream but are not guaranteed to be complete upon return.
+ *
+ * @param[IN] dest_dev_id    Associated device to copy to or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] src_dev_id     Associated device to copy from or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] dest           Destination to copy memory to
+ * @param[IN] src            Source to copy memory from
+ * @param[IN] size           Size of memory to copy
+ * @param[IN] stream         Stream to perform asynchronous move on
+ * @param[IN] type           Transfer type field for performance
+ *                           Can be set to MCA_ACCELERATOR_TRANSFER_UNSPEC
+ *                           if caller is unsure of the transfer direction.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_memmove_async_fn_t)(
+    int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+    opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
+
 /**
  * Allocates size bytes memory from the device and sets ptr to the
  * pointer of the allocated memory. The memory is not initialized.
@@ -340,6 +368,46 @@ typedef int (*opal_accelerator_base_module_mem_alloc_fn_t)(
 typedef int (*opal_accelerator_base_module_mem_release_fn_t)(
     int dev_id, void *ptr);
 
+
+/**
+ * Allocates size bytes memory from the device and sets ptr to the
+ * pointer of the allocated memory. The memory is not initialized.
+ * The allocation request is placed into the stream object.
+ * Any use of the memory must succeed the completion of this
+ * operation on the stream.
+ *
+ * @param[IN] dev_id         Associated device for the allocation or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[OUT] ptr           Returns pointer to allocated memory
+ * @param[IN] size           Size of memory to allocate
+ * @param[IN] stream         Stream into which to insert the allocation request
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_mem_alloc_stream_fn_t)(
+    int dev_id, void **ptr, size_t size, opal_accelerator_stream_t *stream);
+
+/**
+ * Frees the memory space pointed to by ptr which has been returned by
+ * a previous call to an opal_accelerator_base_module_mem_alloc_stream_fn_t().
+ * If the function is called on a ptr that has already been freed,
+ * undefined behavior occurs. If ptr is NULL, no operation is performed,
+ * and the function returns OPAL_SUCCESS.
+ * The release of the memory will be inserted into the stream and occurs after
+ * all previous operations have completed.
+ *
+ * @param[IN] dev_id         Associated device for the allocation or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] ptr            Pointer to free
+ * @param[IN] stream         Stream into which to insert the free operation
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_mem_release_stream_fn_t)(
+    int dev_id, void *ptr, opal_accelerator_stream_t *stream);
+
+
+
 /**
  * Retrieves the base address and/or size of a memory allocation of the
  * device.
@@ -557,6 +625,35 @@ typedef int (*opal_accelerator_base_module_device_can_access_peer_fn_t)(
 typedef int (*opal_accelerator_base_module_get_buffer_id_fn_t)(
     int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+/**
+ * Wait for the completion of all operations inserted into the stream.
+ *
+ * @param[IN] stram          The stream to wait for.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_wait_stream_fn_t)(opal_accelerator_stream_t *stream);
+
+/**
+ * Get the number of devices available.
+ *
+ * @param[OUT] stram         Number of devices.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_devices);
+
+/**
+ * Get the memory bandwidth of the device.
+ *
+ * @param[IN] device         The device to query.
+ * @param[OUT] bw            The returned bandwidth for the device.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_get_mem_bw_fn_t)(int device, float *bw);
+
+
 /*
  * the standard public API data structure
  */
@@ -572,10 +669,13 @@ typedef struct {
 
     opal_accelerator_base_module_memcpy_async_fn_t mem_copy_async;
     opal_accelerator_base_module_memcpy_fn_t mem_copy;
+    opal_accelerator_base_module_memmove_async_fn_t mem_move_async;
     opal_accelerator_base_module_memmove_fn_t mem_move;
 
     opal_accelerator_base_module_mem_alloc_fn_t mem_alloc;
     opal_accelerator_base_module_mem_release_fn_t mem_release;
+    opal_accelerator_base_module_mem_alloc_stream_fn_t mem_alloc_stream;
+    opal_accelerator_base_module_mem_release_stream_fn_t mem_release_stream;
     opal_accelerator_base_module_get_address_range_fn_t get_address_range;
 
     opal_accelerator_base_module_is_ipc_enabled_fn_t is_ipc_enabled;
@@ -595,6 +695,10 @@ typedef struct {
     opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer;
 
     opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id;
+
+    opal_accelerator_base_module_wait_stream_fn_t wait_stream;
+    opal_accelerator_base_module_get_num_devices_fn_t num_devices;
+    opal_accelerator_base_module_get_mem_bw_fn_t get_mem_bw;
 } opal_accelerator_base_module_t;
 
 /**
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index edabb864c3d..a55b5d490b8 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -6,6 +6,9 @@
  *                         All rights reserved.
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -35,10 +38,16 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src,
+                                          size_t size, opal_accelerator_stream_t *stream,
+                                          opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                              opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size);
 static int accelerator_cuda_mem_release(int dev_id, void *ptr);
+static int accelerator_cuda_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                             opal_accelerator_stream_t *stream);
+static int accelerator_cuda_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base,
                                               size_t *size);
 
@@ -67,6 +76,11 @@ static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int d
 
 static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream);
+static int accelerator_cuda_get_num_devices(int *num_devices);
+static int accelerator_cuda_get_mem_bw(int device, float *bw);
+
+
 #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((CUstream *)_stream->stream))
 
 opal_accelerator_base_module_t opal_accelerator_cuda_module =
@@ -82,9 +96,12 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
 
     accelerator_cuda_memcpy_async,
     accelerator_cuda_memcpy,
+    accelerator_cuda_memmove_async,
     accelerator_cuda_memmove,
     accelerator_cuda_mem_alloc,
     accelerator_cuda_mem_release,
+    accelerator_cuda_mem_alloc_stream,
+    accelerator_cuda_mem_release_stream,
     accelerator_cuda_get_address_range,
 
     accelerator_cuda_is_ipc_enabled,
@@ -103,9 +120,31 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_get_device_pci_attr,
     accelerator_cuda_device_can_access_peer,
 
-    accelerator_cuda_get_buffer_id
+    accelerator_cuda_get_buffer_id,
+
+    accelerator_cuda_wait_stream,
+    accelerator_cuda_get_num_devices,
+    accelerator_cuda_get_mem_bw
 };
 
+static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
+    /* query the device from the context */
+    int dev_id = -1;
+    CUdevice ptr_dev;
+    cuCtxPushCurrent(mem_ctx);
+    cuCtxGetDevice(&ptr_dev);
+    for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+        CUdevice dev;
+        cuDeviceGet(&dev, i);
+        if (dev == ptr_dev) {
+            dev_id = i;
+            break;
+        }
+    }
+    cuCtxPopCurrent(&mem_ctx);
+    return dev_id;
+}
+
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
 {
     CUresult result;
@@ -154,6 +193,9 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else if (0 == mem_type) {
         /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
         return 0;
+    } else {
+        /* query the device from the context */
+        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
     }
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
@@ -169,6 +211,10 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else if (CU_MEMORYTYPE_HOST == mem_type) {
         /* Host memory, nothing to do here */
         return 0;
+    } else {
+        result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
+        /* query the device from the context */
+        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
     }
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
@@ -216,7 +262,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
         }
     }
 
-    /* WORKAROUND - They are times when the above code determines a pice of memory
+    /* WORKAROUND - There are times when the above code determines a pice of memory
      * is GPU memory, but it actually is not.  That has been seen on multi-GPU systems
      * with 6 or 8 GPUs on them. Therefore, we will do this extra check.  Note if we
      * made it this far, then the assumption at this point is we have GPU memory.
@@ -435,34 +481,23 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_SUCCESS;
     }
 
-    /* Async copy then synchronize is the default behavior as some applications
-     * cannot utilize synchronous copies. In addition, host memory does not need
-     * to be page-locked if an Async memory copy is done (It just makes it synchronous
-     * which is what we want anyway):
-     * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-     * Additionally, cuMemcpy is not necessarily always synchronous. See:
-     * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html
-     * TODO: Add optimizations for type field */
-    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
-                       size, result);
-        return OPAL_ERROR;
-    }
-    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpy((CUdeviceptr) dest, (CUdeviceptr) src, size);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
+        opal_show_help("help-accelerator-cuda.txt", "cuMemcpy failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }
     return OPAL_SUCCESS;
 }
 
-static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
-                             opal_accelerator_transfer_type_t type)
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                          const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream,
+                                          opal_accelerator_transfer_type_t type)
 {
     CUdeviceptr tmp;
     CUresult result;
+    void *ptr;
 
     int delayed_init = opal_accelerator_cuda_delayed_init();
     if (OPAL_UNLIKELY(0 != delayed_init)) {
@@ -473,29 +508,41 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = cuMemAlloc(&tmp, size);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    result = accelerator_cuda_mem_alloc_stream(src_dev_id, &ptr, size, stream);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
         return OPAL_ERROR;
     }
-    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    tmp = (CUdeviceptr)ptr;
+    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, *(CUstream*)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size,
                        result);
         return OPAL_ERROR;
     }
-    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, *(CUstream*)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp,
                        size, result);
         return OPAL_ERROR;
     }
-    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    return accelerator_cuda_mem_release_stream(src_dev_id, ptr, stream);
+}
+
+static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                    opal_accelerator_transfer_type_t type)
+{
+    int ret;
+
+    ret = accelerator_cuda_memmove_async(dest_dev_id, src_dev_id, dest, src, size, &opal_accelerator_cuda_memcpy_stream.base, type);
+    if (OPAL_SUCCESS != ret) {
+        return OPAL_ERROR;
+    }
+    ret = accelerator_cuda_wait_stream(&opal_accelerator_cuda_memcpy_stream.base);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }
-    cuMemFree(tmp);
     return OPAL_SUCCESS;
 }
 
@@ -512,15 +559,35 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
         return OPAL_ERR_BAD_PARAM;
     }
 
-    if (size > 0) {
-        result = cuMemAlloc((CUdeviceptr *) ptr, size);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
-                           OPAL_PROC_MY_HOSTNAME, result);
-            return OPAL_ERROR;
-        }
+    result = cuMemAlloc((CUdeviceptr *) ptr, size);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
     }
-    return 0;
+    return OPAL_SUCCESS;
+}
+
+
+
+static int accelerator_cuda_mem_alloc_stream(int dev_id, void **addr, size_t size,
+                                             opal_accelerator_stream_t *stream)
+{
+
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    /* fall-back to regular stream allocation */
+
+    CUresult result = cuMemAllocAsync((CUdeviceptr*)addr, size, *(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
 }
 
 static int accelerator_cuda_mem_release(int dev_id, void *ptr)
@@ -537,6 +604,38 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr)
     return 0;
 }
 
+static int accelerator_cuda_mem_release_stream(int dev_id, void *addr,
+                                               opal_accelerator_stream_t *stream)
+{
+    CUresult result;
+
+    if (NULL == stream || NULL == addr) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = cuMemFreeAsync((CUdeviceptr)addr, *(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
+static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream)
+{
+    CUresult result;
+    result = cuStreamSynchronize(*(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
 static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base,
                                        size_t *size)
 {
@@ -764,3 +863,29 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
     }
     return OPAL_SUCCESS;
 }
+
+
+
+static int accelerator_cuda_get_num_devices(int *num_devices)
+{
+
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    *num_devices = opal_accelerator_cuda_num_devices;
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_cuda_get_mem_bw(int device, float *bw)
+{
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    assert(opal_accelerator_cuda_mem_bw != NULL);
+
+    *bw = opal_accelerator_cuda_mem_bw[device];
+    return OPAL_SUCCESS;
+}
\ No newline at end of file
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
index 694a4192231..831d3419881 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.h
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -2,6 +2,9 @@
  * Copyright (c) 2014      Intel, Inc.  All rights reserved.
  * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -38,13 +41,16 @@ typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t);
 
 /* Declare extern variables, defined in accelerator_cuda_component.c */
-OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream;
-OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock;
+OPAL_DECLSPEC extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream;
 
 OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_component;
 
 OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
 
+OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices;
+
+OPAL_DECLSPEC extern float *opal_accelerator_cuda_mem_bw;
+
 OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
 
 END_C_DECLS
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index d48e29c9f65..0d39e1f4eef 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -6,6 +6,9 @@
  *                         reserved.
  * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -34,13 +37,15 @@
 #include "opal/sys/atomic.h"
 
 /* Define global variables, used in accelerator_cuda.c */
-CUstream opal_accelerator_cuda_memcpy_stream = NULL;
-opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
+opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream = {0};
+int opal_accelerator_cuda_num_devices = 0;
 
 /* Initialization lock for delayed cuda initialization */
 static opal_mutex_t accelerator_cuda_init_lock;
 static bool accelerator_cuda_init_complete = false;
 
+float *opal_accelerator_cuda_mem_bw = NULL;
+
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x)  STRINGIFY2(x)
 
@@ -122,6 +127,7 @@ static int accelerator_cuda_component_register(void)
 int opal_accelerator_cuda_delayed_init()
 {
     int result = OPAL_SUCCESS;
+    int prio_lo, prio_hi;
     CUcontext cuContext;
 
     /* Double checked locking to avoid having to
@@ -137,6 +143,8 @@ int opal_accelerator_cuda_delayed_init()
         goto out;
     }
 
+    cuDeviceGetCount(&opal_accelerator_cuda_num_devices);
+
     /* Check to see if this process is running in a CUDA context.  If
      * so, all is good.  If not, then disable registration of memory. */
     result = cuCtxGetCurrent(&cuContext);
@@ -145,19 +153,51 @@ int opal_accelerator_cuda_delayed_init()
         goto out;
     } else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
-        result = OPAL_ERROR;
-        goto out;
+
+        /* create a context for each device */
+        for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+            CUdevice dev;
+            result = cuDeviceGet(&dev, i);
+            if (CUDA_SUCCESS != result) {
+                opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                    "CUDA: cuDeviceGet failed");
+                result = OPAL_ERROR;
+                goto out;
+            }
+            result = cuDevicePrimaryCtxRetain(&cuContext, dev);
+            if (CUDA_SUCCESS != result) {
+                opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                    "CUDA: cuDevicePrimaryCtxRetain failed");
+                result = OPAL_ERROR;
+                goto out;
+            }
+            if (0 == i) {
+                result = cuCtxPushCurrent(cuContext);
+                if (CUDA_SUCCESS != result) {
+                    opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                        "CUDA: cuCtxPushCurrent failed");
+                    result = OPAL_ERROR;
+                    goto out;
+                }
+            }
+        }
+
+
     } else {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
     }
 
     /* Create stream for use in cuMemcpyAsync synchronous copies */
-    result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0);
+    CUstream memcpy_stream;
+    result = cuStreamCreate(&memcpy_stream, 0);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         goto out;
     }
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_cuda_stream_t);
+    opal_accelerator_cuda_memcpy_stream.base.stream = malloc(sizeof(CUstream));
+    *(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream = memcpy_stream;
 
     result = cuMemHostRegister(&checkmem, sizeof(int), 0);
     if (result != CUDA_SUCCESS) {
@@ -165,11 +205,36 @@ int opal_accelerator_cuda_delayed_init()
          * This is not a fatal error. */
         opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister during init failed", true,
                        &checkmem, sizeof(int), OPAL_PROC_MY_HOSTNAME, result, "checkmem");
-
     } else {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
                             "CUDA: cuMemHostRegister OK on test region");
     }
+
+    /* determine the memory bandwidth */
+    opal_accelerator_cuda_mem_bw = malloc(sizeof(float)*opal_accelerator_cuda_num_devices);
+    for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+        CUdevice dev;
+        result = cuDeviceGet(&dev, i);
+        if (CUDA_SUCCESS != result) {
+            opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                "CUDA: cuDeviceGet failed");
+            goto out;
+        }
+        int mem_clock_rate; // kHz
+        result = cuDeviceGetAttribute(&mem_clock_rate,
+                                CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                                dev);
+        int bus_width; // bit
+        result = cuDeviceGetAttribute(&bus_width,
+                                CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                                dev);
+        /* bw = clock_rate * bus width * 2bit multiplier
+         * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
+         */
+        float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
+        opal_accelerator_cuda_mem_bw[i] = bw;
+    }
+
     result = OPAL_SUCCESS;
     opal_atomic_wmb();
     accelerator_cuda_init_complete = true;
@@ -180,8 +245,9 @@ int opal_accelerator_cuda_delayed_init()
 
 static opal_accelerator_base_module_t* accelerator_cuda_init(void)
 {
-    OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
     OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_stream_t);
+
     /* First check if the support is enabled.  In the case that the user has
      * turned it off, we do not need to continue with any CUDA specific
      * initialization.  Do this after MCA parameter registration. */
@@ -205,11 +271,14 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
     if (CUDA_SUCCESS != result) {
         ctx_ok = 0;
     }
-    if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) {
-        cuStreamDestroy(opal_accelerator_cuda_memcpy_stream);
+
+    if ((NULL != opal_accelerator_cuda_memcpy_stream.base.stream) && ctx_ok) {
+        OBJ_DESTRUCT(&opal_accelerator_cuda_memcpy_stream);
     }
 
-    OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
+    free(opal_accelerator_cuda_mem_bw);
+    opal_accelerator_cuda_mem_bw = NULL;
+
     OBJ_DESTRUCT(&accelerator_cuda_init_lock);
     return;
 }
diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c
index 1bd6e0e2811..ce36d79f164 100644
--- a/opal/mca/accelerator/null/accelerator_null_component.c
+++ b/opal/mca/accelerator/null/accelerator_null_component.c
@@ -9,6 +9,9 @@
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2023      Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -50,11 +53,15 @@ static int accelerator_null_memcpy_async(int dest_dev_id, int src_dev_id, void *
                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                                    size_t size, opal_accelerator_transfer_type_t type);
+static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                     opal_accelerator_transfer_type_t type);
 
 static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size);
 static int accelerator_null_mem_release(int dev_id, void *ptr);
+static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size, opal_accelerator_stream_t* stream);
+static int accelerator_null_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base, size_t *size);
 
 static bool accelerator_null_is_ipc_enabled(void);
@@ -82,6 +89,12 @@ static int accelerator_null_device_can_access_peer(int *access, int dev1, int de
 
 static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream);
+
+static int accelerator_null_get_num_devices(int *num_devices);
+
+static int accelerator_null_get_mem_bw(int device, float *bw);
+
 /*
  * Instantiate the public struct with all of our public information
  * and pointers to our public functions in it
@@ -133,9 +146,12 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
 
     accelerator_null_memcpy_async,
     accelerator_null_memcpy,
+    accelerator_null_memmove_async,
     accelerator_null_memmove,
     accelerator_null_mem_alloc,
     accelerator_null_mem_release,
+    accelerator_null_mem_alloc_stream,
+    accelerator_null_mem_release_stream,
     accelerator_null_get_address_range,
 
     accelerator_null_is_ipc_enabled,
@@ -154,7 +170,11 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
     accelerator_null_get_device_pci_attr,
     accelerator_null_device_can_access_peer,
 
-    accelerator_null_get_buffer_id
+    accelerator_null_get_buffer_id,
+
+    accelerator_null_wait_stream,
+    accelerator_null_get_num_devices,
+    accelerator_null_get_mem_bw
 };
 
 static int accelerator_null_open(void)
@@ -237,6 +257,13 @@ static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest,
     return OPAL_SUCCESS;
 }
 
+static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type)
+{
+    memmove(dest, src, size);
+    return OPAL_SUCCESS;
+}
+
 static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size)
 {
     *ptr = malloc(size);
@@ -249,6 +276,23 @@ static int accelerator_null_mem_release(int dev_id, void *ptr)
     return OPAL_SUCCESS;
 }
 
+
+static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                             opal_accelerator_stream_t *stream)
+{
+    (void)stream;
+    *ptr = malloc(size);
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_null_mem_release_stream(int dev_id, void *ptr,
+                                               opal_accelerator_stream_t *stream)
+{
+    (void)stream;
+    free(ptr);
+    return OPAL_SUCCESS;
+}
+
 static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base,
                                               size_t *size)
 {
@@ -331,3 +375,21 @@ static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_acc
 {
     return OPAL_ERR_NOT_IMPLEMENTED;
 }
+
+static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream)
+{
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_null_get_num_devices(int *num_devices)
+{
+    *num_devices = 0;
+    return OPAL_SUCCESS;
+}
+
+
+static int accelerator_null_get_mem_bw(int device, float *bw)
+{
+    *bw = 1.0; // return something that is not 0
+    return OPAL_SUCCESS;
+}
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm.h b/opal/mca/accelerator/rocm/accelerator_rocm.h
index 38409778ad4..22e78d91a0e 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm.h
+++ b/opal/mca/accelerator/rocm/accelerator_rocm.h
@@ -1,5 +1,8 @@
 /*
  * Copyright (c) 2022-2023  Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  *
  * $COPYRIGHT$
  *
@@ -67,12 +70,15 @@ struct opal_accelerator_rocm_ipc_event_handle_t {
 typedef struct opal_accelerator_rocm_ipc_event_handle_t opal_accelerator_rocm_ipc_event_handle_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_rocm_ipc_event_handle_t);
 
-OPAL_DECLSPEC extern hipStream_t opal_accelerator_rocm_MemcpyStream;
+OPAL_DECLSPEC extern hipStream_t *opal_accelerator_rocm_MemcpyStream;
 OPAL_DECLSPEC extern int opal_accelerator_rocm_memcpy_async;
 OPAL_DECLSPEC extern int opal_accelerator_rocm_verbose;
 OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyH2D_limit;
 OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyD2H_limit;
 
+OPAL_DECLSPEC extern int opal_accelerator_rocm_num_devices;
+OPAL_DECLSPEC extern float *opal_accelerator_rocm_mem_bw;
+
 OPAL_DECLSPEC extern int opal_accelerator_rocm_lazy_init(void);
 
 #endif
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
index 8f1bbbb53a5..c999909d7df 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
@@ -7,6 +7,9 @@
  * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -19,7 +22,9 @@
 #include <stdio.h>
 
 #include "opal/mca/dl/base/base.h"
+#include "opal/mca/accelerator/base/base.h"
 #include "opal/runtime/opal_params.h"
+#include "opal/util/proc.h"
 #include "accelerator_rocm.h"
 
 int opal_accelerator_rocm_memcpy_async = 0;
@@ -31,7 +36,10 @@ size_t opal_accelerator_rocm_memcpyH2D_limit=1048576;
 static opal_mutex_t accelerator_rocm_init_lock;
 static bool accelerator_rocm_init_complete = false;
 
-hipStream_t opal_accelerator_rocm_MemcpyStream = NULL;
+/* Define global variables, used in accelerator_rocm.c */
+int opal_accelerator_rocm_num_devices = 0;
+float *opal_accelerator_rocm_mem_bw = NULL;
+hipStream_t *opal_accelerator_rocm_MemcpyStream = NULL;
 
 /*
  * Public string showing the accelerator rocm component version number
@@ -159,6 +167,7 @@ static int accelerator_rocm_component_register(void)
 
 int opal_accelerator_rocm_lazy_init()
 {
+    int prio_hi, prio_lo;
     int err = OPAL_SUCCESS;
 
     /* Double checked locking to avoid having to
@@ -174,13 +183,35 @@ int opal_accelerator_rocm_lazy_init()
         goto out;
     }
 
-    err = hipStreamCreate(&opal_accelerator_rocm_MemcpyStream);
-    if (hipSuccess != err) {
+    hipGetDeviceCount(&opal_accelerator_rocm_num_devices);
+
+    /* Create stream for use in hipMemcpyAsync synchronous copies */
+    hipStream_t memcpy_stream;
+    err = hipStreamCreate(&memcpy_stream);
+    if (OPAL_UNLIKELY(err != hipSuccess)) {
         opal_output(0, "Could not create hipStream, err=%d %s\n",
                 err, hipGetErrorString(err));
         goto out;
     }
-
+    opal_accelerator_rocm_MemcpyStream = malloc(sizeof(hipStream_t));
+    *(hipStream_t*)opal_accelerator_rocm_MemcpyStream = memcpy_stream;
+
+    opal_accelerator_rocm_mem_bw = malloc(sizeof(float)*opal_accelerator_rocm_num_devices);
+    for (int i = 0; i < opal_accelerator_rocm_num_devices; ++i) {
+        int mem_clock_rate; // kHz
+        err = hipDeviceGetAttribute(&mem_clock_rate,
+                                    hipDeviceAttributeMemoryClockRate,
+                                    i);
+        int bus_width; // bit
+        err = hipDeviceGetAttribute(&bus_width,
+                                    hipDeviceAttributeMemoryBusWidth,
+                                    i);
+        /* bw = clock_rate * bus width * 2bit multiplier
+         * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
+         */
+        float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
+        opal_accelerator_rocm_mem_bw[i] = bw;
+    }
     err = OPAL_SUCCESS;
     opal_atomic_wmb();
     accelerator_rocm_init_complete = true;
@@ -192,7 +223,7 @@ int opal_accelerator_rocm_lazy_init()
 static opal_accelerator_base_module_t* accelerator_rocm_init(void)
 {
     OBJ_CONSTRUCT(&accelerator_rocm_init_lock, opal_mutex_t);
-    
+
     hipError_t err;
 
     if (opal_rocm_runtime_initialized) {
@@ -214,12 +245,16 @@ static opal_accelerator_base_module_t* accelerator_rocm_init(void)
 
 static void accelerator_rocm_finalize(opal_accelerator_base_module_t* module)
 {
-    if (NULL != (void*)opal_accelerator_rocm_MemcpyStream) {
-        hipError_t err = hipStreamDestroy(opal_accelerator_rocm_MemcpyStream);
+    if (NULL != opal_accelerator_rocm_MemcpyStream) {
+        hipError_t err = hipStreamDestroy(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err) {
             opal_output_verbose(10, 0, "hip_dl_finalize: error while destroying the hipStream\n");
         }
+        free(opal_accelerator_rocm_MemcpyStream);
         opal_accelerator_rocm_MemcpyStream = NULL;
+
+        free(opal_accelerator_rocm_mem_bw);
+        opal_accelerator_rocm_mem_bw = NULL;
     }
 
     OBJ_DESTRUCT(&accelerator_rocm_init_lock);
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index 6db5e0d4927..dfebf09def8 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -1,6 +1,9 @@
 /*
  * Copyright (c) 2022-2023  Advanced Micro Devices, Inc. All Rights reserved.
  * $COPYRIGHT$
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  *
  * Additional copyrights may follow
  *
@@ -27,10 +30,17 @@ static int mca_accelerator_rocm_memcpy_async(int dest_dev_id, int src_dev_id, vo
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                              const void *src, size_t size,
+                                              opal_accelerator_stream_t *stream,
+                                              opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_mem_alloc(int dev_id, void **ptr, size_t size);
 static int mca_accelerator_rocm_mem_release(int dev_id, void *ptr);
+static int mca_accelerator_rocm_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                                 opal_accelerator_stream_t *stream);
+static int mca_accelerator_rocm_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int mca_accelerator_rocm_get_address_range(int dev_id, const void *ptr, void **base,
                                                   size_t *size);
 
@@ -59,6 +69,11 @@ static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, i
 
 static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream);
+
+static int mca_accelerator_rocm_get_num_devices(int *num_devices);
+
+static int mca_accelerator_rocm_get_mem_bw(int device, float *bw);
 
 #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((hipStream_t *)_stream->stream))
 
@@ -75,9 +90,12 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
 
     mca_accelerator_rocm_memcpy_async,
     mca_accelerator_rocm_memcpy,
+    mca_accelerator_rocm_memmove_async,
     mca_accelerator_rocm_memmove,
     mca_accelerator_rocm_mem_alloc,
     mca_accelerator_rocm_mem_release,
+    mca_accelerator_rocm_mem_alloc_stream,
+    mca_accelerator_rocm_mem_release_stream,
     mca_accelerator_rocm_get_address_range,
 
     mca_accelerator_rocm_is_ipc_enabled,
@@ -96,7 +114,11 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
     mca_accelerator_rocm_get_device_pci_attr,
     mca_accelerator_rocm_device_can_access_peer,
 
-    mca_accelerator_rocm_get_buffer_id
+    mca_accelerator_rocm_get_buffer_id,
+
+    mca_accelerator_rocm_wait_stream,
+    mca_accelerator_rocm_get_num_devices,
+    mca_accelerator_rocm_get_mem_bw
 };
 
 
@@ -233,7 +255,7 @@ OBJ_CLASS_INSTANCE(
     opal_accelerator_event_t,
     NULL,
     mca_accelerator_rocm_event_destruct);
- 
+
 static int mca_accelerator_rocm_record_event(int dev_id, opal_accelerator_event_t *event,
                                              opal_accelerator_stream_t *stream)
 {
@@ -348,14 +370,14 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de
 
     if (opal_accelerator_rocm_memcpy_async) {
         err = hipMemcpyAsync(dest, src, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error starting async copy\n");
             return OPAL_ERROR;
         }
 
-        err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream);
+        err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error synchronizing stream after async copy\n");
@@ -373,6 +395,44 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de
     return OPAL_SUCCESS;
 }
 
+static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src,
+                                              size_t size, opal_accelerator_stream_t *stream,
+                                              opal_accelerator_transfer_type_t type)
+{
+    hipDeviceptr_t tmp;
+    hipError_t result;
+    int ret;
+    void *ptr;
+
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    if (NULL == dest || NULL == src || size <= 0) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    ret = mca_accelerator_rocm_mem_alloc_stream(src_dev_id, &ptr, size, stream);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
+        return OPAL_ERROR;
+    }
+    tmp = (hipDeviceptr_t)ptr;
+    result = hipMemcpyAsync(tmp, (hipDeviceptr_t) src, size, hipMemcpyDefault, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error during synchronous copy\n");
+        return OPAL_ERROR;
+    }
+    result = hipMemcpyAsync((hipDeviceptr_t) dest, tmp, size, hipMemcpyDefault, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error during synchronous copy\n");
+        return OPAL_ERROR;
+    }
+    return mca_accelerator_rocm_mem_release_stream(src_dev_id, ptr, stream);
+}
+
 static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest,
                                         const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type)
@@ -393,7 +453,7 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d
 
     if (opal_accelerator_rocm_memcpy_async) {
         err = hipMemcpyAsync(tmp, src, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error in async memcpy for memmove\n");
@@ -401,14 +461,14 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d
         }
 
         err = hipMemcpyAsync(dest, tmp, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error in async memcpy for memmove\n");
             return OPAL_ERROR;
         }
 
-        err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream);
+        err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error synchronizing stream for memmove\n");
@@ -535,7 +595,7 @@ static int mca_accelerator_rocm_get_ipc_handle(int dev_id, void *dev_ptr,
                             "Error in hipIpcGetMemHandle dev_ptr %p", dev_ptr);
         OBJ_DESTRUCT(rocm_handle);
         return OPAL_ERROR;
-    }    
+    }
     memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE);
 
     return OPAL_SUCCESS;
@@ -597,7 +657,7 @@ static int mca_accelerator_rocm_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HAN
 
 static void mca_accelerator_rocm_ipc_event_handle_destruct(opal_accelerator_rocm_ipc_handle_t *handle)
 {
-    // Just a place holder, there is no hipIpcCloseEventHandle. 
+    // Just a place holder, there is no hipIpcCloseEventHandle.
 }
 
 OBJ_CLASS_INSTANCE(
@@ -617,7 +677,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e
     hipIpcEventHandle_t rocm_ipc_handle;
     opal_accelerator_rocm_ipc_event_handle_t *rocm_handle = (opal_accelerator_rocm_ipc_event_handle_t *) handle;
     OBJ_CONSTRUCT(rocm_handle, opal_accelerator_rocm_ipc_event_handle_t);
-    
+
     memset(rocm_ipc_handle.reserved, 0, HIP_IPC_HANDLE_SIZE);
     hipError_t err = hipIpcGetEventHandle(&rocm_ipc_handle,
                                           *((hipEvent_t *)event->event));
@@ -626,7 +686,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e
                             "error in hipIpcGetEventHandle");
         OBJ_DESTRUCT(rocm_handle);
         return OPAL_ERROR;
-    }    
+    }
     memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE);
 
     return OPAL_SUCCESS;
@@ -664,7 +724,7 @@ static int mca_accelerator_rocm_open_ipc_event_handle(opal_accelerator_ipc_event
         opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                             "error in hipIpcOpenEventHandle");
         return OPAL_ERROR;
-    }    
+    }
 
     return OPAL_SUCCESS;
 }
@@ -802,3 +862,81 @@ static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal
 #endif
     return OPAL_SUCCESS;
 }
+
+
+static int mca_accelerator_rocm_mem_alloc_stream(
+    int dev_id,
+    void **addr,
+    size_t size,
+    opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    if (NULL == stream || NULL == addr || 0 == size) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = hipMallocAsync(addr, size, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error allocating memory\n");
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+static int mca_accelerator_rocm_mem_release_stream(
+    int dev_id,
+    void *addr,
+    opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+
+    if (NULL == stream || NULL == addr) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = hipFreeAsync(addr, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error freeing memory\n");
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+    result = hipStreamSynchronize(*(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error synchronizing stream\n");
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
+static int mca_accelerator_rocm_get_num_devices(int *num_devices)
+{
+    *num_devices = opal_accelerator_rocm_num_devices;
+    return OPAL_SUCCESS;
+}
+
+static int mca_accelerator_rocm_get_mem_bw(int device, float *bw)
+{
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    assert(opal_accelerator_rocm_mem_bw != NULL);
+
+    *bw = opal_accelerator_rocm_mem_bw[device];
+    return OPAL_SUCCESS;
+}
\ No newline at end of file
diff --git a/opal/mca/accelerator/ze/accelerator_ze_module.c b/opal/mca/accelerator/ze/accelerator_ze_module.c
index 38b49cf4290..ca4b4bb7fdc 100644
--- a/opal/mca/accelerator/ze/accelerator_ze_module.c
+++ b/opal/mca/accelerator/ze/accelerator_ze_module.c
@@ -32,10 +32,17 @@ static int mca_accelerator_ze_memcpy_async(int dest_dev_id, int src_dev_id, void
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                            const void *src, size_t size,
+                                            opal_accelerator_stream_t *stream,
+                                            opal_accelerator_transfer_type_t type);
 static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type);
 static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size);
 static int mca_accelerator_ze_mem_release(int dev_id, void *ptr);
+static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                               opal_accelerator_stream_t *stream);
+static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base,
                                                   size_t *size);
 
@@ -65,6 +72,12 @@ static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_p
 
 static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream);
+
+static int mca_accelerator_ze_get_num_devices(int *num_devices);
+
+static int mca_accelerator_ze_get_mem_bw(int device, float *bw);
+
 opal_accelerator_base_module_t opal_accelerator_ze_module =
 {
     .check_addr = mca_accelerator_ze_check_addr,
@@ -77,10 +90,13 @@ opal_accelerator_base_module_t opal_accelerator_ze_module =
 
     .mem_copy_async = mca_accelerator_ze_memcpy_async,
     .mem_copy = mca_accelerator_ze_memcpy,
+    .mem_move_async = mca_accelerator_ze_memmove_async,
     .mem_move = mca_accelerator_ze_memmove,
 
     .mem_alloc = mca_accelerator_ze_mem_alloc,
     .mem_release = mca_accelerator_ze_mem_release,
+    .mem_alloc_stream = mca_accelerator_ze_mem_alloc_stream,
+    .mem_release_stream = mca_accelerator_ze_mem_release_stream,
     .get_address_range = mca_accelerator_ze_get_address_range,
 
     .is_ipc_enabled = mca_accelerator_ze_is_ipc_enabled,
@@ -99,7 +115,10 @@ opal_accelerator_base_module_t opal_accelerator_ze_module =
     .get_device_pci_attr = mca_accelerator_ze_get_device_pci_attr,
     .device_can_access_peer = mca_accelerator_ze_device_can_access_peer,
 
-    .get_buffer_id = mca_accelerator_ze_get_buffer_id
+    .get_buffer_id = mca_accelerator_ze_get_buffer_id,
+    .wait_stream = mca_accelerator_ze_wait_stream,
+    .num_devices = mca_accelerator_ze_get_num_devices,
+    .get_mem_bw = mca_accelerator_ze_get_mem_bw
 };
 
 static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice)
@@ -137,7 +156,7 @@ static int mca_accelerator_ze_check_addr (const void *addr, int *dev_id, uint64_
 
     memset(&attr, 0, sizeof(ze_memory_allocation_properties_t));
 
-    zret = zeMemGetAllocProperties(opal_accelerator_ze_context, 
+    zret = zeMemGetAllocProperties(opal_accelerator_ze_context,
                                   addr,
                                   &attr,
                                   &hDevice);
@@ -200,7 +219,7 @@ static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_
         OBJ_RELEASE(*stream);
         return OPAL_ERR_OUT_OF_RESOURCE;
     }
-   
+
     if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) {
         hDevice = opal_accelerator_ze_devices_handle[0];
     } else {
@@ -208,9 +227,9 @@ static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_
     }
     ze_stream->dev_id = dev_id;
 
-    zret = zeCommandQueueCreate(opal_accelerator_ze_context, 
+    zret = zeCommandQueueCreate(opal_accelerator_ze_context,
                                 hDevice,
-                                &cmdQueueDesc, 
+                                &cmdQueueDesc,
                                 &ze_stream->hCommandQueue);
     if (ZE_RESULT_SUCCESS != zret) {
         opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
@@ -226,12 +245,12 @@ static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_
         .stype =  ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
         .pNext = NULL,
         .commandQueueGroupOrdinal = 0,
-        .flags = 0, 
+        .flags = 0,
     };
 
-    zret = zeCommandListCreate(opal_accelerator_ze_context, 
-                               opal_accelerator_ze_devices_handle[0], 
-                               &commandListDesc, 
+    zret = zeCommandListCreate(opal_accelerator_ze_context,
+                               opal_accelerator_ze_devices_handle[0],
+                               &commandListDesc,
                                &ze_stream->hCommandList);
     if (ZE_RESULT_SUCCESS != zret) {
         opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
@@ -359,7 +378,7 @@ static int mca_accelerator_ze_record_event(int dev_id, opal_accelerator_event_t
                            "zeCommandListClose returned %d", zret);
         return OPAL_ERROR;
     }
-    
+
     zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue,
                                              1,
                                              &ze_stream->hCommandList,
@@ -469,7 +488,7 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest
 
     if (NULL == src || NULL == dest || size <0) {
         return OPAL_ERR_BAD_PARAM;
-    }                           
+    }
     if (0 == size) {
         return OPAL_SUCCESS;
     }
@@ -486,7 +505,7 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest
         if (OPAL_SUCCESS != ret) {
             return ret;
         }
-    }            
+    }
 
     ze_stream = (opal_accelerator_ze_stream_t *)opal_accelerator_ze_MemcpyStream[dev_id]->stream;
     zret = zeCommandListAppendMemoryCopy(ze_stream->hCommandList,
@@ -509,8 +528,8 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest
         return OPAL_ERROR;
     }
 
-    zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue, 
-                                             1, 
+    zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue,
+                                             1,
                                              &ze_stream->hCommandList,
                                              NULL);
     if (ZE_RESULT_SUCCESS != zret) {
@@ -548,12 +567,23 @@ static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *des
     return OPAL_ERR_NOT_IMPLEMENTED;
 }
 
+static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                            const void *src, size_t size,
+                                            opal_accelerator_stream_t *stream,
+                                            opal_accelerator_transfer_type_t type)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
 static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size)
 {
    ze_result_t zret;
    size_t mem_alignment;
    ze_device_handle_t hDevice;
-   
+
    ze_device_mem_alloc_desc_t device_desc = {
         .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
         .pNext = NULL,
@@ -570,10 +600,10 @@ static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size)
     /* Currently ZE ignores this argument and uses an internal alignment
      * value. However, this behavior can change in the future. */
     mem_alignment = 1;
-    zret = zeMemAllocDevice(opal_accelerator_ze_context, 
-                           &device_desc, 
-                           size, 
-                           mem_alignment, 
+    zret = zeMemAllocDevice(opal_accelerator_ze_context,
+                           &device_desc,
+                           size,
+                           mem_alignment,
                            hDevice,
                            ptr);
     if (ZE_RESULT_SUCCESS != zret) {
@@ -603,6 +633,23 @@ static int mca_accelerator_ze_mem_release(int dev_id, void *ptr)
     return OPAL_ERROR;
 }
 
+static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                               opal_accelerator_stream_t *stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
+static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
 static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base,
 						  size_t *size)
 {
@@ -615,7 +662,7 @@ static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, voi
     }
 
     zret = zeMemGetAddressRange(opal_accelerator_ze_context,
-                                ptr,                             
+                                ptr,
                                 &pBase,
                                 &pSize);
     if (ZE_RESULT_SUCCESS != zret) {
@@ -694,7 +741,7 @@ static int mca_accelerator_ze_host_unregister(int dev_id, void *ptr)
 static int mca_accelerator_ze_get_device(int *dev_id)
 {
     /*
-     * this method does not map to the Zero Level API, just return 0.  
+     * this method does not map to the Zero Level API, just return 0.
      * This may just work if the runtime is use the ZE_AFFINITY_MASK
      * environment variable to control the visible PV(s) for a given process.
      */
@@ -709,15 +756,15 @@ static int mca_accelerator_ze_get_device(int *dev_id)
 }
 
 static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
-{                                       
+{
     ze_result_t zret;
     ze_device_handle_t hDevice;
     ze_pci_ext_properties_t pPciProperties;
-    
+
     if (NULL == pci_attr) {
         return OPAL_ERR_BAD_PARAM;
     }
-    
+
     if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) {
         hDevice = opal_accelerator_ze_devices_handle[0];
     } else {
@@ -730,15 +777,15 @@ static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_p
                             "zeDevicePciGetPropertiesExt returned %d", zret);
         return OPAL_ERROR;
     }
-    
+
     pci_attr->domain_id = (uint16_t)pPciProperties.address.domain;
     pci_attr->bus_id = (uint8_t) pPciProperties.address.bus;
     pci_attr->device_id = (uint8_t)pPciProperties.address.device;
     pci_attr->function_id = (uint8_t)pPciProperties.address.function;
 
     return OPAL_SUCCESS;
-}       
-            
+}
+
 
 /*
  * could zeDeviceGetP2PProperties be used instead here?
@@ -756,7 +803,7 @@ static int mca_accelerator_ze_device_can_access_peer(int *access, int dev1, int
 
     hDevice = opal_accelerator_ze_devices_handle[dev1];
     hPeerDevice = opal_accelerator_ze_devices_handle[dev2];
-    
+
     zret = zeDeviceCanAccessPeer(hDevice,
                                  hPeerDevice,
                                  &value);
@@ -781,7 +828,7 @@ static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_a
         return OPAL_ERR_BAD_PARAM;
     }
 
-    if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { 
+    if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) {
         hDevice = opal_accelerator_ze_devices_handle[0];
     } else {
         hDevice = opal_accelerator_ze_devices_handle[dev_id];
@@ -798,6 +845,31 @@ static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_a
     }
 
     *buf_id = pMemAllocProperties.id;
-                                   
+
     return OPAL_SUCCESS;
 }
+
+
+static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
+static int mca_accelerator_ze_get_num_devices(int *num_devices)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
+static int mca_accelerator_ze_get_mem_bw(int device, float *bw)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
\ No newline at end of file