From 97dbb09a435e68ed92205f9ac1469300a13a708a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 20 Feb 2024 16:22:51 -0500 Subject: [PATCH] Add stream operations to accelerator components - Stream-based alloc and free - Stream-based memmove - Wait for stream to complete Also, enable querying for number of devices and memory bandwidth. These operations are needed for operation device offloading. Co-authored-by: Phuong Nguyen Signed-off-by: Joseph Schuchart --- opal/mca/accelerator/accelerator.h | 106 +++++++++- opal/mca/accelerator/cuda/accelerator_cuda.c | 195 ++++++++++++++---- opal/mca/accelerator/cuda/accelerator_cuda.h | 10 +- .../cuda/accelerator_cuda_component.c | 89 +++++++- .../null/accelerator_null_component.c | 64 +++++- opal/mca/accelerator/rocm/accelerator_rocm.h | 8 +- .../rocm/accelerator_rocm_component.c | 49 ++++- .../rocm/accelerator_rocm_module.c | 162 +++++++++++++-- .../accelerator/ze/accelerator_ze_module.c | 132 +++++++++--- 9 files changed, 716 insertions(+), 99 deletions(-) diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h index 0d660725acc..1163e72fa7a 100644 --- a/opal/mca/accelerator/accelerator.h +++ b/opal/mca/accelerator/accelerator.h @@ -5,6 +5,9 @@ * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * * $COPYRIGHT$ * @@ -193,7 +196,7 @@ typedef int (*opal_accelerator_base_module_create_stream_fn_t)( * @param[IN] dev_id Associated device for the event or * MCA_ACCELERATOR_NO_DEVICE_ID * @param[OUT] event Event to create - * @param[IN] enable_ipc support inter-process tracking of the event + * @param[IN] enable_ipc support inter-process tracking of the event * * @return OPAL_SUCCESS or error status on failure. */ @@ -310,6 +313,31 @@ typedef int (*opal_accelerator_base_module_memmove_fn_t)( int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); + +/** + * Copies memory asynchronously from src to dest. Memory of dest and src + * may overlap. Optionally can specify the transfer type to + * avoid pointer detection for performance. The operations will be enqueued + * into the provided stream but are not guaranteed to be complete upon return. + * + * @param[IN] dest_dev_id Associated device to copy to or + * MCA_ACCELERATOR_NO_DEVICE_ID + * @param[IN] src_dev_id Associated device to copy from or + * MCA_ACCELERATOR_NO_DEVICE_ID + * @param[IN] dest Destination to copy memory to + * @param[IN] src Source to copy memory from + * @param[IN] size Size of memory to copy + * @param[IN] stream Stream to perform asynchronous move on + * @param[IN] type Transfer type field for performance + * Can be set to MCA_ACCELERATOR_TRANSFER_UNSPEC + * if caller is unsure of the transfer direction. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_memmove_async_fn_t)( + int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); + /** * Allocates size bytes memory from the device and sets ptr to the * pointer of the allocated memory. The memory is not initialized. @@ -340,6 +368,46 @@ typedef int (*opal_accelerator_base_module_mem_alloc_fn_t)( typedef int (*opal_accelerator_base_module_mem_release_fn_t)( int dev_id, void *ptr); + +/** + * Allocates size bytes memory from the device and sets ptr to the + * pointer of the allocated memory. The memory is not initialized. + * The allocation request is placed into the stream object. + * Any use of the memory must succeed the completion of this + * operation on the stream. + * + * @param[IN] dev_id Associated device for the allocation or + * MCA_ACCELERATOR_NO_DEVICE_ID + * @param[OUT] ptr Returns pointer to allocated memory + * @param[IN] size Size of memory to allocate + * @param[IN] stream Stream into which to insert the allocation request + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_mem_alloc_stream_fn_t)( + int dev_id, void **ptr, size_t size, opal_accelerator_stream_t *stream); + +/** + * Frees the memory space pointed to by ptr which has been returned by + * a previous call to an opal_accelerator_base_module_mem_alloc_stream_fn_t(). + * If the function is called on a ptr that has already been freed, + * undefined behavior occurs. If ptr is NULL, no operation is performed, + * and the function returns OPAL_SUCCESS. + * The release of the memory will be inserted into the stream and occurs after + * all previous operations have completed. + * + * @param[IN] dev_id Associated device for the allocation or + * MCA_ACCELERATOR_NO_DEVICE_ID + * @param[IN] ptr Pointer to free + * @param[IN] stream Stream into which to insert the free operation + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_mem_release_stream_fn_t)( + int dev_id, void *ptr, opal_accelerator_stream_t *stream); + + + /** * Retrieves the base address and/or size of a memory allocation of the * device. @@ -557,6 +625,35 @@ typedef int (*opal_accelerator_base_module_device_can_access_peer_fn_t)( typedef int (*opal_accelerator_base_module_get_buffer_id_fn_t)( int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +/** + * Wait for the completion of all operations inserted into the stream. + * + * @param[IN] stram The stream to wait for. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_wait_stream_fn_t)(opal_accelerator_stream_t *stream); + +/** + * Get the number of devices available. + * + * @param[OUT] stram Number of devices. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_devices); + +/** + * Get the memory bandwidth of the device. + * + * @param[IN] device The device to query. + * @param[OUT] bw The returned bandwidth for the device. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_get_mem_bw_fn_t)(int device, float *bw); + + /* * the standard public API data structure */ @@ -572,10 +669,13 @@ typedef struct { opal_accelerator_base_module_memcpy_async_fn_t mem_copy_async; opal_accelerator_base_module_memcpy_fn_t mem_copy; + opal_accelerator_base_module_memmove_async_fn_t mem_move_async; opal_accelerator_base_module_memmove_fn_t mem_move; opal_accelerator_base_module_mem_alloc_fn_t mem_alloc; opal_accelerator_base_module_mem_release_fn_t mem_release; + opal_accelerator_base_module_mem_alloc_stream_fn_t mem_alloc_stream; + opal_accelerator_base_module_mem_release_stream_fn_t mem_release_stream; opal_accelerator_base_module_get_address_range_fn_t get_address_range; opal_accelerator_base_module_is_ipc_enabled_fn_t is_ipc_enabled; @@ -595,6 +695,10 @@ typedef struct { opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer; opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id; + + opal_accelerator_base_module_wait_stream_fn_t wait_stream; + opal_accelerator_base_module_get_num_devices_fn_t num_devices; + opal_accelerator_base_module_get_mem_bw_fn_t get_mem_bw; } opal_accelerator_base_module_t; /** diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index edabb864c3d..a55b5d490b8 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -6,6 +6,9 @@ * All rights reserved. * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,10 +38,16 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void * opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, + size_t size, opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type); static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size); static int accelerator_cuda_mem_release(int dev_id, void *ptr); +static int accelerator_cuda_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream); +static int accelerator_cuda_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream); static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base, size_t *size); @@ -67,6 +76,11 @@ static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int d static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream); +static int accelerator_cuda_get_num_devices(int *num_devices); +static int accelerator_cuda_get_mem_bw(int device, float *bw); + + #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((CUstream *)_stream->stream)) opal_accelerator_base_module_t opal_accelerator_cuda_module = @@ -82,9 +96,12 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_memcpy_async, accelerator_cuda_memcpy, + accelerator_cuda_memmove_async, accelerator_cuda_memmove, accelerator_cuda_mem_alloc, accelerator_cuda_mem_release, + accelerator_cuda_mem_alloc_stream, + accelerator_cuda_mem_release_stream, accelerator_cuda_get_address_range, accelerator_cuda_is_ipc_enabled, @@ -103,9 +120,31 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_get_device_pci_attr, accelerator_cuda_device_can_access_peer, - accelerator_cuda_get_buffer_id + accelerator_cuda_get_buffer_id, + + accelerator_cuda_wait_stream, + accelerator_cuda_get_num_devices, + accelerator_cuda_get_mem_bw }; +static int accelerator_cuda_get_device_id(CUcontext mem_ctx) { + /* query the device from the context */ + int dev_id = -1; + CUdevice ptr_dev; + cuCtxPushCurrent(mem_ctx); + cuCtxGetDevice(&ptr_dev); + for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) { + CUdevice dev; + cuDeviceGet(&dev, i); + if (dev == ptr_dev) { + dev_id = i; + break; + } + } + cuCtxPopCurrent(&mem_ctx); + return dev_id; +} + static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags) { CUresult result; @@ -154,6 +193,9 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * } else if (0 == mem_type) { /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */ return 0; + } else { + /* query the device from the context */ + *dev_id = accelerator_cuda_get_device_id(mem_ctx); } /* Must be a device pointer */ assert(CU_MEMORYTYPE_DEVICE == mem_type); @@ -169,6 +211,10 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * } else if (CU_MEMORYTYPE_HOST == mem_type) { /* Host memory, nothing to do here */ return 0; + } else { + result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf); + /* query the device from the context */ + *dev_id = accelerator_cuda_get_device_id(mem_ctx); } /* Must be a device pointer */ assert(CU_MEMORYTYPE_DEVICE == mem_type); @@ -216,7 +262,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * } } - /* WORKAROUND - They are times when the above code determines a pice of memory + /* WORKAROUND - There are times when the above code determines a pice of memory * is GPU memory, but it actually is not. That has been seen on multi-GPU systems * with 6 or 8 GPUs on them. Therefore, we will do this extra check. Note if we * made it this far, then the assumption at this point is we have GPU memory. @@ -435,34 +481,23 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, return OPAL_SUCCESS; } - /* Async copy then synchronize is the default behavior as some applications - * cannot utilize synchronous copies. In addition, host memory does not need - * to be page-locked if an Async memory copy is done (It just makes it synchronous - * which is what we want anyway): - * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device - * Additionally, cuMemcpy is not necessarily always synchronous. See: - * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html - * TODO: Add optimizations for type field */ - result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src, - size, result); - return OPAL_ERROR; - } - result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream); + result = cuMemcpy((CUdeviceptr) dest, (CUdeviceptr) src, size); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true, + opal_show_help("help-accelerator-cuda.txt", "cuMemcpy failed", true, OPAL_PROC_MY_HOSTNAME, result); return OPAL_ERROR; } return OPAL_SUCCESS; } -static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, - opal_accelerator_transfer_type_t type) +static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type) { CUdeviceptr tmp; CUresult result; + void *ptr; int delayed_init = opal_accelerator_cuda_delayed_init(); if (OPAL_UNLIKELY(0 != delayed_init)) { @@ -473,29 +508,41 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, return OPAL_ERR_BAD_PARAM; } - result = cuMemAlloc(&tmp, size); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + result = accelerator_cuda_mem_alloc_stream(src_dev_id, &ptr, size, stream); + if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) { return OPAL_ERROR; } - result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream); + tmp = (CUdeviceptr)ptr; + result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, *(CUstream*)stream->stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size, result); return OPAL_ERROR; } - result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream); + result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, *(CUstream*)stream->stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp, size, result); return OPAL_ERROR; } - result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + return accelerator_cuda_mem_release_stream(src_dev_id, ptr, stream); +} + +static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_transfer_type_t type) +{ + int ret; + + ret = accelerator_cuda_memmove_async(dest_dev_id, src_dev_id, dest, src, size, &opal_accelerator_cuda_memcpy_stream.base, type); + if (OPAL_SUCCESS != ret) { + return OPAL_ERROR; + } + ret = accelerator_cuda_wait_stream(&opal_accelerator_cuda_memcpy_stream.base); + if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true, OPAL_PROC_MY_HOSTNAME, result); return OPAL_ERROR; } - cuMemFree(tmp); return OPAL_SUCCESS; } @@ -512,15 +559,35 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size) return OPAL_ERR_BAD_PARAM; } - if (size > 0) { - result = cuMemAlloc((CUdeviceptr *) ptr, size); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true, - OPAL_PROC_MY_HOSTNAME, result); - return OPAL_ERROR; - } + result = cuMemAlloc((CUdeviceptr *) ptr, size); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; } - return 0; + return OPAL_SUCCESS; +} + + + +static int accelerator_cuda_mem_alloc_stream(int dev_id, void **addr, size_t size, + opal_accelerator_stream_t *stream) +{ + + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + + /* fall-back to regular stream allocation */ + + CUresult result = cuMemAllocAsync((CUdeviceptr*)addr, size, *(CUstream*)stream->stream); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; + } + return OPAL_SUCCESS; } static int accelerator_cuda_mem_release(int dev_id, void *ptr) @@ -537,6 +604,38 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr) return 0; } +static int accelerator_cuda_mem_release_stream(int dev_id, void *addr, + opal_accelerator_stream_t *stream) +{ + CUresult result; + + if (NULL == stream || NULL == addr) { + return OPAL_ERR_BAD_PARAM; + } + + result = cuMemFreeAsync((CUdeviceptr)addr, *(CUstream*)stream->stream); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + + +static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream) +{ + CUresult result; + result = cuStreamSynchronize(*(CUstream*)stream->stream); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + + static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base, size_t *size) { @@ -764,3 +863,29 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc } return OPAL_SUCCESS; } + + + +static int accelerator_cuda_get_num_devices(int *num_devices) +{ + + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + + *num_devices = opal_accelerator_cuda_num_devices; + return OPAL_SUCCESS; +} + +static int accelerator_cuda_get_mem_bw(int device, float *bw) +{ + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + assert(opal_accelerator_cuda_mem_bw != NULL); + + *bw = opal_accelerator_cuda_mem_bw[device]; + return OPAL_SUCCESS; +} \ No newline at end of file diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h index 694a4192231..831d3419881 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.h +++ b/opal/mca/accelerator/cuda/accelerator_cuda.h @@ -2,6 +2,9 @@ * Copyright (c) 2014 Intel, Inc. All rights reserved. * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,13 +41,16 @@ typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t; OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t); /* Declare extern variables, defined in accelerator_cuda_component.c */ -OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream; -OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock; +OPAL_DECLSPEC extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream; OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_component; OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module; +OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices; + +OPAL_DECLSPEC extern float *opal_accelerator_cuda_mem_bw; + OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void); END_C_DECLS diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c index d48e29c9f65..0d39e1f4eef 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c @@ -6,6 +6,9 @@ * reserved. * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,13 +37,15 @@ #include "opal/sys/atomic.h" /* Define global variables, used in accelerator_cuda.c */ -CUstream opal_accelerator_cuda_memcpy_stream = NULL; -opal_mutex_t opal_accelerator_cuda_stream_lock = {0}; +opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream = {0}; +int opal_accelerator_cuda_num_devices = 0; /* Initialization lock for delayed cuda initialization */ static opal_mutex_t accelerator_cuda_init_lock; static bool accelerator_cuda_init_complete = false; +float *opal_accelerator_cuda_mem_bw = NULL; + #define STRINGIFY2(x) #x #define STRINGIFY(x) STRINGIFY2(x) @@ -122,6 +127,7 @@ static int accelerator_cuda_component_register(void) int opal_accelerator_cuda_delayed_init() { int result = OPAL_SUCCESS; + int prio_lo, prio_hi; CUcontext cuContext; /* Double checked locking to avoid having to @@ -137,6 +143,8 @@ int opal_accelerator_cuda_delayed_init() goto out; } + cuDeviceGetCount(&opal_accelerator_cuda_num_devices); + /* Check to see if this process is running in a CUDA context. If * so, all is good. If not, then disable registration of memory. */ result = cuCtxGetCurrent(&cuContext); @@ -145,19 +153,51 @@ int opal_accelerator_cuda_delayed_init() goto out; } else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context"); - result = OPAL_ERROR; - goto out; + + /* create a context for each device */ + for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) { + CUdevice dev; + result = cuDeviceGet(&dev, i); + if (CUDA_SUCCESS != result) { + opal_output_verbose(20, opal_accelerator_base_framework.framework_output, + "CUDA: cuDeviceGet failed"); + result = OPAL_ERROR; + goto out; + } + result = cuDevicePrimaryCtxRetain(&cuContext, dev); + if (CUDA_SUCCESS != result) { + opal_output_verbose(20, opal_accelerator_base_framework.framework_output, + "CUDA: cuDevicePrimaryCtxRetain failed"); + result = OPAL_ERROR; + goto out; + } + if (0 == i) { + result = cuCtxPushCurrent(cuContext); + if (CUDA_SUCCESS != result) { + opal_output_verbose(20, opal_accelerator_base_framework.framework_output, + "CUDA: cuCtxPushCurrent failed"); + result = OPAL_ERROR; + goto out; + } + } + } + + } else { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded"); } /* Create stream for use in cuMemcpyAsync synchronous copies */ - result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0); + CUstream memcpy_stream; + result = cuStreamCreate(&memcpy_stream, 0); if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true, OPAL_PROC_MY_HOSTNAME, result); goto out; } + OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_cuda_stream_t); + opal_accelerator_cuda_memcpy_stream.base.stream = malloc(sizeof(CUstream)); + *(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream = memcpy_stream; result = cuMemHostRegister(&checkmem, sizeof(int), 0); if (result != CUDA_SUCCESS) { @@ -165,11 +205,36 @@ int opal_accelerator_cuda_delayed_init() * This is not a fatal error. */ opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister during init failed", true, &checkmem, sizeof(int), OPAL_PROC_MY_HOSTNAME, result, "checkmem"); - } else { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuMemHostRegister OK on test region"); } + + /* determine the memory bandwidth */ + opal_accelerator_cuda_mem_bw = malloc(sizeof(float)*opal_accelerator_cuda_num_devices); + for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) { + CUdevice dev; + result = cuDeviceGet(&dev, i); + if (CUDA_SUCCESS != result) { + opal_output_verbose(20, opal_accelerator_base_framework.framework_output, + "CUDA: cuDeviceGet failed"); + goto out; + } + int mem_clock_rate; // kHz + result = cuDeviceGetAttribute(&mem_clock_rate, + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + dev); + int bus_width; // bit + result = cuDeviceGetAttribute(&bus_width, + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, + dev); + /* bw = clock_rate * bus width * 2bit multiplier + * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940 + */ + float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8; + opal_accelerator_cuda_mem_bw[i] = bw; + } + result = OPAL_SUCCESS; opal_atomic_wmb(); accelerator_cuda_init_complete = true; @@ -180,8 +245,9 @@ int opal_accelerator_cuda_delayed_init() static opal_accelerator_base_module_t* accelerator_cuda_init(void) { - OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t); OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t); + OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_stream_t); + /* First check if the support is enabled. In the case that the user has * turned it off, we do not need to continue with any CUDA specific * initialization. Do this after MCA parameter registration. */ @@ -205,11 +271,14 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module) if (CUDA_SUCCESS != result) { ctx_ok = 0; } - if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) { - cuStreamDestroy(opal_accelerator_cuda_memcpy_stream); + + if ((NULL != opal_accelerator_cuda_memcpy_stream.base.stream) && ctx_ok) { + OBJ_DESTRUCT(&opal_accelerator_cuda_memcpy_stream); } - OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock); + free(opal_accelerator_cuda_mem_bw); + opal_accelerator_cuda_mem_bw = NULL; + OBJ_DESTRUCT(&accelerator_cuda_init_lock); return; } diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c index 1bd6e0e2811..ce36d79f164 100644 --- a/opal/mca/accelerator/null/accelerator_null_component.c +++ b/opal/mca/accelerator/null/accelerator_null_component.c @@ -9,6 +9,9 @@ * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,11 +53,15 @@ static int accelerator_null_memcpy_async(int dest_dev_id, int src_dev_id, void * opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size); static int accelerator_null_mem_release(int dev_id, void *ptr); +static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size, opal_accelerator_stream_t* stream); +static int accelerator_null_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream); static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base, size_t *size); static bool accelerator_null_is_ipc_enabled(void); @@ -82,6 +89,12 @@ static int accelerator_null_device_can_access_peer(int *access, int dev1, int de static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream); + +static int accelerator_null_get_num_devices(int *num_devices); + +static int accelerator_null_get_mem_bw(int device, float *bw); + /* * Instantiate the public struct with all of our public information * and pointers to our public functions in it @@ -133,9 +146,12 @@ opal_accelerator_base_module_t opal_accelerator_null_module = accelerator_null_memcpy_async, accelerator_null_memcpy, + accelerator_null_memmove_async, accelerator_null_memmove, accelerator_null_mem_alloc, accelerator_null_mem_release, + accelerator_null_mem_alloc_stream, + accelerator_null_mem_release_stream, accelerator_null_get_address_range, accelerator_null_is_ipc_enabled, @@ -154,7 +170,11 @@ opal_accelerator_base_module_t opal_accelerator_null_module = accelerator_null_get_device_pci_attr, accelerator_null_device_can_access_peer, - accelerator_null_get_buffer_id + accelerator_null_get_buffer_id, + + accelerator_null_wait_stream, + accelerator_null_get_num_devices, + accelerator_null_get_mem_bw }; static int accelerator_null_open(void) @@ -237,6 +257,13 @@ static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, return OPAL_SUCCESS; } +static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type) +{ + memmove(dest, src, size); + return OPAL_SUCCESS; +} + static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size) { *ptr = malloc(size); @@ -249,6 +276,23 @@ static int accelerator_null_mem_release(int dev_id, void *ptr) return OPAL_SUCCESS; } + +static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream) +{ + (void)stream; + *ptr = malloc(size); + return OPAL_SUCCESS; +} + +static int accelerator_null_mem_release_stream(int dev_id, void *ptr, + opal_accelerator_stream_t *stream) +{ + (void)stream; + free(ptr); + return OPAL_SUCCESS; +} + static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base, size_t *size) { @@ -331,3 +375,21 @@ static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_acc { return OPAL_ERR_NOT_IMPLEMENTED; } + +static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream) +{ + return OPAL_SUCCESS; +} + +static int accelerator_null_get_num_devices(int *num_devices) +{ + *num_devices = 0; + return OPAL_SUCCESS; +} + + +static int accelerator_null_get_mem_bw(int device, float *bw) +{ + *bw = 1.0; // return something that is not 0 + return OPAL_SUCCESS; +} diff --git a/opal/mca/accelerator/rocm/accelerator_rocm.h b/opal/mca/accelerator/rocm/accelerator_rocm.h index 38409778ad4..22e78d91a0e 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm.h +++ b/opal/mca/accelerator/rocm/accelerator_rocm.h @@ -1,5 +1,8 @@ /* * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * * $COPYRIGHT$ * @@ -67,12 +70,15 @@ struct opal_accelerator_rocm_ipc_event_handle_t { typedef struct opal_accelerator_rocm_ipc_event_handle_t opal_accelerator_rocm_ipc_event_handle_t; OBJ_CLASS_DECLARATION(opal_accelerator_rocm_ipc_event_handle_t); -OPAL_DECLSPEC extern hipStream_t opal_accelerator_rocm_MemcpyStream; +OPAL_DECLSPEC extern hipStream_t *opal_accelerator_rocm_MemcpyStream; OPAL_DECLSPEC extern int opal_accelerator_rocm_memcpy_async; OPAL_DECLSPEC extern int opal_accelerator_rocm_verbose; OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyH2D_limit; OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyD2H_limit; +OPAL_DECLSPEC extern int opal_accelerator_rocm_num_devices; +OPAL_DECLSPEC extern float *opal_accelerator_rocm_mem_bw; + OPAL_DECLSPEC extern int opal_accelerator_rocm_lazy_init(void); #endif diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c index 8f1bbbb53a5..c999909d7df 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c @@ -7,6 +7,9 @@ * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. * All Rights reserved. * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -19,7 +22,9 @@ #include #include "opal/mca/dl/base/base.h" +#include "opal/mca/accelerator/base/base.h" #include "opal/runtime/opal_params.h" +#include "opal/util/proc.h" #include "accelerator_rocm.h" int opal_accelerator_rocm_memcpy_async = 0; @@ -31,7 +36,10 @@ size_t opal_accelerator_rocm_memcpyH2D_limit=1048576; static opal_mutex_t accelerator_rocm_init_lock; static bool accelerator_rocm_init_complete = false; -hipStream_t opal_accelerator_rocm_MemcpyStream = NULL; +/* Define global variables, used in accelerator_rocm.c */ +int opal_accelerator_rocm_num_devices = 0; +float *opal_accelerator_rocm_mem_bw = NULL; +hipStream_t *opal_accelerator_rocm_MemcpyStream = NULL; /* * Public string showing the accelerator rocm component version number @@ -159,6 +167,7 @@ static int accelerator_rocm_component_register(void) int opal_accelerator_rocm_lazy_init() { + int prio_hi, prio_lo; int err = OPAL_SUCCESS; /* Double checked locking to avoid having to @@ -174,13 +183,35 @@ int opal_accelerator_rocm_lazy_init() goto out; } - err = hipStreamCreate(&opal_accelerator_rocm_MemcpyStream); - if (hipSuccess != err) { + hipGetDeviceCount(&opal_accelerator_rocm_num_devices); + + /* Create stream for use in hipMemcpyAsync synchronous copies */ + hipStream_t memcpy_stream; + err = hipStreamCreate(&memcpy_stream); + if (OPAL_UNLIKELY(err != hipSuccess)) { opal_output(0, "Could not create hipStream, err=%d %s\n", err, hipGetErrorString(err)); goto out; } - + opal_accelerator_rocm_MemcpyStream = malloc(sizeof(hipStream_t)); + *(hipStream_t*)opal_accelerator_rocm_MemcpyStream = memcpy_stream; + + opal_accelerator_rocm_mem_bw = malloc(sizeof(float)*opal_accelerator_rocm_num_devices); + for (int i = 0; i < opal_accelerator_rocm_num_devices; ++i) { + int mem_clock_rate; // kHz + err = hipDeviceGetAttribute(&mem_clock_rate, + hipDeviceAttributeMemoryClockRate, + i); + int bus_width; // bit + err = hipDeviceGetAttribute(&bus_width, + hipDeviceAttributeMemoryBusWidth, + i); + /* bw = clock_rate * bus width * 2bit multiplier + * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940 + */ + float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8; + opal_accelerator_rocm_mem_bw[i] = bw; + } err = OPAL_SUCCESS; opal_atomic_wmb(); accelerator_rocm_init_complete = true; @@ -192,7 +223,7 @@ int opal_accelerator_rocm_lazy_init() static opal_accelerator_base_module_t* accelerator_rocm_init(void) { OBJ_CONSTRUCT(&accelerator_rocm_init_lock, opal_mutex_t); - + hipError_t err; if (opal_rocm_runtime_initialized) { @@ -214,12 +245,16 @@ static opal_accelerator_base_module_t* accelerator_rocm_init(void) static void accelerator_rocm_finalize(opal_accelerator_base_module_t* module) { - if (NULL != (void*)opal_accelerator_rocm_MemcpyStream) { - hipError_t err = hipStreamDestroy(opal_accelerator_rocm_MemcpyStream); + if (NULL != opal_accelerator_rocm_MemcpyStream) { + hipError_t err = hipStreamDestroy(*opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err) { opal_output_verbose(10, 0, "hip_dl_finalize: error while destroying the hipStream\n"); } + free(opal_accelerator_rocm_MemcpyStream); opal_accelerator_rocm_MemcpyStream = NULL; + + free(opal_accelerator_rocm_mem_bw); + opal_accelerator_rocm_mem_bw = NULL; } OBJ_DESTRUCT(&accelerator_rocm_init_lock); diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c index 6db5e0d4927..dfebf09def8 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c @@ -1,6 +1,9 @@ /* * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * * Additional copyrights may follow * @@ -27,10 +30,17 @@ static int mca_accelerator_rocm_memcpy_async(int dest_dev_id, int src_dev_id, vo opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type); static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int mca_accelerator_rocm_mem_alloc(int dev_id, void **ptr, size_t size); static int mca_accelerator_rocm_mem_release(int dev_id, void *ptr); +static int mca_accelerator_rocm_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream); +static int mca_accelerator_rocm_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream); static int mca_accelerator_rocm_get_address_range(int dev_id, const void *ptr, void **base, size_t *size); @@ -59,6 +69,11 @@ static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, i static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream); + +static int mca_accelerator_rocm_get_num_devices(int *num_devices); + +static int mca_accelerator_rocm_get_mem_bw(int device, float *bw); #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((hipStream_t *)_stream->stream)) @@ -75,9 +90,12 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module = mca_accelerator_rocm_memcpy_async, mca_accelerator_rocm_memcpy, + mca_accelerator_rocm_memmove_async, mca_accelerator_rocm_memmove, mca_accelerator_rocm_mem_alloc, mca_accelerator_rocm_mem_release, + mca_accelerator_rocm_mem_alloc_stream, + mca_accelerator_rocm_mem_release_stream, mca_accelerator_rocm_get_address_range, mca_accelerator_rocm_is_ipc_enabled, @@ -96,7 +114,11 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module = mca_accelerator_rocm_get_device_pci_attr, mca_accelerator_rocm_device_can_access_peer, - mca_accelerator_rocm_get_buffer_id + mca_accelerator_rocm_get_buffer_id, + + mca_accelerator_rocm_wait_stream, + mca_accelerator_rocm_get_num_devices, + mca_accelerator_rocm_get_mem_bw }; @@ -233,7 +255,7 @@ OBJ_CLASS_INSTANCE( opal_accelerator_event_t, NULL, mca_accelerator_rocm_event_destruct); - + static int mca_accelerator_rocm_record_event(int dev_id, opal_accelerator_event_t *event, opal_accelerator_stream_t *stream) { @@ -348,14 +370,14 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de if (opal_accelerator_rocm_memcpy_async) { err = hipMemcpyAsync(dest, src, size, hipMemcpyDefault, - opal_accelerator_rocm_MemcpyStream); + *opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error starting async copy\n"); return OPAL_ERROR; } - err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream); + err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error synchronizing stream after async copy\n"); @@ -373,6 +395,44 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de return OPAL_SUCCESS; } +static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, + size_t size, opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type) +{ + hipDeviceptr_t tmp; + hipError_t result; + int ret; + void *ptr; + + int delayed_init = opal_accelerator_rocm_lazy_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + + if (NULL == dest || NULL == src || size <= 0) { + return OPAL_ERR_BAD_PARAM; + } + + ret = mca_accelerator_rocm_mem_alloc_stream(src_dev_id, &ptr, size, stream); + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + return OPAL_ERROR; + } + tmp = (hipDeviceptr_t)ptr; + result = hipMemcpyAsync(tmp, (hipDeviceptr_t) src, size, hipMemcpyDefault, *(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error during synchronous copy\n"); + return OPAL_ERROR; + } + result = hipMemcpyAsync((hipDeviceptr_t) dest, tmp, size, hipMemcpyDefault, *(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error during synchronous copy\n"); + return OPAL_ERROR; + } + return mca_accelerator_rocm_mem_release_stream(src_dev_id, ptr, stream); +} + static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type) @@ -393,7 +453,7 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d if (opal_accelerator_rocm_memcpy_async) { err = hipMemcpyAsync(tmp, src, size, hipMemcpyDefault, - opal_accelerator_rocm_MemcpyStream); + *opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error in async memcpy for memmove\n"); @@ -401,14 +461,14 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d } err = hipMemcpyAsync(dest, tmp, size, hipMemcpyDefault, - opal_accelerator_rocm_MemcpyStream); + *opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error in async memcpy for memmove\n"); return OPAL_ERROR; } - err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream); + err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error synchronizing stream for memmove\n"); @@ -535,7 +595,7 @@ static int mca_accelerator_rocm_get_ipc_handle(int dev_id, void *dev_ptr, "Error in hipIpcGetMemHandle dev_ptr %p", dev_ptr); OBJ_DESTRUCT(rocm_handle); return OPAL_ERROR; - } + } memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE); return OPAL_SUCCESS; @@ -597,7 +657,7 @@ static int mca_accelerator_rocm_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HAN static void mca_accelerator_rocm_ipc_event_handle_destruct(opal_accelerator_rocm_ipc_handle_t *handle) { - // Just a place holder, there is no hipIpcCloseEventHandle. + // Just a place holder, there is no hipIpcCloseEventHandle. } OBJ_CLASS_INSTANCE( @@ -617,7 +677,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e hipIpcEventHandle_t rocm_ipc_handle; opal_accelerator_rocm_ipc_event_handle_t *rocm_handle = (opal_accelerator_rocm_ipc_event_handle_t *) handle; OBJ_CONSTRUCT(rocm_handle, opal_accelerator_rocm_ipc_event_handle_t); - + memset(rocm_ipc_handle.reserved, 0, HIP_IPC_HANDLE_SIZE); hipError_t err = hipIpcGetEventHandle(&rocm_ipc_handle, *((hipEvent_t *)event->event)); @@ -626,7 +686,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e "error in hipIpcGetEventHandle"); OBJ_DESTRUCT(rocm_handle); return OPAL_ERROR; - } + } memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE); return OPAL_SUCCESS; @@ -664,7 +724,7 @@ static int mca_accelerator_rocm_open_ipc_event_handle(opal_accelerator_ipc_event opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error in hipIpcOpenEventHandle"); return OPAL_ERROR; - } + } return OPAL_SUCCESS; } @@ -802,3 +862,81 @@ static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal #endif return OPAL_SUCCESS; } + + +static int mca_accelerator_rocm_mem_alloc_stream( + int dev_id, + void **addr, + size_t size, + opal_accelerator_stream_t *stream) +{ + hipError_t result; + + int delayed_init = opal_accelerator_rocm_lazy_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + + if (NULL == stream || NULL == addr || 0 == size) { + return OPAL_ERR_BAD_PARAM; + } + + result = hipMallocAsync(addr, size, *(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error allocating memory\n"); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + +static int mca_accelerator_rocm_mem_release_stream( + int dev_id, + void *addr, + opal_accelerator_stream_t *stream) +{ + hipError_t result; + + if (NULL == stream || NULL == addr) { + return OPAL_ERR_BAD_PARAM; + } + + result = hipFreeAsync(addr, *(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error freeing memory\n"); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + +static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream) +{ + hipError_t result; + result = hipStreamSynchronize(*(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error synchronizing stream\n"); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + + +static int mca_accelerator_rocm_get_num_devices(int *num_devices) +{ + *num_devices = opal_accelerator_rocm_num_devices; + return OPAL_SUCCESS; +} + +static int mca_accelerator_rocm_get_mem_bw(int device, float *bw) +{ + int delayed_init = opal_accelerator_rocm_lazy_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + assert(opal_accelerator_rocm_mem_bw != NULL); + + *bw = opal_accelerator_rocm_mem_bw[device]; + return OPAL_SUCCESS; +} \ No newline at end of file diff --git a/opal/mca/accelerator/ze/accelerator_ze_module.c b/opal/mca/accelerator/ze/accelerator_ze_module.c index 38b49cf4290..ca4b4bb7fdc 100644 --- a/opal/mca/accelerator/ze/accelerator_ze_module.c +++ b/opal/mca/accelerator/ze/accelerator_ze_module.c @@ -32,10 +32,17 @@ static int mca_accelerator_ze_memcpy_async(int dest_dev_id, int src_dev_id, void opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type); static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size); static int mca_accelerator_ze_mem_release(int dev_id, void *ptr); +static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream); +static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream); static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base, size_t *size); @@ -65,6 +72,12 @@ static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_p static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream); + +static int mca_accelerator_ze_get_num_devices(int *num_devices); + +static int mca_accelerator_ze_get_mem_bw(int device, float *bw); + opal_accelerator_base_module_t opal_accelerator_ze_module = { .check_addr = mca_accelerator_ze_check_addr, @@ -77,10 +90,13 @@ opal_accelerator_base_module_t opal_accelerator_ze_module = .mem_copy_async = mca_accelerator_ze_memcpy_async, .mem_copy = mca_accelerator_ze_memcpy, + .mem_move_async = mca_accelerator_ze_memmove_async, .mem_move = mca_accelerator_ze_memmove, .mem_alloc = mca_accelerator_ze_mem_alloc, .mem_release = mca_accelerator_ze_mem_release, + .mem_alloc_stream = mca_accelerator_ze_mem_alloc_stream, + .mem_release_stream = mca_accelerator_ze_mem_release_stream, .get_address_range = mca_accelerator_ze_get_address_range, .is_ipc_enabled = mca_accelerator_ze_is_ipc_enabled, @@ -99,7 +115,10 @@ opal_accelerator_base_module_t opal_accelerator_ze_module = .get_device_pci_attr = mca_accelerator_ze_get_device_pci_attr, .device_can_access_peer = mca_accelerator_ze_device_can_access_peer, - .get_buffer_id = mca_accelerator_ze_get_buffer_id + .get_buffer_id = mca_accelerator_ze_get_buffer_id, + .wait_stream = mca_accelerator_ze_wait_stream, + .num_devices = mca_accelerator_ze_get_num_devices, + .get_mem_bw = mca_accelerator_ze_get_mem_bw }; static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice) @@ -137,7 +156,7 @@ static int mca_accelerator_ze_check_addr (const void *addr, int *dev_id, uint64_ memset(&attr, 0, sizeof(ze_memory_allocation_properties_t)); - zret = zeMemGetAllocProperties(opal_accelerator_ze_context, + zret = zeMemGetAllocProperties(opal_accelerator_ze_context, addr, &attr, &hDevice); @@ -200,7 +219,7 @@ static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_ OBJ_RELEASE(*stream); return OPAL_ERR_OUT_OF_RESOURCE; } - + if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { hDevice = opal_accelerator_ze_devices_handle[0]; } else { @@ -208,9 +227,9 @@ static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_ } ze_stream->dev_id = dev_id; - zret = zeCommandQueueCreate(opal_accelerator_ze_context, + zret = zeCommandQueueCreate(opal_accelerator_ze_context, hDevice, - &cmdQueueDesc, + &cmdQueueDesc, &ze_stream->hCommandQueue); if (ZE_RESULT_SUCCESS != zret) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, @@ -226,12 +245,12 @@ static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_ .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, .pNext = NULL, .commandQueueGroupOrdinal = 0, - .flags = 0, + .flags = 0, }; - zret = zeCommandListCreate(opal_accelerator_ze_context, - opal_accelerator_ze_devices_handle[0], - &commandListDesc, + zret = zeCommandListCreate(opal_accelerator_ze_context, + opal_accelerator_ze_devices_handle[0], + &commandListDesc, &ze_stream->hCommandList); if (ZE_RESULT_SUCCESS != zret) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, @@ -359,7 +378,7 @@ static int mca_accelerator_ze_record_event(int dev_id, opal_accelerator_event_t "zeCommandListClose returned %d", zret); return OPAL_ERROR; } - + zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue, 1, &ze_stream->hCommandList, @@ -469,7 +488,7 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest if (NULL == src || NULL == dest || size <0) { return OPAL_ERR_BAD_PARAM; - } + } if (0 == size) { return OPAL_SUCCESS; } @@ -486,7 +505,7 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest if (OPAL_SUCCESS != ret) { return ret; } - } + } ze_stream = (opal_accelerator_ze_stream_t *)opal_accelerator_ze_MemcpyStream[dev_id]->stream; zret = zeCommandListAppendMemoryCopy(ze_stream->hCommandList, @@ -509,8 +528,8 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest return OPAL_ERROR; } - zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue, - 1, + zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue, + 1, &ze_stream->hCommandList, NULL); if (ZE_RESULT_SUCCESS != zret) { @@ -548,12 +567,23 @@ static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *des return OPAL_ERR_NOT_IMPLEMENTED; } +static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size) { ze_result_t zret; size_t mem_alignment; ze_device_handle_t hDevice; - + ze_device_mem_alloc_desc_t device_desc = { .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, .pNext = NULL, @@ -570,10 +600,10 @@ static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size) /* Currently ZE ignores this argument and uses an internal alignment * value. However, this behavior can change in the future. */ mem_alignment = 1; - zret = zeMemAllocDevice(opal_accelerator_ze_context, - &device_desc, - size, - mem_alignment, + zret = zeMemAllocDevice(opal_accelerator_ze_context, + &device_desc, + size, + mem_alignment, hDevice, ptr); if (ZE_RESULT_SUCCESS != zret) { @@ -603,6 +633,23 @@ static int mca_accelerator_ze_mem_release(int dev_id, void *ptr) return OPAL_ERROR; } +static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + +static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base, size_t *size) { @@ -615,7 +662,7 @@ static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, voi } zret = zeMemGetAddressRange(opal_accelerator_ze_context, - ptr, + ptr, &pBase, &pSize); if (ZE_RESULT_SUCCESS != zret) { @@ -694,7 +741,7 @@ static int mca_accelerator_ze_host_unregister(int dev_id, void *ptr) static int mca_accelerator_ze_get_device(int *dev_id) { /* - * this method does not map to the Zero Level API, just return 0. + * this method does not map to the Zero Level API, just return 0. * This may just work if the runtime is use the ZE_AFFINITY_MASK * environment variable to control the visible PV(s) for a given process. */ @@ -709,15 +756,15 @@ static int mca_accelerator_ze_get_device(int *dev_id) } static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr) -{ +{ ze_result_t zret; ze_device_handle_t hDevice; ze_pci_ext_properties_t pPciProperties; - + if (NULL == pci_attr) { return OPAL_ERR_BAD_PARAM; } - + if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { hDevice = opal_accelerator_ze_devices_handle[0]; } else { @@ -730,15 +777,15 @@ static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_p "zeDevicePciGetPropertiesExt returned %d", zret); return OPAL_ERROR; } - + pci_attr->domain_id = (uint16_t)pPciProperties.address.domain; pci_attr->bus_id = (uint8_t) pPciProperties.address.bus; pci_attr->device_id = (uint8_t)pPciProperties.address.device; pci_attr->function_id = (uint8_t)pPciProperties.address.function; return OPAL_SUCCESS; -} - +} + /* * could zeDeviceGetP2PProperties be used instead here? @@ -756,7 +803,7 @@ static int mca_accelerator_ze_device_can_access_peer(int *access, int dev1, int hDevice = opal_accelerator_ze_devices_handle[dev1]; hPeerDevice = opal_accelerator_ze_devices_handle[dev2]; - + zret = zeDeviceCanAccessPeer(hDevice, hPeerDevice, &value); @@ -781,7 +828,7 @@ static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_a return OPAL_ERR_BAD_PARAM; } - if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { + if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { hDevice = opal_accelerator_ze_devices_handle[0]; } else { hDevice = opal_accelerator_ze_devices_handle[dev_id]; @@ -798,6 +845,31 @@ static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_a } *buf_id = pMemAllocProperties.id; - + return OPAL_SUCCESS; } + + +static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + +static int mca_accelerator_ze_get_num_devices(int *num_devices) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + +static int mca_accelerator_ze_get_mem_bw(int device, float *bw) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} \ No newline at end of file