From 26ac9df86d7d853c9e77187d10bfe6e90e5cb68d Mon Sep 17 00:00:00 2001 From: Burlen Loring Date: Thu, 24 Aug 2023 09:14:05 -0700 Subject: [PATCH] hip/cuda allocators short circuit copy construct use hip/cuda memory management API when copy constructing POD arrays of the same type, rather than allocate-move-copy construct. --- hamr_cuda_malloc_allocator_impl.h | 100 +++++++++++++---------- hamr_cuda_malloc_async_allocator_impl.h | 102 +++++++++++++---------- hamr_hip_malloc_allocator_impl.h | 104 ++++++++++++++---------- 3 files changed, 180 insertions(+), 126 deletions(-) diff --git a/hamr_cuda_malloc_allocator_impl.h b/hamr_cuda_malloc_allocator_impl.h index 0a078fd..e37e59a 100644 --- a/hamr_cuda_malloc_allocator_impl.h +++ b/hamr_cuda_malloc_allocator_impl.h @@ -521,61 +521,79 @@ cuda_malloc_allocator::value>:: return nullptr; } - // move the existing array to the GPU - U *tmp = nullptr; - if (!cudaVals) + if (std::is_same::value) { - size_t n_bytes_vals = n_elem*sizeof(U); - - if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess) + // if the source and dest are the same type, and both are POD, we can + // short circuit the copy constructor and directly copy the data + cudaMemcpyKind dir = cudaVals ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice; + if ((ierr = cudaMemcpy(ptr, vals, n_bytes, dir)) != cudaSuccess) { std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to cudaMalloc " << n_elem << " of " - << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + " Failed to cudaMemcpy a " << (cudaVals ? "device" : "host") + << " array of " << n_elem << " of " << typeid(T).name() + << " total " << n_bytes << "bytes. " << cudaGetErrorString(ierr) << std::endl; return nullptr; } + } + else + { + // move the existing array to the GPU + U *tmp = nullptr; + if (!cudaVals) + { + size_t n_bytes_vals = n_elem*sizeof(U); + + if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess) + { + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" + " Failed to cudaMalloc " << n_elem << " of " + << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + << cudaGetErrorString(ierr) << std::endl; + return nullptr; + } + + if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess) + { + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" + " Failed to cudaMemcpy array of " << n_elem + << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + << cudaGetErrorString(ierr) << std::endl; + return nullptr; + } + + vals = tmp; + } - if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess) + // get launch parameters + int device_id = -1; + dim3 block_grid; + int n_blocks = 0; + dim3 thread_grid = 0; + if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, + n_blocks, thread_grid)) { std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to cudaMemcpy array of " << n_elem - << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + " Failed to determine launch properties. " << cudaGetErrorString(ierr) << std::endl; return nullptr; } - vals = tmp; - } - - // get launch parameters - int device_id = -1; - dim3 block_grid; - int n_blocks = 0; - dim3 thread_grid = 0; - if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, - n_blocks, thread_grid)) - { - std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to determine launch properties. " - << cudaGetErrorString(ierr) << std::endl; - return nullptr; - } - - // construct - cuda_kernels::fill<<>>(ptr, n_elem, vals); - if ((ierr = cudaGetLastError()) != cudaSuccess) - { - std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to launch the construct kernel. " - << cudaGetErrorString(ierr) << std::endl; - return nullptr; - } + // construct + cuda_kernels::fill<<>>(ptr, n_elem, vals); + if ((ierr = cudaGetLastError()) != cudaSuccess) + { + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" + " Failed to launch the construct kernel. " + << cudaGetErrorString(ierr) << std::endl; + return nullptr; + } - // free up temporary buffers - if (!cudaVals) - { - cudaFree(tmp); + // free up temporary buffers + if (!cudaVals) + { + cudaFree(tmp); + } } #if defined(HAMR_VERBOSE) diff --git a/hamr_cuda_malloc_async_allocator_impl.h b/hamr_cuda_malloc_async_allocator_impl.h index 2adc9e8..ddad3ab 100644 --- a/hamr_cuda_malloc_async_allocator_impl.h +++ b/hamr_cuda_malloc_async_allocator_impl.h @@ -501,62 +501,80 @@ cuda_malloc_async_allocator::va return nullptr; } - // move the existing array to the GPU - U *tmp = nullptr; - if (!cudaVals) + if (std::is_same::value) { - size_t n_bytes_vals = n_elem*sizeof(U); - - if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess) + // if the source and dest are the same type, and both are POD, we can + // short circuit the copy constructor and directly copy the data + cudaMemcpyKind dir = cudaVals ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice; + if ((ierr = cudaMemcpyAsync(ptr, vals, n_bytes, dir, str)) != cudaSuccess) { std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to cudaMalloc " << n_elem << " of " - << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + " Failed to cudaMemcpy a " << (cudaVals ? "device" : "host") + << " array of " << n_elem << " of " << typeid(T).name() + << " total " << n_bytes << "bytes. " << cudaGetErrorString(ierr) << std::endl; return nullptr; } + } + else + { + // move the existing array to the GPU + U *tmp = nullptr; + if (!cudaVals) + { + size_t n_bytes_vals = n_elem*sizeof(U); + + if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess) + { + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" + " Failed to cudaMalloc " << n_elem << " of " + << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + << cudaGetErrorString(ierr) << std::endl; + return nullptr; + } + + if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals, + cudaMemcpyHostToDevice, str)) != cudaSuccess) + { + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" + " Failed to cudaMemcpy array of " << n_elem + << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + << cudaGetErrorString(ierr) << std::endl; + return nullptr; + } + + vals = tmp; + } - if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals, - cudaMemcpyHostToDevice, str)) != cudaSuccess) + // get launch parameters + int device_id = -1; + dim3 block_grid; + int n_blocks = 0; + dim3 thread_grid = 0; + if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, + n_blocks, thread_grid)) { std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to cudaMemcpy array of " << n_elem - << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + " Failed to determine launch properties. " << cudaGetErrorString(ierr) << std::endl; return nullptr; } - vals = tmp; - } - - // get launch parameters - int device_id = -1; - dim3 block_grid; - int n_blocks = 0; - dim3 thread_grid = 0; - if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, - n_blocks, thread_grid)) - { - std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to determine launch properties. " - << cudaGetErrorString(ierr) << std::endl; - return nullptr; - } - - // construct - cuda_kernels::fill<<>>(ptr, n_elem, vals); - if ((ierr = cudaGetLastError()) != cudaSuccess) - { - std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to launch the construct kernel. " - << cudaGetErrorString(ierr) << std::endl; - return nullptr; - } + // construct + cuda_kernels::fill<<>>(ptr, n_elem, vals); + if ((ierr = cudaGetLastError()) != cudaSuccess) + { + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" + " Failed to launch the construct kernel. " + << cudaGetErrorString(ierr) << std::endl; + return nullptr; + } - // free up temporary buffers - if (!cudaVals) - { - cudaFreeAsync(tmp, str); + // free up temporary buffers + if (!cudaVals) + { + cudaFreeAsync(tmp, str); + } } #if defined(HAMR_VERBOSE) diff --git a/hamr_hip_malloc_allocator_impl.h b/hamr_hip_malloc_allocator_impl.h index 92a61d2..a78dbc9 100644 --- a/hamr_hip_malloc_allocator_impl.h +++ b/hamr_hip_malloc_allocator_impl.h @@ -494,63 +494,81 @@ hip_malloc_allocator::value>::t return nullptr; } - // move the existing array to the GPU - U *tmp = nullptr; - if (!hipVals) + if (std::is_same::value) { - size_t n_bytes_vals = n_elem*sizeof(U); - - if ((ierr = hipMalloc(&tmp, n_bytes_vals)) != hipSuccess) + // if the source and dest are the same type, and both are POD, we can + // short circuit the copy constructor and directly copy the data + hipMemcpyKind dir = hipVals ? hipMemcpyDeviceToDevice : hipMemcpyHostToDevice; + if ((ierr = hipMemcpy(ptr, vals, n_bytes, dir)) != hipSuccess) { std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to hipMalloc " << n_elem << " of " - << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + " Failed to hipMemcpy a " (hipVals ? "device" : "host") + << " array of " << n_elem << " of " << typeid(T).name() + << " total " << n_bytes << "bytes. " << hipGetErrorString(ierr) << std::endl; return nullptr; } + } + else + { + // move the existing array to the GPU + U *tmp = nullptr; + if (!hipVals) + { + size_t n_bytes_vals = n_elem*sizeof(U); + + if ((ierr = hipMalloc(&tmp, n_bytes_vals)) != hipSuccess) + { + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" + " Failed to hipMalloc " << n_elem << " of " + << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + << hipGetErrorString(ierr) << std::endl; + return nullptr; + } + + if ((ierr = hipMemcpy(tmp, vals, n_bytes_vals, + hipMemcpyHostToDevice)) != hipSuccess) + { + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" + " Failed to hipMemcpy array of " << n_elem + << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + << hipGetErrorString(ierr) << std::endl; + return nullptr; + } + + vals = tmp; + } - if ((ierr = hipMemcpy(tmp, vals, n_bytes_vals, - hipMemcpyHostToDevice)) != hipSuccess) + // get launch parameters + int device_id = -1; + dim3 block_grid; + int n_blocks = 0; + dim3 thread_grid = 0; + if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, + n_blocks, thread_grid)) { std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to hipMemcpy array of " << n_elem - << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. " + " Failed to determine launch properties. " << hipGetErrorString(ierr) << std::endl; return nullptr; } - vals = tmp; - } - - // get launch parameters - int device_id = -1; - dim3 block_grid; - int n_blocks = 0; - dim3 thread_grid = 0; - if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, - n_blocks, thread_grid)) - { - std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to determine launch properties. " - << hipGetErrorString(ierr) << std::endl; - return nullptr; - } - - // construct - hip_kernels::fill<<>>(ptr, n_elem, vals); - if ((ierr = hipGetLastError()) != hipSuccess) - { - std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" - " Failed to launch the construct kernel. " - << hipGetErrorString(ierr) << std::endl; - return nullptr; - } + // construct + hip_kernels::fill<<>>(ptr, n_elem, vals); + if ((ierr = hipGetLastError()) != hipSuccess) + { + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" + " Failed to launch the construct kernel. " + << hipGetErrorString(ierr) << std::endl; + return nullptr; + } - // free up temporary buffers - if (!hipVals) - { - ierr = hipFree(tmp); - (void) ierr; + // free up temporary buffers + if (!hipVals) + { + ierr = hipFree(tmp); + (void) ierr; + } } #if defined(HAMR_VERBOSE)