From 26ac9df86d7d853c9e77187d10bfe6e90e5cb68d Mon Sep 17 00:00:00 2001
From: Burlen Loring <bloring@lbl.gov>
Date: Thu, 24 Aug 2023 09:14:05 -0700
Subject: [PATCH] hip/cuda allocators short circuit copy construct

use hip/cuda memory management API when copy constructing POD arrays of
the same type, rather than allocate-move-copy construct.
---
 hamr_cuda_malloc_allocator_impl.h       | 100 +++++++++++++----------
 hamr_cuda_malloc_async_allocator_impl.h | 102 +++++++++++++----------
 hamr_hip_malloc_allocator_impl.h        | 104 ++++++++++++++----------
 3 files changed, 180 insertions(+), 126 deletions(-)
diff --git a/hamr_cuda_malloc_allocator_impl.h b/hamr_cuda_malloc_allocator_impl.h
index 0a078fd..e37e59a 100644
--- a/hamr_cuda_malloc_allocator_impl.h
+++ b/hamr_cuda_malloc_allocator_impl.h
@@ -521,61 +521,79 @@ cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::
         return nullptr;
     }
 
-    // move the existing array to the GPU
-    U *tmp = nullptr;
-    if (!cudaVals)
+    if (std::is_same<T,U>::value)
     {
-        size_t n_bytes_vals = n_elem*sizeof(U);
-
-        if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
+        // if the source and dest are the same type, and both are POD, we can
+        // short circuit the copy constructor and directly copy the data
+        cudaMemcpyKind dir = cudaVals ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
+        if ((ierr = cudaMemcpy(ptr, vals, n_bytes, dir)) != cudaSuccess)
         {
             std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-                " Failed to cudaMalloc " << n_elem << " of "
-                << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                " Failed to cudaMemcpy a " << (cudaVals ? "device" : "host")
+                << " array of " << n_elem << " of " << typeid(T).name()
+                << " total " << n_bytes  << "bytes. "
                 << cudaGetErrorString(ierr) << std::endl;
             return nullptr;
         }
+    }
+    else
+    {
+        // move the existing array to the GPU
+        U *tmp = nullptr;
+        if (!cudaVals)
+        {
+            size_t n_bytes_vals = n_elem*sizeof(U);
+
+            if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
+            {
+                std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
+                    " Failed to cudaMalloc " << n_elem << " of "
+                    << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                    << cudaGetErrorString(ierr) << std::endl;
+                return nullptr;
+            }
+
+            if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
+            {
+                std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
+                    " Failed to cudaMemcpy array of " << n_elem
+                    << " of " << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                    << cudaGetErrorString(ierr) << std::endl;
+                return nullptr;
+            }
+
+            vals = tmp;
+        }
 
-        if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
+        // get launch parameters
+        int device_id = -1;
+        dim3 block_grid;
+        int n_blocks = 0;
+        dim3 thread_grid = 0;
+        if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
+            n_blocks, thread_grid))
         {
             std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-                " Failed to cudaMemcpy array of " << n_elem
-                << " of " << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                " Failed to determine launch properties. "
                 << cudaGetErrorString(ierr) << std::endl;
             return nullptr;
         }
 
-        vals = tmp;
-    }
-
-    // get launch parameters
-    int device_id = -1;
-    dim3 block_grid;
-    int n_blocks = 0;
-    dim3 thread_grid = 0;
-    if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
-        n_blocks, thread_grid))
-    {
-        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-            " Failed to determine launch properties. "
-            << cudaGetErrorString(ierr) << std::endl;
-        return nullptr;
-    }
-
-    // construct
-    cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
-    if ((ierr = cudaGetLastError()) != cudaSuccess)
-    {
-        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-            " Failed to launch the construct kernel. "
-            << cudaGetErrorString(ierr) << std::endl;
-        return nullptr;
-    }
+        // construct
+        cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
+        if ((ierr = cudaGetLastError()) != cudaSuccess)
+        {
+            std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
+                " Failed to launch the construct kernel. "
+                << cudaGetErrorString(ierr) << std::endl;
+            return nullptr;
+        }
 
-    // free up temporary buffers
-    if (!cudaVals)
-    {
-        cudaFree(tmp);
+        // free up temporary buffers
+        if (!cudaVals)
+        {
+            cudaFree(tmp);
+        }
     }
 
 #if defined(HAMR_VERBOSE)
diff --git a/hamr_cuda_malloc_async_allocator_impl.h b/hamr_cuda_malloc_async_allocator_impl.h
index 2adc9e8..ddad3ab 100644
--- a/hamr_cuda_malloc_async_allocator_impl.h
+++ b/hamr_cuda_malloc_async_allocator_impl.h
@@ -501,62 +501,80 @@ cuda_malloc_async_allocator<T, typename std::enable_if<std::is_arithmetic<T>::va
         return nullptr;
     }
 
-    // move the existing array to the GPU
-    U *tmp = nullptr;
-    if (!cudaVals)
+    if (std::is_same<T,U>::value)
     {
-        size_t n_bytes_vals = n_elem*sizeof(U);
-
-        if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess)
+        // if the source and dest are the same type, and both are POD, we can
+        // short circuit the copy constructor and directly copy the data
+        cudaMemcpyKind dir = cudaVals ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
+        if ((ierr = cudaMemcpyAsync(ptr, vals, n_bytes, dir, str)) != cudaSuccess)
         {
             std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-                " Failed to cudaMalloc " << n_elem << " of "
-                << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                " Failed to cudaMemcpy a " << (cudaVals ? "device" : "host")
+                << " array of " << n_elem << " of " << typeid(T).name()
+                << " total " << n_bytes  << "bytes. "
                 << cudaGetErrorString(ierr) << std::endl;
             return nullptr;
         }
+    }
+    else
+    {
+        // move the existing array to the GPU
+        U *tmp = nullptr;
+        if (!cudaVals)
+        {
+            size_t n_bytes_vals = n_elem*sizeof(U);
+
+            if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess)
+            {
+                std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
+                    " Failed to cudaMalloc " << n_elem << " of "
+                    << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                    << cudaGetErrorString(ierr) << std::endl;
+                return nullptr;
+            }
+
+            if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
+                cudaMemcpyHostToDevice, str)) != cudaSuccess)
+            {
+                std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
+                    " Failed to cudaMemcpy array of " << n_elem
+                    << " of " << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                    << cudaGetErrorString(ierr) << std::endl;
+                return nullptr;
+            }
+
+            vals = tmp;
+        }
 
-        if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
-            cudaMemcpyHostToDevice, str)) != cudaSuccess)
+        // get launch parameters
+        int device_id = -1;
+        dim3 block_grid;
+        int n_blocks = 0;
+        dim3 thread_grid = 0;
+        if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
+            n_blocks, thread_grid))
         {
             std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-                " Failed to cudaMemcpy array of " << n_elem
-                << " of " << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                " Failed to determine launch properties. "
                 << cudaGetErrorString(ierr) << std::endl;
             return nullptr;
         }
 
-        vals = tmp;
-    }
-
-    // get launch parameters
-    int device_id = -1;
-    dim3 block_grid;
-    int n_blocks = 0;
-    dim3 thread_grid = 0;
-    if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
-        n_blocks, thread_grid))
-    {
-        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-            " Failed to determine launch properties. "
-            << cudaGetErrorString(ierr) << std::endl;
-        return nullptr;
-    }
-
-    // construct
-    cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
-    if ((ierr = cudaGetLastError()) != cudaSuccess)
-    {
-        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-            " Failed to launch the construct kernel. "
-            << cudaGetErrorString(ierr) << std::endl;
-        return nullptr;
-    }
+        // construct
+        cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
+        if ((ierr = cudaGetLastError()) != cudaSuccess)
+        {
+            std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
+                " Failed to launch the construct kernel. "
+                << cudaGetErrorString(ierr) << std::endl;
+            return nullptr;
+        }
 
-    // free up temporary buffers
-    if (!cudaVals)
-    {
-        cudaFreeAsync(tmp, str);
+        // free up temporary buffers
+        if (!cudaVals)
+        {
+            cudaFreeAsync(tmp, str);
+        }
     }
 
 #if defined(HAMR_VERBOSE)
diff --git a/hamr_hip_malloc_allocator_impl.h b/hamr_hip_malloc_allocator_impl.h
index 92a61d2..a78dbc9 100644
--- a/hamr_hip_malloc_allocator_impl.h
+++ b/hamr_hip_malloc_allocator_impl.h
@@ -494,63 +494,81 @@ hip_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::t
         return nullptr;
     }
 
-    // move the existing array to the GPU
-    U *tmp = nullptr;
-    if (!hipVals)
+    if (std::is_same<T,U>::value)
     {
-        size_t n_bytes_vals = n_elem*sizeof(U);
-
-        if ((ierr = hipMalloc(&tmp, n_bytes_vals)) != hipSuccess)
+        // if the source and dest are the same type, and both are POD, we can
+        // short circuit the copy constructor and directly copy the data
+        hipMemcpyKind dir = hipVals ? hipMemcpyDeviceToDevice : hipMemcpyHostToDevice;
+        if ((ierr = hipMemcpy(ptr, vals, n_bytes, dir)) != hipSuccess)
         {
             std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-                " Failed to hipMalloc " << n_elem << " of "
-                << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                " Failed to hipMemcpy a " (hipVals ? "device" : "host")
+                << " array of " << n_elem << " of " << typeid(T).name()
+                << " total " << n_bytes  << "bytes. "
                 << hipGetErrorString(ierr) << std::endl;
             return nullptr;
         }
+    }
+    else
+    {
+        // move the existing array to the GPU
+        U *tmp = nullptr;
+        if (!hipVals)
+        {
+            size_t n_bytes_vals = n_elem*sizeof(U);
+
+            if ((ierr = hipMalloc(&tmp, n_bytes_vals)) != hipSuccess)
+            {
+                std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
+                    " Failed to hipMalloc " << n_elem << " of "
+                    << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                    << hipGetErrorString(ierr) << std::endl;
+                return nullptr;
+            }
+
+            if ((ierr = hipMemcpy(tmp, vals, n_bytes_vals,
+                hipMemcpyHostToDevice)) != hipSuccess)
+            {
+                std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
+                    " Failed to hipMemcpy array of " << n_elem
+                    << " of " << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                    << hipGetErrorString(ierr) << std::endl;
+                return nullptr;
+            }
+
+            vals = tmp;
+        }
 
-        if ((ierr = hipMemcpy(tmp, vals, n_bytes_vals,
-            hipMemcpyHostToDevice)) != hipSuccess)
+        // get launch parameters
+        int device_id = -1;
+        dim3 block_grid;
+        int n_blocks = 0;
+        dim3 thread_grid = 0;
+        if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
+            n_blocks, thread_grid))
         {
             std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-                " Failed to hipMemcpy array of " << n_elem
-                << " of " << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "
+                " Failed to determine launch properties. "
                 << hipGetErrorString(ierr) << std::endl;
             return nullptr;
         }
 
-        vals = tmp;
-    }
-
-    // get launch parameters
-    int device_id = -1;
-    dim3 block_grid;
-    int n_blocks = 0;
-    dim3 thread_grid = 0;
-    if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
-        n_blocks, thread_grid))
-    {
-        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-            " Failed to determine launch properties. "
-            << hipGetErrorString(ierr) << std::endl;
-        return nullptr;
-    }
-
-    // construct
-    hip_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
-    if ((ierr = hipGetLastError()) != hipSuccess)
-    {
-        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
-            " Failed to launch the construct kernel. "
-            << hipGetErrorString(ierr) << std::endl;
-        return nullptr;
-    }
+        // construct
+        hip_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
+        if ((ierr = hipGetLastError()) != hipSuccess)
+        {
+            std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
+                " Failed to launch the construct kernel. "
+                << hipGetErrorString(ierr) << std::endl;
+            return nullptr;
+        }
 
-    // free up temporary buffers
-    if (!hipVals)
-    {
-        ierr = hipFree(tmp);
-        (void) ierr;
+        // free up temporary buffers
+        if (!hipVals)
+        {
+            ierr = hipFree(tmp);
+            (void) ierr;
+        }
     }
 
 #if defined(HAMR_VERBOSE)