From bfdca5b6b666b4f08f2f7d8039af11a15cc3f831 Mon Sep 17 00:00:00 2001
From: Simon Rit <simon.rit@creatis.insa-lyon.fr>
Date: Sun, 12 May 2024 10:42:14 +0200
Subject: [PATCH] ENH: Avoid single component intermediate CUDA array for
 scalar images

---
 src/rtkCudaUtilities.cu | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/rtkCudaUtilities.cu b/src/rtkCudaUtilities.cu
index bab5d4bff..1bb84a8b8 100644
--- a/src/rtkCudaUtilities.cu
+++ b/src/rtkCudaUtilities.cu
@@ -168,8 +168,11 @@ prepareVectorTextureObject(int                                size[3],
   // Allocate an intermediate memory space to extract the components of the input volume
   float * singleComponent;
   size_t  numel = size[0] * size[1] * size[2];
-  cudaMalloc(&singleComponent, numel * sizeof(float));
-  CUDA_CHECK_ERROR;
+  if (nComponents > 1)
+  {
+    cudaMalloc(&singleComponent, numel * sizeof(float));
+    CUDA_CHECK_ERROR;
+  }
   float one = 1.0;
 
   // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr.
@@ -177,16 +180,19 @@ prepareVectorTextureObject(int                                size[3],
   // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api
   for (unsigned int component = 0; component < nComponents; component++)
   {
-    // Reset the intermediate memory
-    cudaMemset((void *)singleComponent, 0, numel * sizeof(float));
+    if (nComponents > 1)
+    {
+      // Reset the intermediate memory
+      cudaMemset((void *)singleComponent, 0, numel * sizeof(float));
 
-    // Fill it with the current component
-    const float * pComponent = dev_ptr + component;
+      // Fill it with the current component
+      const float * pComponent = dev_ptr + component;
 #if CUDA_VERSION < 12000
-    cublasSaxpy(handle, (int)numel, &one, pComponent, nComponents, singleComponent, 1);
+      cublasSaxpy(handle, (int)numel, &one, pComponent, nComponents, singleComponent, 1);
 #else
-    cublasSaxpy_64(handle, numel, &one, pComponent, nComponents, singleComponent, 1);
+      cublasSaxpy_64(handle, numel, &one, pComponent, nComponents, singleComponent, 1);
 #endif
+    }
 
     // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays
     if (isProjections)
@@ -197,7 +203,10 @@ prepareVectorTextureObject(int                                size[3],
 
     // Fill it with the current singleComponent
     cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms();
-    CopyParams.srcPtr = make_cudaPitchedPtr(singleComponent, size[0] * sizeof(float), size[0], size[1]);
+    if (nComponents > 1)
+      CopyParams.srcPtr = make_cudaPitchedPtr(singleComponent, size[0] * sizeof(float), size[0], size[1]);
+    else
+      CopyParams.srcPtr = make_cudaPitchedPtr((void *)dev_ptr, size[0] * sizeof(float), size[0], size[1]);
     CUDA_CHECK_ERROR;
     CopyParams.dstArray = componentArrays[component];
     CopyParams.extent = volExtent;
@@ -212,7 +221,8 @@ prepareVectorTextureObject(int                                size[3],
   }
 
   // Intermediate memory is no longer needed
-  cudaFree(singleComponent);
+  if (nComponents > 1)
+    cudaFree(singleComponent);
 
   // Destroy CUBLAS context
   cublasDestroy(handle);