RTKConsortium · SimonRit · Apr 21, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 18, 2024
diff --git a/include/rtkCudaDisplacedDetectorImageFilter.hcu b/include/rtkCudaDisplacedDetectorImageFilter.hcu
@@ -28,7 +28,7 @@ CUDA_displaced_weight(int     proj_idx_in[3],
                       int     proj_dim_out_buf[2],
                       float * dev_proj_in,
                       float * dev_proj_out,
-                      float * geometries,
+                      float * geometry,
                       float   theta,
                       bool    isPositiveCase,
                       float   proj_orig,

diff --git a/include/rtkCudaFDKWeightProjectionFilter.hcu b/include/rtkCudaFDKWeightProjectionFilter.hcu
@@ -26,7 +26,7 @@ CUDA_weight_projection(int     proj_idx[2],
                        int     proj_dim_buf_out[2],
                        float * dev_proj_in,
                        float * dev_proj_out,
-                       float * geometries,
+                       float * geometry,
                        float   proj_orig[2],
                        float   proj_row[2],
                        float   proj_col[2]);

diff --git a/include/rtkCudaParkerShortScanImageFilter.hcu b/include/rtkCudaParkerShortScanImageFilter.hcu
@@ -26,7 +26,7 @@ CUDA_parker_weight(int     proj_idx[2],
                    int     proj_dim_buf_out[2],
                    float * dev_proj_in,
                    float * dev_proj_out,
-                   float * geometries,
+                   float * geometry,
                    float   delta,
                    float   firstAngle,
                    float   proj_orig,

diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu
@@ -126,13 +126,31 @@ inline __host__ __device__ float
 {
   return u.x * v.x + u.y * v.y + u.z * v.z;
 }
+
+__host__ void
+prepareScalarTextureObject(int                          size[3],
+                           float *                      dev_ptr,
+                           cudaArray *&                 threeDArray,
+                           cudaTextureObject_t &        tex,
+                           const bool                   isProjections,
+                           const bool                   isLinear = true,
+                           const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder);
+
+__host__ void
+prepareVectorTextureObject(int                                size[3],
+                           const float *                      dev_ptr,
+                           std::vector<cudaArray *> &         componentArrays,
+                           const unsigned int                 nComponents,
+                           std::vector<cudaTextureObject_t> & tex,
+                           const bool                         isProjections,
+                           const cudaTextureAddressMode       texAddressMode = cudaAddressModeBorder);
+
 __host__ void
-prepareTextureObject(int                   size[3],
-                     float *               dev_ptr,
-                     cudaArray **&         componentArrays,
-                     unsigned int          nComponents,
-                     cudaTextureObject_t * tex,
-                     bool                  isProjections);
+prepareGeometryTextureObject(int                   size,
+                             const float *         geometry,
+                             float *&              dev_geom,
+                             cudaTextureObject_t & tex_geom,
+                             const unsigned int    nParam);
 
 inline __device__ void
 matrix_matrix_multiply(float * A, float * B, float * C, unsigned int rowsA, unsigned int colsB, unsigned int colsArowsB)

diff --git a/itk-module-init.cmake b/itk-module-init.cmake
@@ -44,8 +44,6 @@ if(RTK_USE_CUDA)
     find_package(CUDAToolkit REQUIRED 8.0)
   endif()
 
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-declarations")
   set(RTK_CUDA_PROJECTIONS_SLAB_SIZE "16" CACHE STRING "Number of projections processed simultaneously in CUDA forward and back projections")
 elseif(RTK_CUDA_VERSION)
   message(FATAL_ERROR "RTK_CUDA_VERSION is set but the CUDA toolkit has not been found.")

diff --git a/src/rtkCudaBackProjectionImageFilter.cu b/src/rtkCudaBackProjectionImageFilter.cu
@@ -51,7 +51,7 @@ __constant__ int3 c_volSize;
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 
-template <unsigned int vectorLength, bool isCylindrical>
+template <unsigned int VVectorLength, bool VIsCylindrical>
 __global__ void
 kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTextureObject_t * dev_tex_proj)
 {
@@ -68,13 +68,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
   itk::SizeValueType vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x);
 
   float3 ip, pp;
-  float  voxel_data[vectorLength];
-  for (unsigned int c = 0; c < vectorLength; c++)
+  float  voxel_data[VVectorLength];
+  for (unsigned int c = 0; c < VVectorLength; c++)
     voxel_data[c] = 0.0f;
 
   for (unsigned int proj = 0; proj < c_projSize.z; proj++)
   {
-    if (isCylindrical)
+    if (VIsCylindrical)
     {
       // matrix multiply
       pp = matrix_multiply(make_float3(i, j, k), &(c_volIndexToProjPP[12 * proj]));
@@ -105,13 +105,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
     }
 
     // Get texture point, clip left to GPU, and accumulate in voxel_data
-    for (unsigned int c = 0; c < vectorLength; c++)
+    for (unsigned int c = 0; c < VVectorLength; c++)
       voxel_data[c] += tex2DLayered<float>(dev_tex_proj[c], ip.x, ip.y, proj);
   }
 
   // Place it into the volume
-  for (unsigned int c = 0; c < vectorLength; c++)
-    dev_vol_out[vol_idx * vectorLength + c] = dev_vol_in[vol_idx * vectorLength + c] + voxel_data[c];
+  for (unsigned int c = 0; c < VVectorLength; c++)
+    dev_vol_out[vol_idx * VVectorLength + c] = dev_vol_in[vol_idx * VVectorLength + c] + voxel_data[c];
 }
 
 
@@ -134,14 +134,6 @@ CUDA_back_project(int          projSize[3],
                   double       radiusCylindricalDetector,
                   unsigned int vectorLength)
 {
-  // 2D layered texture requires CudaComputeCapability >= 2.0
-  int device;
-  cudaGetDevice(&device);
-  if (GetCudaComputeCapability(device).first <= 1)
-  {
-    itkGenericExceptionMacro(<< "RTK no longer supports GPUs with CudaComputeCapability < 2.0");
-  }
-
   // Copy the size of inputs into constant memory
   cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3));
   cudaMemcpyToSymbol(c_volSize, volSize, sizeof(int3));
@@ -169,18 +161,15 @@ CUDA_back_project(int          projSize[3],
   dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
   CUDA_CHECK_ERROR;
 
-  cudaArray ** projComponentArrays = new cudaArray *[vectorLength];
-
-  // Create an array of textures
-  cudaTextureObject_t * tex_proj = new cudaTextureObject_t[vectorLength];
-
   // Prepare texture objects (needs an array of cudaTextureObjects on the host as "tex_proj" argument)
-  prepareTextureObject(projSize, dev_proj, projComponentArrays, vectorLength, tex_proj, true);
+  std::vector<cudaArray *>         projComponentArrays;
+  std::vector<cudaTextureObject_t> tex_proj;
+  prepareVectorTextureObject(projSize, dev_proj, projComponentArrays, vectorLength, tex_proj, true);
 
   // Copy them to a device pointer, since it will have to be de-referenced in the kernels
   cudaTextureObject_t * dev_tex_proj;
   cudaMalloc(&dev_tex_proj, vectorLength * sizeof(cudaTextureObject_t));
-  cudaMemcpy(dev_tex_proj, tex_proj, vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_tex_proj, tex_proj.data(), vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice);
 
   // Run the kernel. Since "vectorLength" is passed as a function argument, not as a template argument,
   // the compiler can't assume it's constant, and a dirty trick has to be used.
@@ -245,11 +234,11 @@ CUDA_back_project(int          projSize[3],
   // Cleanup
   for (unsigned int c = 0; c < vectorLength; c++)
   {
-    cudaFreeArray((cudaArray *)projComponentArrays[c]);
+    cudaFreeArray(projComponentArrays[c]);
+    CUDA_CHECK_ERROR;
     cudaDestroyTextureObject(tex_proj[c]);
+    CUDA_CHECK_ERROR;
   }
   cudaFree(dev_tex_proj);
-  delete[] tex_proj;
-  delete[] projComponentArrays;
   CUDA_CHECK_ERROR;
 }
diff --git a/src/rtkCudaDisplacedDetectorImageFilter.cu b/src/rtkCudaDisplacedDetectorImageFilter.cu
@@ -22,8 +22,6 @@
 #include <cuda_runtime.h>
 #include <math_constants.h>
 
-texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture
-
 inline __device__ float
 TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column)
 {
@@ -45,19 +43,20 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s
 }
 
 __global__ void
-kernel_displaced_weight(int3    proj_idx_in,
-                        int3    proj_size_in,
-                        int2    proj_size_in_buf,
-                        int3    proj_idx_out,
-                        int3    proj_size_out,
-                        int2    proj_size_out_buf,
-                        float * dev_proj_in,
-                        float * dev_proj_out,
-                        float   theta,
-                        bool    isPositiveCase,
-                        float   proj_orig, // projection origin
-                        float   proj_row,  // projection row direction & spacing
-                        float   proj_col   // projection col direction & spacing
+kernel_displaced_weight(int3                proj_idx_in,
+                        int3                proj_size_in,
+                        int2                proj_size_in_buf,
+                        int3                proj_idx_out,
+                        int3                proj_size_out,
+                        int2                proj_size_out_buf,
+                        float *             dev_proj_in,
+                        float *             dev_proj_out,
+                        float               theta,
+                        bool                isPositiveCase,
+                        float               proj_orig, // projection origin
+                        float               proj_row,  // projection row direction & spacing
+                        float               proj_col,  // projection col direction & spacing
+                        cudaTextureObject_t tex_geom   // geometry texture object
 )
 {
   // compute thread index
@@ -90,10 +89,10 @@ kernel_displaced_weight(int3    proj_idx_in,
   {
     float pPoint = TransformIndexToPhysicalPoint(make_int2(pIdx.x, pIdx.y), proj_orig, proj_row, proj_col);
 
-    float sdd = tex1Dfetch(tex_geometry, tIdx.z * 4 + 0);
-    float sx = tex1Dfetch(tex_geometry, tIdx.z * 4 + 1);
-    float px = tex1Dfetch(tex_geometry, tIdx.z * 4 + 2);
-    float sid = tex1Dfetch(tex_geometry, tIdx.z * 4 + 3);
+    float sdd = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 0);
+    float sx = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 1);
+    float px = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 2);
+    float sid = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 3);
 
     float hyp = sqrtf(sid * sid + sx * sx); // to untilted situation
     float l = ToUntiltedCoordinateAtIsocenter(pPoint, sdd, sid, sx, px, hyp);
@@ -139,21 +138,17 @@ CUDA_displaced_weight(int     proj_idx_in[3],      // overlapping input region i
                       int     proj_dim_out_buf[2], // output size of buffered region
                       float * dev_proj_in,
                       float * dev_proj_out,
-                      float * geometries,
+                      float * geometry,
                       float   theta,
                       bool    isPositiveCase,
                       float   proj_orig,
                       float   proj_row,
                       float   proj_col)
 {
   // copy geometry matrix to device, bind the matrix to the texture
-  float * dev_geom;
-  cudaMalloc((void **)&dev_geom, proj_dim_out[2] * 4 * sizeof(float));
-  CUDA_CHECK_ERROR;
-  cudaMemcpy(dev_geom, geometries, proj_dim_out[2] * 4 * sizeof(float), cudaMemcpyHostToDevice);
-  CUDA_CHECK_ERROR;
-  cudaBindTexture(0, tex_geometry, dev_geom, proj_dim_out[2] * 4 * sizeof(float));
-  CUDA_CHECK_ERROR;
+  float *             dev_geom;
+  cudaTextureObject_t tex_geom;
+  prepareGeometryTextureObject(proj_dim_out[2], geometry, dev_geom, tex_geom, 4);
 
   // Thread Block Dimensions
   int tBlock_x = 16;
@@ -179,10 +174,11 @@ CUDA_displaced_weight(int     proj_idx_in[3],      // overlapping input region i
                                                  isPositiveCase,
                                                  proj_orig,
                                                  proj_row,
-                                                 proj_col);
+                                                 proj_col,
+                                                 tex_geom);
 
   // Unbind matrix texture
-  cudaUnbindTexture(tex_geometry);
+  cudaDestroyTextureObject(tex_geom);
   CUDA_CHECK_ERROR;
   cudaFree(dev_geom);
   CUDA_CHECK_ERROR;