ENH: Replace texture references by objects for CUDA 12 compatibility

RTKConsortium · Apr 21, 2024 · 5361f69 · 5361f69
1 parent f109868
commit 5361f69
Show file tree

Hide file tree

Showing 15 changed files with 374 additions and 741 deletions.
diff --git a/include/rtkCudaDisplacedDetectorImageFilter.hcu b/include/rtkCudaDisplacedDetectorImageFilter.hcu
@@ -28,7 +28,7 @@ CUDA_displaced_weight(int     proj_idx_in[3],
                       int     proj_dim_out_buf[2],
                       float * dev_proj_in,
                       float * dev_proj_out,
-                      float * geometries,
+                      float * geometry,
                       float   theta,
                       bool    isPositiveCase,
                       float   proj_orig,

diff --git a/include/rtkCudaFDKWeightProjectionFilter.hcu b/include/rtkCudaFDKWeightProjectionFilter.hcu
@@ -26,7 +26,7 @@ CUDA_weight_projection(int     proj_idx[2],
                        int     proj_dim_buf_out[2],
                        float * dev_proj_in,
                        float * dev_proj_out,
-                       float * geometries,
+                       float * geometry,
                        float   proj_orig[2],
                        float   proj_row[2],
                        float   proj_col[2]);

diff --git a/include/rtkCudaParkerShortScanImageFilter.hcu b/include/rtkCudaParkerShortScanImageFilter.hcu
@@ -26,7 +26,7 @@ CUDA_parker_weight(int     proj_idx[2],
                    int     proj_dim_buf_out[2],
                    float * dev_proj_in,
                    float * dev_proj_out,
-                   float * geometries,
+                   float * geometry,
                    float   delta,
                    float   firstAngle,
                    float   proj_orig,

diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu
@@ -127,14 +127,30 @@ inline __host__ __device__ float
   return u.x * v.x + u.y * v.y + u.z * v.z;
 }
 
+__host__ void
+prepareScalarTextureObject(int                          size[3],
+                           float *                      dev_ptr,
+                           cudaArray *&                 threeDArray,
+                           cudaTextureObject_t &        tex,
+                           const bool                   isProjections,
+                           const bool                   isLinear = true,
+                           const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder);
+
 __host__ void
 prepareVectorTextureObject(int                                size[3],
                            const float *                      dev_ptr,
                            std::vector<cudaArray *> &         componentArrays,
                            const unsigned int                 nComponents,
                            std::vector<cudaTextureObject_t> & tex,
-                           bool                               isProjections);
+                           const bool                         isProjections,
+                           const cudaTextureAddressMode       texAddressMode = cudaAddressModeBorder);
 
+__host__ void
+prepareGeometryTextureObject(int                   size,
+                             const float *         geometry,
+                             float *&              dev_geom,
+                             cudaTextureObject_t & tex_geom,
+                             const unsigned int    nParam);
 
 inline __device__ void
 matrix_matrix_multiply(float * A, float * B, float * C, unsigned int rowsA, unsigned int colsB, unsigned int colsArowsB)

diff --git a/src/rtkCudaDisplacedDetectorImageFilter.cu b/src/rtkCudaDisplacedDetectorImageFilter.cu
@@ -22,8 +22,6 @@
 #include <cuda_runtime.h>
 #include <math_constants.h>
 
-texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture
-
 inline __device__ float
 TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column)
 {
@@ -45,19 +43,20 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s
 }
 
 __global__ void
-kernel_displaced_weight(int3    proj_idx_in,
-                        int3    proj_size_in,
-                        int2    proj_size_in_buf,
-                        int3    proj_idx_out,
-                        int3    proj_size_out,
-                        int2    proj_size_out_buf,
-                        float * dev_proj_in,
-                        float * dev_proj_out,
-                        float   theta,
-                        bool    isPositiveCase,
-                        float   proj_orig, // projection origin
-                        float   proj_row,  // projection row direction & spacing
-                        float   proj_col   // projection col direction & spacing
+kernel_displaced_weight(int3                proj_idx_in,
+                        int3                proj_size_in,
+                        int2                proj_size_in_buf,
+                        int3                proj_idx_out,
+                        int3                proj_size_out,
+                        int2                proj_size_out_buf,
+                        float *             dev_proj_in,
+                        float *             dev_proj_out,
+                        float               theta,
+                        bool                isPositiveCase,
+                        float               proj_orig, // projection origin
+                        float               proj_row,  // projection row direction & spacing
+                        float               proj_col,  // projection col direction & spacing
+                        cudaTextureObject_t tex_geom   // geometry texture object
 )
 {
   // compute thread index
@@ -90,10 +89,10 @@ kernel_displaced_weight(int3    proj_idx_in,
   {
     float pPoint = TransformIndexToPhysicalPoint(make_int2(pIdx.x, pIdx.y), proj_orig, proj_row, proj_col);
 
-    float sdd = tex1Dfetch(tex_geometry, tIdx.z * 4 + 0);
-    float sx = tex1Dfetch(tex_geometry, tIdx.z * 4 + 1);
-    float px = tex1Dfetch(tex_geometry, tIdx.z * 4 + 2);
-    float sid = tex1Dfetch(tex_geometry, tIdx.z * 4 + 3);
+    float sdd = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 0);
+    float sx = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 1);
+    float px = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 2);
+    float sid = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 3);
 
     float hyp = sqrtf(sid * sid + sx * sx); // to untilted situation
     float l = ToUntiltedCoordinateAtIsocenter(pPoint, sdd, sid, sx, px, hyp);
@@ -139,21 +138,17 @@ CUDA_displaced_weight(int     proj_idx_in[3],      // overlapping input region i
                       int     proj_dim_out_buf[2], // output size of buffered region
                       float * dev_proj_in,
                       float * dev_proj_out,
-                      float * geometries,
+                      float * geometry,
                       float   theta,
                       bool    isPositiveCase,
                       float   proj_orig,
                       float   proj_row,
                       float   proj_col)
 {
   // copy geometry matrix to device, bind the matrix to the texture
-  float * dev_geom;
-  cudaMalloc((void **)&dev_geom, proj_dim_out[2] * 4 * sizeof(float));
-  CUDA_CHECK_ERROR;
-  cudaMemcpy(dev_geom, geometries, proj_dim_out[2] * 4 * sizeof(float), cudaMemcpyHostToDevice);
-  CUDA_CHECK_ERROR;
-  cudaBindTexture(0, tex_geometry, dev_geom, proj_dim_out[2] * 4 * sizeof(float));
-  CUDA_CHECK_ERROR;
+  float *             dev_geom;
+  cudaTextureObject_t tex_geom;
+  prepareGeometryTextureObject(proj_dim_out[2], geometry, dev_geom, tex_geom, 4);
 
   // Thread Block Dimensions
   int tBlock_x = 16;
@@ -179,10 +174,11 @@ CUDA_displaced_weight(int     proj_idx_in[3],      // overlapping input region i
                                                  isPositiveCase,
                                                  proj_orig,
                                                  proj_row,
-                                                 proj_col);
+                                                 proj_col,
+                                                 tex_geom);
 
   // Unbind matrix texture
-  cudaUnbindTexture(tex_geometry);
+  cudaDestroyTextureObject(tex_geom);
   CUDA_CHECK_ERROR;
   cudaFree(dev_geom);
   CUDA_CHECK_ERROR;

diff --git a/src/rtkCudaFDKBackProjectionImageFilter.cu b/src/rtkCudaFDKBackProjectionImageFilter.cu
@@ -39,9 +39,6 @@
  *****************/
 #include <cuda.h>
 
-// T E X T U R E S ////////////////////////////////////////////////////////
-texture<float, cudaTextureType2DLayered> tex_proj;
-
 // Constant memory
 __constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections
 __constant__ int3 c_projSize;
@@ -53,7 +50,7 @@ __constant__ int3 c_vol_size;
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 
 __global__ void
-kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out)
+kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out, cudaTextureObject_t tex_proj)
 {
   itk::SizeValueType i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
   itk::SizeValueType j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
@@ -81,7 +78,7 @@ kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out)
     ip.y = ip.y * ip.z;
 
     // Get texture point, clip left to GPU, and accumulate in voxel_data
-    voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z;
+    voxel_data += tex2DLayered<float>(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z;
   }
 
   // Place it into the volume
@@ -110,33 +107,6 @@ CUDA_reconstruct_conebeam(int     proj_size[3],
   // Copy the projection matrices into constant memory
   cudaMemcpyToSymbol(c_matrices, &(matrices[0]), 12 * sizeof(float) * proj_size[2]);
 
-  // set texture parameters
-  tex_proj.addressMode[0] = cudaAddressModeBorder;
-  tex_proj.addressMode[1] = cudaAddressModeBorder;
-  tex_proj.addressMode[2] = cudaAddressModeBorder;
-  tex_proj.filterMode = cudaFilterModeLinear;
-  tex_proj.normalized = false; // don't access with normalized texture coords
-
-  // Copy projection data to array, bind the array to the texture
-  cudaExtent                   projExtent = make_cudaExtent(proj_size[0], proj_size[1], proj_size[2]);
-  cudaArray *                  array_proj;
-  static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-  CUDA_CHECK_ERROR;
-
-  // Allocate array for input projections, in order to bind them to
-  // a 2D layered texture
-  cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
-  CUDA_CHECK_ERROR;
-
-  // Copy data to 3D array
-  cudaMemcpy3DParms copyParams = cudaMemcpy3DParms();
-  copyParams.srcPtr = make_cudaPitchedPtr(dev_proj, proj_size[0] * sizeof(float), proj_size[0], proj_size[1]);
-  copyParams.dstArray = (cudaArray *)array_proj;
-  copyParams.extent = projExtent;
-  copyParams.kind = cudaMemcpyDeviceToDevice;
-  cudaMemcpy3D(&copyParams);
-  CUDA_CHECK_ERROR;
-
   // Thread Block Dimensions
   constexpr int tBlock_x = 16;
   constexpr int tBlock_y = 4;
@@ -153,19 +123,15 @@ CUDA_reconstruct_conebeam(int     proj_size[3],
   // Compute block and grid sizes
   dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
   dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
-  CUDA_CHECK_ERROR;
-
-  // Bind the array of projections to a 2D layered texture
-  cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
-  CUDA_CHECK_ERROR;
 
-  kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
-
-  // Unbind the image and projection matrix textures
-  cudaUnbindTexture(tex_proj);
-  CUDA_CHECK_ERROR;
+  cudaArray *         array_proj;
+  cudaTextureObject_t tex_proj;
+  prepareScalarTextureObject(proj_size, dev_proj, array_proj, tex_proj, true);
+  kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, tex_proj);
 
   // Cleanup
   cudaFreeArray((cudaArray *)array_proj);
   CUDA_CHECK_ERROR;
+  cudaDestroyTextureObject(tex_proj);
+  CUDA_CHECK_ERROR;
 }
diff --git a/src/rtkCudaFDKWeightProjectionFilter.cu b/src/rtkCudaFDKWeightProjectionFilter.cu
@@ -21,24 +21,23 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture
-
 inline __device__ float2
                   TransformIndexToPhysicalPoint(int2 idx, float2 origin, float2 row, float2 column)
 {
   return make_float2(origin.x + row.x * idx.x + column.x * idx.y, origin.y + row.y * idx.x + column.y * idx.y);
 }
 
 __global__ void
-kernel_weight_projection(int2    proj_idx,
-                         int3    proj_size,
-                         int2    proj_size_buf_in,
-                         int2    proj_size_buf_out,
-                         float * dev_proj_in,
-                         float * dev_proj_out,
-                         float2  proj_orig, // projection origin
-                         float2  proj_row,  // projection row direction & spacing
-                         float2  proj_col   // projection col direction & spacing
+kernel_weight_projection(int2                proj_idx,
+                         int3                proj_size,
+                         int2                proj_size_buf_in,
+                         int2                proj_size_buf_out,
+                         float *             dev_proj_in,
+                         float *             dev_proj_out,
+                         float2              proj_orig, // projection origin
+                         float2              proj_row,  // projection row direction & spacing
+                         float2              proj_col,  // projection col direction & spacing
+                         cudaTextureObject_t tex_geom   // geometry texture object
 )
 {
   // compute projection index (== thread index)
@@ -53,19 +52,19 @@ kernel_weight_projection(int2    proj_idx,
   if (pIdx.x >= proj_size.x || pIdx.y >= proj_size.y || pIdx.z >= proj_size.z)
     return;
 
-  const float sdd = tex1Dfetch(tex_geometry, pIdx.z * 7 + 0);
-  const float sid = tex1Dfetch(tex_geometry, pIdx.z * 7 + 1);
-  const float wFac = tex1Dfetch(tex_geometry, pIdx.z * 7 + 5);
+  const float sdd = tex1Dfetch<float>(tex_geom, pIdx.z * 7);
+  const float sid = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 1);
+  const float wFac = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 5);
   if (sdd == 0) // parallel
   {
     dev_proj_out[pIdx_comp_out] = dev_proj_in[pIdx_comp_in] * wFac;
   }
   else // divergent
   {
-    const float pOffX = tex1Dfetch(tex_geometry, pIdx.z * 7 + 2);
-    const float pOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 3);
-    const float sOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 4);
-    const float tAngle = tex1Dfetch(tex_geometry, pIdx.z * 7 + 6);
+    const float pOffX = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 2);
+    const float pOffY = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 3);
+    const float sOffY = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 4);
+    const float tAngle = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 6);
     const float sina = sin(tAngle);
     const float cosa = cos(tAngle);
     const float tana = tan(tAngle);
@@ -90,19 +89,14 @@ CUDA_weight_projection(int     proj_idx[2],
                        int     proj_dim_buf_out[2],
                        float * dev_proj_in,
                        float * dev_proj_out,
-                       float * geometries,
+                       float * geometry,
                        float   proj_orig[2],
                        float   proj_row[2],
                        float   proj_col[2])
 {
-  // copy geometry matrix to device, bind the matrix to the texture
-  float * dev_geom;
-  cudaMalloc((void **)&dev_geom, proj_dim[2] * 7 * sizeof(float));
-  CUDA_CHECK_ERROR;
-  cudaMemcpy(dev_geom, geometries, proj_dim[2] * 7 * sizeof(float), cudaMemcpyHostToDevice);
-  CUDA_CHECK_ERROR;
-  cudaBindTexture(0, tex_geometry, dev_geom, proj_dim[2] * 7 * sizeof(float));
-  CUDA_CHECK_ERROR;
+  float *             dev_geom;
+  cudaTextureObject_t tex_geom;
+  prepareGeometryTextureObject(proj_dim[2], geometry, dev_geom, tex_geom, 7);
 
   // Thread Block Dimensions
   int tBlock_x = 16;
@@ -124,10 +118,11 @@ CUDA_weight_projection(int     proj_idx[2],
                                                   dev_proj_out,
                                                   make_float2(proj_orig[0], proj_orig[1]),
                                                   make_float2(proj_row[0], proj_row[1]),
-                                                  make_float2(proj_col[0], proj_col[1]));
+                                                  make_float2(proj_col[0], proj_col[1]),
+                                                  tex_geom);
 
-  // Unbind matrix texture
-  cudaUnbindTexture(tex_geometry);
+  // destroy texture object
+  cudaDestroyTextureObject(tex_geom);
   CUDA_CHECK_ERROR;
   cudaFree(dev_geom);
   CUDA_CHECK_ERROR;

diff --git a/src/rtkCudaForwardProjectionImageFilter.cu b/src/rtkCudaForwardProjectionImageFilter.cu
@@ -36,7 +36,6 @@
  * CUDA #includes *
  *****************/
 #include <cuda.h>
-#include <cublas_v2.h>
 #include <cuda_runtime.h>
 
 // CONSTANTS //
@@ -201,7 +200,7 @@ CUDA_forward_project(int          projSize[3],
   // Prepare texture objects
   std::vector<cudaArray *>         volComponentArrays;
   std::vector<cudaTextureObject_t> tex_vol;
-  prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false);
+  prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false, cudaAddressModeClamp);
 
   // Copy them to a device pointer, since it will have to be de-referenced in the kernels
   cudaTextureObject_t * dev_tex_vol;