ENH: Remove CUDA code for compute capability 1

RTK does not support compute capability 1 since 1e75fbc.
RTKConsortium · SimonRit · Apr 21, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 18, 2024
commit 06b137bd705959cf430c7a612c9a43b83f4b3272
diff --git a/src/rtkCudaBackProjectionImageFilter.cu b/src/rtkCudaBackProjectionImageFilter.cu
@@ -51,7 +51,7 @@ __constant__ int3 c_volSize;
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 
-template <unsigned int vectorLength, bool isCylindrical>
+template <unsigned int VVectorLength, bool VIsCylindrical>
 __global__ void
 kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTextureObject_t * dev_tex_proj)
 {
@@ -68,13 +68,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
   itk::SizeValueType vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x);
 
   float3 ip, pp;
-  float  voxel_data[vectorLength];
-  for (unsigned int c = 0; c < vectorLength; c++)
+  float  voxel_data[VVectorLength];
+  for (unsigned int c = 0; c < VVectorLength; c++)
     voxel_data[c] = 0.0f;
 
   for (unsigned int proj = 0; proj < c_projSize.z; proj++)
   {
-    if (isCylindrical)
+    if (VIsCylindrical)
     {
       // matrix multiply
       pp = matrix_multiply(make_float3(i, j, k), &(c_volIndexToProjPP[12 * proj]));
@@ -105,13 +105,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
     }
 
     // Get texture point, clip left to GPU, and accumulate in voxel_data
-    for (unsigned int c = 0; c < vectorLength; c++)
+    for (unsigned int c = 0; c < VVectorLength; c++)
       voxel_data[c] += tex2DLayered<float>(dev_tex_proj[c], ip.x, ip.y, proj);
   }
 
   // Place it into the volume
-  for (unsigned int c = 0; c < vectorLength; c++)
-    dev_vol_out[vol_idx * vectorLength + c] = dev_vol_in[vol_idx * vectorLength + c] + voxel_data[c];
+  for (unsigned int c = 0; c < VVectorLength; c++)
+    dev_vol_out[vol_idx * VVectorLength + c] = dev_vol_in[vol_idx * VVectorLength + c] + voxel_data[c];
 }
 
 
@@ -134,14 +134,6 @@ CUDA_back_project(int          projSize[3],
                   double       radiusCylindricalDetector,
                   unsigned int vectorLength)
 {
-  // 2D layered texture requires CudaComputeCapability >= 2.0
-  int device;
-  cudaGetDevice(&device);
-  if (GetCudaComputeCapability(device).first <= 1)
-  {
-    itkGenericExceptionMacro(<< "RTK no longer supports GPUs with CudaComputeCapability < 2.0");
-  }
-
   // Copy the size of inputs into constant memory
   cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3));
   cudaMemcpyToSymbol(c_volSize, volSize, sizeof(int3));

diff --git a/src/rtkCudaFDKBackProjectionImageFilter.cu b/src/rtkCudaFDKBackProjectionImageFilter.cu
@@ -40,8 +40,7 @@
 #include <cuda.h>
 
 // T E X T U R E S ////////////////////////////////////////////////////////
-texture<float, cudaTextureType2DLayered>   tex_proj;
-texture<float, 3, cudaReadModeElementType> tex_proj_3D;
+texture<float, cudaTextureType2DLayered> tex_proj;
 
 // Constant memory
 __constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections
@@ -53,48 +52,6 @@ __constant__ int3 c_vol_size;
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 
-__global__ void
-kernel_fdk(float * dev_vol_in, float * dev_vol_out, unsigned int Blocks_Y)
-{
-  // CUDA 2.0 does not allow for a 3D grid, which severely
-  // limits the manipulation of large 3D arrays of data.  The
-  // following code is a hack to bypass this implementation
-  // limitation.
-  unsigned int       blockIdx_z = blockIdx.y / Blocks_Y;
-  unsigned int       blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
-  itk::SizeValueType i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
-  itk::SizeValueType j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
-  itk::SizeValueType k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;
-
-  if (i >= c_vol_size.x || j >= c_vol_size.y || k >= c_vol_size.z)
-  {
-    return;
-  }
-
-  // Index row major into the volume
-  itk::SizeValueType vol_idx = i + (j + k * c_vol_size.y) * (c_vol_size.x);
-
-  float3 ip;
-  float  voxel_data = 0;
-
-  for (unsigned int proj = 0; proj < c_projSize.z; proj++)
-  {
-    // matrix multiply
-    ip = matrix_multiply(make_float3(i, j, k), &(c_matrices[12 * proj]));
-
-    // Change coordinate systems
-    ip.z = 1 / ip.z;
-    ip.x = ip.x * ip.z;
-    ip.y = ip.y * ip.z;
-
-    // Get texture point, clip left to GPU
-    voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5) * ip.z * ip.z;
-  }
-
-  // Place it into the volume
-  dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data;
-}
-
 __global__ void
 kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out)
 {
@@ -146,9 +103,6 @@ CUDA_reconstruct_conebeam(int     proj_size[3],
                           float * dev_vol_out,
                           float * dev_proj)
 {
-  int device;
-  cudaGetDevice(&device);
-
   // Copy the size of inputs into constant memory
   cudaMemcpyToSymbol(c_projSize, proj_size, sizeof(int3));
   cudaMemcpyToSymbol(c_vol_size, vol_size, sizeof(int3));
@@ -163,25 +117,15 @@ CUDA_reconstruct_conebeam(int     proj_size[3],
   tex_proj.filterMode = cudaFilterModeLinear;
   tex_proj.normalized = false; // don't access with normalized texture coords
 
-  tex_proj_3D.addressMode[0] = cudaAddressModeBorder;
-  tex_proj_3D.addressMode[1] = cudaAddressModeBorder;
-  tex_proj_3D.addressMode[2] = cudaAddressModeBorder;
-  tex_proj_3D.filterMode = cudaFilterModeLinear;
-  tex_proj_3D.normalized = false; // don't access with normalized texture coords
-
   // Copy projection data to array, bind the array to the texture
   cudaExtent                   projExtent = make_cudaExtent(proj_size[0], proj_size[1], proj_size[2]);
   cudaArray *                  array_proj;
   static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
   CUDA_CHECK_ERROR;
 
   // Allocate array for input projections, in order to bind them to
-  // either a 2D layered texture (requires GetCudaComputeCapability >= 2.0) or
-  // a 3D texture
-  if (GetCudaComputeCapability(device).first <= 1)
-    cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent);
-  else
-    cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
+  // a 2D layered texture
+  cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
   CUDA_CHECK_ERROR;
 
   // Copy data to 3D array
@@ -205,39 +149,21 @@ CUDA_reconstruct_conebeam(int     proj_size[3],
 
   // Run kernels. Note: Projection data is passed via texture memory,
   // transform matrix is passed via constant memory
-  if (GetCudaComputeCapability(device).first <= 1)
-  {
-    // Compute block and grid sizes
-    dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ);
-    dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
-
-    // Bind the array of projections to a 3D texture
-    cudaBindTextureToArray(tex_proj_3D, (cudaArray *)array_proj, channelDesc);
-    CUDA_CHECK_ERROR;
 
-    kernel_fdk<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, blocksInY);
-
-    // Unbind the image and projection matrix textures
-    cudaUnbindTexture(tex_proj_3D);
-    CUDA_CHECK_ERROR;
-  }
-  else
-  {
-    // Compute block and grid sizes
-    dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
-    dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
-    CUDA_CHECK_ERROR;
+  // Compute block and grid sizes
+  dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
+  dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
+  CUDA_CHECK_ERROR;
 
-    // Bind the array of projections to a 2D layered texture
-    cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
-    CUDA_CHECK_ERROR;
+  // Bind the array of projections to a 2D layered texture
+  cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
+  CUDA_CHECK_ERROR;
 
-    kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
+  kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
 
-    // Unbind the image and projection matrix textures
-    cudaUnbindTexture(tex_proj);
-    CUDA_CHECK_ERROR;
-  }
+  // Unbind the image and projection matrix textures
+  cudaUnbindTexture(tex_proj);
+  CUDA_CHECK_ERROR;
 
   // Cleanup
   cudaFreeArray((cudaArray *)array_proj);

diff --git a/src/rtkCudaWarpBackProjectionImageFilter.cu b/src/rtkCudaWarpBackProjectionImageFilter.cu
@@ -42,8 +42,7 @@
 #include <cuda_runtime.h>
 
 // T E X T U R E S ////////////////////////////////////////////////////////
-texture<float, cudaTextureType2DLayered>   tex_proj;
-texture<float, 3, cudaReadModeElementType> tex_proj_3D;
+texture<float, cudaTextureType2DLayered> tex_proj;
 
 texture<float, 3, cudaReadModeElementType> tex_xdvf;
 texture<float, 3, cudaReadModeElementType> tex_ydvf;
@@ -66,131 +65,6 @@ __constant__ float c_IndexInputToPPInputMatrix[12];
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 
-__global__ void
-kernel_warp_back_project(float * dev_vol_in, float * dev_vol_out, unsigned int Blocks_Y)
-{
-  // CUDA 2.0 does not allow for a 3D grid, which severely
-  // limits the manipulation of large 3D arrays of data.  The
-  // following code is a hack to bypass this implementation
-  // limitation.
-  unsigned int blockIdx_z = blockIdx.y / Blocks_Y;
-  unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
-  unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
-  unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
-  unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;
-
-  if (i >= c_volSize.x || j >= c_volSize.y || k >= c_volSize.z)
-  {
-    return;
-  }
-
-  // Index row major into the volume
-  long int vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x);
-
-  float3 IndexInDVF, Displacement, PP, IndexInInput, ip;
-  float  voxel_data = 0;
-
-  for (unsigned int proj = 0; proj < c_projSize.z; proj++)
-  {
-    // Compute the index in the DVF
-    IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix);
-
-    // Get each component of the displacement vector by interpolation in the DVF
-    Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-    Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-    Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-
-    // Compute the physical point in input + the displacement vector
-    PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement;
-
-    // Convert it to a continuous index
-    IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix);
-
-    // Project the voxel onto the detector to find out which value to add to it
-    ip = matrix_multiply(IndexInInput, &(c_matrices[12 * proj]));
-    ;
-
-    // Change coordinate systems
-    ip.z = 1 / ip.z;
-    ip.x = ip.x * ip.z;
-    ip.y = ip.y * ip.z;
-
-    // Get texture point, clip left to GPU
-    voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5);
-  }
-
-  // Place it into the volume
-  dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data;
-}
-
-__global__ void
-kernel_warp_back_project_cylindrical_detector(float *      dev_vol_in,
-                                              float *      dev_vol_out,
-                                              unsigned int Blocks_Y,
-                                              float        radius)
-{
-  // CUDA 2.0 does not allow for a 3D grid, which severely
-  // limits the manipulation of large 3D arrays of data.  The
-  // following code is a hack to bypass this implementation
-  // limitation.
-  unsigned int blockIdx_z = blockIdx.y / Blocks_Y;
-  unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
-  unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
-  unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
-  unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;
-
-  if (i >= c_volSize.x || j >= c_volSize.y || k >= c_volSize.z)
-  {
-    return;
-  }
-
-  // Index row major into the volume
-  long int vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x);
-
-  float3 IndexInDVF, Displacement, PP, IndexInInput, ip, pp;
-  float  voxel_data = 0;
-
-  for (unsigned int proj = 0; proj < c_projSize.z; proj++)
-  {
-    // Compute the index in the DVF
-    IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix);
-
-    // Get each component of the displacement vector by interpolation in the DVF
-    Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-    Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-    Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-
-    // Compute the physical point in input + the displacement vector
-    PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement;
-
-    // Convert it to a continuous index
-    IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix);
-
-    // Project the voxel onto the detector to find out which value to add to it
-    pp = matrix_multiply(IndexInInput, &(c_volIndexToProjPP[12 * proj]));
-
-    // Change coordinate systems
-    pp.z = 1 / pp.z;
-    pp.x = pp.x * pp.z;
-    pp.y = pp.y * pp.z;
-
-    // Apply correction for cylindrical detector
-    const float u = pp.x;
-    pp.x = radius * atan2(u, radius);
-    pp.y = pp.y * radius / sqrt(radius * radius + u * u);
-
-    // Get projection index
-    ip.x = c_projPPToProjIndex[0] * pp.x + c_projPPToProjIndex[1] * pp.y + c_projPPToProjIndex[2];
-    ip.y = c_projPPToProjIndex[3] * pp.x + c_projPPToProjIndex[4] * pp.y + c_projPPToProjIndex[5];
-
-    // Get texture point, clip left to GPU
-    voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5);
-  }
-
-  // Place it into the volume
-  dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data;
-}
-
 __global__ void
 kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out)
 {
@@ -227,7 +101,6 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out)
 
     // Project the voxel onto the detector to find out which value to add to it
     ip = matrix_multiply(IndexInInput, &(c_matrices[12 * proj]));
-    ;
 
     // Change coordinate systems
     ip.z = 1 / ip.z;
@@ -346,25 +219,15 @@ CUDA_warp_back_project(int     projSize[3],
   tex_proj.filterMode = cudaFilterModeLinear;
   tex_proj.normalized = false; // don't access with normalized texture coords
 
-  tex_proj_3D.addressMode[0] = cudaAddressModeBorder;
-  tex_proj_3D.addressMode[1] = cudaAddressModeBorder;
-  tex_proj_3D.addressMode[2] = cudaAddressModeBorder;
-  tex_proj_3D.filterMode = cudaFilterModeLinear;
-  tex_proj_3D.normalized = false; // don't access with normalized texture coords
-
   // Copy projection data to array, bind the array to the texture
   cudaExtent                   projExtent = make_cudaExtent(projSize[0], projSize[1], projSize[2]);
   cudaArray *                  array_proj;
   static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
   CUDA_CHECK_ERROR;
 
   // Allocate array for input projections, in order to bind them to
-  // either a 2D layered texture (requires GetCudaComputeCapability >= 2.0) or
-  // a 3D texture
-  if (GetCudaComputeCapability(device).first <= 1)
-    cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent);
-  else
-    cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
+  // a 2D layered texture
+  cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
   CUDA_CHECK_ERROR;
 
   // Copy data to 3D array
@@ -460,49 +323,27 @@ CUDA_warp_back_project(int     projSize[3],
 
   // Run kernels. Note: Projection data is passed via texture memory,
   // transform matrix is passed via constant memory
-  if (GetCudaComputeCapability(device).first <= 1)
-  {
-    // Compute block and grid sizes
-    dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ);
-    dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
 
-    // Bind the array of projections to a 3D texture
-    cudaBindTextureToArray(tex_proj_3D, (cudaArray *)array_proj, channelDesc);
-    CUDA_CHECK_ERROR;
+  // Compute block and grid sizes
+  dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
+  dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
+  CUDA_CHECK_ERROR;
 
-    if (radiusCylindricalDetector == 0)
-      kernel_warp_back_project<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, blocksInY);
-    else
-      kernel_warp_back_project_cylindrical_detector<<<dimGrid, dimBlock>>>(
-        dev_vol_in, dev_vol_out, blocksInY, (float)radiusCylindricalDetector);
+  // Bind the array of projections to a 2D layered texture
+  cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
+  CUDA_CHECK_ERROR;
 
-    // Unbind the image and projection matrix textures
-    cudaUnbindTexture(tex_proj_3D);
-    CUDA_CHECK_ERROR;
-  }
+  // Note: cbi->img is passed via texture memory
+  // Matrices are passed via constant memory
+  //-------------------------------------
+  if (radiusCylindricalDetector == 0)
+    kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
   else
-  {
-    // Compute block and grid sizes
-    dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
-    dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
-    CUDA_CHECK_ERROR;
-
-    // Bind the array of projections to a 2D layered texture
-    cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
-    CUDA_CHECK_ERROR;
-
-    // Note: cbi->img is passed via texture memory
-    // Matrices are passed via constant memory
-    //-------------------------------------
-    if (radiusCylindricalDetector == 0)
-      kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
-    else
-      kernel_warp_back_project_3Dgrid_cylindrical_detector<<<dimGrid, dimBlock>>>(
-        dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector);
-    // Unbind the image and projection matrix textures
-    cudaUnbindTexture(tex_proj);
-    CUDA_CHECK_ERROR;
-  }
+    kernel_warp_back_project_3Dgrid_cylindrical_detector<<<dimGrid, dimBlock>>>(
+      dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector);
+  // Unbind the image and projection matrix textures
+  cudaUnbindTexture(tex_proj);
+  CUDA_CHECK_ERROR;
 
   // Unbind the image and projection matrix textures
   cudaUnbindTexture(tex_xdvf);

diff --git a/src/rtkCudaWarpImageFilter.cu b/src/rtkCudaWarpImageFilter.cu
@@ -63,51 +63,6 @@ __constant__ float c_PPInputToIndexInputMatrix[12];
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 
-__global__ void
-kernel(float * dev_vol_out, int3 vol_dim, unsigned int Blocks_Y)
-{
-  // CUDA 2.0 does not allow for a 3D grid, which severely
-  // limits the manipulation of large 3D arrays of data.  The
-  // following code is a hack to bypass this implementation
-  // limitation.
-  unsigned int blockIdx_z = blockIdx.y / Blocks_Y;
-  unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
-  unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
-  unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
-  unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;
-
-  if (i >= vol_dim.x || j >= vol_dim.y || k >= vol_dim.z)
-  {
-    return;
-  }
-
-  // Index row major into the volume
-  long int vol_idx = i + (j + k * vol_dim.y) * (vol_dim.x);
-
-  // Matrix multiply to get the index in the DVF texture of the current point in the output volume
-  float3 IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexOutputToIndexDVFMatrix);
-
-  // Get each component of the displacement vector by
-  // interpolation in the dvf
-  float3 Displacement;
-  Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-  Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-  Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-
-  // Matrix multiply to get the physical coordinates of the current point in the output volume
-  float3 PPinOutput = matrix_multiply(make_float3(i, j, k), c_IndexOutputToPPOutputMatrix);
-
-  // Get the index corresponding to the current physical point in output displaced by the displacement vector
-  float3 PPDisplaced;
-  PPDisplaced.x = PPinOutput.x + Displacement.x;
-  PPDisplaced.y = PPinOutput.y + Displacement.y;
-  PPDisplaced.z = PPinOutput.z + Displacement.z;
-  float3 IndexInInput = matrix_multiply(PPDisplaced, c_PPInputToIndexInputMatrix);
-
-  // Interpolate in the input and copy into the output
-  dev_vol_out[vol_idx] = tex3D(tex_input_vol, IndexInInput.x + 0.5f, IndexInInput.y + 0.5f, IndexInInput.z + 0.5f);
-}
-
 __global__ void
 kernel_3Dgrid(float * dev_vol_out, int3 vol_dim)
 {
@@ -165,7 +120,6 @@ CUDA_warp(int     input_vol_dim[3],
           float * dev_DVF,
           bool    isLinear)
 {
-
   // Prepare channel description for arrays
   static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 
@@ -301,26 +255,13 @@ CUDA_warp(int     input_vol_dim[3],
   unsigned int blocksInY = (output_vol_dim[1] - 1) / tBlock_y + 1;
   unsigned int blocksInZ = (output_vol_dim[2] - 1) / tBlock_z + 1;
 
-  if (GetCudaComputeCapability(device).first <= 1)
-  {
-    dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ);
-    dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
+  dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
+  dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
 
-    // Note: the DVF and input image are passed via texture memory
-    //-------------------------------------
-    kernel<<<dimGrid, dimBlock>>>(
-      dev_output_vol, make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]), blocksInY);
-  }
-  else
-  {
-    dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
-    dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
-
-    // Note: the DVF and input image are passed via texture memory
-    //-------------------------------------
-    kernel_3Dgrid<<<dimGrid, dimBlock>>>(dev_output_vol,
-                                         make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]));
-  }
+  // Note: the DVF and input image are passed via texture memory
+  //-------------------------------------
+  kernel_3Dgrid<<<dimGrid, dimBlock>>>(dev_output_vol,
+                                       make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]));
 
   CUDA_CHECK_ERROR;