Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cuda12 #589

Merged
merged 5 commits into from
Apr 21, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ENH: Remove CUDA code for compute capability 1
RTK does not support compute capability 1 since
1e75fbc.
Simon Rit committed Apr 19, 2024
commit 06b137bd705959cf430c7a612c9a43b83f4b3272
22 changes: 7 additions & 15 deletions src/rtkCudaBackProjectionImageFilter.cu
Original file line number Diff line number Diff line change
@@ -51,7 +51,7 @@ __constant__ int3 c_volSize;
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

template <unsigned int vectorLength, bool isCylindrical>
template <unsigned int VVectorLength, bool VIsCylindrical>
__global__ void
kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTextureObject_t * dev_tex_proj)
{
@@ -68,13 +68,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
itk::SizeValueType vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x);

float3 ip, pp;
float voxel_data[vectorLength];
for (unsigned int c = 0; c < vectorLength; c++)
float voxel_data[VVectorLength];
for (unsigned int c = 0; c < VVectorLength; c++)
voxel_data[c] = 0.0f;

for (unsigned int proj = 0; proj < c_projSize.z; proj++)
{
if (isCylindrical)
if (VIsCylindrical)
{
// matrix multiply
pp = matrix_multiply(make_float3(i, j, k), &(c_volIndexToProjPP[12 * proj]));
@@ -105,13 +105,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
}

// Get texture point, clip left to GPU, and accumulate in voxel_data
for (unsigned int c = 0; c < vectorLength; c++)
for (unsigned int c = 0; c < VVectorLength; c++)
voxel_data[c] += tex2DLayered<float>(dev_tex_proj[c], ip.x, ip.y, proj);
}

// Place it into the volume
for (unsigned int c = 0; c < vectorLength; c++)
dev_vol_out[vol_idx * vectorLength + c] = dev_vol_in[vol_idx * vectorLength + c] + voxel_data[c];
for (unsigned int c = 0; c < VVectorLength; c++)
dev_vol_out[vol_idx * VVectorLength + c] = dev_vol_in[vol_idx * VVectorLength + c] + voxel_data[c];
}


@@ -134,14 +134,6 @@ CUDA_back_project(int projSize[3],
double radiusCylindricalDetector,
unsigned int vectorLength)
{
// 2D layered texture requires CudaComputeCapability >= 2.0
int device;
cudaGetDevice(&device);
if (GetCudaComputeCapability(device).first <= 1)
{
itkGenericExceptionMacro(<< "RTK no longer supports GPUs with CudaComputeCapability < 2.0");
}

// Copy the size of inputs into constant memory
cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3));
cudaMemcpyToSymbol(c_volSize, volSize, sizeof(int3));
102 changes: 14 additions & 88 deletions src/rtkCudaFDKBackProjectionImageFilter.cu
Original file line number Diff line number Diff line change
@@ -40,8 +40,7 @@
#include <cuda.h>

// T E X T U R E S ////////////////////////////////////////////////////////
texture<float, cudaTextureType2DLayered> tex_proj;
texture<float, 3, cudaReadModeElementType> tex_proj_3D;
texture<float, cudaTextureType2DLayered> tex_proj;

// Constant memory
__constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections
@@ -53,48 +52,6 @@ __constant__ int3 c_vol_size;
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

__global__ void
kernel_fdk(float * dev_vol_in, float * dev_vol_out, unsigned int Blocks_Y)
{
// CUDA 2.0 does not allow for a 3D grid, which severely
// limits the manipulation of large 3D arrays of data. The
// following code is a hack to bypass this implementation
// limitation.
unsigned int blockIdx_z = blockIdx.y / Blocks_Y;
unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
itk::SizeValueType i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
itk::SizeValueType j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
itk::SizeValueType k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;

if (i >= c_vol_size.x || j >= c_vol_size.y || k >= c_vol_size.z)
{
return;
}

// Index row major into the volume
itk::SizeValueType vol_idx = i + (j + k * c_vol_size.y) * (c_vol_size.x);

float3 ip;
float voxel_data = 0;

for (unsigned int proj = 0; proj < c_projSize.z; proj++)
{
// matrix multiply
ip = matrix_multiply(make_float3(i, j, k), &(c_matrices[12 * proj]));

// Change coordinate systems
ip.z = 1 / ip.z;
ip.x = ip.x * ip.z;
ip.y = ip.y * ip.z;

// Get texture point, clip left to GPU
voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5) * ip.z * ip.z;
}

// Place it into the volume
dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data;
}

__global__ void
kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out)
{
@@ -146,9 +103,6 @@ CUDA_reconstruct_conebeam(int proj_size[3],
float * dev_vol_out,
float * dev_proj)
{
int device;
cudaGetDevice(&device);

// Copy the size of inputs into constant memory
cudaMemcpyToSymbol(c_projSize, proj_size, sizeof(int3));
cudaMemcpyToSymbol(c_vol_size, vol_size, sizeof(int3));
@@ -163,25 +117,15 @@ CUDA_reconstruct_conebeam(int proj_size[3],
tex_proj.filterMode = cudaFilterModeLinear;
tex_proj.normalized = false; // don't access with normalized texture coords

tex_proj_3D.addressMode[0] = cudaAddressModeBorder;
tex_proj_3D.addressMode[1] = cudaAddressModeBorder;
tex_proj_3D.addressMode[2] = cudaAddressModeBorder;
tex_proj_3D.filterMode = cudaFilterModeLinear;
tex_proj_3D.normalized = false; // don't access with normalized texture coords

// Copy projection data to array, bind the array to the texture
cudaExtent projExtent = make_cudaExtent(proj_size[0], proj_size[1], proj_size[2]);
cudaArray * array_proj;
static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
CUDA_CHECK_ERROR;

// Allocate array for input projections, in order to bind them to
// either a 2D layered texture (requires GetCudaComputeCapability >= 2.0) or
// a 3D texture
if (GetCudaComputeCapability(device).first <= 1)
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent);
else
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
// a 2D layered texture
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
CUDA_CHECK_ERROR;

// Copy data to 3D array
@@ -205,39 +149,21 @@ CUDA_reconstruct_conebeam(int proj_size[3],

// Run kernels. Note: Projection data is passed via texture memory,
// transform matrix is passed via constant memory
if (GetCudaComputeCapability(device).first <= 1)
{
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);

// Bind the array of projections to a 3D texture
cudaBindTextureToArray(tex_proj_3D, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;

kernel_fdk<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, blocksInY);

// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj_3D);
CUDA_CHECK_ERROR;
}
else
{
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
CUDA_CHECK_ERROR;
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
CUDA_CHECK_ERROR;

// Bind the array of projections to a 2D layered texture
cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;
// Bind the array of projections to a 2D layered texture
cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;

kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);

// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj);
CUDA_CHECK_ERROR;
}
// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj);
CUDA_CHECK_ERROR;

// Cleanup
cudaFreeArray((cudaArray *)array_proj);
199 changes: 20 additions & 179 deletions src/rtkCudaWarpBackProjectionImageFilter.cu
Original file line number Diff line number Diff line change
@@ -42,8 +42,7 @@
#include <cuda_runtime.h>

// T E X T U R E S ////////////////////////////////////////////////////////
texture<float, cudaTextureType2DLayered> tex_proj;
texture<float, 3, cudaReadModeElementType> tex_proj_3D;
texture<float, cudaTextureType2DLayered> tex_proj;

texture<float, 3, cudaReadModeElementType> tex_xdvf;
texture<float, 3, cudaReadModeElementType> tex_ydvf;
@@ -66,131 +65,6 @@ __constant__ float c_IndexInputToPPInputMatrix[12];
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

__global__ void
kernel_warp_back_project(float * dev_vol_in, float * dev_vol_out, unsigned int Blocks_Y)
{
// CUDA 2.0 does not allow for a 3D grid, which severely
// limits the manipulation of large 3D arrays of data. The
// following code is a hack to bypass this implementation
// limitation.
unsigned int blockIdx_z = blockIdx.y / Blocks_Y;
unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;

if (i >= c_volSize.x || j >= c_volSize.y || k >= c_volSize.z)
{
return;
}

// Index row major into the volume
long int vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x);

float3 IndexInDVF, Displacement, PP, IndexInInput, ip;
float voxel_data = 0;

for (unsigned int proj = 0; proj < c_projSize.z; proj++)
{
// Compute the index in the DVF
IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix);

// Get each component of the displacement vector by interpolation in the DVF
Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);

// Compute the physical point in input + the displacement vector
PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement;

// Convert it to a continuous index
IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix);

// Project the voxel onto the detector to find out which value to add to it
ip = matrix_multiply(IndexInInput, &(c_matrices[12 * proj]));
;

// Change coordinate systems
ip.z = 1 / ip.z;
ip.x = ip.x * ip.z;
ip.y = ip.y * ip.z;

// Get texture point, clip left to GPU
voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5);
}

// Place it into the volume
dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data;
}

__global__ void
kernel_warp_back_project_cylindrical_detector(float * dev_vol_in,
float * dev_vol_out,
unsigned int Blocks_Y,
float radius)
{
// CUDA 2.0 does not allow for a 3D grid, which severely
// limits the manipulation of large 3D arrays of data. The
// following code is a hack to bypass this implementation
// limitation.
unsigned int blockIdx_z = blockIdx.y / Blocks_Y;
unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;

if (i >= c_volSize.x || j >= c_volSize.y || k >= c_volSize.z)
{
return;
}

// Index row major into the volume
long int vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x);

float3 IndexInDVF, Displacement, PP, IndexInInput, ip, pp;
float voxel_data = 0;

for (unsigned int proj = 0; proj < c_projSize.z; proj++)
{
// Compute the index in the DVF
IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix);

// Get each component of the displacement vector by interpolation in the DVF
Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);

// Compute the physical point in input + the displacement vector
PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement;

// Convert it to a continuous index
IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix);

// Project the voxel onto the detector to find out which value to add to it
pp = matrix_multiply(IndexInInput, &(c_volIndexToProjPP[12 * proj]));

// Change coordinate systems
pp.z = 1 / pp.z;
pp.x = pp.x * pp.z;
pp.y = pp.y * pp.z;

// Apply correction for cylindrical detector
const float u = pp.x;
pp.x = radius * atan2(u, radius);
pp.y = pp.y * radius / sqrt(radius * radius + u * u);

// Get projection index
ip.x = c_projPPToProjIndex[0] * pp.x + c_projPPToProjIndex[1] * pp.y + c_projPPToProjIndex[2];
ip.y = c_projPPToProjIndex[3] * pp.x + c_projPPToProjIndex[4] * pp.y + c_projPPToProjIndex[5];

// Get texture point, clip left to GPU
voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5);
}

// Place it into the volume
dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data;
}

__global__ void
kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out)
{
@@ -227,7 +101,6 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out)

// Project the voxel onto the detector to find out which value to add to it
ip = matrix_multiply(IndexInInput, &(c_matrices[12 * proj]));
;

// Change coordinate systems
ip.z = 1 / ip.z;
@@ -346,25 +219,15 @@ CUDA_warp_back_project(int projSize[3],
tex_proj.filterMode = cudaFilterModeLinear;
tex_proj.normalized = false; // don't access with normalized texture coords

tex_proj_3D.addressMode[0] = cudaAddressModeBorder;
tex_proj_3D.addressMode[1] = cudaAddressModeBorder;
tex_proj_3D.addressMode[2] = cudaAddressModeBorder;
tex_proj_3D.filterMode = cudaFilterModeLinear;
tex_proj_3D.normalized = false; // don't access with normalized texture coords

// Copy projection data to array, bind the array to the texture
cudaExtent projExtent = make_cudaExtent(projSize[0], projSize[1], projSize[2]);
cudaArray * array_proj;
static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
CUDA_CHECK_ERROR;

// Allocate array for input projections, in order to bind them to
// either a 2D layered texture (requires GetCudaComputeCapability >= 2.0) or
// a 3D texture
if (GetCudaComputeCapability(device).first <= 1)
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent);
else
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
// a 2D layered texture
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
CUDA_CHECK_ERROR;

// Copy data to 3D array
@@ -460,49 +323,27 @@ CUDA_warp_back_project(int projSize[3],

// Run kernels. Note: Projection data is passed via texture memory,
// transform matrix is passed via constant memory
if (GetCudaComputeCapability(device).first <= 1)
{
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);

// Bind the array of projections to a 3D texture
cudaBindTextureToArray(tex_proj_3D, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
CUDA_CHECK_ERROR;

if (radiusCylindricalDetector == 0)
kernel_warp_back_project<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, blocksInY);
else
kernel_warp_back_project_cylindrical_detector<<<dimGrid, dimBlock>>>(
dev_vol_in, dev_vol_out, blocksInY, (float)radiusCylindricalDetector);
// Bind the array of projections to a 2D layered texture
cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;

// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj_3D);
CUDA_CHECK_ERROR;
}
// Note: cbi->img is passed via texture memory
// Matrices are passed via constant memory
//-------------------------------------
if (radiusCylindricalDetector == 0)
kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
else
{
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
CUDA_CHECK_ERROR;

// Bind the array of projections to a 2D layered texture
cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;

// Note: cbi->img is passed via texture memory
// Matrices are passed via constant memory
//-------------------------------------
if (radiusCylindricalDetector == 0)
kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
else
kernel_warp_back_project_3Dgrid_cylindrical_detector<<<dimGrid, dimBlock>>>(
dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector);
// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj);
CUDA_CHECK_ERROR;
}
kernel_warp_back_project_3Dgrid_cylindrical_detector<<<dimGrid, dimBlock>>>(
dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector);
// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj);
CUDA_CHECK_ERROR;

// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_xdvf);
71 changes: 6 additions & 65 deletions src/rtkCudaWarpImageFilter.cu
Original file line number Diff line number Diff line change
@@ -63,51 +63,6 @@ __constant__ float c_PPInputToIndexInputMatrix[12];
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

__global__ void
kernel(float * dev_vol_out, int3 vol_dim, unsigned int Blocks_Y)
{
// CUDA 2.0 does not allow for a 3D grid, which severely
// limits the manipulation of large 3D arrays of data. The
// following code is a hack to bypass this implementation
// limitation.
unsigned int blockIdx_z = blockIdx.y / Blocks_Y;
unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;

if (i >= vol_dim.x || j >= vol_dim.y || k >= vol_dim.z)
{
return;
}

// Index row major into the volume
long int vol_idx = i + (j + k * vol_dim.y) * (vol_dim.x);

// Matrix multiply to get the index in the DVF texture of the current point in the output volume
float3 IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexOutputToIndexDVFMatrix);

// Get each component of the displacement vector by
// interpolation in the dvf
float3 Displacement;
Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);

// Matrix multiply to get the physical coordinates of the current point in the output volume
float3 PPinOutput = matrix_multiply(make_float3(i, j, k), c_IndexOutputToPPOutputMatrix);

// Get the index corresponding to the current physical point in output displaced by the displacement vector
float3 PPDisplaced;
PPDisplaced.x = PPinOutput.x + Displacement.x;
PPDisplaced.y = PPinOutput.y + Displacement.y;
PPDisplaced.z = PPinOutput.z + Displacement.z;
float3 IndexInInput = matrix_multiply(PPDisplaced, c_PPInputToIndexInputMatrix);

// Interpolate in the input and copy into the output
dev_vol_out[vol_idx] = tex3D(tex_input_vol, IndexInInput.x + 0.5f, IndexInInput.y + 0.5f, IndexInInput.z + 0.5f);
}

__global__ void
kernel_3Dgrid(float * dev_vol_out, int3 vol_dim)
{
@@ -165,7 +120,6 @@ CUDA_warp(int input_vol_dim[3],
float * dev_DVF,
bool isLinear)
{

// Prepare channel description for arrays
static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

@@ -301,26 +255,13 @@ CUDA_warp(int input_vol_dim[3],
unsigned int blocksInY = (output_vol_dim[1] - 1) / tBlock_y + 1;
unsigned int blocksInZ = (output_vol_dim[2] - 1) / tBlock_z + 1;

if (GetCudaComputeCapability(device).first <= 1)
{
dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);

// Note: the DVF and input image are passed via texture memory
//-------------------------------------
kernel<<<dimGrid, dimBlock>>>(
dev_output_vol, make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]), blocksInY);
}
else
{
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);

// Note: the DVF and input image are passed via texture memory
//-------------------------------------
kernel_3Dgrid<<<dimGrid, dimBlock>>>(dev_output_vol,
make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]));
}
// Note: the DVF and input image are passed via texture memory
//-------------------------------------
kernel_3Dgrid<<<dimGrid, dimBlock>>>(dev_output_vol,
make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]));

CUDA_CHECK_ERROR;