Skip to content

Commit

Permalink
ENH: Replace texture references by objects for CUDA 12 compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
Simon Rit authored and SimonRit committed Apr 21, 2024
1 parent f109868 commit 5361f69
Show file tree
Hide file tree
Showing 15 changed files with 374 additions and 741 deletions.
2 changes: 1 addition & 1 deletion include/rtkCudaDisplacedDetectorImageFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ CUDA_displaced_weight(int proj_idx_in[3],
int proj_dim_out_buf[2],
float * dev_proj_in,
float * dev_proj_out,
float * geometries,
float * geometry,
float theta,
bool isPositiveCase,
float proj_orig,
Expand Down
2 changes: 1 addition & 1 deletion include/rtkCudaFDKWeightProjectionFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ CUDA_weight_projection(int proj_idx[2],
int proj_dim_buf_out[2],
float * dev_proj_in,
float * dev_proj_out,
float * geometries,
float * geometry,
float proj_orig[2],
float proj_row[2],
float proj_col[2]);
Expand Down
2 changes: 1 addition & 1 deletion include/rtkCudaParkerShortScanImageFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ CUDA_parker_weight(int proj_idx[2],
int proj_dim_buf_out[2],
float * dev_proj_in,
float * dev_proj_out,
float * geometries,
float * geometry,
float delta,
float firstAngle,
float proj_orig,
Expand Down
18 changes: 17 additions & 1 deletion include/rtkCudaUtilities.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,30 @@ inline __host__ __device__ float
return u.x * v.x + u.y * v.y + u.z * v.z;
}

__host__ void
prepareScalarTextureObject(int size[3],
float * dev_ptr,
cudaArray *& threeDArray,
cudaTextureObject_t & tex,
const bool isProjections,
const bool isLinear = true,
const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder);

__host__ void
prepareVectorTextureObject(int size[3],
const float * dev_ptr,
std::vector<cudaArray *> & componentArrays,
const unsigned int nComponents,
std::vector<cudaTextureObject_t> & tex,
bool isProjections);
const bool isProjections,
const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder);

__host__ void
prepareGeometryTextureObject(int size,
const float * geometry,
float *& dev_geom,
cudaTextureObject_t & tex_geom,
const unsigned int nParam);

inline __device__ void
matrix_matrix_multiply(float * A, float * B, float * C, unsigned int rowsA, unsigned int colsB, unsigned int colsArowsB)
Expand Down
54 changes: 25 additions & 29 deletions src/rtkCudaDisplacedDetectorImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
#include <cuda_runtime.h>
#include <math_constants.h>

texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture

inline __device__ float
TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column)
{
Expand All @@ -45,19 +43,20 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s
}

__global__ void
kernel_displaced_weight(int3 proj_idx_in,
int3 proj_size_in,
int2 proj_size_in_buf,
int3 proj_idx_out,
int3 proj_size_out,
int2 proj_size_out_buf,
float * dev_proj_in,
float * dev_proj_out,
float theta,
bool isPositiveCase,
float proj_orig, // projection origin
float proj_row, // projection row direction & spacing
float proj_col // projection col direction & spacing
kernel_displaced_weight(int3 proj_idx_in,
int3 proj_size_in,
int2 proj_size_in_buf,
int3 proj_idx_out,
int3 proj_size_out,
int2 proj_size_out_buf,
float * dev_proj_in,
float * dev_proj_out,
float theta,
bool isPositiveCase,
float proj_orig, // projection origin
float proj_row, // projection row direction & spacing
float proj_col, // projection col direction & spacing
cudaTextureObject_t tex_geom // geometry texture object
)
{
// compute thread index
Expand Down Expand Up @@ -90,10 +89,10 @@ kernel_displaced_weight(int3 proj_idx_in,
{
float pPoint = TransformIndexToPhysicalPoint(make_int2(pIdx.x, pIdx.y), proj_orig, proj_row, proj_col);

float sdd = tex1Dfetch(tex_geometry, tIdx.z * 4 + 0);
float sx = tex1Dfetch(tex_geometry, tIdx.z * 4 + 1);
float px = tex1Dfetch(tex_geometry, tIdx.z * 4 + 2);
float sid = tex1Dfetch(tex_geometry, tIdx.z * 4 + 3);
float sdd = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 0);
float sx = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 1);
float px = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 2);
float sid = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 3);

float hyp = sqrtf(sid * sid + sx * sx); // to untilted situation
float l = ToUntiltedCoordinateAtIsocenter(pPoint, sdd, sid, sx, px, hyp);
Expand Down Expand Up @@ -139,21 +138,17 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i
int proj_dim_out_buf[2], // output size of buffered region
float * dev_proj_in,
float * dev_proj_out,
float * geometries,
float * geometry,
float theta,
bool isPositiveCase,
float proj_orig,
float proj_row,
float proj_col)
{
// copy geometry matrix to device, bind the matrix to the texture
float * dev_geom;
cudaMalloc((void **)&dev_geom, proj_dim_out[2] * 4 * sizeof(float));
CUDA_CHECK_ERROR;
cudaMemcpy(dev_geom, geometries, proj_dim_out[2] * 4 * sizeof(float), cudaMemcpyHostToDevice);
CUDA_CHECK_ERROR;
cudaBindTexture(0, tex_geometry, dev_geom, proj_dim_out[2] * 4 * sizeof(float));
CUDA_CHECK_ERROR;
float * dev_geom;
cudaTextureObject_t tex_geom;
prepareGeometryTextureObject(proj_dim_out[2], geometry, dev_geom, tex_geom, 4);

// Thread Block Dimensions
int tBlock_x = 16;
Expand All @@ -179,10 +174,11 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i
isPositiveCase,
proj_orig,
proj_row,
proj_col);
proj_col,
tex_geom);

// Unbind matrix texture
cudaUnbindTexture(tex_geometry);
cudaDestroyTextureObject(tex_geom);
CUDA_CHECK_ERROR;
cudaFree(dev_geom);
CUDA_CHECK_ERROR;
Expand Down
50 changes: 8 additions & 42 deletions src/rtkCudaFDKBackProjectionImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@
*****************/
#include <cuda.h>

// T E X T U R E S ////////////////////////////////////////////////////////
texture<float, cudaTextureType2DLayered> tex_proj;

// Constant memory
__constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections
__constant__ int3 c_projSize;
Expand All @@ -53,7 +50,7 @@ __constant__ int3 c_vol_size;
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

__global__ void
kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out)
kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out, cudaTextureObject_t tex_proj)
{
itk::SizeValueType i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
itk::SizeValueType j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
Expand Down Expand Up @@ -81,7 +78,7 @@ kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out)
ip.y = ip.y * ip.z;

// Get texture point, clip left to GPU, and accumulate in voxel_data
voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z;
voxel_data += tex2DLayered<float>(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z;
}

// Place it into the volume
Expand Down Expand Up @@ -110,33 +107,6 @@ CUDA_reconstruct_conebeam(int proj_size[3],
// Copy the projection matrices into constant memory
cudaMemcpyToSymbol(c_matrices, &(matrices[0]), 12 * sizeof(float) * proj_size[2]);

// set texture parameters
tex_proj.addressMode[0] = cudaAddressModeBorder;
tex_proj.addressMode[1] = cudaAddressModeBorder;
tex_proj.addressMode[2] = cudaAddressModeBorder;
tex_proj.filterMode = cudaFilterModeLinear;
tex_proj.normalized = false; // don't access with normalized texture coords

// Copy projection data to array, bind the array to the texture
cudaExtent projExtent = make_cudaExtent(proj_size[0], proj_size[1], proj_size[2]);
cudaArray * array_proj;
static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
CUDA_CHECK_ERROR;

// Allocate array for input projections, in order to bind them to
// a 2D layered texture
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
CUDA_CHECK_ERROR;

// Copy data to 3D array
cudaMemcpy3DParms copyParams = cudaMemcpy3DParms();
copyParams.srcPtr = make_cudaPitchedPtr(dev_proj, proj_size[0] * sizeof(float), proj_size[0], proj_size[1]);
copyParams.dstArray = (cudaArray *)array_proj;
copyParams.extent = projExtent;
copyParams.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&copyParams);
CUDA_CHECK_ERROR;

// Thread Block Dimensions
constexpr int tBlock_x = 16;
constexpr int tBlock_y = 4;
Expand All @@ -153,19 +123,15 @@ CUDA_reconstruct_conebeam(int proj_size[3],
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
CUDA_CHECK_ERROR;

// Bind the array of projections to a 2D layered texture
cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;

kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);

// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj);
CUDA_CHECK_ERROR;
cudaArray * array_proj;
cudaTextureObject_t tex_proj;
prepareScalarTextureObject(proj_size, dev_proj, array_proj, tex_proj, true);
kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, tex_proj);

// Cleanup
cudaFreeArray((cudaArray *)array_proj);
CUDA_CHECK_ERROR;
cudaDestroyTextureObject(tex_proj);
CUDA_CHECK_ERROR;
}
55 changes: 25 additions & 30 deletions src/rtkCudaFDKWeightProjectionFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,23 @@
#include <cuda.h>
#include <cuda_runtime.h>

texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture

inline __device__ float2
TransformIndexToPhysicalPoint(int2 idx, float2 origin, float2 row, float2 column)
{
return make_float2(origin.x + row.x * idx.x + column.x * idx.y, origin.y + row.y * idx.x + column.y * idx.y);
}

__global__ void
kernel_weight_projection(int2 proj_idx,
int3 proj_size,
int2 proj_size_buf_in,
int2 proj_size_buf_out,
float * dev_proj_in,
float * dev_proj_out,
float2 proj_orig, // projection origin
float2 proj_row, // projection row direction & spacing
float2 proj_col // projection col direction & spacing
kernel_weight_projection(int2 proj_idx,
int3 proj_size,
int2 proj_size_buf_in,
int2 proj_size_buf_out,
float * dev_proj_in,
float * dev_proj_out,
float2 proj_orig, // projection origin
float2 proj_row, // projection row direction & spacing
float2 proj_col, // projection col direction & spacing
cudaTextureObject_t tex_geom // geometry texture object
)
{
// compute projection index (== thread index)
Expand All @@ -53,19 +52,19 @@ kernel_weight_projection(int2 proj_idx,
if (pIdx.x >= proj_size.x || pIdx.y >= proj_size.y || pIdx.z >= proj_size.z)
return;

const float sdd = tex1Dfetch(tex_geometry, pIdx.z * 7 + 0);
const float sid = tex1Dfetch(tex_geometry, pIdx.z * 7 + 1);
const float wFac = tex1Dfetch(tex_geometry, pIdx.z * 7 + 5);
const float sdd = tex1Dfetch<float>(tex_geom, pIdx.z * 7);
const float sid = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 1);
const float wFac = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 5);
if (sdd == 0) // parallel
{
dev_proj_out[pIdx_comp_out] = dev_proj_in[pIdx_comp_in] * wFac;
}
else // divergent
{
const float pOffX = tex1Dfetch(tex_geometry, pIdx.z * 7 + 2);
const float pOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 3);
const float sOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 4);
const float tAngle = tex1Dfetch(tex_geometry, pIdx.z * 7 + 6);
const float pOffX = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 2);
const float pOffY = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 3);
const float sOffY = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 4);
const float tAngle = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 6);
const float sina = sin(tAngle);
const float cosa = cos(tAngle);
const float tana = tan(tAngle);
Expand All @@ -90,19 +89,14 @@ CUDA_weight_projection(int proj_idx[2],
int proj_dim_buf_out[2],
float * dev_proj_in,
float * dev_proj_out,
float * geometries,
float * geometry,
float proj_orig[2],
float proj_row[2],
float proj_col[2])
{
// copy geometry matrix to device, bind the matrix to the texture
float * dev_geom;
cudaMalloc((void **)&dev_geom, proj_dim[2] * 7 * sizeof(float));
CUDA_CHECK_ERROR;
cudaMemcpy(dev_geom, geometries, proj_dim[2] * 7 * sizeof(float), cudaMemcpyHostToDevice);
CUDA_CHECK_ERROR;
cudaBindTexture(0, tex_geometry, dev_geom, proj_dim[2] * 7 * sizeof(float));
CUDA_CHECK_ERROR;
float * dev_geom;
cudaTextureObject_t tex_geom;
prepareGeometryTextureObject(proj_dim[2], geometry, dev_geom, tex_geom, 7);

// Thread Block Dimensions
int tBlock_x = 16;
Expand All @@ -124,10 +118,11 @@ CUDA_weight_projection(int proj_idx[2],
dev_proj_out,
make_float2(proj_orig[0], proj_orig[1]),
make_float2(proj_row[0], proj_row[1]),
make_float2(proj_col[0], proj_col[1]));
make_float2(proj_col[0], proj_col[1]),
tex_geom);

// Unbind matrix texture
cudaUnbindTexture(tex_geometry);
// destroy texture object
cudaDestroyTextureObject(tex_geom);
CUDA_CHECK_ERROR;
cudaFree(dev_geom);
CUDA_CHECK_ERROR;
Expand Down
3 changes: 1 addition & 2 deletions src/rtkCudaForwardProjectionImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
* CUDA #includes *
*****************/
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

// CONSTANTS //
Expand Down Expand Up @@ -201,7 +200,7 @@ CUDA_forward_project(int projSize[3],
// Prepare texture objects
std::vector<cudaArray *> volComponentArrays;
std::vector<cudaTextureObject_t> tex_vol;
prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false);
prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false, cudaAddressModeClamp);

// Copy them to a device pointer, since it will have to be de-referenced in the kernels
cudaTextureObject_t * dev_tex_vol;
Expand Down
Loading

0 comments on commit 5361f69

Please sign in to comment.