Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cuda12 #589

Merged
merged 5 commits into from
Apr 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/rtkCudaDisplacedDetectorImageFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ CUDA_displaced_weight(int proj_idx_in[3],
int proj_dim_out_buf[2],
float * dev_proj_in,
float * dev_proj_out,
float * geometries,
float * geometry,
float theta,
bool isPositiveCase,
float proj_orig,
Expand Down
2 changes: 1 addition & 1 deletion include/rtkCudaFDKWeightProjectionFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ CUDA_weight_projection(int proj_idx[2],
int proj_dim_buf_out[2],
float * dev_proj_in,
float * dev_proj_out,
float * geometries,
float * geometry,
float proj_orig[2],
float proj_row[2],
float proj_col[2]);
Expand Down
2 changes: 1 addition & 1 deletion include/rtkCudaParkerShortScanImageFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ CUDA_parker_weight(int proj_idx[2],
int proj_dim_buf_out[2],
float * dev_proj_in,
float * dev_proj_out,
float * geometries,
float * geometry,
float delta,
float firstAngle,
float proj_orig,
Expand Down
30 changes: 24 additions & 6 deletions include/rtkCudaUtilities.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,31 @@ inline __host__ __device__ float
{
return u.x * v.x + u.y * v.y + u.z * v.z;
}

__host__ void
prepareScalarTextureObject(int size[3],
float * dev_ptr,
cudaArray *& threeDArray,
cudaTextureObject_t & tex,
const bool isProjections,
const bool isLinear = true,
const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder);

__host__ void
prepareVectorTextureObject(int size[3],
const float * dev_ptr,
std::vector<cudaArray *> & componentArrays,
const unsigned int nComponents,
std::vector<cudaTextureObject_t> & tex,
const bool isProjections,
const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder);

__host__ void
prepareTextureObject(int size[3],
float * dev_ptr,
cudaArray **& componentArrays,
unsigned int nComponents,
cudaTextureObject_t * tex,
bool isProjections);
prepareGeometryTextureObject(int size,
const float * geometry,
float *& dev_geom,
cudaTextureObject_t & tex_geom,
const unsigned int nParam);

inline __device__ void
matrix_matrix_multiply(float * A, float * B, float * C, unsigned int rowsA, unsigned int colsB, unsigned int colsArowsB)
Expand Down
2 changes: 0 additions & 2 deletions itk-module-init.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ if(RTK_USE_CUDA)
find_package(CUDAToolkit REQUIRED 8.0)
endif()

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-declarations")
set(RTK_CUDA_PROJECTIONS_SLAB_SIZE "16" CACHE STRING "Number of projections processed simultaneously in CUDA forward and back projections")
elseif(RTK_CUDA_VERSION)
message(FATAL_ERROR "RTK_CUDA_VERSION is set but the CUDA toolkit has not been found.")
Expand Down
39 changes: 14 additions & 25 deletions src/rtkCudaBackProjectionImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ __constant__ int3 c_volSize;
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

template <unsigned int vectorLength, bool isCylindrical>
template <unsigned int VVectorLength, bool VIsCylindrical>
__global__ void
kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTextureObject_t * dev_tex_proj)
{
Expand All @@ -68,13 +68,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
itk::SizeValueType vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x);

float3 ip, pp;
float voxel_data[vectorLength];
for (unsigned int c = 0; c < vectorLength; c++)
float voxel_data[VVectorLength];
for (unsigned int c = 0; c < VVectorLength; c++)
voxel_data[c] = 0.0f;

for (unsigned int proj = 0; proj < c_projSize.z; proj++)
{
if (isCylindrical)
if (VIsCylindrical)
{
// matrix multiply
pp = matrix_multiply(make_float3(i, j, k), &(c_volIndexToProjPP[12 * proj]));
Expand Down Expand Up @@ -105,13 +105,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
}

// Get texture point, clip left to GPU, and accumulate in voxel_data
for (unsigned int c = 0; c < vectorLength; c++)
for (unsigned int c = 0; c < VVectorLength; c++)
voxel_data[c] += tex2DLayered<float>(dev_tex_proj[c], ip.x, ip.y, proj);
}

// Place it into the volume
for (unsigned int c = 0; c < vectorLength; c++)
dev_vol_out[vol_idx * vectorLength + c] = dev_vol_in[vol_idx * vectorLength + c] + voxel_data[c];
for (unsigned int c = 0; c < VVectorLength; c++)
dev_vol_out[vol_idx * VVectorLength + c] = dev_vol_in[vol_idx * VVectorLength + c] + voxel_data[c];
}


Expand All @@ -134,14 +134,6 @@ CUDA_back_project(int projSize[3],
double radiusCylindricalDetector,
unsigned int vectorLength)
{
// 2D layered texture requires CudaComputeCapability >= 2.0
int device;
cudaGetDevice(&device);
if (GetCudaComputeCapability(device).first <= 1)
{
itkGenericExceptionMacro(<< "RTK no longer supports GPUs with CudaComputeCapability < 2.0");
}

// Copy the size of inputs into constant memory
cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3));
cudaMemcpyToSymbol(c_volSize, volSize, sizeof(int3));
Expand Down Expand Up @@ -169,18 +161,15 @@ CUDA_back_project(int projSize[3],
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
CUDA_CHECK_ERROR;

cudaArray ** projComponentArrays = new cudaArray *[vectorLength];

// Create an array of textures
cudaTextureObject_t * tex_proj = new cudaTextureObject_t[vectorLength];

// Prepare texture objects (needs an array of cudaTextureObjects on the host as "tex_proj" argument)
prepareTextureObject(projSize, dev_proj, projComponentArrays, vectorLength, tex_proj, true);
std::vector<cudaArray *> projComponentArrays;
std::vector<cudaTextureObject_t> tex_proj;
prepareVectorTextureObject(projSize, dev_proj, projComponentArrays, vectorLength, tex_proj, true);

// Copy them to a device pointer, since it will have to be de-referenced in the kernels
cudaTextureObject_t * dev_tex_proj;
cudaMalloc(&dev_tex_proj, vectorLength * sizeof(cudaTextureObject_t));
cudaMemcpy(dev_tex_proj, tex_proj, vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice);
cudaMemcpy(dev_tex_proj, tex_proj.data(), vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice);

// Run the kernel. Since "vectorLength" is passed as a function argument, not as a template argument,
// the compiler can't assume it's constant, and a dirty trick has to be used.
Expand Down Expand Up @@ -245,11 +234,11 @@ CUDA_back_project(int projSize[3],
// Cleanup
for (unsigned int c = 0; c < vectorLength; c++)
{
cudaFreeArray((cudaArray *)projComponentArrays[c]);
cudaFreeArray(projComponentArrays[c]);
CUDA_CHECK_ERROR;
cudaDestroyTextureObject(tex_proj[c]);
CUDA_CHECK_ERROR;
}
cudaFree(dev_tex_proj);
delete[] tex_proj;
delete[] projComponentArrays;
CUDA_CHECK_ERROR;
}
54 changes: 25 additions & 29 deletions src/rtkCudaDisplacedDetectorImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
#include <cuda_runtime.h>
#include <math_constants.h>

texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture

inline __device__ float
TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column)
{
Expand All @@ -45,19 +43,20 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s
}

__global__ void
kernel_displaced_weight(int3 proj_idx_in,
int3 proj_size_in,
int2 proj_size_in_buf,
int3 proj_idx_out,
int3 proj_size_out,
int2 proj_size_out_buf,
float * dev_proj_in,
float * dev_proj_out,
float theta,
bool isPositiveCase,
float proj_orig, // projection origin
float proj_row, // projection row direction & spacing
float proj_col // projection col direction & spacing
kernel_displaced_weight(int3 proj_idx_in,
int3 proj_size_in,
int2 proj_size_in_buf,
int3 proj_idx_out,
int3 proj_size_out,
int2 proj_size_out_buf,
float * dev_proj_in,
float * dev_proj_out,
float theta,
bool isPositiveCase,
float proj_orig, // projection origin
float proj_row, // projection row direction & spacing
float proj_col, // projection col direction & spacing
cudaTextureObject_t tex_geom // geometry texture object
)
{
// compute thread index
Expand Down Expand Up @@ -90,10 +89,10 @@ kernel_displaced_weight(int3 proj_idx_in,
{
float pPoint = TransformIndexToPhysicalPoint(make_int2(pIdx.x, pIdx.y), proj_orig, proj_row, proj_col);

float sdd = tex1Dfetch(tex_geometry, tIdx.z * 4 + 0);
float sx = tex1Dfetch(tex_geometry, tIdx.z * 4 + 1);
float px = tex1Dfetch(tex_geometry, tIdx.z * 4 + 2);
float sid = tex1Dfetch(tex_geometry, tIdx.z * 4 + 3);
float sdd = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 0);
float sx = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 1);
float px = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 2);
float sid = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 3);

float hyp = sqrtf(sid * sid + sx * sx); // to untilted situation
float l = ToUntiltedCoordinateAtIsocenter(pPoint, sdd, sid, sx, px, hyp);
Expand Down Expand Up @@ -139,21 +138,17 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i
int proj_dim_out_buf[2], // output size of buffered region
float * dev_proj_in,
float * dev_proj_out,
float * geometries,
float * geometry,
float theta,
bool isPositiveCase,
float proj_orig,
float proj_row,
float proj_col)
{
// copy geometry matrix to device, bind the matrix to the texture
float * dev_geom;
cudaMalloc((void **)&dev_geom, proj_dim_out[2] * 4 * sizeof(float));
CUDA_CHECK_ERROR;
cudaMemcpy(dev_geom, geometries, proj_dim_out[2] * 4 * sizeof(float), cudaMemcpyHostToDevice);
CUDA_CHECK_ERROR;
cudaBindTexture(0, tex_geometry, dev_geom, proj_dim_out[2] * 4 * sizeof(float));
CUDA_CHECK_ERROR;
float * dev_geom;
cudaTextureObject_t tex_geom;
prepareGeometryTextureObject(proj_dim_out[2], geometry, dev_geom, tex_geom, 4);

// Thread Block Dimensions
int tBlock_x = 16;
Expand All @@ -179,10 +174,11 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i
isPositiveCase,
proj_orig,
proj_row,
proj_col);
proj_col,
tex_geom);

// Unbind matrix texture
cudaUnbindTexture(tex_geometry);
cudaDestroyTextureObject(tex_geom);
CUDA_CHECK_ERROR;
cudaFree(dev_geom);
CUDA_CHECK_ERROR;
Expand Down
Loading
Loading