diff --git a/include/rtkCudaDisplacedDetectorImageFilter.hcu b/include/rtkCudaDisplacedDetectorImageFilter.hcu index b115c9ac1..8c50a12ed 100644 --- a/include/rtkCudaDisplacedDetectorImageFilter.hcu +++ b/include/rtkCudaDisplacedDetectorImageFilter.hcu @@ -28,7 +28,7 @@ CUDA_displaced_weight(int proj_idx_in[3], int proj_dim_out_buf[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float theta, bool isPositiveCase, float proj_orig, diff --git a/include/rtkCudaFDKWeightProjectionFilter.hcu b/include/rtkCudaFDKWeightProjectionFilter.hcu index f6284a255..353230ce1 100644 --- a/include/rtkCudaFDKWeightProjectionFilter.hcu +++ b/include/rtkCudaFDKWeightProjectionFilter.hcu @@ -26,7 +26,7 @@ CUDA_weight_projection(int proj_idx[2], int proj_dim_buf_out[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float proj_orig[2], float proj_row[2], float proj_col[2]); diff --git a/include/rtkCudaParkerShortScanImageFilter.hcu b/include/rtkCudaParkerShortScanImageFilter.hcu index e39e8783f..aca4c24d1 100644 --- a/include/rtkCudaParkerShortScanImageFilter.hcu +++ b/include/rtkCudaParkerShortScanImageFilter.hcu @@ -26,7 +26,7 @@ CUDA_parker_weight(int proj_idx[2], int proj_dim_buf_out[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float delta, float firstAngle, float proj_orig, diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu index 612a8fda6..737da666e 100644 --- a/include/rtkCudaUtilities.hcu +++ b/include/rtkCudaUtilities.hcu @@ -127,14 +127,30 @@ inline __host__ __device__ float return u.x * v.x + u.y * v.y + u.z * v.z; } +__host__ void +prepareScalarTextureObject(int size[3], + float * dev_ptr, + cudaArray *& threeDArray, + cudaTextureObject_t & tex, + const bool isProjections, + const bool isLinear = true, + const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder); + __host__ void prepareVectorTextureObject(int size[3], const float * dev_ptr, std::vector & componentArrays, const unsigned int nComponents, std::vector & tex, - bool isProjections); + const bool isProjections, + const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder); +__host__ void +prepareGeometryTextureObject(int size, + const float * geometry, + float *& dev_geom, + cudaTextureObject_t & tex_geom, + const unsigned int nParam); inline __device__ void matrix_matrix_multiply(float * A, float * B, float * C, unsigned int rowsA, unsigned int colsB, unsigned int colsArowsB) diff --git a/src/rtkCudaDisplacedDetectorImageFilter.cu b/src/rtkCudaDisplacedDetectorImageFilter.cu index 0d579c79c..b888487b3 100644 --- a/src/rtkCudaDisplacedDetectorImageFilter.cu +++ b/src/rtkCudaDisplacedDetectorImageFilter.cu @@ -22,8 +22,6 @@ #include #include -texture tex_geometry; // geometry texture - inline __device__ float TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column) { @@ -45,19 +43,20 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s } __global__ void -kernel_displaced_weight(int3 proj_idx_in, - int3 proj_size_in, - int2 proj_size_in_buf, - int3 proj_idx_out, - int3 proj_size_out, - int2 proj_size_out_buf, - float * dev_proj_in, - float * dev_proj_out, - float theta, - bool isPositiveCase, - float proj_orig, // projection origin - float proj_row, // projection row direction & spacing - float proj_col // projection col direction & spacing +kernel_displaced_weight(int3 proj_idx_in, + int3 proj_size_in, + int2 proj_size_in_buf, + int3 proj_idx_out, + int3 proj_size_out, + int2 proj_size_out_buf, + float * dev_proj_in, + float * dev_proj_out, + float theta, + bool isPositiveCase, + float proj_orig, // projection origin + float proj_row, // projection row direction & spacing + float proj_col, // projection col direction & spacing + cudaTextureObject_t tex_geom // geometry texture object ) { // compute thread index @@ -90,10 +89,10 @@ kernel_displaced_weight(int3 proj_idx_in, { float pPoint = TransformIndexToPhysicalPoint(make_int2(pIdx.x, pIdx.y), proj_orig, proj_row, proj_col); - float sdd = tex1Dfetch(tex_geometry, tIdx.z * 4 + 0); - float sx = tex1Dfetch(tex_geometry, tIdx.z * 4 + 1); - float px = tex1Dfetch(tex_geometry, tIdx.z * 4 + 2); - float sid = tex1Dfetch(tex_geometry, tIdx.z * 4 + 3); + float sdd = tex1Dfetch(tex_geom, tIdx.z * 4 + 0); + float sx = tex1Dfetch(tex_geom, tIdx.z * 4 + 1); + float px = tex1Dfetch(tex_geom, tIdx.z * 4 + 2); + float sid = tex1Dfetch(tex_geom, tIdx.z * 4 + 3); float hyp = sqrtf(sid * sid + sx * sx); // to untilted situation float l = ToUntiltedCoordinateAtIsocenter(pPoint, sdd, sid, sx, px, hyp); @@ -139,7 +138,7 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i int proj_dim_out_buf[2], // output size of buffered region float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float theta, bool isPositiveCase, float proj_orig, @@ -147,13 +146,9 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i float proj_col) { // copy geometry matrix to device, bind the matrix to the texture - float * dev_geom; - cudaMalloc((void **)&dev_geom, proj_dim_out[2] * 4 * sizeof(float)); - CUDA_CHECK_ERROR; - cudaMemcpy(dev_geom, geometries, proj_dim_out[2] * 4 * sizeof(float), cudaMemcpyHostToDevice); - CUDA_CHECK_ERROR; - cudaBindTexture(0, tex_geometry, dev_geom, proj_dim_out[2] * 4 * sizeof(float)); - CUDA_CHECK_ERROR; + float * dev_geom; + cudaTextureObject_t tex_geom; + prepareGeometryTextureObject(proj_dim_out[2], geometry, dev_geom, tex_geom, 4); // Thread Block Dimensions int tBlock_x = 16; @@ -179,10 +174,11 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i isPositiveCase, proj_orig, proj_row, - proj_col); + proj_col, + tex_geom); // Unbind matrix texture - cudaUnbindTexture(tex_geometry); + cudaDestroyTextureObject(tex_geom); CUDA_CHECK_ERROR; cudaFree(dev_geom); CUDA_CHECK_ERROR; diff --git a/src/rtkCudaFDKBackProjectionImageFilter.cu b/src/rtkCudaFDKBackProjectionImageFilter.cu index 4a9d45601..417e7544f 100644 --- a/src/rtkCudaFDKBackProjectionImageFilter.cu +++ b/src/rtkCudaFDKBackProjectionImageFilter.cu @@ -39,9 +39,6 @@ *****************/ #include -// T E X T U R E S //////////////////////////////////////////////////////// -texture tex_proj; - // Constant memory __constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections __constant__ int3 c_projSize; @@ -53,7 +50,7 @@ __constant__ int3 c_vol_size; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ __global__ void -kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out) +kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out, cudaTextureObject_t tex_proj) { itk::SizeValueType i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; itk::SizeValueType j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -81,7 +78,7 @@ kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out) ip.y = ip.y * ip.z; // Get texture point, clip left to GPU, and accumulate in voxel_data - voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z; + voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z; } // Place it into the volume @@ -110,33 +107,6 @@ CUDA_reconstruct_conebeam(int proj_size[3], // Copy the projection matrices into constant memory cudaMemcpyToSymbol(c_matrices, &(matrices[0]), 12 * sizeof(float) * proj_size[2]); - // set texture parameters - tex_proj.addressMode[0] = cudaAddressModeBorder; - tex_proj.addressMode[1] = cudaAddressModeBorder; - tex_proj.addressMode[2] = cudaAddressModeBorder; - tex_proj.filterMode = cudaFilterModeLinear; - tex_proj.normalized = false; // don't access with normalized texture coords - - // Copy projection data to array, bind the array to the texture - cudaExtent projExtent = make_cudaExtent(proj_size[0], proj_size[1], proj_size[2]); - cudaArray * array_proj; - static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - CUDA_CHECK_ERROR; - - // Allocate array for input projections, in order to bind them to - // a 2D layered texture - cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered); - CUDA_CHECK_ERROR; - - // Copy data to 3D array - cudaMemcpy3DParms copyParams = cudaMemcpy3DParms(); - copyParams.srcPtr = make_cudaPitchedPtr(dev_proj, proj_size[0] * sizeof(float), proj_size[0], proj_size[1]); - copyParams.dstArray = (cudaArray *)array_proj; - copyParams.extent = projExtent; - copyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(©Params); - CUDA_CHECK_ERROR; - // Thread Block Dimensions constexpr int tBlock_x = 16; constexpr int tBlock_y = 4; @@ -153,19 +123,15 @@ CUDA_reconstruct_conebeam(int proj_size[3], // Compute block and grid sizes dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - CUDA_CHECK_ERROR; - - // Bind the array of projections to a 2D layered texture - cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc); - CUDA_CHECK_ERROR; - kernel_fdk_3Dgrid<<>>(dev_vol_in, dev_vol_out); - - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_proj); - CUDA_CHECK_ERROR; + cudaArray * array_proj; + cudaTextureObject_t tex_proj; + prepareScalarTextureObject(proj_size, dev_proj, array_proj, tex_proj, true); + kernel_fdk_3Dgrid<<>>(dev_vol_in, dev_vol_out, tex_proj); // Cleanup cudaFreeArray((cudaArray *)array_proj); CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_proj); + CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaFDKWeightProjectionFilter.cu b/src/rtkCudaFDKWeightProjectionFilter.cu index e4f4a45be..26e2e3b38 100644 --- a/src/rtkCudaFDKWeightProjectionFilter.cu +++ b/src/rtkCudaFDKWeightProjectionFilter.cu @@ -21,8 +21,6 @@ #include #include -texture tex_geometry; // geometry texture - inline __device__ float2 TransformIndexToPhysicalPoint(int2 idx, float2 origin, float2 row, float2 column) { @@ -30,15 +28,16 @@ inline __device__ float2 } __global__ void -kernel_weight_projection(int2 proj_idx, - int3 proj_size, - int2 proj_size_buf_in, - int2 proj_size_buf_out, - float * dev_proj_in, - float * dev_proj_out, - float2 proj_orig, // projection origin - float2 proj_row, // projection row direction & spacing - float2 proj_col // projection col direction & spacing +kernel_weight_projection(int2 proj_idx, + int3 proj_size, + int2 proj_size_buf_in, + int2 proj_size_buf_out, + float * dev_proj_in, + float * dev_proj_out, + float2 proj_orig, // projection origin + float2 proj_row, // projection row direction & spacing + float2 proj_col, // projection col direction & spacing + cudaTextureObject_t tex_geom // geometry texture object ) { // compute projection index (== thread index) @@ -53,19 +52,19 @@ kernel_weight_projection(int2 proj_idx, if (pIdx.x >= proj_size.x || pIdx.y >= proj_size.y || pIdx.z >= proj_size.z) return; - const float sdd = tex1Dfetch(tex_geometry, pIdx.z * 7 + 0); - const float sid = tex1Dfetch(tex_geometry, pIdx.z * 7 + 1); - const float wFac = tex1Dfetch(tex_geometry, pIdx.z * 7 + 5); + const float sdd = tex1Dfetch(tex_geom, pIdx.z * 7); + const float sid = tex1Dfetch(tex_geom, pIdx.z * 7 + 1); + const float wFac = tex1Dfetch(tex_geom, pIdx.z * 7 + 5); if (sdd == 0) // parallel { dev_proj_out[pIdx_comp_out] = dev_proj_in[pIdx_comp_in] * wFac; } else // divergent { - const float pOffX = tex1Dfetch(tex_geometry, pIdx.z * 7 + 2); - const float pOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 3); - const float sOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 4); - const float tAngle = tex1Dfetch(tex_geometry, pIdx.z * 7 + 6); + const float pOffX = tex1Dfetch(tex_geom, pIdx.z * 7 + 2); + const float pOffY = tex1Dfetch(tex_geom, pIdx.z * 7 + 3); + const float sOffY = tex1Dfetch(tex_geom, pIdx.z * 7 + 4); + const float tAngle = tex1Dfetch(tex_geom, pIdx.z * 7 + 6); const float sina = sin(tAngle); const float cosa = cos(tAngle); const float tana = tan(tAngle); @@ -90,19 +89,14 @@ CUDA_weight_projection(int proj_idx[2], int proj_dim_buf_out[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float proj_orig[2], float proj_row[2], float proj_col[2]) { - // copy geometry matrix to device, bind the matrix to the texture - float * dev_geom; - cudaMalloc((void **)&dev_geom, proj_dim[2] * 7 * sizeof(float)); - CUDA_CHECK_ERROR; - cudaMemcpy(dev_geom, geometries, proj_dim[2] * 7 * sizeof(float), cudaMemcpyHostToDevice); - CUDA_CHECK_ERROR; - cudaBindTexture(0, tex_geometry, dev_geom, proj_dim[2] * 7 * sizeof(float)); - CUDA_CHECK_ERROR; + float * dev_geom; + cudaTextureObject_t tex_geom; + prepareGeometryTextureObject(proj_dim[2], geometry, dev_geom, tex_geom, 7); // Thread Block Dimensions int tBlock_x = 16; @@ -124,10 +118,11 @@ CUDA_weight_projection(int proj_idx[2], dev_proj_out, make_float2(proj_orig[0], proj_orig[1]), make_float2(proj_row[0], proj_row[1]), - make_float2(proj_col[0], proj_col[1])); + make_float2(proj_col[0], proj_col[1]), + tex_geom); - // Unbind matrix texture - cudaUnbindTexture(tex_geometry); + // destroy texture object + cudaDestroyTextureObject(tex_geom); CUDA_CHECK_ERROR; cudaFree(dev_geom); CUDA_CHECK_ERROR; diff --git a/src/rtkCudaForwardProjectionImageFilter.cu b/src/rtkCudaForwardProjectionImageFilter.cu index 3940d0f0d..869d263f9 100644 --- a/src/rtkCudaForwardProjectionImageFilter.cu +++ b/src/rtkCudaForwardProjectionImageFilter.cu @@ -36,7 +36,6 @@ * CUDA #includes * *****************/ #include -#include #include // CONSTANTS // @@ -201,7 +200,7 @@ CUDA_forward_project(int projSize[3], // Prepare texture objects std::vector volComponentArrays; std::vector tex_vol; - prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false); + prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false, cudaAddressModeClamp); // Copy them to a device pointer, since it will have to be de-referenced in the kernels cudaTextureObject_t * dev_tex_vol; diff --git a/src/rtkCudaForwardWarpImageFilter.cu b/src/rtkCudaForwardWarpImageFilter.cu index 1f9a04956..530dc33b6 100644 --- a/src/rtkCudaForwardWarpImageFilter.cu +++ b/src/rtkCudaForwardWarpImageFilter.cu @@ -39,15 +39,11 @@ *****************/ #include -// T E X T U R E S //////////////////////////////////////////////////////// -texture tex_IndexInputToPPInputMatrix; -texture tex_IndexInputToIndexDVFMatrix; -texture tex_PPOutputToIndexOutputMatrix; - -texture tex_xdvf; -texture tex_ydvf; -texture tex_zdvf; -/////////////////////////////////////////////////////////////////////////// +// CONSTANTS ////////////////////////////////////////////////////////////// +__constant__ float c_IndexInputToPPInputMatrix[12]; +__constant__ float c_IndexInputToIndexDVFMatrix[12]; +__constant__ float c_PPOutputToIndexOutputMatrix[12]; +//////////////////////////////////////////////////////////////////////////// //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ // K E R N E L S -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ @@ -129,7 +125,14 @@ normalize_3Dgrid(float * dev_vol_out, float * dev_accumulate_weights, int3 out_d } __global__ void -linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumulate_weights, int3 in_dim, int3 out_dim) +linearSplat_3Dgrid(float * dev_vol_in, + float * dev_vol_out, + float * dev_accumulate_weights, + int3 in_dim, + int3 out_dim, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -141,32 +144,21 @@ linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumula } // Index row major into the volume + float3 idx = make_float3(i, j, k); long int in_idx = i + (j + k * in_dim.y) * (in_dim.x); // Matrix multiply to get the index in the DVF texture of the current point in the output volume - float3 IndexInDVF; - IndexInDVF.x = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 0) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 1) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 2) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 3); - IndexInDVF.y = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 4) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 5) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 6) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 7); - IndexInDVF.z = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 8) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 9) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 10) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 11); + float3 IndexInDVF = matrix_multiply(idx, c_IndexInputToIndexDVFMatrix); // Get each component of the displacement vector by // interpolation in the dvf float3 Displacement; - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Matrix multiply to get the physical coordinates of the current point in the input volume - float3 PPinInput; - PPinInput.x = tex1Dfetch(tex_IndexInputToPPInputMatrix, 0) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 1) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 2) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 3); - PPinInput.y = tex1Dfetch(tex_IndexInputToPPInputMatrix, 4) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 5) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 6) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 7); - PPinInput.z = tex1Dfetch(tex_IndexInputToPPInputMatrix, 8) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 9) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 10) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 11); + float3 PPinInput = matrix_multiply(idx, c_IndexInputToPPInputMatrix); // Get the index corresponding to the current physical point in output displaced by the displacement vector float3 PPDisplaced; @@ -174,19 +166,7 @@ linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumula PPDisplaced.y = PPinInput.y + Displacement.y; PPDisplaced.z = PPinInput.z + Displacement.z; - float3 IndexInOutput; - IndexInOutput.x = tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 0) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 1) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 2) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 3); - IndexInOutput.y = tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 4) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 5) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 6) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 7); - IndexInOutput.z = tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 8) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 9) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 10) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 11); + float3 IndexInOutput = matrix_multiply(PPDisplaced, c_PPOutputToIndexOutputMatrix); // Compute the splat weights int3 BaseIndexInOutput; @@ -303,11 +283,14 @@ linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumula } __global__ void -nearestNeighborSplat_3Dgrid(float * dev_vol_in, - float * dev_vol_out, - float * dev_accumulate_weights, - int3 in_dim, - int3 out_dim) +nearestNeighborSplat_3Dgrid(float * dev_vol_in, + float * dev_vol_out, + float * dev_accumulate_weights, + int3 in_dim, + int3 out_dim, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -322,29 +305,18 @@ nearestNeighborSplat_3Dgrid(float * dev_vol_in, long int in_idx = i + (j + k * in_dim.y) * (in_dim.x); // Matrix multiply to get the index in the DVF texture of the current point in the output volume - float3 IndexInDVF; - IndexInDVF.x = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 0) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 1) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 2) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 3); - IndexInDVF.y = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 4) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 5) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 6) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 7); - IndexInDVF.z = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 8) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 9) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 10) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 11); + float3 idx = make_float3(i, j, k); + float3 IndexInDVF = matrix_multiply(idx, c_IndexInputToIndexDVFMatrix); // Get each component of the displacement vector by // interpolation in the dvf float3 Displacement; - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Matrix multiply to get the physical coordinates of the current point in the input volume - float3 PPinInput; - PPinInput.x = tex1Dfetch(tex_IndexInputToPPInputMatrix, 0) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 1) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 2) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 3); - PPinInput.y = tex1Dfetch(tex_IndexInputToPPInputMatrix, 4) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 5) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 6) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 7); - PPinInput.z = tex1Dfetch(tex_IndexInputToPPInputMatrix, 8) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 9) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 10) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 11); + float3 PPinInput = matrix_multiply(idx, c_IndexInputToPPInputMatrix); // Get the index corresponding to the current physical point in output displaced by the displacement vector float3 PPDisplaced; @@ -352,19 +324,10 @@ nearestNeighborSplat_3Dgrid(float * dev_vol_in, PPDisplaced.y = PPinInput.y + Displacement.y; PPDisplaced.z = PPinInput.z + Displacement.z; - float3 IndexInOutput; - IndexInOutput.x = floor(tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 0) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 1) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 2) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 3) + 0.5); - IndexInOutput.y = floor(tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 4) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 5) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 6) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 7) + 0.5); - IndexInOutput.z = floor(tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 8) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 9) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 10) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 11) + 0.5); + float3 IndexInOutput = matrix_multiply(PPDisplaced, c_PPOutputToIndexOutputMatrix); + IndexInOutput.x = floor(IndexInOutput.x + 0.5); + IndexInOutput.y = floor(IndexInOutput.y + 0.5); + IndexInOutput.z = floor(IndexInOutput.z + 0.5); bool isInVolume = (IndexInOutput.x >= 0) && (IndexInOutput.x < out_dim.x) && (IndexInOutput.y >= 0) && (IndexInOutput.y < out_dim.y) && (IndexInOutput.z >= 0) && (IndexInOutput.z < out_dim.z); @@ -400,101 +363,29 @@ CUDA_ForwardWarp(int input_vol_dim[3], float * dev_output_vol, bool isLinear) { - // Prepare channel description for arrays static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); /////////////////////////////////// - // For each component of the dvf, perform a strided copy (pick every third - // float from dev_input_dvf) into a 3D array, and bind the array to a 3D texture - // Extent stuff, will be used for each component extraction - cudaExtent dvfExtent = make_cudaExtent(input_dvf_dim[0], input_dvf_dim[1], input_dvf_dim[2]); - - // Set texture parameters - tex_xdvf.addressMode[0] = cudaAddressModeBorder; - tex_xdvf.addressMode[1] = cudaAddressModeBorder; - tex_xdvf.addressMode[2] = cudaAddressModeBorder; - tex_xdvf.filterMode = cudaFilterModeLinear; - tex_xdvf.normalized = false; // don't access with normalized texture coords - - tex_ydvf.addressMode[0] = cudaAddressModeBorder; - tex_ydvf.addressMode[1] = cudaAddressModeBorder; - tex_ydvf.addressMode[2] = cudaAddressModeBorder; - tex_ydvf.filterMode = cudaFilterModeLinear; - tex_ydvf.normalized = false; - - tex_zdvf.addressMode[0] = cudaAddressModeBorder; - tex_zdvf.addressMode[1] = cudaAddressModeBorder; - tex_zdvf.addressMode[2] = cudaAddressModeBorder; - tex_zdvf.filterMode = cudaFilterModeLinear; - tex_zdvf.normalized = false; - - // Allocate the arrays - cudaArray * array_xdvf; - cudaArray * array_ydvf; - cudaArray * array_zdvf; - cudaMalloc3DArray((cudaArray **)&array_xdvf, &channelDesc, dvfExtent); - cudaMalloc3DArray((cudaArray **)&array_ydvf, &channelDesc, dvfExtent); - cudaMalloc3DArray((cudaArray **)&array_zdvf, &channelDesc, dvfExtent); - CUDA_CHECK_ERROR; - - // Copy image data to arrays. For a nice explanation on make_cudaPitchedPtr, checkout - // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api - cudaMemcpy3DParms xCopyParams = cudaMemcpy3DParms(); - xCopyParams.srcPtr = - make_cudaPitchedPtr(dev_input_xdvf, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]); - xCopyParams.dstArray = (cudaArray *)array_xdvf; - xCopyParams.extent = dvfExtent; - xCopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&xCopyParams); - CUDA_CHECK_ERROR; - - cudaMemcpy3DParms yCopyParams = cudaMemcpy3DParms(); - yCopyParams.srcPtr = - make_cudaPitchedPtr(dev_input_ydvf, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]); - yCopyParams.dstArray = (cudaArray *)array_ydvf; - yCopyParams.extent = dvfExtent; - yCopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&yCopyParams); - CUDA_CHECK_ERROR; - - cudaMemcpy3DParms zCopyParams = cudaMemcpy3DParms(); - zCopyParams.srcPtr = - make_cudaPitchedPtr(dev_input_zdvf, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]); - zCopyParams.dstArray = (cudaArray *)array_zdvf; - zCopyParams.extent = dvfExtent; - zCopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&zCopyParams); - CUDA_CHECK_ERROR; - - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_xdvf, (cudaArray *)array_xdvf, channelDesc); - cudaBindTextureToArray(tex_ydvf, (cudaArray *)array_ydvf, channelDesc); - cudaBindTextureToArray(tex_zdvf, (cudaArray *)array_zdvf, channelDesc); - CUDA_CHECK_ERROR; + cudaArray * array_xdvf, *array_ydvf, *array_zdvf; + cudaTextureObject_t tex_xdvf, tex_ydvf, tex_zdvf; + prepareScalarTextureObject(input_dvf_dim, dev_input_xdvf, array_xdvf, tex_xdvf, false); + prepareScalarTextureObject(input_dvf_dim, dev_input_ydvf, array_ydvf, tex_ydvf, false); + prepareScalarTextureObject(input_dvf_dim, dev_input_zdvf, array_zdvf, tex_zdvf, false); /////////////////////////////////////// - // Copy matrices, bind them to textures - - float * dev_IndexInputToPPInput; - cudaMalloc((void **)&dev_IndexInputToPPInput, 12 * sizeof(float)); - cudaMemcpy(dev_IndexInputToPPInput, IndexInputToPPInputMatrix, 12 * sizeof(float), cudaMemcpyHostToDevice); - cudaBindTexture(0, tex_IndexInputToPPInputMatrix, dev_IndexInputToPPInput, 12 * sizeof(float)); - - float * dev_IndexInputToIndexDVF; - cudaMalloc((void **)&dev_IndexInputToIndexDVF, 12 * sizeof(float)); - cudaMemcpy(dev_IndexInputToIndexDVF, IndexInputToIndexDVFMatrix, 12 * sizeof(float), cudaMemcpyHostToDevice); - cudaBindTexture(0, tex_IndexInputToIndexDVFMatrix, dev_IndexInputToIndexDVF, 12 * sizeof(float)); - - float * dev_PPOutputToIndexOutput; - cudaMalloc((void **)&dev_PPOutputToIndexOutput, 12 * sizeof(float)); - cudaMemcpy(dev_PPOutputToIndexOutput, PPOutputToIndexOutputMatrix, 12 * sizeof(float), cudaMemcpyHostToDevice); - cudaBindTexture(0, tex_PPOutputToIndexOutputMatrix, dev_PPOutputToIndexOutput, 12 * sizeof(float)); + // Copy matrices into constant memory + cudaMemcpyToSymbol( + c_IndexInputToPPInputMatrix, IndexInputToPPInputMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol( + c_IndexInputToIndexDVFMatrix, IndexInputToIndexDVFMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol( + c_PPOutputToIndexOutputMatrix, PPOutputToIndexOutputMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); + CUDA_CHECK_ERROR; /////////////////////////////////////// /// Initialize the output - cudaMemset((void *)dev_output_vol, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]); ////////////////////////////////////// @@ -507,12 +398,6 @@ CUDA_ForwardWarp(int input_vol_dim[3], cudaMemset( (void *)dev_accumulate_weights, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]); - ////////////////////////////////////// - /// Run - - int device; - cudaGetDevice(&device); - // Thread Block Dimensions constexpr int tBlock_x = 16; constexpr int tBlock_y = 4; @@ -533,14 +418,20 @@ CUDA_ForwardWarp(int input_vol_dim[3], dev_output_vol, dev_accumulate_weights, make_int3(input_vol_dim[0], input_vol_dim[1], input_vol_dim[2]), - make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); + make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]), + tex_xdvf, + tex_ydvf, + tex_zdvf); else nearestNeighborSplat_3Dgrid<<>>( dev_input_vol, dev_output_vol, dev_accumulate_weights, make_int3(input_vol_dim[0], input_vol_dim[1], input_vol_dim[2]), - make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); + make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]), + tex_xdvf, + tex_ydvf, + tex_zdvf); CUDA_CHECK_ERROR; @@ -552,25 +443,19 @@ CUDA_ForwardWarp(int input_vol_dim[3], dev_output_vol, dev_accumulate_weights, make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); CUDA_CHECK_ERROR; - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_xdvf); - cudaUnbindTexture(tex_ydvf); - cudaUnbindTexture(tex_zdvf); + // Cleanup + cudaFreeArray(array_xdvf); CUDA_CHECK_ERROR; - cudaUnbindTexture(tex_IndexInputToPPInputMatrix); - cudaUnbindTexture(tex_PPOutputToIndexOutputMatrix); - cudaUnbindTexture(tex_IndexInputToIndexDVFMatrix); + cudaFreeArray(array_ydvf); CUDA_CHECK_ERROR; - - // Cleanup - cudaFreeArray((cudaArray *)array_xdvf); - cudaFreeArray((cudaArray *)array_ydvf); - cudaFreeArray((cudaArray *)array_zdvf); + cudaFreeArray(array_zdvf); CUDA_CHECK_ERROR; cudaFree(dev_accumulate_weights); CUDA_CHECK_ERROR; - cudaFree(dev_IndexInputToPPInput); - cudaFree(dev_PPOutputToIndexOutput); - cudaFree(dev_IndexInputToIndexDVF); + cudaDestroyTextureObject(tex_xdvf); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_ydvf); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_zdvf); CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaParkerShortScanImageFilter.cu b/src/rtkCudaParkerShortScanImageFilter.cu index 8e3c6f734..be4656371 100644 --- a/src/rtkCudaParkerShortScanImageFilter.cu +++ b/src/rtkCudaParkerShortScanImageFilter.cu @@ -22,8 +22,6 @@ #include #include -texture tex_geometry; // geometry texture - inline __device__ float TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column) { @@ -45,17 +43,18 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s } __global__ void -kernel_parker_weight(int2 proj_idx, - int3 proj_size, - int2 proj_size_buf_in, - int2 proj_size_buf_out, - float * dev_proj_in, - float * dev_proj_out, - float delta, - float firstAngle, - float proj_orig, // projection origin - float proj_row, // projection row direction & spacing - float proj_col // projection col direction & spacing +kernel_parker_weight(int2 proj_idx, + int3 proj_size, + int2 proj_size_buf_in, + int2 proj_size_buf_out, + float * dev_proj_in, + float * dev_proj_out, + float delta, + float firstAngle, + float proj_orig, // projection origin + float proj_row, // projection row direction & spacing + float proj_col, // projection col direction & spacing + cudaTextureObject_t tex_geom // geometry texture object ) { // compute projection index (== thread index) @@ -70,10 +69,10 @@ kernel_parker_weight(int2 proj_idx, if (pIdx.x >= proj_size.x || pIdx.y >= proj_size.y || pIdx.z >= proj_size.z) return; - float sdd = tex1Dfetch(tex_geometry, pIdx.z * 5 + 0); - float sx = tex1Dfetch(tex_geometry, pIdx.z * 5 + 1); - float px = tex1Dfetch(tex_geometry, pIdx.z * 5 + 2); - float sid = tex1Dfetch(tex_geometry, pIdx.z * 5 + 3); + float sdd = tex1Dfetch(tex_geom, pIdx.z * 5 + 0); + float sx = tex1Dfetch(tex_geom, pIdx.z * 5 + 1); + float px = tex1Dfetch(tex_geom, pIdx.z * 5 + 2); + float sid = tex1Dfetch(tex_geom, pIdx.z * 5 + 3); // convert actual index to point float pPoint = @@ -86,7 +85,7 @@ kernel_parker_weight(int2 proj_idx, float alpha = atan(-1 * l * invsid); // beta projection angle: Parker's article assumes that the scan starts at 0 - float beta = tex1Dfetch(tex_geometry, pIdx.z * 5 + 4); + float beta = tex1Dfetch(tex_geom, pIdx.z * 5 + 4); beta -= firstAngle; if (beta < 0) beta += (2.f * CUDART_PI_F); @@ -111,21 +110,16 @@ CUDA_parker_weight(int proj_idx[2], int proj_dim_buf_out[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float delta, float firstAngle, float proj_orig, float proj_row, float proj_col) { - // copy geometry matrix to device, bind the matrix to the texture - float * dev_geom; - cudaMalloc((void **)&dev_geom, proj_dim[2] * 5 * sizeof(float)); - CUDA_CHECK_ERROR; - cudaMemcpy(dev_geom, geometries, proj_dim[2] * 5 * sizeof(float), cudaMemcpyHostToDevice); - CUDA_CHECK_ERROR; - cudaBindTexture(0, tex_geometry, dev_geom, proj_dim[2] * 5 * sizeof(float)); - CUDA_CHECK_ERROR; + float * dev_geom; + cudaTextureObject_t tex_geom; + prepareGeometryTextureObject(proj_dim[2], geometry, dev_geom, tex_geom, 5); // Thread Block Dimensions int tBlock_x = 16; @@ -139,6 +133,7 @@ CUDA_parker_weight(int proj_idx[2], dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); + kernel_parker_weight<<>>(make_int2(proj_idx[0], proj_idx[1]), make_int3(proj_dim[0], proj_dim[1], proj_dim[2]), make_int2(proj_dim_buf_in[0], proj_dim_buf_in[1]), @@ -149,10 +144,11 @@ CUDA_parker_weight(int proj_idx[2], firstAngle, proj_orig, proj_row, - proj_col); + proj_col, + tex_geom); - // Unbind matrix texture - cudaUnbindTexture(tex_geometry); + // destroy texture object + cudaDestroyTextureObject(tex_geom); CUDA_CHECK_ERROR; cudaFree(dev_geom); CUDA_CHECK_ERROR; diff --git a/src/rtkCudaUtilities.cu b/src/rtkCudaUtilities.cu index 4f3938d05..020fac6cc 100644 --- a/src/rtkCudaUtilities.cu +++ b/src/rtkCudaUtilities.cu @@ -17,6 +17,7 @@ *=========================================================================*/ #include "rtkCudaUtilities.hcu" +#include std::vector GetListOfCudaDevices() @@ -76,13 +77,73 @@ GetFreeGPUGlobalMemory(int device) return free; } +__host__ void +prepareScalarTextureObject(int size[3], + float * dev_ptr, + cudaArray *& threeDArray, + cudaTextureObject_t & tex, + const bool isProjections, + const bool isLinear, + const cudaTextureAddressMode texAddressMode) +{ + // create texture object + cudaResourceDesc resDesc = {}; + resDesc.resType = cudaResourceTypeArray; + + cudaTextureDesc texDesc = {}; + texDesc.readMode = cudaReadModeElementType; + + for (int component = 0; component < 3; component++) + texDesc.addressMode[component] = texAddressMode; + if (isLinear) + texDesc.filterMode = cudaFilterModeLinear; + else + texDesc.filterMode = cudaFilterModePoint; + + static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaExtent volExtent = make_cudaExtent(size[0], size[1], size[2]); + + // Allocate an intermediate memory space to extract the components of the input volume + float * singleComponent; + int numel = size[0] * size[1] * size[2]; + cudaMalloc(&singleComponent, numel * sizeof(float)); + CUDA_CHECK_ERROR; + + // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr. + // The best way to understand it is to read + // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api + + // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays + if (isProjections) + cudaMalloc3DArray(&threeDArray, &channelDesc, volExtent, cudaArrayLayered); + else + cudaMalloc3DArray(&threeDArray, &channelDesc, volExtent); + CUDA_CHECK_ERROR; + + // Fill it with the current singleComponent + cudaMemcpy3DParms CopyParams = {}; + CopyParams.srcPtr = make_cudaPitchedPtr(dev_ptr, size[0] * sizeof(float), size[0], size[1]); + CUDA_CHECK_ERROR; + CopyParams.dstArray = threeDArray; + CopyParams.extent = volExtent; + CopyParams.kind = cudaMemcpyDeviceToDevice; + cudaMemcpy3D(&CopyParams); + CUDA_CHECK_ERROR; + + // Fill in the texture object with all this information + resDesc.res.array.array = threeDArray; + cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); + CUDA_CHECK_ERROR; +} + __host__ void prepareVectorTextureObject(int size[3], const float * dev_ptr, std::vector & componentArrays, const unsigned int nComponents, std::vector & tex, - bool isProjections) + const bool isProjections, + const cudaTextureAddressMode texAddressMode) { componentArrays.resize(nComponents); tex.resize(nComponents); @@ -92,20 +153,13 @@ prepareVectorTextureObject(int size[3], cublasCreate(&handle); // create texture object - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); + cudaResourceDesc resDesc = {}; resDesc.resType = cudaResourceTypeArray; - cudaTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); + cudaTextureDesc texDesc = {}; texDesc.readMode = cudaReadModeElementType; for (int component = 0; component < 3; component++) - { - if (isProjections) - texDesc.addressMode[component] = cudaAddressModeBorder; - else - texDesc.addressMode[component] = cudaAddressModeClamp; - } + texDesc.addressMode[component] = texAddressMode; texDesc.filterMode = cudaFilterModeLinear; static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); @@ -127,7 +181,7 @@ prepareVectorTextureObject(int size[3], cudaMemset((void *)singleComponent, 0, numel * sizeof(float)); // Fill it with the current component - float * pComponent = dev_ptr + component; + const float * pComponent = dev_ptr + component; cublasSaxpy(handle, numel, &one, pComponent, nComponents, singleComponent, 1); // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays @@ -140,6 +194,7 @@ prepareVectorTextureObject(int size[3], // Fill it with the current singleComponent cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); CopyParams.srcPtr = make_cudaPitchedPtr(singleComponent, size[0] * sizeof(float), size[0], size[1]); + CUDA_CHECK_ERROR; CopyParams.dstArray = componentArrays[component]; CopyParams.extent = volExtent; CopyParams.kind = cudaMemcpyDeviceToDevice; @@ -158,3 +213,31 @@ prepareVectorTextureObject(int size[3], // Destroy CUBLAS context cublasDestroy(handle); } + +__host__ void +prepareGeometryTextureObject(int length, + const float * geometry, + float *& dev_geom, + cudaTextureObject_t & tex_geom, + const unsigned int nParam) +{ + // copy geometry matrix to device, bind the matrix to the texture + cudaMalloc((void **)&dev_geom, length * nParam * sizeof(float)); + CUDA_CHECK_ERROR; + cudaMemcpy(dev_geom, geometry, length * nParam * sizeof(float), cudaMemcpyHostToDevice); + CUDA_CHECK_ERROR; + + // create texture object + cudaResourceDesc resDesc = {}; + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = dev_geom; + resDesc.res.linear.desc.f = cudaChannelFormatKindFloat; + resDesc.res.linear.desc.x = 32; // bits per channel + resDesc.res.linear.sizeInBytes = length * nParam * sizeof(float); + + cudaTextureDesc texDesc = {}; + texDesc.readMode = cudaReadModeElementType; + + cudaCreateTextureObject(&tex_geom, &resDesc, &texDesc, NULL); + CUDA_CHECK_ERROR; +} diff --git a/src/rtkCudaWarpBackProjectionImageFilter.cu b/src/rtkCudaWarpBackProjectionImageFilter.cu index 333b6dfba..a26d57f5f 100644 --- a/src/rtkCudaWarpBackProjectionImageFilter.cu +++ b/src/rtkCudaWarpBackProjectionImageFilter.cu @@ -38,17 +38,8 @@ * CUDA #includes * *****************/ #include -#include #include -// T E X T U R E S //////////////////////////////////////////////////////// -texture tex_proj; - -texture tex_xdvf; -texture tex_ydvf; -texture tex_zdvf; -/////////////////////////////////////////////////////////////////////////// - // CONSTANTS ////////////////////////////////////////////////////////////// __constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections __constant__ float c_volIndexToProjPP[SLAB_SIZE * 12]; @@ -66,7 +57,12 @@ __constant__ float c_IndexInputToPPInputMatrix[12]; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ __global__ void -kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) +kernel_warp_back_project_3Dgrid(float * dev_vol_in, + float * dev_vol_out, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf, + cudaTextureObject_t tex_proj) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -89,9 +85,9 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix); // Get each component of the displacement vector by interpolation in the DVF - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Compute the physical point in input + the displacement vector PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement; @@ -108,7 +104,7 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) ip.y = ip.y * ip.z; // Get texture point, clip left to GPU - voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj); + voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj); } // Place it into the volume @@ -116,7 +112,13 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) } __global__ void -kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, float * dev_vol_out, float radius) +kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, + float * dev_vol_out, + float radius, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf, + cudaTextureObject_t tex_proj) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -139,9 +141,9 @@ kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, float * IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix); // Get each component of the displacement vector by interpolation in the DVF - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Compute the physical point in input + the displacement vector PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement; @@ -167,7 +169,7 @@ kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, float * ip.y = c_projPPToProjIndex[3] * pp.x + c_projPPToProjIndex[4] * pp.y + c_projPPToProjIndex[5]; // Get texture point, clip left to GPU - voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj); + voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj); } // Place it into the volume @@ -196,13 +198,6 @@ CUDA_warp_back_project(int projSize[3], float IndexInputToPPInputMatrix[12], double radiusCylindricalDetector) { - // Create CUBLAS context - cublasHandle_t handle; - cublasCreate(&handle); - - int device; - cudaGetDevice(&device); - // Copy the size of inputs into constant memory cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3)); cudaMemcpyToSymbol(c_volSize, volSize, sizeof(int3)); @@ -212,96 +207,15 @@ CUDA_warp_back_project(int projSize[3], cudaMemcpyToSymbol(c_volIndexToProjPP, &(volIndexToProjPPs[0]), 12 * sizeof(float) * projSize[2]); cudaMemcpyToSymbol(c_projPPToProjIndex, &(projPPToProjIndex[0]), 9 * sizeof(float)); - // set texture parameters - tex_proj.addressMode[0] = cudaAddressModeBorder; - tex_proj.addressMode[1] = cudaAddressModeBorder; - tex_proj.addressMode[2] = cudaAddressModeBorder; - tex_proj.filterMode = cudaFilterModeLinear; - tex_proj.normalized = false; // don't access with normalized texture coords - - // Copy projection data to array, bind the array to the texture - cudaExtent projExtent = make_cudaExtent(projSize[0], projSize[1], projSize[2]); - cudaArray * array_proj; - static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - CUDA_CHECK_ERROR; - - // Allocate array for input projections, in order to bind them to - // a 2D layered texture - cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered); - CUDA_CHECK_ERROR; - - // Copy data to 3D array - cudaMemcpy3DParms copyParams = cudaMemcpy3DParms(); - copyParams.srcPtr = make_cudaPitchedPtr(dev_proj, projSize[0] * sizeof(float), projSize[0], projSize[1]); - copyParams.dstArray = (cudaArray *)array_proj; - copyParams.extent = projExtent; - copyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(©Params); - CUDA_CHECK_ERROR; + // Prepare proj texture + cudaArray * array_proj; + cudaTextureObject_t tex_proj; + prepareScalarTextureObject(projSize, dev_proj, array_proj, tex_proj, true); - // Extent stuff, will be used for each component extraction - cudaExtent dvfExtent = make_cudaExtent(dvf_size[0], dvf_size[1], dvf_size[2]); - - // Set texture parameters - tex_xdvf.addressMode[0] = cudaAddressModeBorder; - tex_xdvf.addressMode[1] = cudaAddressModeBorder; - tex_xdvf.addressMode[2] = cudaAddressModeBorder; - tex_xdvf.filterMode = cudaFilterModeLinear; - tex_xdvf.normalized = false; // don't access with normalized texture coords - - tex_ydvf.addressMode[0] = cudaAddressModeBorder; - tex_ydvf.addressMode[1] = cudaAddressModeBorder; - tex_ydvf.addressMode[2] = cudaAddressModeBorder; - tex_ydvf.filterMode = cudaFilterModeLinear; - tex_ydvf.normalized = false; - - tex_zdvf.addressMode[0] = cudaAddressModeBorder; - tex_zdvf.addressMode[1] = cudaAddressModeBorder; - tex_zdvf.addressMode[2] = cudaAddressModeBorder; - tex_zdvf.filterMode = cudaFilterModeLinear; - tex_zdvf.normalized = false; - - // Allocate an intermediate memory space to extract x, y and z components of the DVF - float * DVFcomponent; - int numel = dvf_size[0] * dvf_size[1] * dvf_size[2]; - cudaMalloc(&DVFcomponent, numel * sizeof(float)); - float one = 1.0; - - // Allocate the arrays used for textures - cudaArray ** DVFcomponentArrays = new cudaArray *[3]; - CUDA_CHECK_ERROR; - - // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr. - // The best way to understand it is to read - // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api - for (unsigned int component = 0; component < 3; component++) - { - // Reset the intermediate memory - cudaMemset((void *)DVFcomponent, 0, numel * sizeof(float)); - - // Fill it with the current component - float * pComponent = dev_input_dvf + component; - cublasSaxpy(handle, numel, &one, pComponent, 3, DVFcomponent, 1); - - // Allocate the cudaArray and fill it with the current DVFcomponent - cudaMalloc3DArray((cudaArray **)&DVFcomponentArrays[component], &channelDesc, dvfExtent); - cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); - CopyParams.srcPtr = make_cudaPitchedPtr(DVFcomponent, dvf_size[0] * sizeof(float), dvf_size[0], dvf_size[1]); - CopyParams.dstArray = (cudaArray *)DVFcomponentArrays[component]; - CopyParams.extent = dvfExtent; - CopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&CopyParams); - CUDA_CHECK_ERROR; - } - - // Intermediate memory is no longer needed - cudaFree(DVFcomponent); - - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_xdvf, (cudaArray *)DVFcomponentArrays[0], channelDesc); - cudaBindTextureToArray(tex_ydvf, (cudaArray *)DVFcomponentArrays[1], channelDesc); - cudaBindTextureToArray(tex_zdvf, (cudaArray *)DVFcomponentArrays[2], channelDesc); - CUDA_CHECK_ERROR; + // Prepare DVF textures + std::vector DVFComponentArrays; + std::vector tex_dvf; + prepareVectorTextureObject(dvf_size, dev_input_dvf, DVFComponentArrays, 3, tex_dvf, false); // Copy matrices into constant memory cudaMemcpyToSymbol( @@ -329,35 +243,27 @@ CUDA_warp_back_project(int projSize[3], dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); CUDA_CHECK_ERROR; - // Bind the array of projections to a 2D layered texture - cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc); - CUDA_CHECK_ERROR; - // Note: cbi->img is passed via texture memory // Matrices are passed via constant memory //------------------------------------- if (radiusCylindricalDetector == 0) - kernel_warp_back_project_3Dgrid<<>>(dev_vol_in, dev_vol_out); + kernel_warp_back_project_3Dgrid<<>>( + dev_vol_in, dev_vol_out, tex_dvf[0], tex_dvf[1], tex_dvf[2], tex_proj); else kernel_warp_back_project_3Dgrid_cylindrical_detector<<>>( - dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector); - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_proj); + dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector, tex_dvf[0], tex_dvf[1], tex_dvf[2], tex_proj); CUDA_CHECK_ERROR; - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_xdvf); - cudaUnbindTexture(tex_ydvf); - cudaUnbindTexture(tex_zdvf); - // Cleanup - cudaFreeArray((cudaArray *)DVFcomponentArrays[0]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[1]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[2]); - delete[] DVFcomponentArrays; + for (unsigned int c = 0; c < 3; c++) + { + cudaFreeArray(DVFComponentArrays[c]); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_dvf[c]); + CUDA_CHECK_ERROR; + } cudaFreeArray((cudaArray *)array_proj); CUDA_CHECK_ERROR; - - // Destroy CUBLAS context - cublasDestroy(handle); + cudaDestroyTextureObject(tex_proj); + CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaWarpForwardProjectionImageFilter.cu b/src/rtkCudaWarpForwardProjectionImageFilter.cu index 0b7578726..094cc1f42 100644 --- a/src/rtkCudaWarpForwardProjectionImageFilter.cu +++ b/src/rtkCudaWarpForwardProjectionImageFilter.cu @@ -36,15 +36,9 @@ * CUDA #includes * *****************/ #include -#include #include -// TEXTURES AND CONSTANTS // -texture tex_xdvf; -texture tex_ydvf; -texture tex_zdvf; -texture tex_vol; - +// CONSTANTS __constant__ int3 c_projSize; __constant__ float3 c_boxMin; __constant__ float3 c_boxMax; @@ -65,7 +59,12 @@ __constant__ float c_PPInputToIndexInputMatrix[12]; // KERNEL kernel_forwardProject __global__ void -kernel_warped_forwardProject(float * dev_proj_in, float * dev_proj_out) +kernel_warped_forwardProject(float * dev_proj_in, + float * dev_proj_out, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf, + cudaTextureObject_t tex_vol) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; unsigned int j = blockIdx.y * blockDim.y + threadIdx.y; @@ -122,9 +121,9 @@ kernel_warped_forwardProject(float * dev_proj_in, float * dev_proj_out) // Get each component of the displacement vector by // interpolation in the dvf - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Matrix multiply to get the physical coordinates of the current point in the output volume // + the displacement @@ -134,7 +133,7 @@ kernel_warped_forwardProject(float * dev_proj_in, float * dev_proj_out) IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix); // Read from 3D texture from volume - sample = tex3D(tex_vol, IndexInInput.x, IndexInInput.y, IndexInInput.z); + sample = tex3D(tex_vol, IndexInInput.x, IndexInInput.y, IndexInInput.z); // Accumulate, and move forward along the ray sum += sample; @@ -172,10 +171,6 @@ CUDA_warp_forward_project(int projSize[3], float PPInputToIndexInputMatrix[12], float IndexInputToPPInputMatrix[12]) { - // Create CUBLAS context - cublasHandle_t handle; - cublasCreate(&handle); - // constant memory cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3)); cudaMemcpyToSymbol(c_boxMin, box_min, sizeof(float3)); @@ -190,94 +185,15 @@ CUDA_warp_forward_project(int projSize[3], // Copy the projection matrices into constant memory cudaMemcpyToSymbol(c_matrices, &(matrices[0]), 12 * sizeof(float) * projSize[2]); - // Prepare channel description for arrays - static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - - // Extent stuff, will be used for each component extraction - cudaExtent dvfExtent = make_cudaExtent(dvfSize[0], dvfSize[1], dvfSize[2]); - - // Set texture parameters for the input volume - tex_vol.addressMode[0] = cudaAddressModeClamp; // clamp texture coordinates - tex_vol.addressMode[1] = cudaAddressModeClamp; - tex_vol.addressMode[2] = cudaAddressModeClamp; - tex_vol.normalized = false; // access with normalized texture coordinates - tex_vol.filterMode = cudaFilterModeLinear; // linear interpolation - - // Copy volume data to array, bind the array to the texture - cudaExtent volExtent = make_cudaExtent(volSize[0], volSize[1], volSize[2]); - cudaArray * array_vol; - cudaMalloc3DArray((cudaArray **)&array_vol, &channelDesc, volExtent); - CUDA_CHECK_ERROR; - - // Copy data to 3D array - cudaMemcpy3DParms copyParams = cudaMemcpy3DParms(); - copyParams.srcPtr = make_cudaPitchedPtr(dev_vol, volSize[0] * sizeof(float), volSize[0], volSize[1]); - copyParams.dstArray = (cudaArray *)array_vol; - copyParams.extent = volExtent; - copyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(©Params); - CUDA_CHECK_ERROR; - - // Set texture parameters - tex_xdvf.addressMode[0] = cudaAddressModeBorder; - tex_xdvf.addressMode[1] = cudaAddressModeBorder; - tex_xdvf.addressMode[2] = cudaAddressModeBorder; - tex_xdvf.filterMode = cudaFilterModeLinear; - tex_xdvf.normalized = false; // don't access with normalized texture coords - - tex_ydvf.addressMode[0] = cudaAddressModeBorder; - tex_ydvf.addressMode[1] = cudaAddressModeBorder; - tex_ydvf.addressMode[2] = cudaAddressModeBorder; - tex_ydvf.filterMode = cudaFilterModeLinear; - tex_ydvf.normalized = false; - - tex_zdvf.addressMode[0] = cudaAddressModeBorder; - tex_zdvf.addressMode[1] = cudaAddressModeBorder; - tex_zdvf.addressMode[2] = cudaAddressModeBorder; - tex_zdvf.filterMode = cudaFilterModeLinear; - tex_zdvf.normalized = false; - - // Allocate an intermediate memory space to extract x, y and z components of the DVF - float * DVFcomponent; - int numel = dvfSize[0] * dvfSize[1] * dvfSize[2]; - cudaMalloc(&DVFcomponent, numel * sizeof(float)); - float one = 1.0; - - // Allocate the arrays used for textures - cudaArray ** DVFcomponentArrays = new cudaArray *[3]; - CUDA_CHECK_ERROR; - - // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr. - // The best way to understand it is to read - // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api - for (unsigned int component = 0; component < 3; component++) - { - // Reset the intermediate memory - cudaMemset((void *)DVFcomponent, 0, numel * sizeof(float)); - - // Fill it with the current component - float * pComponent = dev_input_dvf + component; - cublasSaxpy(handle, numel, &one, pComponent, 3, DVFcomponent, 1); - - // Allocate the cudaArray and fill it with the current DVFcomponent - cudaMalloc3DArray((cudaArray **)&DVFcomponentArrays[component], &channelDesc, dvfExtent); - cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); - CopyParams.srcPtr = make_cudaPitchedPtr(DVFcomponent, dvfSize[0] * sizeof(float), dvfSize[0], dvfSize[1]); - CopyParams.dstArray = (cudaArray *)DVFcomponentArrays[component]; - CopyParams.extent = dvfExtent; - CopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&CopyParams); - CUDA_CHECK_ERROR; - } - - // Intermediate memory is no longer needed - cudaFree(DVFcomponent); + // Prepare volume texture + cudaArray * array_vol; + cudaTextureObject_t tex_vol; + prepareScalarTextureObject(volSize, dev_vol, array_vol, tex_vol, false, true, cudaAddressModeClamp); - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_xdvf, (cudaArray *)DVFcomponentArrays[0], channelDesc); - cudaBindTextureToArray(tex_ydvf, (cudaArray *)DVFcomponentArrays[1], channelDesc); - cudaBindTextureToArray(tex_zdvf, (cudaArray *)DVFcomponentArrays[2], channelDesc); - CUDA_CHECK_ERROR; + // Prepare DVF textures + std::vector DVFComponentArrays; + std::vector tex_dvf; + prepareVectorTextureObject(dvfSize, dev_input_dvf, DVFComponentArrays, 3, tex_dvf, false); // Copy matrices into constant memory cudaMemcpyToSymbol( @@ -292,25 +208,19 @@ CUDA_warp_forward_project(int projSize[3], dim3 dimBlock = dim3(16, 16, 1); dim3 dimGrid = dim3(iDivUp(projSize[0], dimBlock.x), iDivUp(projSize[1], dimBlock.y)); - // Bind 3D array to 3D texture - cudaBindTextureToArray(tex_vol, (cudaArray *)array_vol, channelDesc); - CUDA_CHECK_ERROR; - - kernel_warped_forwardProject<<>>(dev_proj_in, dev_proj_out); - - cudaUnbindTexture(tex_xdvf); - cudaUnbindTexture(tex_ydvf); - cudaUnbindTexture(tex_zdvf); - cudaUnbindTexture(tex_vol); - CUDA_CHECK_ERROR; + kernel_warped_forwardProject<<>>( + dev_proj_in, dev_proj_out, tex_dvf[0], tex_dvf[1], tex_dvf[2], tex_vol); - cudaFreeArray((cudaArray *)DVFcomponentArrays[0]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[1]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[2]); - delete[] DVFcomponentArrays; + // Cleanup + for (unsigned int c = 0; c < 3; c++) + { + cudaFreeArray(DVFComponentArrays[c]); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_dvf[c]); + CUDA_CHECK_ERROR; + } cudaFreeArray((cudaArray *)array_vol); CUDA_CHECK_ERROR; - - // Destroy CUBLAS context - cublasDestroy(handle); + cudaDestroyTextureObject(tex_vol); + CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaWarpImageFilter.cu b/src/rtkCudaWarpImageFilter.cu index 132cbe9cb..0038ec72d 100644 --- a/src/rtkCudaWarpImageFilter.cu +++ b/src/rtkCudaWarpImageFilter.cu @@ -38,19 +38,6 @@ * CUDA #includes * *****************/ #include -#include -#include - -// T E X T U R E S //////////////////////////////////////////////////////// -texture tex_IndexOutputToPPOutputMatrix; -texture tex_IndexOutputToIndexDVFMatrix; -texture tex_PPInputToIndexInputMatrix; - -texture tex_xdvf; -texture tex_ydvf; -texture tex_zdvf; -texture tex_input_vol; -/////////////////////////////////////////////////////////////////////////// // CONSTANTS ////////////////////////////////////////////////////////////// __constant__ float c_IndexOutputToPPOutputMatrix[12]; @@ -64,7 +51,12 @@ __constant__ float c_PPInputToIndexInputMatrix[12]; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ __global__ void -kernel_3Dgrid(float * dev_vol_out, int3 vol_dim) +kernel_3Dgrid(float * dev_vol_out, + int3 vol_dim, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf, + cudaTextureObject_t tex_input_vol) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -84,9 +76,9 @@ kernel_3Dgrid(float * dev_vol_out, int3 vol_dim) // Get each component of the displacement vector by // interpolation in the dvf float3 Displacement; - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Matrix multiply to get the physical coordinates of the current point in the output volume float3 PP = matrix_multiply(make_float3(i, j, k), c_IndexOutputToPPOutputMatrix); @@ -98,7 +90,8 @@ kernel_3Dgrid(float * dev_vol_out, int3 vol_dim) float3 IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix); // Interpolate in the input and copy into the output - dev_vol_out[vol_idx] = tex3D(tex_input_vol, IndexInInput.x + 0.5f, IndexInInput.y + 0.5f, IndexInInput.z + 0.5f); + dev_vol_out[vol_idx] = + tex3D(tex_input_vol, IndexInInput.x + 0.5f, IndexInInput.y + 0.5f, IndexInInput.z + 0.5f); } //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ @@ -120,116 +113,15 @@ CUDA_warp(int input_vol_dim[3], float * dev_DVF, bool isLinear) { - // Prepare channel description for arrays - static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - - // Create CUBLAS context - cublasHandle_t handle; - cublasCreate(&handle); - - /////////////////////////////////// - // For each component of the dvf, perform a strided copy (pick every third - // float from dev_input_dvf) into a 3D array, and bind the array to a 3D texture - - // Extent stuff, will be used for each component extraction - cudaExtent dvfExtent = make_cudaExtent(input_dvf_dim[0], input_dvf_dim[1], input_dvf_dim[2]); - - // Set texture parameters - tex_xdvf.addressMode[0] = cudaAddressModeBorder; - tex_xdvf.addressMode[1] = cudaAddressModeBorder; - tex_xdvf.addressMode[2] = cudaAddressModeBorder; - tex_xdvf.filterMode = cudaFilterModeLinear; - tex_xdvf.normalized = false; // don't access with normalized texture coords - - tex_ydvf.addressMode[0] = cudaAddressModeBorder; - tex_ydvf.addressMode[1] = cudaAddressModeBorder; - tex_ydvf.addressMode[2] = cudaAddressModeBorder; - tex_ydvf.filterMode = cudaFilterModeLinear; - tex_ydvf.normalized = false; - - tex_zdvf.addressMode[0] = cudaAddressModeBorder; - tex_zdvf.addressMode[1] = cudaAddressModeBorder; - tex_zdvf.addressMode[2] = cudaAddressModeBorder; - tex_zdvf.filterMode = cudaFilterModeLinear; - tex_zdvf.normalized = false; - - // Allocate an intermediate memory space to extract x, y and z components of the DVF - float * DVFcomponent; - int numel = input_dvf_dim[0] * input_dvf_dim[1] * input_dvf_dim[2]; - cudaMalloc(&DVFcomponent, numel * sizeof(float)); - float one = 1.0; - - // Allocate the arrays used for textures - cudaArray ** DVFcomponentArrays = new cudaArray *[3]; - CUDA_CHECK_ERROR; - - // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr. - // The best way to understand it is to read - // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api - for (unsigned int component = 0; component < 3; component++) - { - // Reset the intermediate memory - cudaMemset((void *)DVFcomponent, 0, numel * sizeof(float)); - - // Fill it with the current component - float * pComponent = dev_DVF + component; - cublasSaxpy(handle, numel, &one, pComponent, 3, DVFcomponent, 1); - - // Allocate the cudaArray and fill it with the current DVFcomponent - cudaMalloc3DArray((cudaArray **)&DVFcomponentArrays[component], &channelDesc, dvfExtent); - cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); - CopyParams.srcPtr = - make_cudaPitchedPtr(DVFcomponent, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]); - CopyParams.dstArray = (cudaArray *)DVFcomponentArrays[component]; - CopyParams.extent = dvfExtent; - CopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&CopyParams); - CUDA_CHECK_ERROR; - } - - // Intermediate memory is no longer needed - cudaFree(DVFcomponent); - - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_xdvf, (cudaArray *)DVFcomponentArrays[0], channelDesc); - cudaBindTextureToArray(tex_ydvf, (cudaArray *)DVFcomponentArrays[1], channelDesc); - cudaBindTextureToArray(tex_zdvf, (cudaArray *)DVFcomponentArrays[2], channelDesc); - CUDA_CHECK_ERROR; - - /////////////////////////////////// - // Do the same for the input volume - - // Extent stuff - cudaExtent volExtent = make_cudaExtent(input_vol_dim[0], input_vol_dim[1], input_vol_dim[2]); - - // Set texture parameters - tex_input_vol.addressMode[0] = cudaAddressModeBorder; - tex_input_vol.addressMode[1] = cudaAddressModeBorder; - tex_input_vol.addressMode[2] = cudaAddressModeBorder; - tex_input_vol.normalized = false; // don't access with normalized texture coords - if (isLinear) - tex_input_vol.filterMode = cudaFilterModeLinear; - else - tex_input_vol.filterMode = cudaFilterModePoint; - - // Allocate the array - cudaArray * array_input_vol; - cudaMalloc3DArray((cudaArray **)&array_input_vol, &channelDesc, volExtent); - CUDA_CHECK_ERROR; + // Prepare DVF textures + std::vector DVFComponentArrays; + std::vector tex_dvf; + prepareVectorTextureObject(input_dvf_dim, dev_DVF, DVFComponentArrays, 3, tex_dvf, false); - // Copy image data to array - cudaMemcpy3DParms inputCopyParams = cudaMemcpy3DParms(); - inputCopyParams.srcPtr = - make_cudaPitchedPtr(dev_input_vol, input_vol_dim[0] * sizeof(float), input_vol_dim[0], input_vol_dim[1]); - inputCopyParams.dstArray = (cudaArray *)array_input_vol; - inputCopyParams.extent = volExtent; - inputCopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&inputCopyParams); - CUDA_CHECK_ERROR; - - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_input_vol, (cudaArray *)array_input_vol, channelDesc); - CUDA_CHECK_ERROR; + // Prepare volume texture + cudaArray * array_input_vol; + cudaTextureObject_t tex_input_vol; + prepareScalarTextureObject(output_vol_dim, dev_input_vol, array_input_vol, tex_input_vol, false, isLinear); // Copy matrices into constant memory cudaMemcpyToSymbol( @@ -238,12 +130,7 @@ CUDA_warp(int input_vol_dim[3], c_IndexOutputToIndexDVFMatrix, IndexOutputToIndexDVFMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_PPInputToIndexInputMatrix, PPInputToIndexInputMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); - - ////////////////////////////////////// - /// Run - - int device; - cudaGetDevice(&device); + CUDA_CHECK_ERROR; // Thread Block Dimensions constexpr int tBlock_x = 16; @@ -258,29 +145,24 @@ CUDA_warp(int input_vol_dim[3], dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - // Note: the DVF and input image are passed via texture memory - //------------------------------------- kernel_3Dgrid<<>>(dev_output_vol, - make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); - - CUDA_CHECK_ERROR; - - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_xdvf); - cudaUnbindTexture(tex_ydvf); - cudaUnbindTexture(tex_zdvf); - cudaUnbindTexture(tex_input_vol); + make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]), + tex_dvf[0], + tex_dvf[1], + tex_dvf[2], + tex_input_vol); CUDA_CHECK_ERROR; // Cleanup + for (unsigned int c = 0; c < 3; c++) + { + cudaFreeArray(DVFComponentArrays[c]); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_dvf[c]); + CUDA_CHECK_ERROR; + } + cudaFreeArray(array_input_vol); CUDA_CHECK_ERROR; - cudaFreeArray((cudaArray *)DVFcomponentArrays[0]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[1]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[2]); - cudaFreeArray((cudaArray *)array_input_vol); - delete[] DVFcomponentArrays; + cudaDestroyTextureObject(tex_input_vol); CUDA_CHECK_ERROR; - - // Destroy CUBLAS context - cublasDestroy(handle); } diff --git a/src/rtkCudaWeidingerForwardModelImageFilter.cu b/src/rtkCudaWeidingerForwardModelImageFilter.cu index cf3522a63..e03c8dc8e 100644 --- a/src/rtkCudaWeidingerForwardModelImageFilter.cu +++ b/src/rtkCudaWeidingerForwardModelImageFilter.cu @@ -38,7 +38,6 @@ * CUDA #includes * *****************/ #include -#include #include #define IDX2D(r, c, cols) ((r) * (cols) + (c))