From c59bf8db32147c999ae84e623ad6218087fc0b4c Mon Sep 17 00:00:00 2001 From: Simon Rit <simon.rit@creatis.insa-lyon.fr> Date: Wed, 17 Apr 2024 07:40:02 +0200 Subject: [PATCH 1/5] COMP: Display deprecation CUDA messages --- itk-module-init.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/itk-module-init.cmake b/itk-module-init.cmake index ee9172caa..8d978c6fb 100644 --- a/itk-module-init.cmake +++ b/itk-module-init.cmake @@ -44,8 +44,6 @@ if(RTK_USE_CUDA) find_package(CUDAToolkit REQUIRED 8.0) endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-declarations") set(RTK_CUDA_PROJECTIONS_SLAB_SIZE "16" CACHE STRING "Number of projections processed simultaneously in CUDA forward and back projections") elseif(RTK_CUDA_VERSION) message(FATAL_ERROR "RTK_CUDA_VERSION is set but the CUDA toolkit has not been found.") From 06b137bd705959cf430c7a612c9a43b83f4b3272 Mon Sep 17 00:00:00 2001 From: Simon Rit <simon.rit@creatis.insa-lyon.fr> Date: Wed, 17 Apr 2024 07:59:28 +0200 Subject: [PATCH 2/5] ENH: Remove CUDA code for compute capability 1 RTK does not support compute capability 1 since 1e75fbce682ff9a697165345ca9c53edeb5fb05a. --- src/rtkCudaBackProjectionImageFilter.cu | 22 +-- src/rtkCudaFDKBackProjectionImageFilter.cu | 102 ++-------- src/rtkCudaWarpBackProjectionImageFilter.cu | 199 ++------------------ src/rtkCudaWarpImageFilter.cu | 71 +------ 4 files changed, 47 insertions(+), 347 deletions(-) diff --git a/src/rtkCudaBackProjectionImageFilter.cu b/src/rtkCudaBackProjectionImageFilter.cu index 6b1d260e4..577a533c4 100644 --- a/src/rtkCudaBackProjectionImageFilter.cu +++ b/src/rtkCudaBackProjectionImageFilter.cu @@ -51,7 +51,7 @@ __constant__ int3 c_volSize; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_ //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ -template <unsigned int vectorLength, bool isCylindrical> +template <unsigned int VVectorLength, bool VIsCylindrical> __global__ void kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTextureObject_t * dev_tex_proj) { @@ -68,13 +68,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe itk::SizeValueType vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x); float3 ip, pp; - float voxel_data[vectorLength]; - for (unsigned int c = 0; c < vectorLength; c++) + float voxel_data[VVectorLength]; + for (unsigned int c = 0; c < VVectorLength; c++) voxel_data[c] = 0.0f; for (unsigned int proj = 0; proj < c_projSize.z; proj++) { - if (isCylindrical) + if (VIsCylindrical) { // matrix multiply pp = matrix_multiply(make_float3(i, j, k), &(c_volIndexToProjPP[12 * proj])); @@ -105,13 +105,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe } // Get texture point, clip left to GPU, and accumulate in voxel_data - for (unsigned int c = 0; c < vectorLength; c++) + for (unsigned int c = 0; c < VVectorLength; c++) voxel_data[c] += tex2DLayered<float>(dev_tex_proj[c], ip.x, ip.y, proj); } // Place it into the volume - for (unsigned int c = 0; c < vectorLength; c++) - dev_vol_out[vol_idx * vectorLength + c] = dev_vol_in[vol_idx * vectorLength + c] + voxel_data[c]; + for (unsigned int c = 0; c < VVectorLength; c++) + dev_vol_out[vol_idx * VVectorLength + c] = dev_vol_in[vol_idx * VVectorLength + c] + voxel_data[c]; } @@ -134,14 +134,6 @@ CUDA_back_project(int projSize[3], double radiusCylindricalDetector, unsigned int vectorLength) { - // 2D layered texture requires CudaComputeCapability >= 2.0 - int device; - cudaGetDevice(&device); - if (GetCudaComputeCapability(device).first <= 1) - { - itkGenericExceptionMacro(<< "RTK no longer supports GPUs with CudaComputeCapability < 2.0"); - } - // Copy the size of inputs into constant memory cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3)); cudaMemcpyToSymbol(c_volSize, volSize, sizeof(int3)); diff --git a/src/rtkCudaFDKBackProjectionImageFilter.cu b/src/rtkCudaFDKBackProjectionImageFilter.cu index 5cef0e040..4a9d45601 100644 --- a/src/rtkCudaFDKBackProjectionImageFilter.cu +++ b/src/rtkCudaFDKBackProjectionImageFilter.cu @@ -40,8 +40,7 @@ #include <cuda.h> // T E X T U R E S //////////////////////////////////////////////////////// -texture<float, cudaTextureType2DLayered> tex_proj; -texture<float, 3, cudaReadModeElementType> tex_proj_3D; +texture<float, cudaTextureType2DLayered> tex_proj; // Constant memory __constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections @@ -53,48 +52,6 @@ __constant__ int3 c_vol_size; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_ //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ -__global__ void -kernel_fdk(float * dev_vol_in, float * dev_vol_out, unsigned int Blocks_Y) -{ - // CUDA 2.0 does not allow for a 3D grid, which severely - // limits the manipulation of large 3D arrays of data. The - // following code is a hack to bypass this implementation - // limitation. - unsigned int blockIdx_z = blockIdx.y / Blocks_Y; - unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y); - itk::SizeValueType i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - itk::SizeValueType j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y; - itk::SizeValueType k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z; - - if (i >= c_vol_size.x || j >= c_vol_size.y || k >= c_vol_size.z) - { - return; - } - - // Index row major into the volume - itk::SizeValueType vol_idx = i + (j + k * c_vol_size.y) * (c_vol_size.x); - - float3 ip; - float voxel_data = 0; - - for (unsigned int proj = 0; proj < c_projSize.z; proj++) - { - // matrix multiply - ip = matrix_multiply(make_float3(i, j, k), &(c_matrices[12 * proj])); - - // Change coordinate systems - ip.z = 1 / ip.z; - ip.x = ip.x * ip.z; - ip.y = ip.y * ip.z; - - // Get texture point, clip left to GPU - voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5) * ip.z * ip.z; - } - - // Place it into the volume - dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data; -} - __global__ void kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out) { @@ -146,9 +103,6 @@ CUDA_reconstruct_conebeam(int proj_size[3], float * dev_vol_out, float * dev_proj) { - int device; - cudaGetDevice(&device); - // Copy the size of inputs into constant memory cudaMemcpyToSymbol(c_projSize, proj_size, sizeof(int3)); cudaMemcpyToSymbol(c_vol_size, vol_size, sizeof(int3)); @@ -163,12 +117,6 @@ CUDA_reconstruct_conebeam(int proj_size[3], tex_proj.filterMode = cudaFilterModeLinear; tex_proj.normalized = false; // don't access with normalized texture coords - tex_proj_3D.addressMode[0] = cudaAddressModeBorder; - tex_proj_3D.addressMode[1] = cudaAddressModeBorder; - tex_proj_3D.addressMode[2] = cudaAddressModeBorder; - tex_proj_3D.filterMode = cudaFilterModeLinear; - tex_proj_3D.normalized = false; // don't access with normalized texture coords - // Copy projection data to array, bind the array to the texture cudaExtent projExtent = make_cudaExtent(proj_size[0], proj_size[1], proj_size[2]); cudaArray * array_proj; @@ -176,12 +124,8 @@ CUDA_reconstruct_conebeam(int proj_size[3], CUDA_CHECK_ERROR; // Allocate array for input projections, in order to bind them to - // either a 2D layered texture (requires GetCudaComputeCapability >= 2.0) or - // a 3D texture - if (GetCudaComputeCapability(device).first <= 1) - cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent); - else - cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered); + // a 2D layered texture + cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered); CUDA_CHECK_ERROR; // Copy data to 3D array @@ -205,39 +149,21 @@ CUDA_reconstruct_conebeam(int proj_size[3], // Run kernels. Note: Projection data is passed via texture memory, // transform matrix is passed via constant memory - if (GetCudaComputeCapability(device).first <= 1) - { - // Compute block and grid sizes - dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ); - dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - - // Bind the array of projections to a 3D texture - cudaBindTextureToArray(tex_proj_3D, (cudaArray *)array_proj, channelDesc); - CUDA_CHECK_ERROR; - kernel_fdk<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, blocksInY); - - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_proj_3D); - CUDA_CHECK_ERROR; - } - else - { - // Compute block and grid sizes - dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); - dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - CUDA_CHECK_ERROR; + // Compute block and grid sizes + dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); + dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); + CUDA_CHECK_ERROR; - // Bind the array of projections to a 2D layered texture - cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc); - CUDA_CHECK_ERROR; + // Bind the array of projections to a 2D layered texture + cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc); + CUDA_CHECK_ERROR; - kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out); + kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out); - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_proj); - CUDA_CHECK_ERROR; - } + // Unbind the image and projection matrix textures + cudaUnbindTexture(tex_proj); + CUDA_CHECK_ERROR; // Cleanup cudaFreeArray((cudaArray *)array_proj); diff --git a/src/rtkCudaWarpBackProjectionImageFilter.cu b/src/rtkCudaWarpBackProjectionImageFilter.cu index d2158de42..333b6dfba 100644 --- a/src/rtkCudaWarpBackProjectionImageFilter.cu +++ b/src/rtkCudaWarpBackProjectionImageFilter.cu @@ -42,8 +42,7 @@ #include <cuda_runtime.h> // T E X T U R E S //////////////////////////////////////////////////////// -texture<float, cudaTextureType2DLayered> tex_proj; -texture<float, 3, cudaReadModeElementType> tex_proj_3D; +texture<float, cudaTextureType2DLayered> tex_proj; texture<float, 3, cudaReadModeElementType> tex_xdvf; texture<float, 3, cudaReadModeElementType> tex_ydvf; @@ -66,131 +65,6 @@ __constant__ float c_IndexInputToPPInputMatrix[12]; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_ //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ -__global__ void -kernel_warp_back_project(float * dev_vol_in, float * dev_vol_out, unsigned int Blocks_Y) -{ - // CUDA 2.0 does not allow for a 3D grid, which severely - // limits the manipulation of large 3D arrays of data. The - // following code is a hack to bypass this implementation - // limitation. - unsigned int blockIdx_z = blockIdx.y / Blocks_Y; - unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y); - unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y; - unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z; - - if (i >= c_volSize.x || j >= c_volSize.y || k >= c_volSize.z) - { - return; - } - - // Index row major into the volume - long int vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x); - - float3 IndexInDVF, Displacement, PP, IndexInInput, ip; - float voxel_data = 0; - - for (unsigned int proj = 0; proj < c_projSize.z; proj++) - { - // Compute the index in the DVF - IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix); - - // Get each component of the displacement vector by interpolation in the DVF - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - - // Compute the physical point in input + the displacement vector - PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement; - - // Convert it to a continuous index - IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix); - - // Project the voxel onto the detector to find out which value to add to it - ip = matrix_multiply(IndexInInput, &(c_matrices[12 * proj])); - ; - - // Change coordinate systems - ip.z = 1 / ip.z; - ip.x = ip.x * ip.z; - ip.y = ip.y * ip.z; - - // Get texture point, clip left to GPU - voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5); - } - - // Place it into the volume - dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data; -} - -__global__ void -kernel_warp_back_project_cylindrical_detector(float * dev_vol_in, - float * dev_vol_out, - unsigned int Blocks_Y, - float radius) -{ - // CUDA 2.0 does not allow for a 3D grid, which severely - // limits the manipulation of large 3D arrays of data. The - // following code is a hack to bypass this implementation - // limitation. - unsigned int blockIdx_z = blockIdx.y / Blocks_Y; - unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y); - unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y; - unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z; - - if (i >= c_volSize.x || j >= c_volSize.y || k >= c_volSize.z) - { - return; - } - - // Index row major into the volume - long int vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x); - - float3 IndexInDVF, Displacement, PP, IndexInInput, ip, pp; - float voxel_data = 0; - - for (unsigned int proj = 0; proj < c_projSize.z; proj++) - { - // Compute the index in the DVF - IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix); - - // Get each component of the displacement vector by interpolation in the DVF - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - - // Compute the physical point in input + the displacement vector - PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement; - - // Convert it to a continuous index - IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix); - - // Project the voxel onto the detector to find out which value to add to it - pp = matrix_multiply(IndexInInput, &(c_volIndexToProjPP[12 * proj])); - - // Change coordinate systems - pp.z = 1 / pp.z; - pp.x = pp.x * pp.z; - pp.y = pp.y * pp.z; - - // Apply correction for cylindrical detector - const float u = pp.x; - pp.x = radius * atan2(u, radius); - pp.y = pp.y * radius / sqrt(radius * radius + u * u); - - // Get projection index - ip.x = c_projPPToProjIndex[0] * pp.x + c_projPPToProjIndex[1] * pp.y + c_projPPToProjIndex[2]; - ip.y = c_projPPToProjIndex[3] * pp.x + c_projPPToProjIndex[4] * pp.y + c_projPPToProjIndex[5]; - - // Get texture point, clip left to GPU - voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5); - } - - // Place it into the volume - dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data; -} - __global__ void kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) { @@ -227,7 +101,6 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) // Project the voxel onto the detector to find out which value to add to it ip = matrix_multiply(IndexInInput, &(c_matrices[12 * proj])); - ; // Change coordinate systems ip.z = 1 / ip.z; @@ -346,12 +219,6 @@ CUDA_warp_back_project(int projSize[3], tex_proj.filterMode = cudaFilterModeLinear; tex_proj.normalized = false; // don't access with normalized texture coords - tex_proj_3D.addressMode[0] = cudaAddressModeBorder; - tex_proj_3D.addressMode[1] = cudaAddressModeBorder; - tex_proj_3D.addressMode[2] = cudaAddressModeBorder; - tex_proj_3D.filterMode = cudaFilterModeLinear; - tex_proj_3D.normalized = false; // don't access with normalized texture coords - // Copy projection data to array, bind the array to the texture cudaExtent projExtent = make_cudaExtent(projSize[0], projSize[1], projSize[2]); cudaArray * array_proj; @@ -359,12 +226,8 @@ CUDA_warp_back_project(int projSize[3], CUDA_CHECK_ERROR; // Allocate array for input projections, in order to bind them to - // either a 2D layered texture (requires GetCudaComputeCapability >= 2.0) or - // a 3D texture - if (GetCudaComputeCapability(device).first <= 1) - cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent); - else - cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered); + // a 2D layered texture + cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered); CUDA_CHECK_ERROR; // Copy data to 3D array @@ -460,49 +323,27 @@ CUDA_warp_back_project(int projSize[3], // Run kernels. Note: Projection data is passed via texture memory, // transform matrix is passed via constant memory - if (GetCudaComputeCapability(device).first <= 1) - { - // Compute block and grid sizes - dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ); - dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - // Bind the array of projections to a 3D texture - cudaBindTextureToArray(tex_proj_3D, (cudaArray *)array_proj, channelDesc); - CUDA_CHECK_ERROR; + // Compute block and grid sizes + dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); + dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); + CUDA_CHECK_ERROR; - if (radiusCylindricalDetector == 0) - kernel_warp_back_project<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, blocksInY); - else - kernel_warp_back_project_cylindrical_detector<<<dimGrid, dimBlock>>>( - dev_vol_in, dev_vol_out, blocksInY, (float)radiusCylindricalDetector); + // Bind the array of projections to a 2D layered texture + cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc); + CUDA_CHECK_ERROR; - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_proj_3D); - CUDA_CHECK_ERROR; - } + // Note: cbi->img is passed via texture memory + // Matrices are passed via constant memory + //------------------------------------- + if (radiusCylindricalDetector == 0) + kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out); else - { - // Compute block and grid sizes - dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); - dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - CUDA_CHECK_ERROR; - - // Bind the array of projections to a 2D layered texture - cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc); - CUDA_CHECK_ERROR; - - // Note: cbi->img is passed via texture memory - // Matrices are passed via constant memory - //------------------------------------- - if (radiusCylindricalDetector == 0) - kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out); - else - kernel_warp_back_project_3Dgrid_cylindrical_detector<<<dimGrid, dimBlock>>>( - dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector); - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_proj); - CUDA_CHECK_ERROR; - } + kernel_warp_back_project_3Dgrid_cylindrical_detector<<<dimGrid, dimBlock>>>( + dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector); + // Unbind the image and projection matrix textures + cudaUnbindTexture(tex_proj); + CUDA_CHECK_ERROR; // Unbind the image and projection matrix textures cudaUnbindTexture(tex_xdvf); diff --git a/src/rtkCudaWarpImageFilter.cu b/src/rtkCudaWarpImageFilter.cu index 8ce5c2c0d..132cbe9cb 100644 --- a/src/rtkCudaWarpImageFilter.cu +++ b/src/rtkCudaWarpImageFilter.cu @@ -63,51 +63,6 @@ __constant__ float c_PPInputToIndexInputMatrix[12]; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_ //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ -__global__ void -kernel(float * dev_vol_out, int3 vol_dim, unsigned int Blocks_Y) -{ - // CUDA 2.0 does not allow for a 3D grid, which severely - // limits the manipulation of large 3D arrays of data. The - // following code is a hack to bypass this implementation - // limitation. - unsigned int blockIdx_z = blockIdx.y / Blocks_Y; - unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y); - unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y; - unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z; - - if (i >= vol_dim.x || j >= vol_dim.y || k >= vol_dim.z) - { - return; - } - - // Index row major into the volume - long int vol_idx = i + (j + k * vol_dim.y) * (vol_dim.x); - - // Matrix multiply to get the index in the DVF texture of the current point in the output volume - float3 IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexOutputToIndexDVFMatrix); - - // Get each component of the displacement vector by - // interpolation in the dvf - float3 Displacement; - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - - // Matrix multiply to get the physical coordinates of the current point in the output volume - float3 PPinOutput = matrix_multiply(make_float3(i, j, k), c_IndexOutputToPPOutputMatrix); - - // Get the index corresponding to the current physical point in output displaced by the displacement vector - float3 PPDisplaced; - PPDisplaced.x = PPinOutput.x + Displacement.x; - PPDisplaced.y = PPinOutput.y + Displacement.y; - PPDisplaced.z = PPinOutput.z + Displacement.z; - float3 IndexInInput = matrix_multiply(PPDisplaced, c_PPInputToIndexInputMatrix); - - // Interpolate in the input and copy into the output - dev_vol_out[vol_idx] = tex3D(tex_input_vol, IndexInInput.x + 0.5f, IndexInInput.y + 0.5f, IndexInInput.z + 0.5f); -} - __global__ void kernel_3Dgrid(float * dev_vol_out, int3 vol_dim) { @@ -165,7 +120,6 @@ CUDA_warp(int input_vol_dim[3], float * dev_DVF, bool isLinear) { - // Prepare channel description for arrays static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); @@ -301,26 +255,13 @@ CUDA_warp(int input_vol_dim[3], unsigned int blocksInY = (output_vol_dim[1] - 1) / tBlock_y + 1; unsigned int blocksInZ = (output_vol_dim[2] - 1) / tBlock_z + 1; - if (GetCudaComputeCapability(device).first <= 1) - { - dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ); - dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); + dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); + dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - // Note: the DVF and input image are passed via texture memory - //------------------------------------- - kernel<<<dimGrid, dimBlock>>>( - dev_output_vol, make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]), blocksInY); - } - else - { - dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); - dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - - // Note: the DVF and input image are passed via texture memory - //------------------------------------- - kernel_3Dgrid<<<dimGrid, dimBlock>>>(dev_output_vol, - make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); - } + // Note: the DVF and input image are passed via texture memory + //------------------------------------- + kernel_3Dgrid<<<dimGrid, dimBlock>>>(dev_output_vol, + make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); CUDA_CHECK_ERROR; From fab2e3fe43f09780665ef89b852ee50a1063fc04 Mon Sep 17 00:00:00 2001 From: Simon Rit <simon.rit@creatis.insa-lyon.fr> Date: Thu, 18 Apr 2024 16:29:59 +0200 Subject: [PATCH 3/5] STYLE: Refactor function for preparing texture objects for vector images --- include/rtkCudaUtilities.hcu | 14 ++++---- src/rtkCudaBackProjectionImageFilter.cu | 17 ++++------ src/rtkCudaForwardProjectionImageFilter.cu | 39 ++++++++++------------ src/rtkCudaUtilities.cu | 21 +++++++----- 4 files changed, 45 insertions(+), 46 deletions(-) diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu index 6cf2b6090..612a8fda6 100644 --- a/include/rtkCudaUtilities.hcu +++ b/include/rtkCudaUtilities.hcu @@ -126,13 +126,15 @@ inline __host__ __device__ float { return u.x * v.x + u.y * v.y + u.z * v.z; } + __host__ void -prepareTextureObject(int size[3], - float * dev_ptr, - cudaArray **& componentArrays, - unsigned int nComponents, - cudaTextureObject_t * tex, - bool isProjections); +prepareVectorTextureObject(int size[3], + const float * dev_ptr, + std::vector<cudaArray *> & componentArrays, + const unsigned int nComponents, + std::vector<cudaTextureObject_t> & tex, + bool isProjections); + inline __device__ void matrix_matrix_multiply(float * A, float * B, float * C, unsigned int rowsA, unsigned int colsB, unsigned int colsArowsB) diff --git a/src/rtkCudaBackProjectionImageFilter.cu b/src/rtkCudaBackProjectionImageFilter.cu index 577a533c4..5ffd4faac 100644 --- a/src/rtkCudaBackProjectionImageFilter.cu +++ b/src/rtkCudaBackProjectionImageFilter.cu @@ -161,18 +161,15 @@ CUDA_back_project(int projSize[3], dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); CUDA_CHECK_ERROR; - cudaArray ** projComponentArrays = new cudaArray *[vectorLength]; - - // Create an array of textures - cudaTextureObject_t * tex_proj = new cudaTextureObject_t[vectorLength]; - // Prepare texture objects (needs an array of cudaTextureObjects on the host as "tex_proj" argument) - prepareTextureObject(projSize, dev_proj, projComponentArrays, vectorLength, tex_proj, true); + std::vector<cudaArray *> projComponentArrays; + std::vector<cudaTextureObject_t> tex_proj; + prepareVectorTextureObject(projSize, dev_proj, projComponentArrays, vectorLength, tex_proj, true); // Copy them to a device pointer, since it will have to be de-referenced in the kernels cudaTextureObject_t * dev_tex_proj; cudaMalloc(&dev_tex_proj, vectorLength * sizeof(cudaTextureObject_t)); - cudaMemcpy(dev_tex_proj, tex_proj, vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tex_proj, tex_proj.data(), vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice); // Run the kernel. Since "vectorLength" is passed as a function argument, not as a template argument, // the compiler can't assume it's constant, and a dirty trick has to be used. @@ -237,11 +234,11 @@ CUDA_back_project(int projSize[3], // Cleanup for (unsigned int c = 0; c < vectorLength; c++) { - cudaFreeArray((cudaArray *)projComponentArrays[c]); + cudaFreeArray(projComponentArrays[c]); + CUDA_CHECK_ERROR; cudaDestroyTextureObject(tex_proj[c]); + CUDA_CHECK_ERROR; } cudaFree(dev_tex_proj); - delete[] tex_proj; - delete[] projComponentArrays; CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaForwardProjectionImageFilter.cu b/src/rtkCudaForwardProjectionImageFilter.cu index 9b14365c7..3940d0f0d 100644 --- a/src/rtkCudaForwardProjectionImageFilter.cu +++ b/src/rtkCudaForwardProjectionImageFilter.cu @@ -60,7 +60,7 @@ __constant__ float c_sourcePos[SLAB_SIZE * 3]; // Can process stacks of at most //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ // KERNEL kernel_forwardProject -template <unsigned int vectorLength> +template <unsigned int VVectorLength> __global__ void kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObject_t * dev_tex_vol) { @@ -103,8 +103,8 @@ kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObje // Detect intersection with box if (!intersectBox(ray, &tnear, &tfar, c_boxMin, c_boxMax) || tfar < 0.f) { - for (unsigned int c = 0; c < vectorLength; c++) - dev_proj_out[projOffset * vectorLength + c] = dev_proj_in[projOffset * vectorLength + c]; + for (unsigned int c = 0; c < VVectorLength; c++) + dev_proj_out[projOffset * VVectorLength + c] = dev_proj_in[projOffset * VVectorLength + c]; } else { @@ -123,9 +123,9 @@ kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObje pos = ray.o + tnear * ray.d; float t; - float sample[vectorLength]; - float sum[vectorLength]; - for (unsigned int c = 0; c < vectorLength; c++) + float sample[VVectorLength]; + float sum[VVectorLength]; + for (unsigned int c = 0; c < VVectorLength; c++) { sample[c] = 0.0f; sum[c] = 0.0f; @@ -134,11 +134,11 @@ kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObje for (t = tnear; t <= tfar; t += vStep) { // Read from 3D texture from volume(s) - for (unsigned int c = 0; c < vectorLength; c++) + for (unsigned int c = 0; c < VVectorLength; c++) sample[c] = tex3D<float>(dev_tex_vol[c], pos.x, pos.y, pos.z); // Accumulate - for (unsigned int c = 0; c < vectorLength; c++) + for (unsigned int c = 0; c < VVectorLength; c++) sum[c] += sample[c]; // Step forward @@ -146,9 +146,9 @@ kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObje } // Update the output projection pixels - for (unsigned int c = 0; c < vectorLength; c++) - dev_proj_out[projOffset * vectorLength + c] = - dev_proj_in[projOffset * vectorLength + c] + (sum[c] + (tfar - t + halfVStep) / vStep * sample[c]) * c_tStep; + for (unsigned int c = 0; c < VVectorLength; c++) + dev_proj_out[projOffset * VVectorLength + c] = + dev_proj_in[projOffset * VVectorLength + c] + (sum[c] + (tfar - t + halfVStep) / vStep * sample[c]) * c_tStep; } } } @@ -198,17 +198,15 @@ CUDA_forward_project(int projSize[3], cudaMemcpyToSymbol( c_translatedVolumeTransformMatrices, &(translatedVolumeTransformMatrices[0]), 12 * sizeof(float) * projSize[2]); - // Create an array of textures - cudaTextureObject_t * tex_vol = new cudaTextureObject_t[vectorLength]; - cudaArray ** volComponentArrays = new cudaArray *[vectorLength]; - - // Prepare texture objects (needs an array of cudaTextureObjects on the host as "tex_vol" argument) - prepareTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false); + // Prepare texture objects + std::vector<cudaArray *> volComponentArrays; + std::vector<cudaTextureObject_t> tex_vol; + prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false); // Copy them to a device pointer, since it will have to be de-referenced in the kernels cudaTextureObject_t * dev_tex_vol; cudaMalloc(&dev_tex_vol, vectorLength * sizeof(cudaTextureObject_t)); - cudaMemcpy(dev_tex_vol, tex_vol, vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tex_vol, tex_vol.data(), vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice); // Run the kernel. Since "vectorLength" is passed as a function argument, not as a template argument, // the compiler can't assume it's constant, and a dirty trick has to be used. @@ -239,11 +237,10 @@ CUDA_forward_project(int projSize[3], for (unsigned int c = 0; c < vectorLength; c++) { cudaFreeArray((cudaArray *)volComponentArrays[c]); + CUDA_CHECK_ERROR; cudaDestroyTextureObject(tex_vol[c]); + CUDA_CHECK_ERROR; } cudaFree(dev_tex_vol); - delete[] volComponentArrays; CUDA_CHECK_ERROR; - - delete[] tex_vol; } diff --git a/src/rtkCudaUtilities.cu b/src/rtkCudaUtilities.cu index 4f11e45db..4f3938d05 100644 --- a/src/rtkCudaUtilities.cu +++ b/src/rtkCudaUtilities.cu @@ -77,13 +77,16 @@ GetFreeGPUGlobalMemory(int device) } __host__ void -prepareTextureObject(int size[3], - float * dev_ptr, - cudaArray **& componentArrays, - unsigned int nComponents, - cudaTextureObject_t * tex, - bool isProjections) +prepareVectorTextureObject(int size[3], + const float * dev_ptr, + std::vector<cudaArray *> & componentArrays, + const unsigned int nComponents, + std::vector<cudaTextureObject_t> & tex, + bool isProjections) { + componentArrays.resize(nComponents); + tex.resize(nComponents); + // Create CUBLAS context cublasHandle_t handle; cublasCreate(&handle); @@ -129,15 +132,15 @@ prepareTextureObject(int size[3], // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays if (isProjections) - cudaMalloc3DArray((cudaArray **)&componentArrays[component], &channelDesc, volExtent, cudaArrayLayered); + cudaMalloc3DArray(&componentArrays[component], &channelDesc, volExtent, cudaArrayLayered); else - cudaMalloc3DArray((cudaArray **)&componentArrays[component], &channelDesc, volExtent); + cudaMalloc3DArray(&componentArrays[component], &channelDesc, volExtent); CUDA_CHECK_ERROR; // Fill it with the current singleComponent cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); CopyParams.srcPtr = make_cudaPitchedPtr(singleComponent, size[0] * sizeof(float), size[0], size[1]); - CopyParams.dstArray = (cudaArray *)componentArrays[component]; + CopyParams.dstArray = componentArrays[component]; CopyParams.extent = volExtent; CopyParams.kind = cudaMemcpyDeviceToDevice; cudaMemcpy3D(&CopyParams); From 0d88a07bab57a87f3ebb1afe027571c148ef51f4 Mon Sep 17 00:00:00 2001 From: Simon Rit <simon.rit@creatis.insa-lyon.fr> Date: Fri, 19 Apr 2024 21:19:05 +0200 Subject: [PATCH 4/5] ENH: Replace texture references by objects for CUDA 12 compatibility --- .../rtkCudaDisplacedDetectorImageFilter.hcu | 2 +- include/rtkCudaFDKWeightProjectionFilter.hcu | 2 +- include/rtkCudaParkerShortScanImageFilter.hcu | 2 +- include/rtkCudaUtilities.hcu | 18 +- src/rtkCudaDisplacedDetectorImageFilter.cu | 54 ++-- src/rtkCudaFDKBackProjectionImageFilter.cu | 50 +--- src/rtkCudaFDKWeightProjectionFilter.cu | 55 ++-- src/rtkCudaForwardProjectionImageFilter.cu | 3 +- src/rtkCudaForwardWarpImageFilter.cu | 251 +++++------------- src/rtkCudaParkerShortScanImageFilter.cu | 56 ++-- src/rtkCudaUtilities.cu | 107 +++++++- src/rtkCudaWarpBackProjectionImageFilter.cu | 176 +++--------- ...rtkCudaWarpForwardProjectionImageFilter.cu | 152 +++-------- src/rtkCudaWarpImageFilter.cu | 186 +++---------- ...rtkCudaWeidingerForwardModelImageFilter.cu | 1 - 15 files changed, 374 insertions(+), 741 deletions(-) diff --git a/include/rtkCudaDisplacedDetectorImageFilter.hcu b/include/rtkCudaDisplacedDetectorImageFilter.hcu index b115c9ac1..8c50a12ed 100644 --- a/include/rtkCudaDisplacedDetectorImageFilter.hcu +++ b/include/rtkCudaDisplacedDetectorImageFilter.hcu @@ -28,7 +28,7 @@ CUDA_displaced_weight(int proj_idx_in[3], int proj_dim_out_buf[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float theta, bool isPositiveCase, float proj_orig, diff --git a/include/rtkCudaFDKWeightProjectionFilter.hcu b/include/rtkCudaFDKWeightProjectionFilter.hcu index f6284a255..353230ce1 100644 --- a/include/rtkCudaFDKWeightProjectionFilter.hcu +++ b/include/rtkCudaFDKWeightProjectionFilter.hcu @@ -26,7 +26,7 @@ CUDA_weight_projection(int proj_idx[2], int proj_dim_buf_out[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float proj_orig[2], float proj_row[2], float proj_col[2]); diff --git a/include/rtkCudaParkerShortScanImageFilter.hcu b/include/rtkCudaParkerShortScanImageFilter.hcu index e39e8783f..aca4c24d1 100644 --- a/include/rtkCudaParkerShortScanImageFilter.hcu +++ b/include/rtkCudaParkerShortScanImageFilter.hcu @@ -26,7 +26,7 @@ CUDA_parker_weight(int proj_idx[2], int proj_dim_buf_out[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float delta, float firstAngle, float proj_orig, diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu index 612a8fda6..737da666e 100644 --- a/include/rtkCudaUtilities.hcu +++ b/include/rtkCudaUtilities.hcu @@ -127,14 +127,30 @@ inline __host__ __device__ float return u.x * v.x + u.y * v.y + u.z * v.z; } +__host__ void +prepareScalarTextureObject(int size[3], + float * dev_ptr, + cudaArray *& threeDArray, + cudaTextureObject_t & tex, + const bool isProjections, + const bool isLinear = true, + const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder); + __host__ void prepareVectorTextureObject(int size[3], const float * dev_ptr, std::vector<cudaArray *> & componentArrays, const unsigned int nComponents, std::vector<cudaTextureObject_t> & tex, - bool isProjections); + const bool isProjections, + const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder); +__host__ void +prepareGeometryTextureObject(int size, + const float * geometry, + float *& dev_geom, + cudaTextureObject_t & tex_geom, + const unsigned int nParam); inline __device__ void matrix_matrix_multiply(float * A, float * B, float * C, unsigned int rowsA, unsigned int colsB, unsigned int colsArowsB) diff --git a/src/rtkCudaDisplacedDetectorImageFilter.cu b/src/rtkCudaDisplacedDetectorImageFilter.cu index 0d579c79c..b888487b3 100644 --- a/src/rtkCudaDisplacedDetectorImageFilter.cu +++ b/src/rtkCudaDisplacedDetectorImageFilter.cu @@ -22,8 +22,6 @@ #include <cuda_runtime.h> #include <math_constants.h> -texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture - inline __device__ float TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column) { @@ -45,19 +43,20 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s } __global__ void -kernel_displaced_weight(int3 proj_idx_in, - int3 proj_size_in, - int2 proj_size_in_buf, - int3 proj_idx_out, - int3 proj_size_out, - int2 proj_size_out_buf, - float * dev_proj_in, - float * dev_proj_out, - float theta, - bool isPositiveCase, - float proj_orig, // projection origin - float proj_row, // projection row direction & spacing - float proj_col // projection col direction & spacing +kernel_displaced_weight(int3 proj_idx_in, + int3 proj_size_in, + int2 proj_size_in_buf, + int3 proj_idx_out, + int3 proj_size_out, + int2 proj_size_out_buf, + float * dev_proj_in, + float * dev_proj_out, + float theta, + bool isPositiveCase, + float proj_orig, // projection origin + float proj_row, // projection row direction & spacing + float proj_col, // projection col direction & spacing + cudaTextureObject_t tex_geom // geometry texture object ) { // compute thread index @@ -90,10 +89,10 @@ kernel_displaced_weight(int3 proj_idx_in, { float pPoint = TransformIndexToPhysicalPoint(make_int2(pIdx.x, pIdx.y), proj_orig, proj_row, proj_col); - float sdd = tex1Dfetch(tex_geometry, tIdx.z * 4 + 0); - float sx = tex1Dfetch(tex_geometry, tIdx.z * 4 + 1); - float px = tex1Dfetch(tex_geometry, tIdx.z * 4 + 2); - float sid = tex1Dfetch(tex_geometry, tIdx.z * 4 + 3); + float sdd = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 0); + float sx = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 1); + float px = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 2); + float sid = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 3); float hyp = sqrtf(sid * sid + sx * sx); // to untilted situation float l = ToUntiltedCoordinateAtIsocenter(pPoint, sdd, sid, sx, px, hyp); @@ -139,7 +138,7 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i int proj_dim_out_buf[2], // output size of buffered region float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float theta, bool isPositiveCase, float proj_orig, @@ -147,13 +146,9 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i float proj_col) { // copy geometry matrix to device, bind the matrix to the texture - float * dev_geom; - cudaMalloc((void **)&dev_geom, proj_dim_out[2] * 4 * sizeof(float)); - CUDA_CHECK_ERROR; - cudaMemcpy(dev_geom, geometries, proj_dim_out[2] * 4 * sizeof(float), cudaMemcpyHostToDevice); - CUDA_CHECK_ERROR; - cudaBindTexture(0, tex_geometry, dev_geom, proj_dim_out[2] * 4 * sizeof(float)); - CUDA_CHECK_ERROR; + float * dev_geom; + cudaTextureObject_t tex_geom; + prepareGeometryTextureObject(proj_dim_out[2], geometry, dev_geom, tex_geom, 4); // Thread Block Dimensions int tBlock_x = 16; @@ -179,10 +174,11 @@ CUDA_displaced_weight(int proj_idx_in[3], // overlapping input region i isPositiveCase, proj_orig, proj_row, - proj_col); + proj_col, + tex_geom); // Unbind matrix texture - cudaUnbindTexture(tex_geometry); + cudaDestroyTextureObject(tex_geom); CUDA_CHECK_ERROR; cudaFree(dev_geom); CUDA_CHECK_ERROR; diff --git a/src/rtkCudaFDKBackProjectionImageFilter.cu b/src/rtkCudaFDKBackProjectionImageFilter.cu index 4a9d45601..417e7544f 100644 --- a/src/rtkCudaFDKBackProjectionImageFilter.cu +++ b/src/rtkCudaFDKBackProjectionImageFilter.cu @@ -39,9 +39,6 @@ *****************/ #include <cuda.h> -// T E X T U R E S //////////////////////////////////////////////////////// -texture<float, cudaTextureType2DLayered> tex_proj; - // Constant memory __constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections __constant__ int3 c_projSize; @@ -53,7 +50,7 @@ __constant__ int3 c_vol_size; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ __global__ void -kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out) +kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out, cudaTextureObject_t tex_proj) { itk::SizeValueType i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; itk::SizeValueType j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -81,7 +78,7 @@ kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out) ip.y = ip.y * ip.z; // Get texture point, clip left to GPU, and accumulate in voxel_data - voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z; + voxel_data += tex2DLayered<float>(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z; } // Place it into the volume @@ -110,33 +107,6 @@ CUDA_reconstruct_conebeam(int proj_size[3], // Copy the projection matrices into constant memory cudaMemcpyToSymbol(c_matrices, &(matrices[0]), 12 * sizeof(float) * proj_size[2]); - // set texture parameters - tex_proj.addressMode[0] = cudaAddressModeBorder; - tex_proj.addressMode[1] = cudaAddressModeBorder; - tex_proj.addressMode[2] = cudaAddressModeBorder; - tex_proj.filterMode = cudaFilterModeLinear; - tex_proj.normalized = false; // don't access with normalized texture coords - - // Copy projection data to array, bind the array to the texture - cudaExtent projExtent = make_cudaExtent(proj_size[0], proj_size[1], proj_size[2]); - cudaArray * array_proj; - static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); - CUDA_CHECK_ERROR; - - // Allocate array for input projections, in order to bind them to - // a 2D layered texture - cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered); - CUDA_CHECK_ERROR; - - // Copy data to 3D array - cudaMemcpy3DParms copyParams = cudaMemcpy3DParms(); - copyParams.srcPtr = make_cudaPitchedPtr(dev_proj, proj_size[0] * sizeof(float), proj_size[0], proj_size[1]); - copyParams.dstArray = (cudaArray *)array_proj; - copyParams.extent = projExtent; - copyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(©Params); - CUDA_CHECK_ERROR; - // Thread Block Dimensions constexpr int tBlock_x = 16; constexpr int tBlock_y = 4; @@ -153,19 +123,15 @@ CUDA_reconstruct_conebeam(int proj_size[3], // Compute block and grid sizes dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - CUDA_CHECK_ERROR; - - // Bind the array of projections to a 2D layered texture - cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc); - CUDA_CHECK_ERROR; - kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out); - - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_proj); - CUDA_CHECK_ERROR; + cudaArray * array_proj; + cudaTextureObject_t tex_proj; + prepareScalarTextureObject(proj_size, dev_proj, array_proj, tex_proj, true); + kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, tex_proj); // Cleanup cudaFreeArray((cudaArray *)array_proj); CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_proj); + CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaFDKWeightProjectionFilter.cu b/src/rtkCudaFDKWeightProjectionFilter.cu index e4f4a45be..26e2e3b38 100644 --- a/src/rtkCudaFDKWeightProjectionFilter.cu +++ b/src/rtkCudaFDKWeightProjectionFilter.cu @@ -21,8 +21,6 @@ #include <cuda.h> #include <cuda_runtime.h> -texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture - inline __device__ float2 TransformIndexToPhysicalPoint(int2 idx, float2 origin, float2 row, float2 column) { @@ -30,15 +28,16 @@ inline __device__ float2 } __global__ void -kernel_weight_projection(int2 proj_idx, - int3 proj_size, - int2 proj_size_buf_in, - int2 proj_size_buf_out, - float * dev_proj_in, - float * dev_proj_out, - float2 proj_orig, // projection origin - float2 proj_row, // projection row direction & spacing - float2 proj_col // projection col direction & spacing +kernel_weight_projection(int2 proj_idx, + int3 proj_size, + int2 proj_size_buf_in, + int2 proj_size_buf_out, + float * dev_proj_in, + float * dev_proj_out, + float2 proj_orig, // projection origin + float2 proj_row, // projection row direction & spacing + float2 proj_col, // projection col direction & spacing + cudaTextureObject_t tex_geom // geometry texture object ) { // compute projection index (== thread index) @@ -53,19 +52,19 @@ kernel_weight_projection(int2 proj_idx, if (pIdx.x >= proj_size.x || pIdx.y >= proj_size.y || pIdx.z >= proj_size.z) return; - const float sdd = tex1Dfetch(tex_geometry, pIdx.z * 7 + 0); - const float sid = tex1Dfetch(tex_geometry, pIdx.z * 7 + 1); - const float wFac = tex1Dfetch(tex_geometry, pIdx.z * 7 + 5); + const float sdd = tex1Dfetch<float>(tex_geom, pIdx.z * 7); + const float sid = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 1); + const float wFac = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 5); if (sdd == 0) // parallel { dev_proj_out[pIdx_comp_out] = dev_proj_in[pIdx_comp_in] * wFac; } else // divergent { - const float pOffX = tex1Dfetch(tex_geometry, pIdx.z * 7 + 2); - const float pOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 3); - const float sOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 4); - const float tAngle = tex1Dfetch(tex_geometry, pIdx.z * 7 + 6); + const float pOffX = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 2); + const float pOffY = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 3); + const float sOffY = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 4); + const float tAngle = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 6); const float sina = sin(tAngle); const float cosa = cos(tAngle); const float tana = tan(tAngle); @@ -90,19 +89,14 @@ CUDA_weight_projection(int proj_idx[2], int proj_dim_buf_out[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float proj_orig[2], float proj_row[2], float proj_col[2]) { - // copy geometry matrix to device, bind the matrix to the texture - float * dev_geom; - cudaMalloc((void **)&dev_geom, proj_dim[2] * 7 * sizeof(float)); - CUDA_CHECK_ERROR; - cudaMemcpy(dev_geom, geometries, proj_dim[2] * 7 * sizeof(float), cudaMemcpyHostToDevice); - CUDA_CHECK_ERROR; - cudaBindTexture(0, tex_geometry, dev_geom, proj_dim[2] * 7 * sizeof(float)); - CUDA_CHECK_ERROR; + float * dev_geom; + cudaTextureObject_t tex_geom; + prepareGeometryTextureObject(proj_dim[2], geometry, dev_geom, tex_geom, 7); // Thread Block Dimensions int tBlock_x = 16; @@ -124,10 +118,11 @@ CUDA_weight_projection(int proj_idx[2], dev_proj_out, make_float2(proj_orig[0], proj_orig[1]), make_float2(proj_row[0], proj_row[1]), - make_float2(proj_col[0], proj_col[1])); + make_float2(proj_col[0], proj_col[1]), + tex_geom); - // Unbind matrix texture - cudaUnbindTexture(tex_geometry); + // destroy texture object + cudaDestroyTextureObject(tex_geom); CUDA_CHECK_ERROR; cudaFree(dev_geom); CUDA_CHECK_ERROR; diff --git a/src/rtkCudaForwardProjectionImageFilter.cu b/src/rtkCudaForwardProjectionImageFilter.cu index 3940d0f0d..869d263f9 100644 --- a/src/rtkCudaForwardProjectionImageFilter.cu +++ b/src/rtkCudaForwardProjectionImageFilter.cu @@ -36,7 +36,6 @@ * CUDA #includes * *****************/ #include <cuda.h> -#include <cublas_v2.h> #include <cuda_runtime.h> // CONSTANTS // @@ -201,7 +200,7 @@ CUDA_forward_project(int projSize[3], // Prepare texture objects std::vector<cudaArray *> volComponentArrays; std::vector<cudaTextureObject_t> tex_vol; - prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false); + prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false, cudaAddressModeClamp); // Copy them to a device pointer, since it will have to be de-referenced in the kernels cudaTextureObject_t * dev_tex_vol; diff --git a/src/rtkCudaForwardWarpImageFilter.cu b/src/rtkCudaForwardWarpImageFilter.cu index 1f9a04956..530dc33b6 100644 --- a/src/rtkCudaForwardWarpImageFilter.cu +++ b/src/rtkCudaForwardWarpImageFilter.cu @@ -39,15 +39,11 @@ *****************/ #include <cuda.h> -// T E X T U R E S //////////////////////////////////////////////////////// -texture<float, 1, cudaReadModeElementType> tex_IndexInputToPPInputMatrix; -texture<float, 1, cudaReadModeElementType> tex_IndexInputToIndexDVFMatrix; -texture<float, 1, cudaReadModeElementType> tex_PPOutputToIndexOutputMatrix; - -texture<float, 3, cudaReadModeElementType> tex_xdvf; -texture<float, 3, cudaReadModeElementType> tex_ydvf; -texture<float, 3, cudaReadModeElementType> tex_zdvf; -/////////////////////////////////////////////////////////////////////////// +// CONSTANTS ////////////////////////////////////////////////////////////// +__constant__ float c_IndexInputToPPInputMatrix[12]; +__constant__ float c_IndexInputToIndexDVFMatrix[12]; +__constant__ float c_PPOutputToIndexOutputMatrix[12]; +//////////////////////////////////////////////////////////////////////////// //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ // K E R N E L S -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ @@ -129,7 +125,14 @@ normalize_3Dgrid(float * dev_vol_out, float * dev_accumulate_weights, int3 out_d } __global__ void -linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumulate_weights, int3 in_dim, int3 out_dim) +linearSplat_3Dgrid(float * dev_vol_in, + float * dev_vol_out, + float * dev_accumulate_weights, + int3 in_dim, + int3 out_dim, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -141,32 +144,21 @@ linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumula } // Index row major into the volume + float3 idx = make_float3(i, j, k); long int in_idx = i + (j + k * in_dim.y) * (in_dim.x); // Matrix multiply to get the index in the DVF texture of the current point in the output volume - float3 IndexInDVF; - IndexInDVF.x = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 0) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 1) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 2) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 3); - IndexInDVF.y = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 4) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 5) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 6) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 7); - IndexInDVF.z = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 8) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 9) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 10) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 11); + float3 IndexInDVF = matrix_multiply(idx, c_IndexInputToIndexDVFMatrix); // Get each component of the displacement vector by // interpolation in the dvf float3 Displacement; - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Matrix multiply to get the physical coordinates of the current point in the input volume - float3 PPinInput; - PPinInput.x = tex1Dfetch(tex_IndexInputToPPInputMatrix, 0) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 1) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 2) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 3); - PPinInput.y = tex1Dfetch(tex_IndexInputToPPInputMatrix, 4) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 5) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 6) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 7); - PPinInput.z = tex1Dfetch(tex_IndexInputToPPInputMatrix, 8) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 9) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 10) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 11); + float3 PPinInput = matrix_multiply(idx, c_IndexInputToPPInputMatrix); // Get the index corresponding to the current physical point in output displaced by the displacement vector float3 PPDisplaced; @@ -174,19 +166,7 @@ linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumula PPDisplaced.y = PPinInput.y + Displacement.y; PPDisplaced.z = PPinInput.z + Displacement.z; - float3 IndexInOutput; - IndexInOutput.x = tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 0) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 1) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 2) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 3); - IndexInOutput.y = tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 4) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 5) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 6) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 7); - IndexInOutput.z = tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 8) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 9) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 10) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 11); + float3 IndexInOutput = matrix_multiply(PPDisplaced, c_PPOutputToIndexOutputMatrix); // Compute the splat weights int3 BaseIndexInOutput; @@ -303,11 +283,14 @@ linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumula } __global__ void -nearestNeighborSplat_3Dgrid(float * dev_vol_in, - float * dev_vol_out, - float * dev_accumulate_weights, - int3 in_dim, - int3 out_dim) +nearestNeighborSplat_3Dgrid(float * dev_vol_in, + float * dev_vol_out, + float * dev_accumulate_weights, + int3 in_dim, + int3 out_dim, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -322,29 +305,18 @@ nearestNeighborSplat_3Dgrid(float * dev_vol_in, long int in_idx = i + (j + k * in_dim.y) * (in_dim.x); // Matrix multiply to get the index in the DVF texture of the current point in the output volume - float3 IndexInDVF; - IndexInDVF.x = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 0) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 1) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 2) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 3); - IndexInDVF.y = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 4) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 5) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 6) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 7); - IndexInDVF.z = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 8) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 9) * j + - tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 10) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 11); + float3 idx = make_float3(i, j, k); + float3 IndexInDVF = matrix_multiply(idx, c_IndexInputToIndexDVFMatrix); // Get each component of the displacement vector by // interpolation in the dvf float3 Displacement; - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Matrix multiply to get the physical coordinates of the current point in the input volume - float3 PPinInput; - PPinInput.x = tex1Dfetch(tex_IndexInputToPPInputMatrix, 0) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 1) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 2) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 3); - PPinInput.y = tex1Dfetch(tex_IndexInputToPPInputMatrix, 4) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 5) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 6) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 7); - PPinInput.z = tex1Dfetch(tex_IndexInputToPPInputMatrix, 8) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 9) * j + - tex1Dfetch(tex_IndexInputToPPInputMatrix, 10) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 11); + float3 PPinInput = matrix_multiply(idx, c_IndexInputToPPInputMatrix); // Get the index corresponding to the current physical point in output displaced by the displacement vector float3 PPDisplaced; @@ -352,19 +324,10 @@ nearestNeighborSplat_3Dgrid(float * dev_vol_in, PPDisplaced.y = PPinInput.y + Displacement.y; PPDisplaced.z = PPinInput.z + Displacement.z; - float3 IndexInOutput; - IndexInOutput.x = floor(tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 0) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 1) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 2) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 3) + 0.5); - IndexInOutput.y = floor(tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 4) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 5) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 6) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 7) + 0.5); - IndexInOutput.z = floor(tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 8) * PPDisplaced.x + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 9) * PPDisplaced.y + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 10) * PPDisplaced.z + - tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 11) + 0.5); + float3 IndexInOutput = matrix_multiply(PPDisplaced, c_PPOutputToIndexOutputMatrix); + IndexInOutput.x = floor(IndexInOutput.x + 0.5); + IndexInOutput.y = floor(IndexInOutput.y + 0.5); + IndexInOutput.z = floor(IndexInOutput.z + 0.5); bool isInVolume = (IndexInOutput.x >= 0) && (IndexInOutput.x < out_dim.x) && (IndexInOutput.y >= 0) && (IndexInOutput.y < out_dim.y) && (IndexInOutput.z >= 0) && (IndexInOutput.z < out_dim.z); @@ -400,101 +363,29 @@ CUDA_ForwardWarp(int input_vol_dim[3], float * dev_output_vol, bool isLinear) { - // Prepare channel description for arrays static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); /////////////////////////////////// - // For each component of the dvf, perform a strided copy (pick every third - // float from dev_input_dvf) into a 3D array, and bind the array to a 3D texture - // Extent stuff, will be used for each component extraction - cudaExtent dvfExtent = make_cudaExtent(input_dvf_dim[0], input_dvf_dim[1], input_dvf_dim[2]); - - // Set texture parameters - tex_xdvf.addressMode[0] = cudaAddressModeBorder; - tex_xdvf.addressMode[1] = cudaAddressModeBorder; - tex_xdvf.addressMode[2] = cudaAddressModeBorder; - tex_xdvf.filterMode = cudaFilterModeLinear; - tex_xdvf.normalized = false; // don't access with normalized texture coords - - tex_ydvf.addressMode[0] = cudaAddressModeBorder; - tex_ydvf.addressMode[1] = cudaAddressModeBorder; - tex_ydvf.addressMode[2] = cudaAddressModeBorder; - tex_ydvf.filterMode = cudaFilterModeLinear; - tex_ydvf.normalized = false; - - tex_zdvf.addressMode[0] = cudaAddressModeBorder; - tex_zdvf.addressMode[1] = cudaAddressModeBorder; - tex_zdvf.addressMode[2] = cudaAddressModeBorder; - tex_zdvf.filterMode = cudaFilterModeLinear; - tex_zdvf.normalized = false; - - // Allocate the arrays - cudaArray * array_xdvf; - cudaArray * array_ydvf; - cudaArray * array_zdvf; - cudaMalloc3DArray((cudaArray **)&array_xdvf, &channelDesc, dvfExtent); - cudaMalloc3DArray((cudaArray **)&array_ydvf, &channelDesc, dvfExtent); - cudaMalloc3DArray((cudaArray **)&array_zdvf, &channelDesc, dvfExtent); - CUDA_CHECK_ERROR; - - // Copy image data to arrays. For a nice explanation on make_cudaPitchedPtr, checkout - // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api - cudaMemcpy3DParms xCopyParams = cudaMemcpy3DParms(); - xCopyParams.srcPtr = - make_cudaPitchedPtr(dev_input_xdvf, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]); - xCopyParams.dstArray = (cudaArray *)array_xdvf; - xCopyParams.extent = dvfExtent; - xCopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&xCopyParams); - CUDA_CHECK_ERROR; - - cudaMemcpy3DParms yCopyParams = cudaMemcpy3DParms(); - yCopyParams.srcPtr = - make_cudaPitchedPtr(dev_input_ydvf, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]); - yCopyParams.dstArray = (cudaArray *)array_ydvf; - yCopyParams.extent = dvfExtent; - yCopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&yCopyParams); - CUDA_CHECK_ERROR; - - cudaMemcpy3DParms zCopyParams = cudaMemcpy3DParms(); - zCopyParams.srcPtr = - make_cudaPitchedPtr(dev_input_zdvf, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]); - zCopyParams.dstArray = (cudaArray *)array_zdvf; - zCopyParams.extent = dvfExtent; - zCopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&zCopyParams); - CUDA_CHECK_ERROR; - - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_xdvf, (cudaArray *)array_xdvf, channelDesc); - cudaBindTextureToArray(tex_ydvf, (cudaArray *)array_ydvf, channelDesc); - cudaBindTextureToArray(tex_zdvf, (cudaArray *)array_zdvf, channelDesc); - CUDA_CHECK_ERROR; + cudaArray * array_xdvf, *array_ydvf, *array_zdvf; + cudaTextureObject_t tex_xdvf, tex_ydvf, tex_zdvf; + prepareScalarTextureObject(input_dvf_dim, dev_input_xdvf, array_xdvf, tex_xdvf, false); + prepareScalarTextureObject(input_dvf_dim, dev_input_ydvf, array_ydvf, tex_ydvf, false); + prepareScalarTextureObject(input_dvf_dim, dev_input_zdvf, array_zdvf, tex_zdvf, false); /////////////////////////////////////// - // Copy matrices, bind them to textures - - float * dev_IndexInputToPPInput; - cudaMalloc((void **)&dev_IndexInputToPPInput, 12 * sizeof(float)); - cudaMemcpy(dev_IndexInputToPPInput, IndexInputToPPInputMatrix, 12 * sizeof(float), cudaMemcpyHostToDevice); - cudaBindTexture(0, tex_IndexInputToPPInputMatrix, dev_IndexInputToPPInput, 12 * sizeof(float)); - - float * dev_IndexInputToIndexDVF; - cudaMalloc((void **)&dev_IndexInputToIndexDVF, 12 * sizeof(float)); - cudaMemcpy(dev_IndexInputToIndexDVF, IndexInputToIndexDVFMatrix, 12 * sizeof(float), cudaMemcpyHostToDevice); - cudaBindTexture(0, tex_IndexInputToIndexDVFMatrix, dev_IndexInputToIndexDVF, 12 * sizeof(float)); - - float * dev_PPOutputToIndexOutput; - cudaMalloc((void **)&dev_PPOutputToIndexOutput, 12 * sizeof(float)); - cudaMemcpy(dev_PPOutputToIndexOutput, PPOutputToIndexOutputMatrix, 12 * sizeof(float), cudaMemcpyHostToDevice); - cudaBindTexture(0, tex_PPOutputToIndexOutputMatrix, dev_PPOutputToIndexOutput, 12 * sizeof(float)); + // Copy matrices into constant memory + cudaMemcpyToSymbol( + c_IndexInputToPPInputMatrix, IndexInputToPPInputMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol( + c_IndexInputToIndexDVFMatrix, IndexInputToIndexDVFMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol( + c_PPOutputToIndexOutputMatrix, PPOutputToIndexOutputMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); + CUDA_CHECK_ERROR; /////////////////////////////////////// /// Initialize the output - cudaMemset((void *)dev_output_vol, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]); ////////////////////////////////////// @@ -507,12 +398,6 @@ CUDA_ForwardWarp(int input_vol_dim[3], cudaMemset( (void *)dev_accumulate_weights, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]); - ////////////////////////////////////// - /// Run - - int device; - cudaGetDevice(&device); - // Thread Block Dimensions constexpr int tBlock_x = 16; constexpr int tBlock_y = 4; @@ -533,14 +418,20 @@ CUDA_ForwardWarp(int input_vol_dim[3], dev_output_vol, dev_accumulate_weights, make_int3(input_vol_dim[0], input_vol_dim[1], input_vol_dim[2]), - make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); + make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]), + tex_xdvf, + tex_ydvf, + tex_zdvf); else nearestNeighborSplat_3Dgrid<<<dimGrid, dimBlock>>>( dev_input_vol, dev_output_vol, dev_accumulate_weights, make_int3(input_vol_dim[0], input_vol_dim[1], input_vol_dim[2]), - make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); + make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]), + tex_xdvf, + tex_ydvf, + tex_zdvf); CUDA_CHECK_ERROR; @@ -552,25 +443,19 @@ CUDA_ForwardWarp(int input_vol_dim[3], dev_output_vol, dev_accumulate_weights, make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); CUDA_CHECK_ERROR; - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_xdvf); - cudaUnbindTexture(tex_ydvf); - cudaUnbindTexture(tex_zdvf); + // Cleanup + cudaFreeArray(array_xdvf); CUDA_CHECK_ERROR; - cudaUnbindTexture(tex_IndexInputToPPInputMatrix); - cudaUnbindTexture(tex_PPOutputToIndexOutputMatrix); - cudaUnbindTexture(tex_IndexInputToIndexDVFMatrix); + cudaFreeArray(array_ydvf); CUDA_CHECK_ERROR; - - // Cleanup - cudaFreeArray((cudaArray *)array_xdvf); - cudaFreeArray((cudaArray *)array_ydvf); - cudaFreeArray((cudaArray *)array_zdvf); + cudaFreeArray(array_zdvf); CUDA_CHECK_ERROR; cudaFree(dev_accumulate_weights); CUDA_CHECK_ERROR; - cudaFree(dev_IndexInputToPPInput); - cudaFree(dev_PPOutputToIndexOutput); - cudaFree(dev_IndexInputToIndexDVF); + cudaDestroyTextureObject(tex_xdvf); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_ydvf); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_zdvf); CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaParkerShortScanImageFilter.cu b/src/rtkCudaParkerShortScanImageFilter.cu index 8e3c6f734..be4656371 100644 --- a/src/rtkCudaParkerShortScanImageFilter.cu +++ b/src/rtkCudaParkerShortScanImageFilter.cu @@ -22,8 +22,6 @@ #include <cuda_runtime.h> #include <math_constants.h> -texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture - inline __device__ float TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column) { @@ -45,17 +43,18 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s } __global__ void -kernel_parker_weight(int2 proj_idx, - int3 proj_size, - int2 proj_size_buf_in, - int2 proj_size_buf_out, - float * dev_proj_in, - float * dev_proj_out, - float delta, - float firstAngle, - float proj_orig, // projection origin - float proj_row, // projection row direction & spacing - float proj_col // projection col direction & spacing +kernel_parker_weight(int2 proj_idx, + int3 proj_size, + int2 proj_size_buf_in, + int2 proj_size_buf_out, + float * dev_proj_in, + float * dev_proj_out, + float delta, + float firstAngle, + float proj_orig, // projection origin + float proj_row, // projection row direction & spacing + float proj_col, // projection col direction & spacing + cudaTextureObject_t tex_geom // geometry texture object ) { // compute projection index (== thread index) @@ -70,10 +69,10 @@ kernel_parker_weight(int2 proj_idx, if (pIdx.x >= proj_size.x || pIdx.y >= proj_size.y || pIdx.z >= proj_size.z) return; - float sdd = tex1Dfetch(tex_geometry, pIdx.z * 5 + 0); - float sx = tex1Dfetch(tex_geometry, pIdx.z * 5 + 1); - float px = tex1Dfetch(tex_geometry, pIdx.z * 5 + 2); - float sid = tex1Dfetch(tex_geometry, pIdx.z * 5 + 3); + float sdd = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 0); + float sx = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 1); + float px = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 2); + float sid = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 3); // convert actual index to point float pPoint = @@ -86,7 +85,7 @@ kernel_parker_weight(int2 proj_idx, float alpha = atan(-1 * l * invsid); // beta projection angle: Parker's article assumes that the scan starts at 0 - float beta = tex1Dfetch(tex_geometry, pIdx.z * 5 + 4); + float beta = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 4); beta -= firstAngle; if (beta < 0) beta += (2.f * CUDART_PI_F); @@ -111,21 +110,16 @@ CUDA_parker_weight(int proj_idx[2], int proj_dim_buf_out[2], float * dev_proj_in, float * dev_proj_out, - float * geometries, + float * geometry, float delta, float firstAngle, float proj_orig, float proj_row, float proj_col) { - // copy geometry matrix to device, bind the matrix to the texture - float * dev_geom; - cudaMalloc((void **)&dev_geom, proj_dim[2] * 5 * sizeof(float)); - CUDA_CHECK_ERROR; - cudaMemcpy(dev_geom, geometries, proj_dim[2] * 5 * sizeof(float), cudaMemcpyHostToDevice); - CUDA_CHECK_ERROR; - cudaBindTexture(0, tex_geometry, dev_geom, proj_dim[2] * 5 * sizeof(float)); - CUDA_CHECK_ERROR; + float * dev_geom; + cudaTextureObject_t tex_geom; + prepareGeometryTextureObject(proj_dim[2], geometry, dev_geom, tex_geom, 5); // Thread Block Dimensions int tBlock_x = 16; @@ -139,6 +133,7 @@ CUDA_parker_weight(int proj_idx[2], dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); + kernel_parker_weight<<<dimGrid, dimBlock>>>(make_int2(proj_idx[0], proj_idx[1]), make_int3(proj_dim[0], proj_dim[1], proj_dim[2]), make_int2(proj_dim_buf_in[0], proj_dim_buf_in[1]), @@ -149,10 +144,11 @@ CUDA_parker_weight(int proj_idx[2], firstAngle, proj_orig, proj_row, - proj_col); + proj_col, + tex_geom); - // Unbind matrix texture - cudaUnbindTexture(tex_geometry); + // destroy texture object + cudaDestroyTextureObject(tex_geom); CUDA_CHECK_ERROR; cudaFree(dev_geom); CUDA_CHECK_ERROR; diff --git a/src/rtkCudaUtilities.cu b/src/rtkCudaUtilities.cu index 4f3938d05..020fac6cc 100644 --- a/src/rtkCudaUtilities.cu +++ b/src/rtkCudaUtilities.cu @@ -17,6 +17,7 @@ *=========================================================================*/ #include "rtkCudaUtilities.hcu" +#include <cublas_v2.h> std::vector<int> GetListOfCudaDevices() @@ -76,13 +77,73 @@ GetFreeGPUGlobalMemory(int device) return free; } +__host__ void +prepareScalarTextureObject(int size[3], + float * dev_ptr, + cudaArray *& threeDArray, + cudaTextureObject_t & tex, + const bool isProjections, + const bool isLinear, + const cudaTextureAddressMode texAddressMode) +{ + // create texture object + cudaResourceDesc resDesc = {}; + resDesc.resType = cudaResourceTypeArray; + + cudaTextureDesc texDesc = {}; + texDesc.readMode = cudaReadModeElementType; + + for (int component = 0; component < 3; component++) + texDesc.addressMode[component] = texAddressMode; + if (isLinear) + texDesc.filterMode = cudaFilterModeLinear; + else + texDesc.filterMode = cudaFilterModePoint; + + static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaExtent volExtent = make_cudaExtent(size[0], size[1], size[2]); + + // Allocate an intermediate memory space to extract the components of the input volume + float * singleComponent; + int numel = size[0] * size[1] * size[2]; + cudaMalloc(&singleComponent, numel * sizeof(float)); + CUDA_CHECK_ERROR; + + // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr. + // The best way to understand it is to read + // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api + + // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays + if (isProjections) + cudaMalloc3DArray(&threeDArray, &channelDesc, volExtent, cudaArrayLayered); + else + cudaMalloc3DArray(&threeDArray, &channelDesc, volExtent); + CUDA_CHECK_ERROR; + + // Fill it with the current singleComponent + cudaMemcpy3DParms CopyParams = {}; + CopyParams.srcPtr = make_cudaPitchedPtr(dev_ptr, size[0] * sizeof(float), size[0], size[1]); + CUDA_CHECK_ERROR; + CopyParams.dstArray = threeDArray; + CopyParams.extent = volExtent; + CopyParams.kind = cudaMemcpyDeviceToDevice; + cudaMemcpy3D(&CopyParams); + CUDA_CHECK_ERROR; + + // Fill in the texture object with all this information + resDesc.res.array.array = threeDArray; + cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); + CUDA_CHECK_ERROR; +} + __host__ void prepareVectorTextureObject(int size[3], const float * dev_ptr, std::vector<cudaArray *> & componentArrays, const unsigned int nComponents, std::vector<cudaTextureObject_t> & tex, - bool isProjections) + const bool isProjections, + const cudaTextureAddressMode texAddressMode) { componentArrays.resize(nComponents); tex.resize(nComponents); @@ -92,20 +153,13 @@ prepareVectorTextureObject(int size[3], cublasCreate(&handle); // create texture object - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); + cudaResourceDesc resDesc = {}; resDesc.resType = cudaResourceTypeArray; - cudaTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); + cudaTextureDesc texDesc = {}; texDesc.readMode = cudaReadModeElementType; for (int component = 0; component < 3; component++) - { - if (isProjections) - texDesc.addressMode[component] = cudaAddressModeBorder; - else - texDesc.addressMode[component] = cudaAddressModeClamp; - } + texDesc.addressMode[component] = texAddressMode; texDesc.filterMode = cudaFilterModeLinear; static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); @@ -127,7 +181,7 @@ prepareVectorTextureObject(int size[3], cudaMemset((void *)singleComponent, 0, numel * sizeof(float)); // Fill it with the current component - float * pComponent = dev_ptr + component; + const float * pComponent = dev_ptr + component; cublasSaxpy(handle, numel, &one, pComponent, nComponents, singleComponent, 1); // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays @@ -140,6 +194,7 @@ prepareVectorTextureObject(int size[3], // Fill it with the current singleComponent cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); CopyParams.srcPtr = make_cudaPitchedPtr(singleComponent, size[0] * sizeof(float), size[0], size[1]); + CUDA_CHECK_ERROR; CopyParams.dstArray = componentArrays[component]; CopyParams.extent = volExtent; CopyParams.kind = cudaMemcpyDeviceToDevice; @@ -158,3 +213,31 @@ prepareVectorTextureObject(int size[3], // Destroy CUBLAS context cublasDestroy(handle); } + +__host__ void +prepareGeometryTextureObject(int length, + const float * geometry, + float *& dev_geom, + cudaTextureObject_t & tex_geom, + const unsigned int nParam) +{ + // copy geometry matrix to device, bind the matrix to the texture + cudaMalloc((void **)&dev_geom, length * nParam * sizeof(float)); + CUDA_CHECK_ERROR; + cudaMemcpy(dev_geom, geometry, length * nParam * sizeof(float), cudaMemcpyHostToDevice); + CUDA_CHECK_ERROR; + + // create texture object + cudaResourceDesc resDesc = {}; + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = dev_geom; + resDesc.res.linear.desc.f = cudaChannelFormatKindFloat; + resDesc.res.linear.desc.x = 32; // bits per channel + resDesc.res.linear.sizeInBytes = length * nParam * sizeof(float); + + cudaTextureDesc texDesc = {}; + texDesc.readMode = cudaReadModeElementType; + + cudaCreateTextureObject(&tex_geom, &resDesc, &texDesc, NULL); + CUDA_CHECK_ERROR; +} diff --git a/src/rtkCudaWarpBackProjectionImageFilter.cu b/src/rtkCudaWarpBackProjectionImageFilter.cu index 333b6dfba..a26d57f5f 100644 --- a/src/rtkCudaWarpBackProjectionImageFilter.cu +++ b/src/rtkCudaWarpBackProjectionImageFilter.cu @@ -38,17 +38,8 @@ * CUDA #includes * *****************/ #include <cuda.h> -#include <cublas_v2.h> #include <cuda_runtime.h> -// T E X T U R E S //////////////////////////////////////////////////////// -texture<float, cudaTextureType2DLayered> tex_proj; - -texture<float, 3, cudaReadModeElementType> tex_xdvf; -texture<float, 3, cudaReadModeElementType> tex_ydvf; -texture<float, 3, cudaReadModeElementType> tex_zdvf; -/////////////////////////////////////////////////////////////////////////// - // CONSTANTS ////////////////////////////////////////////////////////////// __constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections __constant__ float c_volIndexToProjPP[SLAB_SIZE * 12]; @@ -66,7 +57,12 @@ __constant__ float c_IndexInputToPPInputMatrix[12]; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ __global__ void -kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) +kernel_warp_back_project_3Dgrid(float * dev_vol_in, + float * dev_vol_out, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf, + cudaTextureObject_t tex_proj) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -89,9 +85,9 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix); // Get each component of the displacement vector by interpolation in the DVF - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Compute the physical point in input + the displacement vector PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement; @@ -108,7 +104,7 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) ip.y = ip.y * ip.z; // Get texture point, clip left to GPU - voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj); + voxel_data += tex2DLayered<float>(tex_proj, ip.x, ip.y, proj); } // Place it into the volume @@ -116,7 +112,13 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out) } __global__ void -kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, float * dev_vol_out, float radius) +kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, + float * dev_vol_out, + float radius, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf, + cudaTextureObject_t tex_proj) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -139,9 +141,9 @@ kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, float * IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix); // Get each component of the displacement vector by interpolation in the DVF - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Compute the physical point in input + the displacement vector PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement; @@ -167,7 +169,7 @@ kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, float * ip.y = c_projPPToProjIndex[3] * pp.x + c_projPPToProjIndex[4] * pp.y + c_projPPToProjIndex[5]; // Get texture point, clip left to GPU - voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj); + voxel_data += tex2DLayered<float>(tex_proj, ip.x, ip.y, proj); } // Place it into the volume @@ -196,13 +198,6 @@ CUDA_warp_back_project(int projSize[3], float IndexInputToPPInputMatrix[12], double radiusCylindricalDetector) { - // Create CUBLAS context - cublasHandle_t handle; - cublasCreate(&handle); - - int device; - cudaGetDevice(&device); - // Copy the size of inputs into constant memory cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3)); cudaMemcpyToSymbol(c_volSize, volSize, sizeof(int3)); @@ -212,96 +207,15 @@ CUDA_warp_back_project(int projSize[3], cudaMemcpyToSymbol(c_volIndexToProjPP, &(volIndexToProjPPs[0]), 12 * sizeof(float) * projSize[2]); cudaMemcpyToSymbol(c_projPPToProjIndex, &(projPPToProjIndex[0]), 9 * sizeof(float)); - // set texture parameters - tex_proj.addressMode[0] = cudaAddressModeBorder; - tex_proj.addressMode[1] = cudaAddressModeBorder; - tex_proj.addressMode[2] = cudaAddressModeBorder; - tex_proj.filterMode = cudaFilterModeLinear; - tex_proj.normalized = false; // don't access with normalized texture coords - - // Copy projection data to array, bind the array to the texture - cudaExtent projExtent = make_cudaExtent(projSize[0], projSize[1], projSize[2]); - cudaArray * array_proj; - static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); - CUDA_CHECK_ERROR; - - // Allocate array for input projections, in order to bind them to - // a 2D layered texture - cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered); - CUDA_CHECK_ERROR; - - // Copy data to 3D array - cudaMemcpy3DParms copyParams = cudaMemcpy3DParms(); - copyParams.srcPtr = make_cudaPitchedPtr(dev_proj, projSize[0] * sizeof(float), projSize[0], projSize[1]); - copyParams.dstArray = (cudaArray *)array_proj; - copyParams.extent = projExtent; - copyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(©Params); - CUDA_CHECK_ERROR; + // Prepare proj texture + cudaArray * array_proj; + cudaTextureObject_t tex_proj; + prepareScalarTextureObject(projSize, dev_proj, array_proj, tex_proj, true); - // Extent stuff, will be used for each component extraction - cudaExtent dvfExtent = make_cudaExtent(dvf_size[0], dvf_size[1], dvf_size[2]); - - // Set texture parameters - tex_xdvf.addressMode[0] = cudaAddressModeBorder; - tex_xdvf.addressMode[1] = cudaAddressModeBorder; - tex_xdvf.addressMode[2] = cudaAddressModeBorder; - tex_xdvf.filterMode = cudaFilterModeLinear; - tex_xdvf.normalized = false; // don't access with normalized texture coords - - tex_ydvf.addressMode[0] = cudaAddressModeBorder; - tex_ydvf.addressMode[1] = cudaAddressModeBorder; - tex_ydvf.addressMode[2] = cudaAddressModeBorder; - tex_ydvf.filterMode = cudaFilterModeLinear; - tex_ydvf.normalized = false; - - tex_zdvf.addressMode[0] = cudaAddressModeBorder; - tex_zdvf.addressMode[1] = cudaAddressModeBorder; - tex_zdvf.addressMode[2] = cudaAddressModeBorder; - tex_zdvf.filterMode = cudaFilterModeLinear; - tex_zdvf.normalized = false; - - // Allocate an intermediate memory space to extract x, y and z components of the DVF - float * DVFcomponent; - int numel = dvf_size[0] * dvf_size[1] * dvf_size[2]; - cudaMalloc(&DVFcomponent, numel * sizeof(float)); - float one = 1.0; - - // Allocate the arrays used for textures - cudaArray ** DVFcomponentArrays = new cudaArray *[3]; - CUDA_CHECK_ERROR; - - // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr. - // The best way to understand it is to read - // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api - for (unsigned int component = 0; component < 3; component++) - { - // Reset the intermediate memory - cudaMemset((void *)DVFcomponent, 0, numel * sizeof(float)); - - // Fill it with the current component - float * pComponent = dev_input_dvf + component; - cublasSaxpy(handle, numel, &one, pComponent, 3, DVFcomponent, 1); - - // Allocate the cudaArray and fill it with the current DVFcomponent - cudaMalloc3DArray((cudaArray **)&DVFcomponentArrays[component], &channelDesc, dvfExtent); - cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); - CopyParams.srcPtr = make_cudaPitchedPtr(DVFcomponent, dvf_size[0] * sizeof(float), dvf_size[0], dvf_size[1]); - CopyParams.dstArray = (cudaArray *)DVFcomponentArrays[component]; - CopyParams.extent = dvfExtent; - CopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&CopyParams); - CUDA_CHECK_ERROR; - } - - // Intermediate memory is no longer needed - cudaFree(DVFcomponent); - - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_xdvf, (cudaArray *)DVFcomponentArrays[0], channelDesc); - cudaBindTextureToArray(tex_ydvf, (cudaArray *)DVFcomponentArrays[1], channelDesc); - cudaBindTextureToArray(tex_zdvf, (cudaArray *)DVFcomponentArrays[2], channelDesc); - CUDA_CHECK_ERROR; + // Prepare DVF textures + std::vector<cudaArray *> DVFComponentArrays; + std::vector<cudaTextureObject_t> tex_dvf; + prepareVectorTextureObject(dvf_size, dev_input_dvf, DVFComponentArrays, 3, tex_dvf, false); // Copy matrices into constant memory cudaMemcpyToSymbol( @@ -329,35 +243,27 @@ CUDA_warp_back_project(int projSize[3], dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); CUDA_CHECK_ERROR; - // Bind the array of projections to a 2D layered texture - cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc); - CUDA_CHECK_ERROR; - // Note: cbi->img is passed via texture memory // Matrices are passed via constant memory //------------------------------------- if (radiusCylindricalDetector == 0) - kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out); + kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>( + dev_vol_in, dev_vol_out, tex_dvf[0], tex_dvf[1], tex_dvf[2], tex_proj); else kernel_warp_back_project_3Dgrid_cylindrical_detector<<<dimGrid, dimBlock>>>( - dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector); - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_proj); + dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector, tex_dvf[0], tex_dvf[1], tex_dvf[2], tex_proj); CUDA_CHECK_ERROR; - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_xdvf); - cudaUnbindTexture(tex_ydvf); - cudaUnbindTexture(tex_zdvf); - // Cleanup - cudaFreeArray((cudaArray *)DVFcomponentArrays[0]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[1]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[2]); - delete[] DVFcomponentArrays; + for (unsigned int c = 0; c < 3; c++) + { + cudaFreeArray(DVFComponentArrays[c]); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_dvf[c]); + CUDA_CHECK_ERROR; + } cudaFreeArray((cudaArray *)array_proj); CUDA_CHECK_ERROR; - - // Destroy CUBLAS context - cublasDestroy(handle); + cudaDestroyTextureObject(tex_proj); + CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaWarpForwardProjectionImageFilter.cu b/src/rtkCudaWarpForwardProjectionImageFilter.cu index 0b7578726..094cc1f42 100644 --- a/src/rtkCudaWarpForwardProjectionImageFilter.cu +++ b/src/rtkCudaWarpForwardProjectionImageFilter.cu @@ -36,15 +36,9 @@ * CUDA #includes * *****************/ #include <cuda.h> -#include <cublas_v2.h> #include <cuda_runtime.h> -// TEXTURES AND CONSTANTS // -texture<float, 3, cudaReadModeElementType> tex_xdvf; -texture<float, 3, cudaReadModeElementType> tex_ydvf; -texture<float, 3, cudaReadModeElementType> tex_zdvf; -texture<float, 3, cudaReadModeElementType> tex_vol; - +// CONSTANTS __constant__ int3 c_projSize; __constant__ float3 c_boxMin; __constant__ float3 c_boxMax; @@ -65,7 +59,12 @@ __constant__ float c_PPInputToIndexInputMatrix[12]; // KERNEL kernel_forwardProject __global__ void -kernel_warped_forwardProject(float * dev_proj_in, float * dev_proj_out) +kernel_warped_forwardProject(float * dev_proj_in, + float * dev_proj_out, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf, + cudaTextureObject_t tex_vol) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; unsigned int j = blockIdx.y * blockDim.y + threadIdx.y; @@ -122,9 +121,9 @@ kernel_warped_forwardProject(float * dev_proj_in, float * dev_proj_out) // Get each component of the displacement vector by // interpolation in the dvf - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Matrix multiply to get the physical coordinates of the current point in the output volume // + the displacement @@ -134,7 +133,7 @@ kernel_warped_forwardProject(float * dev_proj_in, float * dev_proj_out) IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix); // Read from 3D texture from volume - sample = tex3D(tex_vol, IndexInInput.x, IndexInInput.y, IndexInInput.z); + sample = tex3D<float>(tex_vol, IndexInInput.x, IndexInInput.y, IndexInInput.z); // Accumulate, and move forward along the ray sum += sample; @@ -172,10 +171,6 @@ CUDA_warp_forward_project(int projSize[3], float PPInputToIndexInputMatrix[12], float IndexInputToPPInputMatrix[12]) { - // Create CUBLAS context - cublasHandle_t handle; - cublasCreate(&handle); - // constant memory cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3)); cudaMemcpyToSymbol(c_boxMin, box_min, sizeof(float3)); @@ -190,94 +185,15 @@ CUDA_warp_forward_project(int projSize[3], // Copy the projection matrices into constant memory cudaMemcpyToSymbol(c_matrices, &(matrices[0]), 12 * sizeof(float) * projSize[2]); - // Prepare channel description for arrays - static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); - - // Extent stuff, will be used for each component extraction - cudaExtent dvfExtent = make_cudaExtent(dvfSize[0], dvfSize[1], dvfSize[2]); - - // Set texture parameters for the input volume - tex_vol.addressMode[0] = cudaAddressModeClamp; // clamp texture coordinates - tex_vol.addressMode[1] = cudaAddressModeClamp; - tex_vol.addressMode[2] = cudaAddressModeClamp; - tex_vol.normalized = false; // access with normalized texture coordinates - tex_vol.filterMode = cudaFilterModeLinear; // linear interpolation - - // Copy volume data to array, bind the array to the texture - cudaExtent volExtent = make_cudaExtent(volSize[0], volSize[1], volSize[2]); - cudaArray * array_vol; - cudaMalloc3DArray((cudaArray **)&array_vol, &channelDesc, volExtent); - CUDA_CHECK_ERROR; - - // Copy data to 3D array - cudaMemcpy3DParms copyParams = cudaMemcpy3DParms(); - copyParams.srcPtr = make_cudaPitchedPtr(dev_vol, volSize[0] * sizeof(float), volSize[0], volSize[1]); - copyParams.dstArray = (cudaArray *)array_vol; - copyParams.extent = volExtent; - copyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(©Params); - CUDA_CHECK_ERROR; - - // Set texture parameters - tex_xdvf.addressMode[0] = cudaAddressModeBorder; - tex_xdvf.addressMode[1] = cudaAddressModeBorder; - tex_xdvf.addressMode[2] = cudaAddressModeBorder; - tex_xdvf.filterMode = cudaFilterModeLinear; - tex_xdvf.normalized = false; // don't access with normalized texture coords - - tex_ydvf.addressMode[0] = cudaAddressModeBorder; - tex_ydvf.addressMode[1] = cudaAddressModeBorder; - tex_ydvf.addressMode[2] = cudaAddressModeBorder; - tex_ydvf.filterMode = cudaFilterModeLinear; - tex_ydvf.normalized = false; - - tex_zdvf.addressMode[0] = cudaAddressModeBorder; - tex_zdvf.addressMode[1] = cudaAddressModeBorder; - tex_zdvf.addressMode[2] = cudaAddressModeBorder; - tex_zdvf.filterMode = cudaFilterModeLinear; - tex_zdvf.normalized = false; - - // Allocate an intermediate memory space to extract x, y and z components of the DVF - float * DVFcomponent; - int numel = dvfSize[0] * dvfSize[1] * dvfSize[2]; - cudaMalloc(&DVFcomponent, numel * sizeof(float)); - float one = 1.0; - - // Allocate the arrays used for textures - cudaArray ** DVFcomponentArrays = new cudaArray *[3]; - CUDA_CHECK_ERROR; - - // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr. - // The best way to understand it is to read - // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api - for (unsigned int component = 0; component < 3; component++) - { - // Reset the intermediate memory - cudaMemset((void *)DVFcomponent, 0, numel * sizeof(float)); - - // Fill it with the current component - float * pComponent = dev_input_dvf + component; - cublasSaxpy(handle, numel, &one, pComponent, 3, DVFcomponent, 1); - - // Allocate the cudaArray and fill it with the current DVFcomponent - cudaMalloc3DArray((cudaArray **)&DVFcomponentArrays[component], &channelDesc, dvfExtent); - cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); - CopyParams.srcPtr = make_cudaPitchedPtr(DVFcomponent, dvfSize[0] * sizeof(float), dvfSize[0], dvfSize[1]); - CopyParams.dstArray = (cudaArray *)DVFcomponentArrays[component]; - CopyParams.extent = dvfExtent; - CopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&CopyParams); - CUDA_CHECK_ERROR; - } - - // Intermediate memory is no longer needed - cudaFree(DVFcomponent); + // Prepare volume texture + cudaArray * array_vol; + cudaTextureObject_t tex_vol; + prepareScalarTextureObject(volSize, dev_vol, array_vol, tex_vol, false, true, cudaAddressModeClamp); - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_xdvf, (cudaArray *)DVFcomponentArrays[0], channelDesc); - cudaBindTextureToArray(tex_ydvf, (cudaArray *)DVFcomponentArrays[1], channelDesc); - cudaBindTextureToArray(tex_zdvf, (cudaArray *)DVFcomponentArrays[2], channelDesc); - CUDA_CHECK_ERROR; + // Prepare DVF textures + std::vector<cudaArray *> DVFComponentArrays; + std::vector<cudaTextureObject_t> tex_dvf; + prepareVectorTextureObject(dvfSize, dev_input_dvf, DVFComponentArrays, 3, tex_dvf, false); // Copy matrices into constant memory cudaMemcpyToSymbol( @@ -292,25 +208,19 @@ CUDA_warp_forward_project(int projSize[3], dim3 dimBlock = dim3(16, 16, 1); dim3 dimGrid = dim3(iDivUp(projSize[0], dimBlock.x), iDivUp(projSize[1], dimBlock.y)); - // Bind 3D array to 3D texture - cudaBindTextureToArray(tex_vol, (cudaArray *)array_vol, channelDesc); - CUDA_CHECK_ERROR; - - kernel_warped_forwardProject<<<dimGrid, dimBlock>>>(dev_proj_in, dev_proj_out); - - cudaUnbindTexture(tex_xdvf); - cudaUnbindTexture(tex_ydvf); - cudaUnbindTexture(tex_zdvf); - cudaUnbindTexture(tex_vol); - CUDA_CHECK_ERROR; + kernel_warped_forwardProject<<<dimGrid, dimBlock>>>( + dev_proj_in, dev_proj_out, tex_dvf[0], tex_dvf[1], tex_dvf[2], tex_vol); - cudaFreeArray((cudaArray *)DVFcomponentArrays[0]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[1]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[2]); - delete[] DVFcomponentArrays; + // Cleanup + for (unsigned int c = 0; c < 3; c++) + { + cudaFreeArray(DVFComponentArrays[c]); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_dvf[c]); + CUDA_CHECK_ERROR; + } cudaFreeArray((cudaArray *)array_vol); CUDA_CHECK_ERROR; - - // Destroy CUBLAS context - cublasDestroy(handle); + cudaDestroyTextureObject(tex_vol); + CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaWarpImageFilter.cu b/src/rtkCudaWarpImageFilter.cu index 132cbe9cb..0038ec72d 100644 --- a/src/rtkCudaWarpImageFilter.cu +++ b/src/rtkCudaWarpImageFilter.cu @@ -38,19 +38,6 @@ * CUDA #includes * *****************/ #include <cuda.h> -#include <cublas_v2.h> -#include <cuda_runtime.h> - -// T E X T U R E S //////////////////////////////////////////////////////// -texture<float, 1, cudaReadModeElementType> tex_IndexOutputToPPOutputMatrix; -texture<float, 1, cudaReadModeElementType> tex_IndexOutputToIndexDVFMatrix; -texture<float, 1, cudaReadModeElementType> tex_PPInputToIndexInputMatrix; - -texture<float, 3, cudaReadModeElementType> tex_xdvf; -texture<float, 3, cudaReadModeElementType> tex_ydvf; -texture<float, 3, cudaReadModeElementType> tex_zdvf; -texture<float, 3, cudaReadModeElementType> tex_input_vol; -/////////////////////////////////////////////////////////////////////////// // CONSTANTS ////////////////////////////////////////////////////////////// __constant__ float c_IndexOutputToPPOutputMatrix[12]; @@ -64,7 +51,12 @@ __constant__ float c_PPInputToIndexInputMatrix[12]; //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ __global__ void -kernel_3Dgrid(float * dev_vol_out, int3 vol_dim) +kernel_3Dgrid(float * dev_vol_out, + int3 vol_dim, + cudaTextureObject_t tex_xdvf, + cudaTextureObject_t tex_ydvf, + cudaTextureObject_t tex_zdvf, + cudaTextureObject_t tex_input_vol) { unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; @@ -84,9 +76,9 @@ kernel_3Dgrid(float * dev_vol_out, int3 vol_dim) // Get each component of the displacement vector by // interpolation in the dvf float3 Displacement; - Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); - Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); + Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f); // Matrix multiply to get the physical coordinates of the current point in the output volume float3 PP = matrix_multiply(make_float3(i, j, k), c_IndexOutputToPPOutputMatrix); @@ -98,7 +90,8 @@ kernel_3Dgrid(float * dev_vol_out, int3 vol_dim) float3 IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix); // Interpolate in the input and copy into the output - dev_vol_out[vol_idx] = tex3D(tex_input_vol, IndexInInput.x + 0.5f, IndexInInput.y + 0.5f, IndexInInput.z + 0.5f); + dev_vol_out[vol_idx] = + tex3D<float>(tex_input_vol, IndexInInput.x + 0.5f, IndexInInput.y + 0.5f, IndexInInput.z + 0.5f); } //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ @@ -120,116 +113,15 @@ CUDA_warp(int input_vol_dim[3], float * dev_DVF, bool isLinear) { - // Prepare channel description for arrays - static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); - - // Create CUBLAS context - cublasHandle_t handle; - cublasCreate(&handle); - - /////////////////////////////////// - // For each component of the dvf, perform a strided copy (pick every third - // float from dev_input_dvf) into a 3D array, and bind the array to a 3D texture - - // Extent stuff, will be used for each component extraction - cudaExtent dvfExtent = make_cudaExtent(input_dvf_dim[0], input_dvf_dim[1], input_dvf_dim[2]); - - // Set texture parameters - tex_xdvf.addressMode[0] = cudaAddressModeBorder; - tex_xdvf.addressMode[1] = cudaAddressModeBorder; - tex_xdvf.addressMode[2] = cudaAddressModeBorder; - tex_xdvf.filterMode = cudaFilterModeLinear; - tex_xdvf.normalized = false; // don't access with normalized texture coords - - tex_ydvf.addressMode[0] = cudaAddressModeBorder; - tex_ydvf.addressMode[1] = cudaAddressModeBorder; - tex_ydvf.addressMode[2] = cudaAddressModeBorder; - tex_ydvf.filterMode = cudaFilterModeLinear; - tex_ydvf.normalized = false; - - tex_zdvf.addressMode[0] = cudaAddressModeBorder; - tex_zdvf.addressMode[1] = cudaAddressModeBorder; - tex_zdvf.addressMode[2] = cudaAddressModeBorder; - tex_zdvf.filterMode = cudaFilterModeLinear; - tex_zdvf.normalized = false; - - // Allocate an intermediate memory space to extract x, y and z components of the DVF - float * DVFcomponent; - int numel = input_dvf_dim[0] * input_dvf_dim[1] * input_dvf_dim[2]; - cudaMalloc(&DVFcomponent, numel * sizeof(float)); - float one = 1.0; - - // Allocate the arrays used for textures - cudaArray ** DVFcomponentArrays = new cudaArray *[3]; - CUDA_CHECK_ERROR; - - // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr. - // The best way to understand it is to read - // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api - for (unsigned int component = 0; component < 3; component++) - { - // Reset the intermediate memory - cudaMemset((void *)DVFcomponent, 0, numel * sizeof(float)); - - // Fill it with the current component - float * pComponent = dev_DVF + component; - cublasSaxpy(handle, numel, &one, pComponent, 3, DVFcomponent, 1); - - // Allocate the cudaArray and fill it with the current DVFcomponent - cudaMalloc3DArray((cudaArray **)&DVFcomponentArrays[component], &channelDesc, dvfExtent); - cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); - CopyParams.srcPtr = - make_cudaPitchedPtr(DVFcomponent, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]); - CopyParams.dstArray = (cudaArray *)DVFcomponentArrays[component]; - CopyParams.extent = dvfExtent; - CopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&CopyParams); - CUDA_CHECK_ERROR; - } - - // Intermediate memory is no longer needed - cudaFree(DVFcomponent); - - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_xdvf, (cudaArray *)DVFcomponentArrays[0], channelDesc); - cudaBindTextureToArray(tex_ydvf, (cudaArray *)DVFcomponentArrays[1], channelDesc); - cudaBindTextureToArray(tex_zdvf, (cudaArray *)DVFcomponentArrays[2], channelDesc); - CUDA_CHECK_ERROR; - - /////////////////////////////////// - // Do the same for the input volume - - // Extent stuff - cudaExtent volExtent = make_cudaExtent(input_vol_dim[0], input_vol_dim[1], input_vol_dim[2]); - - // Set texture parameters - tex_input_vol.addressMode[0] = cudaAddressModeBorder; - tex_input_vol.addressMode[1] = cudaAddressModeBorder; - tex_input_vol.addressMode[2] = cudaAddressModeBorder; - tex_input_vol.normalized = false; // don't access with normalized texture coords - if (isLinear) - tex_input_vol.filterMode = cudaFilterModeLinear; - else - tex_input_vol.filterMode = cudaFilterModePoint; - - // Allocate the array - cudaArray * array_input_vol; - cudaMalloc3DArray((cudaArray **)&array_input_vol, &channelDesc, volExtent); - CUDA_CHECK_ERROR; + // Prepare DVF textures + std::vector<cudaArray *> DVFComponentArrays; + std::vector<cudaTextureObject_t> tex_dvf; + prepareVectorTextureObject(input_dvf_dim, dev_DVF, DVFComponentArrays, 3, tex_dvf, false); - // Copy image data to array - cudaMemcpy3DParms inputCopyParams = cudaMemcpy3DParms(); - inputCopyParams.srcPtr = - make_cudaPitchedPtr(dev_input_vol, input_vol_dim[0] * sizeof(float), input_vol_dim[0], input_vol_dim[1]); - inputCopyParams.dstArray = (cudaArray *)array_input_vol; - inputCopyParams.extent = volExtent; - inputCopyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&inputCopyParams); - CUDA_CHECK_ERROR; - - // Bind 3D arrays to 3D textures - cudaBindTextureToArray(tex_input_vol, (cudaArray *)array_input_vol, channelDesc); - CUDA_CHECK_ERROR; + // Prepare volume texture + cudaArray * array_input_vol; + cudaTextureObject_t tex_input_vol; + prepareScalarTextureObject(output_vol_dim, dev_input_vol, array_input_vol, tex_input_vol, false, isLinear); // Copy matrices into constant memory cudaMemcpyToSymbol( @@ -238,12 +130,7 @@ CUDA_warp(int input_vol_dim[3], c_IndexOutputToIndexDVFMatrix, IndexOutputToIndexDVFMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_PPInputToIndexInputMatrix, PPInputToIndexInputMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice); - - ////////////////////////////////////// - /// Run - - int device; - cudaGetDevice(&device); + CUDA_CHECK_ERROR; // Thread Block Dimensions constexpr int tBlock_x = 16; @@ -258,29 +145,24 @@ CUDA_warp(int input_vol_dim[3], dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); - // Note: the DVF and input image are passed via texture memory - //------------------------------------- kernel_3Dgrid<<<dimGrid, dimBlock>>>(dev_output_vol, - make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2])); - - CUDA_CHECK_ERROR; - - // Unbind the image and projection matrix textures - cudaUnbindTexture(tex_xdvf); - cudaUnbindTexture(tex_ydvf); - cudaUnbindTexture(tex_zdvf); - cudaUnbindTexture(tex_input_vol); + make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]), + tex_dvf[0], + tex_dvf[1], + tex_dvf[2], + tex_input_vol); CUDA_CHECK_ERROR; // Cleanup + for (unsigned int c = 0; c < 3; c++) + { + cudaFreeArray(DVFComponentArrays[c]); + CUDA_CHECK_ERROR; + cudaDestroyTextureObject(tex_dvf[c]); + CUDA_CHECK_ERROR; + } + cudaFreeArray(array_input_vol); CUDA_CHECK_ERROR; - cudaFreeArray((cudaArray *)DVFcomponentArrays[0]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[1]); - cudaFreeArray((cudaArray *)DVFcomponentArrays[2]); - cudaFreeArray((cudaArray *)array_input_vol); - delete[] DVFcomponentArrays; + cudaDestroyTextureObject(tex_input_vol); CUDA_CHECK_ERROR; - - // Destroy CUBLAS context - cublasDestroy(handle); } diff --git a/src/rtkCudaWeidingerForwardModelImageFilter.cu b/src/rtkCudaWeidingerForwardModelImageFilter.cu index cf3522a63..e03c8dc8e 100644 --- a/src/rtkCudaWeidingerForwardModelImageFilter.cu +++ b/src/rtkCudaWeidingerForwardModelImageFilter.cu @@ -38,7 +38,6 @@ * CUDA #includes * *****************/ #include <cuda.h> -#include <cublas_v2.h> #include <cuda_runtime.h> #define IDX2D(r, c, cols) ((r) * (cols) + (c)) From 0b4b736c7b73b68730ffba6a3a3b7343122ecaf8 Mon Sep 17 00:00:00 2001 From: Simon Rit <simon.rit@creatis.insa-lyon.fr> Date: Fri, 19 Apr 2024 21:20:42 +0200 Subject: [PATCH 5/5] STYLE: Cleanup useless code after CUDA function calls --- src/rtkCudaForwardWarpImageFilter.cxx | 3 --- src/rtkCudaWarpImageFilter.cxx | 3 --- 2 files changed, 6 deletions(-) diff --git a/src/rtkCudaForwardWarpImageFilter.cxx b/src/rtkCudaForwardWarpImageFilter.cxx index 6d410fb4d..9dc828839 100644 --- a/src/rtkCudaForwardWarpImageFilter.cxx +++ b/src/rtkCudaForwardWarpImageFilter.cxx @@ -144,9 +144,6 @@ CudaForwardWarpImageFilter ::GPUGenerateData() xCompDVF = nullptr; yCompDVF = nullptr; zCompDVF = nullptr; - - // The filter is inPlace - // pinVol = poutVol; } } // end namespace rtk diff --git a/src/rtkCudaWarpImageFilter.cxx b/src/rtkCudaWarpImageFilter.cxx index 0b09c1739..de47ed54c 100644 --- a/src/rtkCudaWarpImageFilter.cxx +++ b/src/rtkCudaWarpImageFilter.cxx @@ -109,9 +109,6 @@ CudaWarpImageFilter ::GPUGenerateData() poutVol, pDVF, isLinear); - - // The filter is inPlace - pinVol = poutVol; } } // end namespace rtk