RTKConsortium · SimonRit · May 12, 2024 · May 12, 2024 · May 12, 2024 · May 12, 2024
diff --git a/include/rtkCudaConjugateGradientImageFilter.hcu b/include/rtkCudaConjugateGradientImageFilter.hcu
@@ -22,20 +22,20 @@
 #include "RTKExport.h"
 
 void RTK_EXPORT
-     CUDA_copy(long int numberOfElements, float * in, float * out);
+     CUDA_copy(size_t numberOfElements, float * in, float * out);
 
 void RTK_EXPORT
-     CUDA_copy(long int numberOfElements, double * in, double * out);
+     CUDA_copy(size_t numberOfElements, double * in, double * out);
 
 void RTK_EXPORT
-     CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted);
+     CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted);
 
 void RTK_EXPORT
-     CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted);
+     CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted);
 
 void RTK_EXPORT
-     CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk);
+     CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk);
 
 void RTK_EXPORT
-     CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk);
+     CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk);
 #endif
diff --git a/include/rtkCudaConjugateGradientImageFilter.hxx b/include/rtkCudaConjugateGradientImageFilter.hxx
@@ -42,8 +42,8 @@ void
 CudaConjugateGradientImageFilter<TImage>::GPUGenerateData()
 {
   using DataType = typename itk::PixelTraits<typename TImage::PixelType>::ValueType;
-  long int numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() *
-                              itk::PixelTraits<typename TImage::PixelType>::Dimension;
+  size_t numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() *
+                            itk::PixelTraits<typename TImage::PixelType>::Dimension;
 
   // Create and allocate images
   typename TImage::Pointer P_k = TImage::New();

diff --git a/include/rtkCudaLagCorrectionImageFilter.hcu b/include/rtkCudaLagCorrectionImageFilter.hcu
@@ -29,7 +29,7 @@ CUDA_lag_correction(int              proj_idx_in[3],
                     unsigned short * dev_proj_in,
                     unsigned short * dev_proj_out,
                     float *          h_state,
-                    int              state_size,
+                    size_t           state_size,
                     float *          coefficients);
 
 #endif // rtkCudaLagCorrectionImageFilter_hcu
diff --git a/include/rtkCudaPolynomialGainCorrectionImageFilter.hcu b/include/rtkCudaPolynomialGainCorrectionImageFilter.hcu
@@ -31,7 +31,7 @@ CUDA_gain_correction(int              proj_idx_in[3],
                      unsigned short * dev_dark_in,
                      float *          dev_gain_in,
                      float *          h_powerlut,
-                     int              lut_size,
+                     size_t           lut_size,
                      float *          coefficients);
 
 #endif // rtkCudaPolynomialGainCorrectionImageFilter_hcu
diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu
@@ -26,7 +26,6 @@
 #include <itkMacro.h>
 #undef ITK_STATIC
 #include <cuda.h>
-#include <cublas_v2.h>
 
 #define CUDA_CHECK_ERROR                                                                                               \
   {                                                                                                                    \

diff --git a/src/rtkCudaConjugateGradientImageFilter.cu b/src/rtkCudaConjugateGradientImageFilter.cu
@@ -28,29 +28,33 @@
 // TEXTURES AND CONSTANTS //
 
 void
-CUDA_copy(long int numberOfElements, float * in, float * out)
+CUDA_copy(size_t numberOfElements, float * in, float * out)
 {
   // Copy input volume to output
-  long int memorySizeOutput = numberOfElements * sizeof(float);
+  size_t memorySizeOutput = numberOfElements * sizeof(float);
   cudaMemcpy(out, in, memorySizeOutput, cudaMemcpyDeviceToDevice);
 }
 
 void
-CUDA_copy(long int numberOfElements, double * in, double * out)
+CUDA_copy(size_t numberOfElements, double * in, double * out)
 {
   // Copy input volume to output
-  long int memorySizeOutput = numberOfElements * sizeof(double);
+  size_t memorySizeOutput = numberOfElements * sizeof(double);
   cudaMemcpy(out, in, memorySizeOutput, cudaMemcpyDeviceToDevice);
 }
 
 void
-CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted)
+CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted)
 {
   cublasHandle_t handle;
   cublasCreate(&handle);
 
   const float alpha = -1.0;
-  cublasSaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
+#if CUDA_VERSION < 12000
+  cublasSaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
+#else
+  cublasSaxpy_64(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
+#endif
 
   // Destroy Cublas context
   cublasDestroy(handle);
@@ -59,13 +63,13 @@ CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted)
 }
 
 void
-CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted)
+CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted)
 {
   cublasHandle_t handle;
   cublasCreate(&handle);
 
   const double alpha = -1.0;
-  cublasDaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
+  cublasDaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
 
   // Destroy Cublas context
   cublasDestroy(handle);
@@ -74,7 +78,7 @@ CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted)
 }
 
 void
-CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk)
+CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk)
 {
   cublasHandle_t handle;
   cublasCreate(&handle);
@@ -83,33 +87,33 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float
 
   // Compute Rk_square = sum(Rk(:).^2) by cublas
   float Rk_square = 0;
-  cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square);
+  cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square);
 
   // Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:))
   float Pk_APk = 0;
-  cublasSdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk);
+  cublasSdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk);
 
   const float alpha_k = Rk_square / (Pk_APk + eps);
   const float minus_alpha_k = -alpha_k;
 
   // Compute Xk+1 = Xk + alpha_k * Pk
-  cublasSaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1);
+  cublasSaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1);
 
   // Compute Rk+1 = Rk - alpha_k * APk
-  cublasSaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
+  cublasSaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
 
   // Compute beta_k = sum(Rk+1(:).^2) / Rk_square
   float Rkplusone_square = 0;
-  cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
+  cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
 
   float beta_k = Rkplusone_square / (Rk_square + eps);
   float one = 1.0;
 
   // Compute Pk+1 = Rk+1 + beta_k * Pk
   // This requires two cublas functions,
   // since axpy would store the result in the wrong array
-  cublasSscal(handle, numberOfElements, &beta_k, Pk, 1);
-  cublasSaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1);
+  cublasSscal(handle, (int)numberOfElements, &beta_k, Pk, 1);
+  cublasSaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1);
 
   // Destroy Cublas context
   cublasDestroy(handle);
@@ -118,7 +122,7 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float
 }
 
 void
-CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk)
+CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk)
 {
   cublasHandle_t handle;
   cublasCreate(&handle);
@@ -127,33 +131,33 @@ CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, dou
 
   // Compute Rk_square = sum(Rk(:).^2) by cublas
   double Rk_square = 0;
-  cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square);
+  cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square);
 
   // Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:))
   double Pk_APk = 0;
-  cublasDdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk);
+  cublasDdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk);
 
   const double alpha_k = Rk_square / (Pk_APk + eps);
   const double minus_alpha_k = -alpha_k;
 
   // Compute Xk+1 = Xk + alpha_k * Pk
-  cublasDaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1);
+  cublasDaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1);
 
   // Compute Rk+1 = Rk - alpha_k * APk
-  cublasDaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
+  cublasDaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
 
   // Compute beta_k = sum(Rk+1(:).^2) / Rk_square
   double Rkplusone_square = 0;
-  cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
+  cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
 
   double beta_k = Rkplusone_square / (Rk_square + eps);
   double one = 1.0;
 
   // Compute Pk+1 = Rk+1 + beta_k * Pk
   // This requires two cublas functions,
   // since axpy would store the result in the wrong array
-  cublasDscal(handle, numberOfElements, &beta_k, Pk, 1);
-  cublasDaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1);
+  cublasDscal(handle, (int)numberOfElements, &beta_k, Pk, 1);
+  cublasDaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1);
 
   // Destroy Cublas context
   cublasDestroy(handle);

diff --git a/src/rtkCudaConstantVolumeSeriesSource.cu b/src/rtkCudaConstantVolumeSeriesSource.cu
@@ -74,7 +74,7 @@ CUDA_generate_constant_volume_series(int size[4], float * dev_out, float constan
   // run a kernel to replace the zeros with constantValue.
 
   // Reset output volume
-  long int memorySizeOutput = size[0] * size[1] * size[2] * size[3] * sizeof(float);
+  size_t memorySizeOutput = size[0] * size[1] * size[2] * size[3] * sizeof(float);
   cudaMemset((void *)dev_out, 0, memorySizeOutput);
 
   if (!(constantValue == 0))

diff --git a/src/rtkCudaConstantVolumeSource.cu b/src/rtkCudaConstantVolumeSource.cu
@@ -74,7 +74,7 @@ CUDA_generate_constant_volume(int size[3], float * dev_out, float constantValue)
   // run a kernel to replace the zeros with constantValue.
 
   // Reset output volume
-  long int memorySizeOutput = size[0] * size[1] * size[2] * sizeof(float);
+  size_t memorySizeOutput = size[0] * size[1] * size[2] * sizeof(float);
   cudaMemset((void *)dev_out, 0, memorySizeOutput);
 
   if (!(constantValue == 0))

diff --git a/src/rtkCudaCyclicDeformationImageFilter.cu b/src/rtkCudaCyclicDeformationImageFilter.cu
@@ -51,17 +51,25 @@ CUDA_linear_interpolate_along_fourth_dimension(unsigned int inputSize[4],
   float wInf = (float)weightInf;
   float wSup = (float)weightSup;
 
-  int numel = inputSize[0] * inputSize[1] * inputSize[2] * 3;
+  size_t numel = inputSize[0] * inputSize[1] * inputSize[2] * 3;
 
   cudaMemset((void *)output, 0, numel * sizeof(float));
 
   // Add it weightInf times the frameInf to the output
   float * pinf = input + frameInf * numel;
-  cublasSaxpy(handle, numel, &wInf, pinf, 1, output, 1);
+#if CUDA_VERSION < 12000
+  cublasSaxpy(handle, (int)numel, &wInf, pinf, 1, output, 1);
+#else
+  cublasSaxpy_64(handle, numel, &wInf, pinf, 1, output, 1);
+#endif
 
   // Add it weightSup times the frameSup to the output
   float * psup = input + frameSup * numel;
-  cublasSaxpy(handle, numel, &wSup, psup, 1, output, 1);
+#if CUDA_VERSION < 12000
+  cublasSaxpy(handle, (int)numel, &wSup, psup, 1, output, 1);
+#else
+  cublasSaxpy_64(handle, numel, &wSup, psup, 1, output, 1);
+#endif
 
   // Destroy Cublas context
   cublasDestroy(handle);

diff --git a/src/rtkCudaFFTProjectionsConvolutionImageFilter.cu b/src/rtkCudaFFTProjectionsConvolutionImageFilter.cu
@@ -78,7 +78,7 @@ CUDA_fft_convolution(const int3 &   inputDimension,
   int3           fftDimension = inputDimension;
   fftDimension.x = inputDimension.x / 2 + 1;
 
-  int memorySizeProjectionFFT = fftDimension.x * fftDimension.y * fftDimension.z * sizeof(cufftComplex);
+  size_t memorySizeProjectionFFT = sizeof(cufftComplex) * fftDimension.x * fftDimension.y * fftDimension.z;
   cudaMalloc((void **)&deviceProjectionFFT, memorySizeProjectionFFT);
   CUDA_CHECK_ERROR;
 

diff --git a/src/rtkCudaForwardWarpImageFilter.cu b/src/rtkCudaForwardWarpImageFilter.cu
@@ -386,17 +386,16 @@ CUDA_ForwardWarp(int     input_vol_dim[3],
 
   ///////////////////////////////////////
   /// Initialize the output
-  cudaMemset((void *)dev_output_vol, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]);
+  size_t memorySizeOutput = sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2];
+  cudaMemset((void *)dev_output_vol, 0, memorySizeOutput);
 
   //////////////////////////////////////
   /// Create an image to store the splat weights
   /// (in order to normalize)
 
   float * dev_accumulate_weights;
-  cudaMalloc((void **)&dev_accumulate_weights,
-             sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]);
-  cudaMemset(
-    (void *)dev_accumulate_weights, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]);
+  cudaMalloc((void **)&dev_accumulate_weights, memorySizeOutput);
+  cudaMemset((void *)dev_accumulate_weights, 0, memorySizeOutput);
 
   // Thread Block Dimensions
   constexpr int tBlock_x = 16;

diff --git a/src/rtkCudaInterpolateImageFilter.cu b/src/rtkCudaInterpolateImageFilter.cu
@@ -34,8 +34,8 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr
   cublasCreate(&handle);
 
   // CUDA device pointers
-  int nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z;
-  int memorySizeOutput = nVoxelsOutput * sizeof(float);
+  size_t nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z;
+  size_t memorySizeOutput = nVoxelsOutput * sizeof(float);
 
   // Reset output volume
   cudaMemset((void *)output, 0, memorySizeOutput);
@@ -49,7 +49,11 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr
       float * p = input + phase * nVoxelsOutput;
 
       // Add "weight" times the "phase"-th volume in the input to the output
-      cublasSaxpy(handle, nVoxelsOutput, &weight, p, 1, output, 1);
+#if CUDA_VERSION < 12000
+      cublasSaxpy(handle, (int)nVoxelsOutput, &weight, p, 1, output, 1);
+#else
+      cublasSaxpy_64(handle, nVoxelsOutput, &weight, p, 1, output, 1);
+#endif
     }
   }
 

diff --git a/src/rtkCudaLagCorrectionImageFilter.cu b/src/rtkCudaLagCorrectionImageFilter.cu
@@ -97,7 +97,7 @@ CUDA_lag_correction(int              proj_idx_in[3],      // overlapping input r
                     unsigned short * dev_proj_in,
                     unsigned short * dev_proj_out,
                     float *          h_state,
-                    int              state_size,
+                    size_t           state_size,
                     float *          coefficients)
 {
   // Thread Block Dimensions

diff --git a/src/rtkCudaLagCorrectionImageFilter.cxx b/src/rtkCudaLagCorrectionImageFilter.cxx
@@ -71,7 +71,7 @@ CudaLagCorrectionImageFilter ::GPUGenerateData()
 
   float coefficients[9] = { m_B[0], m_B[1], m_B[2], m_B[3], m_ExpmA[0], m_ExpmA[1], m_ExpmA[2], m_ExpmA[3], m_SumB };
 
-  int S_size = sizeof(float) * m_S.size();
+  size_t S_size = sizeof(float) * m_S.size();
   CUDA_lag_correction(proj_idx_in,
                       proj_size_in,
                       proj_size_in_buf,

diff --git a/src/rtkCudaLaplacianImageFilter.cu b/src/rtkCudaLaplacianImageFilter.cu
@@ -33,7 +33,7 @@ CUDA_laplacian(int size[3], float spacing[3], float * dev_in, float * dev_out)
   float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing[2]);
 
   // Reset output volume
-  long int memorySizeOutput = size[0] * size[1] * size[2] * sizeof(float);
+  size_t memorySizeOutput = sizeof(float) * size[0] * size[1] * size[2];
   cudaMemset((void *)dev_out, 0, memorySizeOutput);
 
   // Initialize volumes to store the gradient components

diff --git a/src/rtkCudaPolynomialGainCorrectionImageFilter.cu b/src/rtkCudaPolynomialGainCorrectionImageFilter.cu
@@ -94,7 +94,7 @@ CUDA_gain_correction(int              proj_idx_in[3],      // overlapping input
                      unsigned short * dev_dark_in,
                      float *          dev_gain_in,
                      float *          h_powerlut,
-                     int              lut_size,
+                     size_t           lut_size,
                      float *          coefficients)
 {
   // Thread Block Dimensions

diff --git a/src/rtkCudaPolynomialGainCorrectionImageFilter.cxx b/src/rtkCudaPolynomialGainCorrectionImageFilter.cxx
@@ -74,7 +74,7 @@ CudaPolynomialGainCorrectionImageFilter ::GPUGenerateData()
 
   float coefficients[2] = { static_cast<float>(m_VModelOrder), m_K };
 
-  int LUT_size = sizeof(float) * m_PowerLut.size();
+  size_t LUT_size = sizeof(float) * m_PowerLut.size();
   CUDA_gain_correction(proj_idx_in,
                        proj_size_in,
                        proj_size_in_buf,

diff --git a/src/rtkCudaRayCastBackProjectionImageFilter.cu b/src/rtkCudaRayCastBackProjectionImageFilter.cu
@@ -285,13 +285,14 @@ CUDA_ray_cast_back_project(int      projSize[3],
   // normalization by the splat weights would affect not only the backprojection
   // of the current projection, but also the initial value of the output
   float * dev_accumulate_values;
-  cudaMalloc((void **)&dev_accumulate_values, sizeof(float) * volSize[0] * volSize[1] * volSize[2]);
-  cudaMemset((void *)dev_accumulate_values, 0, sizeof(float) * volSize[0] * volSize[1] * volSize[2]);
+  size_t  volMemSize = sizeof(float) * volSize[0] * volSize[1] * volSize[2];
+  cudaMalloc((void **)&dev_accumulate_values, volMemSize);
+  cudaMemset((void *)dev_accumulate_values, 0, volMemSize);
 
   // Create an image to store the splat weights (in order to normalize)
   float * dev_accumulate_weights;
-  cudaMalloc((void **)&dev_accumulate_weights, sizeof(float) * volSize[0] * volSize[1] * volSize[2]);
-  cudaMemset((void *)dev_accumulate_weights, 0, sizeof(float) * volSize[0] * volSize[1] * volSize[2]);
+  cudaMalloc((void **)&dev_accumulate_weights, volMemSize);
+  cudaMemset((void *)dev_accumulate_weights, 0, volMemSize);
 
   // Calling kernels
   dim3 dimBlock = dim3(8, 8, 4);