Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CUDA and cublas memory management for large arrays #598

Merged
merged 3 commits into from
May 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions include/rtkCudaConjugateGradientImageFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,20 @@
#include "RTKExport.h"

void RTK_EXPORT
CUDA_copy(long int numberOfElements, float * in, float * out);
CUDA_copy(size_t numberOfElements, float * in, float * out);

void RTK_EXPORT
CUDA_copy(long int numberOfElements, double * in, double * out);
CUDA_copy(size_t numberOfElements, double * in, double * out);

void RTK_EXPORT
CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted);
CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted);

void RTK_EXPORT
CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted);
CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted);

void RTK_EXPORT
CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk);
CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk);

void RTK_EXPORT
CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk);
CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk);
#endif
4 changes: 2 additions & 2 deletions include/rtkCudaConjugateGradientImageFilter.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ void
CudaConjugateGradientImageFilter<TImage>::GPUGenerateData()
{
using DataType = typename itk::PixelTraits<typename TImage::PixelType>::ValueType;
long int numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() *
itk::PixelTraits<typename TImage::PixelType>::Dimension;
size_t numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() *
itk::PixelTraits<typename TImage::PixelType>::Dimension;

// Create and allocate images
typename TImage::Pointer P_k = TImage::New();
Expand Down
2 changes: 1 addition & 1 deletion include/rtkCudaLagCorrectionImageFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ CUDA_lag_correction(int proj_idx_in[3],
unsigned short * dev_proj_in,
unsigned short * dev_proj_out,
float * h_state,
int state_size,
size_t state_size,
float * coefficients);

#endif // rtkCudaLagCorrectionImageFilter_hcu
2 changes: 1 addition & 1 deletion include/rtkCudaPolynomialGainCorrectionImageFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ CUDA_gain_correction(int proj_idx_in[3],
unsigned short * dev_dark_in,
float * dev_gain_in,
float * h_powerlut,
int lut_size,
size_t lut_size,
float * coefficients);

#endif // rtkCudaPolynomialGainCorrectionImageFilter_hcu
1 change: 0 additions & 1 deletion include/rtkCudaUtilities.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include <itkMacro.h>
#undef ITK_STATIC
#include <cuda.h>
#include <cublas_v2.h>

#define CUDA_CHECK_ERROR \
{ \
Expand Down
52 changes: 28 additions & 24 deletions src/rtkCudaConjugateGradientImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,33 @@
// TEXTURES AND CONSTANTS //

void
CUDA_copy(long int numberOfElements, float * in, float * out)
CUDA_copy(size_t numberOfElements, float * in, float * out)
{
// Copy input volume to output
long int memorySizeOutput = numberOfElements * sizeof(float);
size_t memorySizeOutput = numberOfElements * sizeof(float);
cudaMemcpy(out, in, memorySizeOutput, cudaMemcpyDeviceToDevice);
}

void
CUDA_copy(long int numberOfElements, double * in, double * out)
CUDA_copy(size_t numberOfElements, double * in, double * out)
{
// Copy input volume to output
long int memorySizeOutput = numberOfElements * sizeof(double);
size_t memorySizeOutput = numberOfElements * sizeof(double);
cudaMemcpy(out, in, memorySizeOutput, cudaMemcpyDeviceToDevice);
}

void
CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted)
CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted)
{
cublasHandle_t handle;
cublasCreate(&handle);

const float alpha = -1.0;
cublasSaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
#else
cublasSaxpy_64(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
#endif

// Destroy Cublas context
cublasDestroy(handle);
Expand All @@ -59,13 +63,13 @@ CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted)
}

void
CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted)
CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted)
{
cublasHandle_t handle;
cublasCreate(&handle);

const double alpha = -1.0;
cublasDaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
cublasDaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1);

// Destroy Cublas context
cublasDestroy(handle);
Expand All @@ -74,7 +78,7 @@ CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted)
}

void
CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk)
CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk)
{
cublasHandle_t handle;
cublasCreate(&handle);
Expand All @@ -83,33 +87,33 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float

// Compute Rk_square = sum(Rk(:).^2) by cublas
float Rk_square = 0;
cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square);
cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square);

// Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:))
float Pk_APk = 0;
cublasSdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk);
cublasSdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk);

const float alpha_k = Rk_square / (Pk_APk + eps);
const float minus_alpha_k = -alpha_k;

// Compute Xk+1 = Xk + alpha_k * Pk
cublasSaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1);
cublasSaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1);

// Compute Rk+1 = Rk - alpha_k * APk
cublasSaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
cublasSaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);

// Compute beta_k = sum(Rk+1(:).^2) / Rk_square
float Rkplusone_square = 0;
cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);

float beta_k = Rkplusone_square / (Rk_square + eps);
float one = 1.0;

// Compute Pk+1 = Rk+1 + beta_k * Pk
// This requires two cublas functions,
// since axpy would store the result in the wrong array
cublasSscal(handle, numberOfElements, &beta_k, Pk, 1);
cublasSaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1);
cublasSscal(handle, (int)numberOfElements, &beta_k, Pk, 1);
cublasSaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1);

// Destroy Cublas context
cublasDestroy(handle);
Expand All @@ -118,7 +122,7 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float
}

void
CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk)
CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk)
{
cublasHandle_t handle;
cublasCreate(&handle);
Expand All @@ -127,33 +131,33 @@ CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, dou

// Compute Rk_square = sum(Rk(:).^2) by cublas
double Rk_square = 0;
cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square);
cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square);

// Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:))
double Pk_APk = 0;
cublasDdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk);
cublasDdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk);

const double alpha_k = Rk_square / (Pk_APk + eps);
const double minus_alpha_k = -alpha_k;

// Compute Xk+1 = Xk + alpha_k * Pk
cublasDaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1);
cublasDaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1);

// Compute Rk+1 = Rk - alpha_k * APk
cublasDaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
cublasDaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);

// Compute beta_k = sum(Rk+1(:).^2) / Rk_square
double Rkplusone_square = 0;
cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);

double beta_k = Rkplusone_square / (Rk_square + eps);
double one = 1.0;

// Compute Pk+1 = Rk+1 + beta_k * Pk
// This requires two cublas functions,
// since axpy would store the result in the wrong array
cublasDscal(handle, numberOfElements, &beta_k, Pk, 1);
cublasDaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1);
cublasDscal(handle, (int)numberOfElements, &beta_k, Pk, 1);
cublasDaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1);

// Destroy Cublas context
cublasDestroy(handle);
Expand Down
2 changes: 1 addition & 1 deletion src/rtkCudaConstantVolumeSeriesSource.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ CUDA_generate_constant_volume_series(int size[4], float * dev_out, float constan
// run a kernel to replace the zeros with constantValue.

// Reset output volume
long int memorySizeOutput = size[0] * size[1] * size[2] * size[3] * sizeof(float);
size_t memorySizeOutput = size[0] * size[1] * size[2] * size[3] * sizeof(float);
cudaMemset((void *)dev_out, 0, memorySizeOutput);

if (!(constantValue == 0))
Expand Down
2 changes: 1 addition & 1 deletion src/rtkCudaConstantVolumeSource.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ CUDA_generate_constant_volume(int size[3], float * dev_out, float constantValue)
// run a kernel to replace the zeros with constantValue.

// Reset output volume
long int memorySizeOutput = size[0] * size[1] * size[2] * sizeof(float);
size_t memorySizeOutput = size[0] * size[1] * size[2] * sizeof(float);
cudaMemset((void *)dev_out, 0, memorySizeOutput);

if (!(constantValue == 0))
Expand Down
14 changes: 11 additions & 3 deletions src/rtkCudaCyclicDeformationImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,25 @@ CUDA_linear_interpolate_along_fourth_dimension(unsigned int inputSize[4],
float wInf = (float)weightInf;
float wSup = (float)weightSup;

int numel = inputSize[0] * inputSize[1] * inputSize[2] * 3;
size_t numel = inputSize[0] * inputSize[1] * inputSize[2] * 3;

cudaMemset((void *)output, 0, numel * sizeof(float));

// Add it weightInf times the frameInf to the output
float * pinf = input + frameInf * numel;
cublasSaxpy(handle, numel, &wInf, pinf, 1, output, 1);
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)numel, &wInf, pinf, 1, output, 1);
#else
cublasSaxpy_64(handle, numel, &wInf, pinf, 1, output, 1);
#endif

// Add it weightSup times the frameSup to the output
float * psup = input + frameSup * numel;
cublasSaxpy(handle, numel, &wSup, psup, 1, output, 1);
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)numel, &wSup, psup, 1, output, 1);
#else
cublasSaxpy_64(handle, numel, &wSup, psup, 1, output, 1);
#endif

// Destroy Cublas context
cublasDestroy(handle);
Expand Down
2 changes: 1 addition & 1 deletion src/rtkCudaFFTProjectionsConvolutionImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ CUDA_fft_convolution(const int3 & inputDimension,
int3 fftDimension = inputDimension;
fftDimension.x = inputDimension.x / 2 + 1;

int memorySizeProjectionFFT = fftDimension.x * fftDimension.y * fftDimension.z * sizeof(cufftComplex);
size_t memorySizeProjectionFFT = sizeof(cufftComplex) * fftDimension.x * fftDimension.y * fftDimension.z;
cudaMalloc((void **)&deviceProjectionFFT, memorySizeProjectionFFT);
CUDA_CHECK_ERROR;

Expand Down
9 changes: 4 additions & 5 deletions src/rtkCudaForwardWarpImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -386,17 +386,16 @@ CUDA_ForwardWarp(int input_vol_dim[3],

///////////////////////////////////////
/// Initialize the output
cudaMemset((void *)dev_output_vol, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]);
size_t memorySizeOutput = sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2];
cudaMemset((void *)dev_output_vol, 0, memorySizeOutput);

//////////////////////////////////////
/// Create an image to store the splat weights
/// (in order to normalize)

float * dev_accumulate_weights;
cudaMalloc((void **)&dev_accumulate_weights,
sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]);
cudaMemset(
(void *)dev_accumulate_weights, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]);
cudaMalloc((void **)&dev_accumulate_weights, memorySizeOutput);
cudaMemset((void *)dev_accumulate_weights, 0, memorySizeOutput);

// Thread Block Dimensions
constexpr int tBlock_x = 16;
Expand Down
10 changes: 7 additions & 3 deletions src/rtkCudaInterpolateImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr
cublasCreate(&handle);

// CUDA device pointers
int nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z;
int memorySizeOutput = nVoxelsOutput * sizeof(float);
size_t nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z;
size_t memorySizeOutput = nVoxelsOutput * sizeof(float);

// Reset output volume
cudaMemset((void *)output, 0, memorySizeOutput);
Expand All @@ -49,7 +49,11 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr
float * p = input + phase * nVoxelsOutput;

// Add "weight" times the "phase"-th volume in the input to the output
cublasSaxpy(handle, nVoxelsOutput, &weight, p, 1, output, 1);
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)nVoxelsOutput, &weight, p, 1, output, 1);
#else
cublasSaxpy_64(handle, nVoxelsOutput, &weight, p, 1, output, 1);
#endif
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/rtkCudaLagCorrectionImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ CUDA_lag_correction(int proj_idx_in[3], // overlapping input r
unsigned short * dev_proj_in,
unsigned short * dev_proj_out,
float * h_state,
int state_size,
size_t state_size,
float * coefficients)
{
// Thread Block Dimensions
Expand Down
2 changes: 1 addition & 1 deletion src/rtkCudaLagCorrectionImageFilter.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ CudaLagCorrectionImageFilter ::GPUGenerateData()

float coefficients[9] = { m_B[0], m_B[1], m_B[2], m_B[3], m_ExpmA[0], m_ExpmA[1], m_ExpmA[2], m_ExpmA[3], m_SumB };

int S_size = sizeof(float) * m_S.size();
size_t S_size = sizeof(float) * m_S.size();
CUDA_lag_correction(proj_idx_in,
proj_size_in,
proj_size_in_buf,
Expand Down
2 changes: 1 addition & 1 deletion src/rtkCudaLaplacianImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ CUDA_laplacian(int size[3], float spacing[3], float * dev_in, float * dev_out)
float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing[2]);

// Reset output volume
long int memorySizeOutput = size[0] * size[1] * size[2] * sizeof(float);
size_t memorySizeOutput = sizeof(float) * size[0] * size[1] * size[2];
cudaMemset((void *)dev_out, 0, memorySizeOutput);

// Initialize volumes to store the gradient components
Expand Down
2 changes: 1 addition & 1 deletion src/rtkCudaPolynomialGainCorrectionImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ CUDA_gain_correction(int proj_idx_in[3], // overlapping input
unsigned short * dev_dark_in,
float * dev_gain_in,
float * h_powerlut,
int lut_size,
size_t lut_size,
float * coefficients)
{
// Thread Block Dimensions
Expand Down
2 changes: 1 addition & 1 deletion src/rtkCudaPolynomialGainCorrectionImageFilter.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ CudaPolynomialGainCorrectionImageFilter ::GPUGenerateData()

float coefficients[2] = { static_cast<float>(m_VModelOrder), m_K };

int LUT_size = sizeof(float) * m_PowerLut.size();
size_t LUT_size = sizeof(float) * m_PowerLut.size();
CUDA_gain_correction(proj_idx_in,
proj_size_in,
proj_size_in_buf,
Expand Down
9 changes: 5 additions & 4 deletions src/rtkCudaRayCastBackProjectionImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -285,13 +285,14 @@ CUDA_ray_cast_back_project(int projSize[3],
// normalization by the splat weights would affect not only the backprojection
// of the current projection, but also the initial value of the output
float * dev_accumulate_values;
cudaMalloc((void **)&dev_accumulate_values, sizeof(float) * volSize[0] * volSize[1] * volSize[2]);
cudaMemset((void *)dev_accumulate_values, 0, sizeof(float) * volSize[0] * volSize[1] * volSize[2]);
size_t volMemSize = sizeof(float) * volSize[0] * volSize[1] * volSize[2];
cudaMalloc((void **)&dev_accumulate_values, volMemSize);
cudaMemset((void *)dev_accumulate_values, 0, volMemSize);

// Create an image to store the splat weights (in order to normalize)
float * dev_accumulate_weights;
cudaMalloc((void **)&dev_accumulate_weights, sizeof(float) * volSize[0] * volSize[1] * volSize[2]);
cudaMemset((void *)dev_accumulate_weights, 0, sizeof(float) * volSize[0] * volSize[1] * volSize[2]);
cudaMalloc((void **)&dev_accumulate_weights, volMemSize);
cudaMemset((void *)dev_accumulate_weights, 0, volMemSize);

// Calling kernels
dim3 dimBlock = dim3(8, 8, 4);
Expand Down
Loading
Loading