Skip to content

Commit

Permalink
FastFFT: Part 14
Browse files Browse the repository at this point in the history
match_template.cpp
    - Add command line flags to modify TM behavior
        - --histogram-sampling=float (percentage of pixels to sample for histogram)
        - --stats-sampling=float (percentage of pixels to sample for stats)
        - --disable-gpu-prj (really just for benchmarking and troubleshooting)

DeviceManager.cu
    - Remove unused argument to DeviceManager::SetGpu to avoid confusion
    - Delete spurious comments

GpuImage.cu
    - Remove unused methods/kernel definitions for Mip related

TemplateMatchingCore.cu
    - Remove declaration of unused FastFFT::my_functor
    - Move use_gpu_prj from constexpr to member variable set on init

template_matching_empirical_distribution
    - Remove un-needed histogram defining vars, and use those in constants.h directly
  • Loading branch information
bHimes committed Oct 8, 2024
1 parent e2d5969 commit eb24070
Show file tree
Hide file tree
Showing 11 changed files with 146 additions and 267 deletions.
2 changes: 1 addition & 1 deletion src/constants/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ constexpr float histogram_first_bin_midpoint = histogram_min + (histogram_step /
constexpr int number_of_output_images = 8; //mip, psi, theta, phi, pixel, defocus, sums, sqsums (scaled mip is not sent out)
constexpr int number_of_meta_data_values = 10; // img_x, img_y, number cccs, histogram values, pixel_size
constexpr int MAX_ALLOWED_NUMBER_OF_PEAKS = 1000; // An error will be thrown and job aborted if this number of peaks is exceeded in the make template results block
constexpr float MAX_BINNING_FACTOR = histogram_min; // The maximum binning factor allowed for the template matching search image
constexpr float MAX_BINNING_FACTOR = 10.f; // The maximum binning factor allowed for the template matching search image
constexpr float MIN_VALUE_TO_MIP = 1.0f; // The minimum value to be considered for the MIP

/**
Expand Down
41 changes: 1 addition & 40 deletions src/gpu/DeviceManager.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,45 +38,19 @@ void DeviceManager::Init(int wanted_number_of_gpus) {
float GPU_score;
float GPU_score_max = 0.0f;

// memset(&prop, 0, sizeof(cudaDeviceProp));
// prop.memoryBusWidth = 8192;
// prop.memoryClockRate = 4010000;
// prop.multiProcessorCount = 160;
// prop.totalGlobalMem = 68174610432; // 64 Gb
//
// cudaErr(cudaChooseDevice(&selected_GPU, &prop));
// wxPrintf("Chosen device dumber: %d\n", selected_GPU);

// // sleep for a short random bit between 0.5 and 5 seconds. The goal is so mutli-card units have work sent to the next card.
// wxMilliSleep((global_random_number_generator.GetUniformRandom() + 1.5) * 2000);
for ( int iGPU = 0; iGPU < gpu_check; iGPU++ ) {
cudaDeviceProp prop;
cudaErr(cudaGetDeviceProperties(&prop, iGPU));
// wxPrintf("Device Number: %d\n", iGPU);
// wxPrintf(" Device name: %s\n", prop.name);
// wxPrintf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
// wxPrintf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
// wxPrintf(" Peak Memory Bandwidth (GB/s): %f\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
// wxPrintf(" Number of multiprocessors: %d\n", prop.multiProcessorCount);
// wxPrintf(" Threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
// wxPrintf(" Memory per multiprocessor: %li\n", long(prop.sharedMemPerMultiprocessor));
// wxPrintf(" Memory on device: %li\n", long(prop.totalGlobalMem));
// wxPrintf("\n");

cudaErr(cudaDeviceGetAttribute(&cuda_compute_mode, cudaDevAttrComputeMode, iGPU));

cudaErr(cudaSetDevice(iGPU));

// cudaErr(cudaMemGetInfo(&free_mem,&total_mem));
// wxPrintf("Found %zd free mem out of %zd on iGPU %d\n", free_mem, total_mem, iGPU);
// wxPrintf("cuda_compute_mode, cudaDevAttrComputeMode, cudaComputeModeProhibited = %i %i %i\n", cuda_compute_mode, cudaDevAttrComputeMode, cudaComputeModeProhibited);

if ( cuda_compute_mode != cudaComputeModeProhibited && prop.totalGlobalMem > min_memory_available ) {
if ( cuda_compute_mode != cudaComputeModeDefault ) {
wxPrintf("\n\n\tWarning : Your device id %d is not set to Default compute mode.\n\n", iGPU);
}
cudaErr(cudaMemGetInfo(&free_mem, &total_mem));
// wxPrintf("Found %zd free mem out of %zd on iGPU %d\n", free_mem, total_mem, iGPU);
if ( total_mem < min_memory_total ) {
// Don't use this device. This might not be the best way to do this.
cudaErr(cudaDeviceReset( ));
Expand All @@ -85,14 +59,8 @@ void DeviceManager::Init(int wanted_number_of_gpus) {
else if ( free_mem > max_mem && free_mem > min_memory_available ) {
max_mem = free_mem;
}
// else if (free_mem > max_mem && free_mem > min_memory_available) { max_mem = free_mem; selected_GPU = iGPU; }

GPU_score = (float)free_mem * (float)prop.memoryClockRate * (float)prop.memoryBusWidth * (float)prop.multiProcessorCount * (float)prop.maxThreadsPerMultiProcessor;
// wxPrintf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
// wxPrintf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
// wxPrintf(" Number of multiprocessors: %d\n", prop.multiProcessorCount);
// wxPrintf(" Threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
// wxPrintf("GPU score = %g\n", GPU_score);
if ( GPU_score > GPU_score_max ) {
GPU_score_max = GPU_score;
selected_GPU = iGPU;
Expand All @@ -110,15 +78,8 @@ void DeviceManager::Init(int wanted_number_of_gpus) {
is_manager_initialized = true;
};

void DeviceManager::SetGpu(int cpu_thread_idx) {

// // Select the current device
// this->gpuIDX = -1;
// cudaErr(cudaSetDevice(cpu_thread_idx % this->nGPUs)); // "% num_gpus" allows more CPU threads than GPU devices
void DeviceManager::SetGpu( ) {
cudaErr(cudaSetDevice(this->gpuIDX));
// cudaErr(cudaGetDevice(&gpuIDX));

// wxPrintf("For thread %d of nGpus %d assigned %d\n",cpu_thread_idx, nGPUs, gpuIDX);
};

void DeviceManager::ResetGpu( ) {
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/DeviceManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class DeviceManager {
virtual ~DeviceManager( );

void Init(int wanted_number_of_gpus);
void SetGpu(int cpu_thread_idx);
void SetGpu( );
void ResetGpu( );
void ListDevices( );

Expand Down
96 changes: 0 additions & 96 deletions src/gpu/GpuImage.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,6 @@
// #define USE_BLOCK_REDUCE
#define USE_FP16_FOR_WHITENPS

__global__ void
MipPixelWiseKernel(cufftReal* mip, const cufftReal* correlation_output, const int4 dims);
__global__ void
MipPixelWiseKernel(cufftReal* mip, cufftReal* other_image, cufftReal* psi, cufftReal* phi, cufftReal* theta,
int4 dims, float c_psi, float c_phi, float c_theta);
__global__ void
MipPixelWiseKernel(cufftReal* mip, cufftReal* other_image, cufftReal* psi, cufftReal* phi, cufftReal* theta, cufftReal* defocus, cufftReal* pixel, const int4 dims,
float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel);

// Inline declarations
__device__ __forceinline__ int
d_ReturnFourierLogicalCoordGivenPhysicalCoord_X(int physical_index,
Expand Down Expand Up @@ -527,7 +518,6 @@ MultiplyPixelWiseComplexConjugateKernel(const StorageType* __restrict__ img_comp
int address = x + (dims.w / 2) * y;
int stride = (dims.w / 2) * dims.y;

constexpr float epsilon = 1e-6f;
// We have one referece image, and this is broadcasted to all images in the stack
if constexpr ( std::is_same<StorageType, __half2>::value ) {
const Complex ref_val = (Complex)__half22float2(ref_complex_values[address]);
Expand Down Expand Up @@ -1273,92 +1263,6 @@ float GpuImage::ReturnSumSquareModulusComplexValues( ) {
return float(tmpValComplex[tmp_val_idx::ReturnSumSquareModulusComplexValues] * tmpValComplex[tmp_val_idx::ReturnSumSquareModulusComplexValues]);
}

void GpuImage::MipPixelWise(GpuImage& other_image) {

MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension.");
precheck;
ReturnLaunchParameters(dims, true);
MipPixelWiseKernel<<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(real_values, other_image.real_values, this->dims);
postcheck;
}

__global__ void
MipPixelWiseKernel(cufftReal* mip, const cufftReal* correlation_output, const int4 dims) {

int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z);

if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) {
int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims);
mip[address] = MAX(mip[address], correlation_output[address]);
}
}

void GpuImage::MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, GpuImage& defocus, GpuImage& pixel,
float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel) {

MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension.");
precheck;
ReturnLaunchParameters(dims, true);
MipPixelWiseKernel<<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(real_values, other_image.real_values,
psi.real_values, phi.real_values, theta.real_values, defocus.real_values, pixel.real_values,
this->dims, c_psi, c_phi, c_theta, c_defocus, c_pixel);
postcheck;
}

__global__ void
MipPixelWiseKernel(cufftReal* mip, cufftReal* correlation_output, cufftReal* psi, cufftReal* phi, cufftReal* theta, cufftReal* defocus, cufftReal* pixel, const int4 dims,
float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel) {

int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z);

if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) {
int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims);
if ( correlation_output[address] > mip[address] ) {
mip[address] = correlation_output[address];
psi[address] = c_psi;
phi[address] = c_phi;
theta[address] = c_theta;
defocus[address] = c_defocus;
pixel[address] = c_pixel;
}
}
}

void GpuImage::MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta,
float c_psi, float c_phi, float c_theta) {

MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension.");
precheck;
ReturnLaunchParameters(dims, true);
MipPixelWiseKernel<<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(real_values, other_image.real_values,
psi.real_values, phi.real_values, theta.real_values,
this->dims, c_psi, c_phi, c_theta);
postcheck;
}

__global__ void
MipPixelWiseKernel(cufftReal* mip, cufftReal* correlation_output, cufftReal* psi, cufftReal* phi, cufftReal* theta, const int4 dims,
float c_psi, float c_phi, float c_theta) {

int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z);

if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) {
int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims);
if ( correlation_output[address] > mip[address] ) {
mip[address] = correlation_output[address];
psi[address] = c_psi;
phi[address] = c_phi;
theta[address] = c_theta;
}
}
}

template <typename StorageType>
__global__ void
ApplyBFactorKernel(StorageType* d_input,
Expand Down
7 changes: 0 additions & 7 deletions src/gpu/GpuImage.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ class GpuImage {
GpuImage& operator=(const GpuImage& t);
GpuImage& operator=(const GpuImage* t);

// FIXME: move constructor?

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// START MEMBER VARIABLES FROM THE cpu IMAGE CLASS

Expand Down Expand Up @@ -297,11 +295,6 @@ class GpuImage {
void WaitBlocking( );
void RecordAndWait( );
// Maximum intensity projection
void MipPixelWise(GpuImage& other_image);
void MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta,
float c_psi, float c_phi, float c_theta);
void MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, GpuImage& defocus, GpuImage& pixel,
float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel);

// FIXME: These are added for the unblur refinement but are untested.
template <typename StorageTypeBase = float>
Expand Down
42 changes: 15 additions & 27 deletions src/gpu/TemplateMatchingCore.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,26 @@

using namespace cistem_timer;

constexpr bool use_gpu_prj = true;

void TemplateMatchingCore::Init(MyApp* parent_pointer,
std::shared_ptr<GpuImage> wanted_template_reconstruction,
Image& wanted_input_image,
Image& current_projection,
float pixel_size_search_range,
float pixel_size_step,
float pixel_size,
float defocus_search_range,
float defocus_step,
float defocus1,
float defocus2,
float psi_max,
float psi_start,
float psi_step,
AnglesAndShifts& angles,
EulerSearch& global_euler_search,
float histogram_min_scaled,
float histogram_step_scaled,
int histogram_number_of_bins,
const int2 pre_padding,
const int2 roi,
float histogram_sampling,
float stats_sampling,
int first_search_position,
int last_search_position,
ProgressBar* my_progress,
long total_correlation_positions,
bool is_running_locally,
bool use_fast_fft,
bool use_gpu_prj,
int number_of_global_search_images_to_save)

{
Expand All @@ -49,6 +40,10 @@ void TemplateMatchingCore::Init(MyApp* parent_pointer,
MyDebugAssertTrue(wanted_input_image.is_in_memory, "Input image must be in memory");
object_initialized_ = true;

this->use_gpu_prj = use_gpu_prj;
histogram_sampling_ = histogram_sampling;
stats_sampling_ = stats_sampling;

this->first_search_position = first_search_position;
this->last_search_position = last_search_position;
this->angles = angles;
Expand Down Expand Up @@ -97,10 +92,8 @@ void TemplateMatchingCore::Init(MyApp* parent_pointer,
d_statistical_buffers_ptrs[i]->Allocate(d_input_image.dims.x, d_input_image.dims.y, number_of_global_search_images_to_save, true);
}

this->pre_padding = pre_padding;
this->roi = roi;
this->histogram_min_scaled = histogram_min_scaled;
this->histogram_step_scaled = histogram_step_scaled;
this->pre_padding = pre_padding;
this->roi = roi;

this->my_progress = my_progress;
this->total_correlation_positions = total_correlation_positions;
Expand All @@ -115,10 +108,8 @@ void TemplateMatchingCore::Init(MyApp* parent_pointer,
// Transfer the input image_memory_should_not_be_deallocated
};

void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel, float c_defocus, int threadIDX, long& current_correlation_position) {
void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, int threadIDX, long& current_correlation_position) {

this->c_defocus = c_defocus;
this->c_pixel = c_pixel;
total_number_of_cccs_calculated = 0;
total_number_of_histogram_samples = 0;
total_number_of_stats_samples = 0;
Expand Down Expand Up @@ -151,12 +142,10 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel,
}
#endif

const float wanted_histogram_sampling = 1.0f; // 100% until tested
const float wanted_stats_sampling = 1.0f; // 100% until tested
if ( this_is_the_first_run_on_inner_loop ) {
d_input_image.CopyFP32toFP16buffer(false);
d_padded_reference.CopyFP32toFP16buffer(false);
my_dist = std::make_unique<TM_EmpiricalDistribution<__half, __half2>>(d_input_image, histogram_min_scaled, histogram_step_scaled, pre_padding, roi, wanted_histogram_sampling, wanted_stats_sampling);
my_dist = std::make_unique<TM_EmpiricalDistribution<__half, __half2>>(d_input_image, pre_padding, roi, histogram_sampling_, stats_sampling_);
}
else {
my_dist->ZeroHistogram( );
Expand All @@ -177,15 +166,14 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel,

cudaErr(cudaEventCreateWithFlags(&mip_is_done_Event, cudaEventBlockingSync));
#ifdef cisTEM_USING_FastFFT
FastFFT::FourierTransformer<float, __half, __half2, 2> FT;
FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::NOOP> noop;
FastFFT::KernelFunction::my_functor<float, 4, FastFFT::KernelFunction::CONJ_MUL> conj_mul;
FastFFT::FourierTransformer<float, __half, __half2, 2> FT;

// float scale_factor = powf((float)d_current_projection[0].number_of_real_space_pixels, -2.0);
// float scale_factor = 1.f;
float scale_factor = sqrtf(1.0f / float(d_input_image.number_of_real_space_pixels));

FastFFT::KernelFunction::my_functor<float, 4, FastFFT::KernelFunction::CONJ_MUL_THEN_SCALE> conj_mul_then_scale(scale_factor);
FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::NOOP> noop;

if ( use_fast_fft ) {
// TODO: overload that takes and short4's int4's instead of the individual values
Expand Down Expand Up @@ -269,8 +257,8 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel,
else {
// Make sure the previous copy from host -> device has completed before we start to make another projection.
// Event is created as non-blocking so this is a busy-wait.

template_reconstruction.ExtractSlice(current_projection[current_projection_idx], angles, 1.0f, false);
MyDebugAssertFalse(cpu_template == nullptr, "Template reconstruction is not set with SetCpuTemplate");
cpu_template->ExtractSlice(current_projection[current_projection_idx], angles, 1.0f, false);

current_projection[current_projection_idx].SwapRealSpaceQuadrants( );
current_projection[current_projection_idx].MultiplyPixelWise(projection_filter);
Expand Down
Loading

0 comments on commit eb24070

Please sign in to comment.