FastFFT: Part 14

match_template.cpp - Add command line flags to modify TM behavior - --histogram-sampling=float (percentage of pixels to sample for histogram) - --stats-sampling=float (percentage of pixels to sample for stats) - --disable-gpu-prj (really just for benchmarking and troubleshooting) DeviceManager.cu - Remove unused argument to DeviceManager::SetGpu to avoid confusion - Delete spurious comments GpuImage.cu - Remove unused methods/kernel definitions for Mip related TemplateMatchingCore.cu - Remove declaration of unused FastFFT::my_functor - Move use_gpu_prj from constexpr to member variable set on init template_matching_empirical_distribution - Remove un-needed histogram defining vars, and use those in constants.h directly
StochasticAnalytics · Oct 8, 2024 · eb24070 · eb24070
1 parent e2d5969
commit eb24070
Show file tree

Hide file tree

Showing 11 changed files with 146 additions and 267 deletions.
diff --git a/src/constants/constants.h b/src/constants/constants.h
@@ -42,7 +42,7 @@ constexpr float histogram_first_bin_midpoint = histogram_min + (histogram_step /
 constexpr int   number_of_output_images     = 8; //mip, psi, theta, phi, pixel, defocus, sums, sqsums (scaled mip is not sent out)
 constexpr int   number_of_meta_data_values  = 10; // img_x, img_y, number cccs, histogram values, pixel_size
 constexpr int   MAX_ALLOWED_NUMBER_OF_PEAKS = 1000; // An error will be thrown and job aborted if this number of peaks is exceeded in the make template results block
-constexpr float MAX_BINNING_FACTOR          = histogram_min; // The maximum binning factor allowed for the template matching search image
+constexpr float MAX_BINNING_FACTOR          = 10.f; // The maximum binning factor allowed for the template matching search image
 constexpr float MIN_VALUE_TO_MIP            = 1.0f; // The minimum value to be considered for the MIP
 
 /**

diff --git a/src/gpu/DeviceManager.cu b/src/gpu/DeviceManager.cu
@@ -38,45 +38,19 @@ void DeviceManager::Init(int wanted_number_of_gpus) {
     float  GPU_score;
     float  GPU_score_max = 0.0f;
 
-    //  memset(&prop, 0, sizeof(cudaDeviceProp));
-    //  prop.memoryBusWidth = 8192;
-    //  prop.memoryClockRate = 4010000;
-    //  prop.multiProcessorCount = 160;
-    //  prop.totalGlobalMem = 68174610432; // 64 Gb
-    //
-    //  cudaErr(cudaChooseDevice(&selected_GPU, &prop));
-    //  wxPrintf("Chosen device dumber: %d\n", selected_GPU);
-
-    //  // sleep for a short random bit between 0.5 and 5 seconds. The goal is so mutli-card units have work sent to the next card.
-    //  wxMilliSleep((global_random_number_generator.GetUniformRandom() + 1.5) * 2000);
     for ( int iGPU = 0; iGPU < gpu_check; iGPU++ ) {
         cudaDeviceProp prop;
         cudaErr(cudaGetDeviceProperties(&prop, iGPU));
-        //	  wxPrintf("Device Number: %d\n", iGPU);
-        //	  wxPrintf("  Device name: %s\n", prop.name);
-        //	  wxPrintf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
-        //	  wxPrintf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
-        //	  wxPrintf("  Peak Memory Bandwidth (GB/s): %f\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
-        //	  wxPrintf("  Number of multiprocessors: %d\n", prop.multiProcessorCount);
-        //	  wxPrintf("  Threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
-        //	  wxPrintf("  Memory per multiprocessor: %li\n", long(prop.sharedMemPerMultiprocessor));
-        //	  wxPrintf("  Memory on device: %li\n", long(prop.totalGlobalMem));
-        //	  wxPrintf("\n");
 
         cudaErr(cudaDeviceGetAttribute(&cuda_compute_mode, cudaDevAttrComputeMode, iGPU));
 
         cudaErr(cudaSetDevice(iGPU));
 
-        //	  cudaErr(cudaMemGetInfo(&free_mem,&total_mem));
-        //	  wxPrintf("Found %zd free mem out of %zd on iGPU %d\n", free_mem, total_mem, iGPU);
-        //	  wxPrintf("cuda_compute_mode, cudaDevAttrComputeMode, cudaComputeModeProhibited = %i %i %i\n", cuda_compute_mode, cudaDevAttrComputeMode, cudaComputeModeProhibited);
-
         if ( cuda_compute_mode != cudaComputeModeProhibited && prop.totalGlobalMem > min_memory_available ) {
             if ( cuda_compute_mode != cudaComputeModeDefault ) {
                 wxPrintf("\n\n\tWarning : Your device id %d is not set to Default compute mode.\n\n", iGPU);
             }
             cudaErr(cudaMemGetInfo(&free_mem, &total_mem));
-            //		  wxPrintf("Found %zd free mem out of %zd on iGPU %d\n", free_mem, total_mem, iGPU);
             if ( total_mem < min_memory_total ) {
                 // Don't use this device. This might not be the best way to do this.
                 cudaErr(cudaDeviceReset( ));
@@ -85,14 +59,8 @@ void DeviceManager::Init(int wanted_number_of_gpus) {
             else if ( free_mem > max_mem && free_mem > min_memory_available ) {
                 max_mem = free_mem;
             }
-            //		  else if (free_mem > max_mem && free_mem > min_memory_available) { max_mem = free_mem; selected_GPU = iGPU; }
 
             GPU_score = (float)free_mem * (float)prop.memoryClockRate * (float)prop.memoryBusWidth * (float)prop.multiProcessorCount * (float)prop.maxThreadsPerMultiProcessor;
-            //		  wxPrintf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
-            //		  wxPrintf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
-            //		  wxPrintf("  Number of multiprocessors: %d\n", prop.multiProcessorCount);
-            //		  wxPrintf("  Threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
-            //		  wxPrintf("GPU score = %g\n", GPU_score);
             if ( GPU_score > GPU_score_max ) {
                 GPU_score_max = GPU_score;
                 selected_GPU  = iGPU;
@@ -110,15 +78,8 @@ void DeviceManager::Init(int wanted_number_of_gpus) {
     is_manager_initialized = true;
 };
 
-void DeviceManager::SetGpu(int cpu_thread_idx) {
-
-    //	// Select the current device
-    //	this->gpuIDX = -1;
-    //	cudaErr(cudaSetDevice(cpu_thread_idx % this->nGPUs));   // "% num_gpus" allows more CPU threads than GPU devices
+void DeviceManager::SetGpu( ) {
     cudaErr(cudaSetDevice(this->gpuIDX));
-    //	cudaErr(cudaGetDevice(&gpuIDX));
-
-    //	wxPrintf("For thread %d of nGpus %d assigned %d\n",cpu_thread_idx, nGPUs, gpuIDX);
 };
 
 void DeviceManager::ResetGpu( ) {

diff --git a/src/gpu/DeviceManager.h b/src/gpu/DeviceManager.h
@@ -13,7 +13,7 @@ class DeviceManager {
     virtual ~DeviceManager( );
 
     void Init(int wanted_number_of_gpus);
-    void SetGpu(int cpu_thread_idx);
+    void SetGpu( );
     void ResetGpu( );
     void ListDevices( );
 

diff --git a/src/gpu/GpuImage.cu b/src/gpu/GpuImage.cu
@@ -16,15 +16,6 @@
 // #define USE_BLOCK_REDUCE
 #define USE_FP16_FOR_WHITENPS
 
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, const cufftReal* correlation_output, const int4 dims);
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, cufftReal* other_image, cufftReal* psi, cufftReal* phi, cufftReal* theta,
-                   int4 dims, float c_psi, float c_phi, float c_theta);
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, cufftReal* other_image, cufftReal* psi, cufftReal* phi, cufftReal* theta, cufftReal* defocus, cufftReal* pixel, const int4 dims,
-                   float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel);
-
 // Inline declarations
 __device__ __forceinline__ int
 d_ReturnFourierLogicalCoordGivenPhysicalCoord_X(int physical_index,
@@ -527,7 +518,6 @@ MultiplyPixelWiseComplexConjugateKernel(const StorageType* __restrict__ img_comp
     int address = x + (dims.w / 2) * y;
     int stride  = (dims.w / 2) * dims.y;
 
-    constexpr float epsilon = 1e-6f;
     // We have one referece image, and this is broadcasted to all images in the stack
     if constexpr ( std::is_same<StorageType, __half2>::value ) {
         const Complex ref_val = (Complex)__half22float2(ref_complex_values[address]);
@@ -1273,92 +1263,6 @@ float GpuImage::ReturnSumSquareModulusComplexValues( ) {
     return float(tmpValComplex[tmp_val_idx::ReturnSumSquareModulusComplexValues] * tmpValComplex[tmp_val_idx::ReturnSumSquareModulusComplexValues]);
 }
 
-void GpuImage::MipPixelWise(GpuImage& other_image) {
-
-    MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension.");
-    precheck;
-    ReturnLaunchParameters(dims, true);
-    MipPixelWiseKernel<<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(real_values, other_image.real_values, this->dims);
-    postcheck;
-}
-
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, const cufftReal* correlation_output, const int4 dims) {
-
-    int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
-                            blockIdx.y * blockDim.y + threadIdx.y,
-                            blockIdx.z);
-
-    if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) {
-        int address  = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims);
-        mip[address] = MAX(mip[address], correlation_output[address]);
-    }
-}
-
-void GpuImage::MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, GpuImage& defocus, GpuImage& pixel,
-                            float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel) {
-
-    MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension.");
-    precheck;
-    ReturnLaunchParameters(dims, true);
-    MipPixelWiseKernel<<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(real_values, other_image.real_values,
-                                                                              psi.real_values, phi.real_values, theta.real_values, defocus.real_values, pixel.real_values,
-                                                                              this->dims, c_psi, c_phi, c_theta, c_defocus, c_pixel);
-    postcheck;
-}
-
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, cufftReal* correlation_output, cufftReal* psi, cufftReal* phi, cufftReal* theta, cufftReal* defocus, cufftReal* pixel, const int4 dims,
-                   float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel) {
-
-    int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
-                            blockIdx.y * blockDim.y + threadIdx.y,
-                            blockIdx.z);
-
-    if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) {
-        int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims);
-        if ( correlation_output[address] > mip[address] ) {
-            mip[address]     = correlation_output[address];
-            psi[address]     = c_psi;
-            phi[address]     = c_phi;
-            theta[address]   = c_theta;
-            defocus[address] = c_defocus;
-            pixel[address]   = c_pixel;
-        }
-    }
-}
-
-void GpuImage::MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta,
-                            float c_psi, float c_phi, float c_theta) {
-
-    MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension.");
-    precheck;
-    ReturnLaunchParameters(dims, true);
-    MipPixelWiseKernel<<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(real_values, other_image.real_values,
-                                                                              psi.real_values, phi.real_values, theta.real_values,
-                                                                              this->dims, c_psi, c_phi, c_theta);
-    postcheck;
-}
-
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, cufftReal* correlation_output, cufftReal* psi, cufftReal* phi, cufftReal* theta, const int4 dims,
-                   float c_psi, float c_phi, float c_theta) {
-
-    int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
-                            blockIdx.y * blockDim.y + threadIdx.y,
-                            blockIdx.z);
-
-    if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) {
-        int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims);
-        if ( correlation_output[address] > mip[address] ) {
-            mip[address]   = correlation_output[address];
-            psi[address]   = c_psi;
-            phi[address]   = c_phi;
-            theta[address] = c_theta;
-        }
-    }
-}
-
 template <typename StorageType>
 __global__ void
 ApplyBFactorKernel(StorageType* d_input,

diff --git a/src/gpu/GpuImage.h b/src/gpu/GpuImage.h
@@ -30,8 +30,6 @@ class GpuImage {
     GpuImage& operator=(const GpuImage& t);
     GpuImage& operator=(const GpuImage* t);
 
-    // FIXME: move constructor?
-
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // START MEMBER VARIABLES FROM THE cpu IMAGE CLASS
 
@@ -297,11 +295,6 @@ class GpuImage {
     void WaitBlocking( );
     void RecordAndWait( );
     // Maximum intensity projection
-    void MipPixelWise(GpuImage& other_image);
-    void MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta,
-                      float c_psi, float c_phi, float c_theta);
-    void MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, GpuImage& defocus, GpuImage& pixel,
-                      float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel);
 
     // FIXME: These are added for the unblur refinement but are untested.
     template <typename StorageTypeBase = float>

diff --git a/src/gpu/TemplateMatchingCore.cu b/src/gpu/TemplateMatchingCore.cu
@@ -11,35 +11,26 @@
 
 using namespace cistem_timer;
 
-constexpr bool use_gpu_prj = true;
-
 void TemplateMatchingCore::Init(MyApp*                    parent_pointer,
                                 std::shared_ptr<GpuImage> wanted_template_reconstruction,
                                 Image&                    wanted_input_image,
                                 Image&                    current_projection,
-                                float                     pixel_size_search_range,
-                                float                     pixel_size_step,
-                                float                     pixel_size,
-                                float                     defocus_search_range,
-                                float                     defocus_step,
-                                float                     defocus1,
-                                float                     defocus2,
                                 float                     psi_max,
                                 float                     psi_start,
                                 float                     psi_step,
                                 AnglesAndShifts&          angles,
                                 EulerSearch&              global_euler_search,
-                                float                     histogram_min_scaled,
-                                float                     histogram_step_scaled,
-                                int                       histogram_number_of_bins,
                                 const int2                pre_padding,
                                 const int2                roi,
+                                float                     histogram_sampling,
+                                float                     stats_sampling,
                                 int                       first_search_position,
                                 int                       last_search_position,
                                 ProgressBar*              my_progress,
                                 long                      total_correlation_positions,
                                 bool                      is_running_locally,
                                 bool                      use_fast_fft,
+                                bool                      use_gpu_prj,
                                 int                       number_of_global_search_images_to_save)
 
 {
@@ -49,6 +40,10 @@ void TemplateMatchingCore::Init(MyApp*                    parent_pointer,
     MyDebugAssertTrue(wanted_input_image.is_in_memory, "Input image must be in memory");
     object_initialized_ = true;
 
+    this->use_gpu_prj   = use_gpu_prj;
+    histogram_sampling_ = histogram_sampling;
+    stats_sampling_     = stats_sampling;
+
     this->first_search_position          = first_search_position;
     this->last_search_position           = last_search_position;
     this->angles                         = angles;
@@ -97,10 +92,8 @@ void TemplateMatchingCore::Init(MyApp*                    parent_pointer,
         d_statistical_buffers_ptrs[i]->Allocate(d_input_image.dims.x, d_input_image.dims.y, number_of_global_search_images_to_save, true);
     }
 
-    this->pre_padding           = pre_padding;
-    this->roi                   = roi;
-    this->histogram_min_scaled  = histogram_min_scaled;
-    this->histogram_step_scaled = histogram_step_scaled;
+    this->pre_padding = pre_padding;
+    this->roi         = roi;
 
     this->my_progress                 = my_progress;
     this->total_correlation_positions = total_correlation_positions;
@@ -115,10 +108,8 @@ void TemplateMatchingCore::Init(MyApp*                    parent_pointer,
     // Transfer the input image_memory_should_not_be_deallocated
 };
 
-void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel, float c_defocus, int threadIDX, long& current_correlation_position) {
+void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, int threadIDX, long& current_correlation_position) {
 
-    this->c_defocus                   = c_defocus;
-    this->c_pixel                     = c_pixel;
     total_number_of_cccs_calculated   = 0;
     total_number_of_histogram_samples = 0;
     total_number_of_stats_samples     = 0;
@@ -151,12 +142,10 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel,
     }
 #endif
 
-    const float wanted_histogram_sampling = 1.0f; // 100% until tested
-    const float wanted_stats_sampling     = 1.0f; // 100% until tested
     if ( this_is_the_first_run_on_inner_loop ) {
         d_input_image.CopyFP32toFP16buffer(false);
         d_padded_reference.CopyFP32toFP16buffer(false);
-        my_dist = std::make_unique<TM_EmpiricalDistribution<__half, __half2>>(d_input_image, histogram_min_scaled, histogram_step_scaled, pre_padding, roi, wanted_histogram_sampling, wanted_stats_sampling);
+        my_dist = std::make_unique<TM_EmpiricalDistribution<__half, __half2>>(d_input_image, pre_padding, roi, histogram_sampling_, stats_sampling_);
     }
     else {
         my_dist->ZeroHistogram( );
@@ -177,15 +166,14 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel,
 
     cudaErr(cudaEventCreateWithFlags(&mip_is_done_Event, cudaEventBlockingSync));
 #ifdef cisTEM_USING_FastFFT
-    FastFFT::FourierTransformer<float, __half, __half2, 2>                           FT;
-    FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::NOOP>     noop;
-    FastFFT::KernelFunction::my_functor<float, 4, FastFFT::KernelFunction::CONJ_MUL> conj_mul;
+    FastFFT::FourierTransformer<float, __half, __half2, 2> FT;
 
     // float scale_factor = powf((float)d_current_projection[0].number_of_real_space_pixels, -2.0);
     // float scale_factor = 1.f;
     float scale_factor = sqrtf(1.0f / float(d_input_image.number_of_real_space_pixels));
 
     FastFFT::KernelFunction::my_functor<float, 4, FastFFT::KernelFunction::CONJ_MUL_THEN_SCALE> conj_mul_then_scale(scale_factor);
+    FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::NOOP>                noop;
 
     if ( use_fast_fft ) {
         // TODO: overload that takes and short4's int4's instead of the individual values
@@ -269,8 +257,8 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel,
             else {
                 // Make sure the previous copy from host -> device has completed before we start to make another projection.
                 // Event is created as non-blocking so this is a busy-wait.
-
-                template_reconstruction.ExtractSlice(current_projection[current_projection_idx], angles, 1.0f, false);
+                MyDebugAssertFalse(cpu_template == nullptr, "Template reconstruction is not set with SetCpuTemplate");
+                cpu_template->ExtractSlice(current_projection[current_projection_idx], angles, 1.0f, false);
 
                 current_projection[current_projection_idx].SwapRealSpaceQuadrants( );
                 current_projection[current_projection_idx].MultiplyPixelWise(projection_filter);