diff --git a/src/constants/constants.h b/src/constants/constants.h index c01d8498..652525f9 100644 --- a/src/constants/constants.h +++ b/src/constants/constants.h @@ -42,7 +42,7 @@ constexpr float histogram_first_bin_midpoint = histogram_min + (histogram_step / constexpr int number_of_output_images = 8; //mip, psi, theta, phi, pixel, defocus, sums, sqsums (scaled mip is not sent out) constexpr int number_of_meta_data_values = 10; // img_x, img_y, number cccs, histogram values, pixel_size constexpr int MAX_ALLOWED_NUMBER_OF_PEAKS = 1000; // An error will be thrown and job aborted if this number of peaks is exceeded in the make template results block -constexpr float MAX_BINNING_FACTOR = histogram_min; // The maximum binning factor allowed for the template matching search image +constexpr float MAX_BINNING_FACTOR = 10.f; // The maximum binning factor allowed for the template matching search image constexpr float MIN_VALUE_TO_MIP = 1.0f; // The minimum value to be considered for the MIP /** diff --git a/src/gpu/DeviceManager.cu b/src/gpu/DeviceManager.cu index 994fd83f..119424f7 100644 --- a/src/gpu/DeviceManager.cu +++ b/src/gpu/DeviceManager.cu @@ -38,45 +38,19 @@ void DeviceManager::Init(int wanted_number_of_gpus) { float GPU_score; float GPU_score_max = 0.0f; - // memset(&prop, 0, sizeof(cudaDeviceProp)); - // prop.memoryBusWidth = 8192; - // prop.memoryClockRate = 4010000; - // prop.multiProcessorCount = 160; - // prop.totalGlobalMem = 68174610432; // 64 Gb - // - // cudaErr(cudaChooseDevice(&selected_GPU, &prop)); - // wxPrintf("Chosen device dumber: %d\n", selected_GPU); - - // // sleep for a short random bit between 0.5 and 5 seconds. The goal is so mutli-card units have work sent to the next card. - // wxMilliSleep((global_random_number_generator.GetUniformRandom() + 1.5) * 2000); for ( int iGPU = 0; iGPU < gpu_check; iGPU++ ) { cudaDeviceProp prop; cudaErr(cudaGetDeviceProperties(&prop, iGPU)); - // wxPrintf("Device Number: %d\n", iGPU); - // wxPrintf(" Device name: %s\n", prop.name); - // wxPrintf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate); - // wxPrintf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth); - // wxPrintf(" Peak Memory Bandwidth (GB/s): %f\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6); - // wxPrintf(" Number of multiprocessors: %d\n", prop.multiProcessorCount); - // wxPrintf(" Threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor); - // wxPrintf(" Memory per multiprocessor: %li\n", long(prop.sharedMemPerMultiprocessor)); - // wxPrintf(" Memory on device: %li\n", long(prop.totalGlobalMem)); - // wxPrintf("\n"); cudaErr(cudaDeviceGetAttribute(&cuda_compute_mode, cudaDevAttrComputeMode, iGPU)); cudaErr(cudaSetDevice(iGPU)); - // cudaErr(cudaMemGetInfo(&free_mem,&total_mem)); - // wxPrintf("Found %zd free mem out of %zd on iGPU %d\n", free_mem, total_mem, iGPU); - // wxPrintf("cuda_compute_mode, cudaDevAttrComputeMode, cudaComputeModeProhibited = %i %i %i\n", cuda_compute_mode, cudaDevAttrComputeMode, cudaComputeModeProhibited); - if ( cuda_compute_mode != cudaComputeModeProhibited && prop.totalGlobalMem > min_memory_available ) { if ( cuda_compute_mode != cudaComputeModeDefault ) { wxPrintf("\n\n\tWarning : Your device id %d is not set to Default compute mode.\n\n", iGPU); } cudaErr(cudaMemGetInfo(&free_mem, &total_mem)); - // wxPrintf("Found %zd free mem out of %zd on iGPU %d\n", free_mem, total_mem, iGPU); if ( total_mem < min_memory_total ) { // Don't use this device. This might not be the best way to do this. cudaErr(cudaDeviceReset( )); @@ -85,14 +59,8 @@ void DeviceManager::Init(int wanted_number_of_gpus) { else if ( free_mem > max_mem && free_mem > min_memory_available ) { max_mem = free_mem; } - // else if (free_mem > max_mem && free_mem > min_memory_available) { max_mem = free_mem; selected_GPU = iGPU; } GPU_score = (float)free_mem * (float)prop.memoryClockRate * (float)prop.memoryBusWidth * (float)prop.multiProcessorCount * (float)prop.maxThreadsPerMultiProcessor; - // wxPrintf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate); - // wxPrintf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth); - // wxPrintf(" Number of multiprocessors: %d\n", prop.multiProcessorCount); - // wxPrintf(" Threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor); - // wxPrintf("GPU score = %g\n", GPU_score); if ( GPU_score > GPU_score_max ) { GPU_score_max = GPU_score; selected_GPU = iGPU; @@ -110,15 +78,8 @@ void DeviceManager::Init(int wanted_number_of_gpus) { is_manager_initialized = true; }; -void DeviceManager::SetGpu(int cpu_thread_idx) { - - // // Select the current device - // this->gpuIDX = -1; - // cudaErr(cudaSetDevice(cpu_thread_idx % this->nGPUs)); // "% num_gpus" allows more CPU threads than GPU devices +void DeviceManager::SetGpu( ) { cudaErr(cudaSetDevice(this->gpuIDX)); - // cudaErr(cudaGetDevice(&gpuIDX)); - - // wxPrintf("For thread %d of nGpus %d assigned %d\n",cpu_thread_idx, nGPUs, gpuIDX); }; void DeviceManager::ResetGpu( ) { diff --git a/src/gpu/DeviceManager.h b/src/gpu/DeviceManager.h index f0d5dfae..fd888ec5 100644 --- a/src/gpu/DeviceManager.h +++ b/src/gpu/DeviceManager.h @@ -13,7 +13,7 @@ class DeviceManager { virtual ~DeviceManager( ); void Init(int wanted_number_of_gpus); - void SetGpu(int cpu_thread_idx); + void SetGpu( ); void ResetGpu( ); void ListDevices( ); diff --git a/src/gpu/GpuImage.cu b/src/gpu/GpuImage.cu index 8af55522..6731c2ef 100644 --- a/src/gpu/GpuImage.cu +++ b/src/gpu/GpuImage.cu @@ -16,15 +16,6 @@ // #define USE_BLOCK_REDUCE #define USE_FP16_FOR_WHITENPS -__global__ void -MipPixelWiseKernel(cufftReal* mip, const cufftReal* correlation_output, const int4 dims); -__global__ void -MipPixelWiseKernel(cufftReal* mip, cufftReal* other_image, cufftReal* psi, cufftReal* phi, cufftReal* theta, - int4 dims, float c_psi, float c_phi, float c_theta); -__global__ void -MipPixelWiseKernel(cufftReal* mip, cufftReal* other_image, cufftReal* psi, cufftReal* phi, cufftReal* theta, cufftReal* defocus, cufftReal* pixel, const int4 dims, - float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel); - // Inline declarations __device__ __forceinline__ int d_ReturnFourierLogicalCoordGivenPhysicalCoord_X(int physical_index, @@ -527,7 +518,6 @@ MultiplyPixelWiseComplexConjugateKernel(const StorageType* __restrict__ img_comp int address = x + (dims.w / 2) * y; int stride = (dims.w / 2) * dims.y; - constexpr float epsilon = 1e-6f; // We have one referece image, and this is broadcasted to all images in the stack if constexpr ( std::is_same::value ) { const Complex ref_val = (Complex)__half22float2(ref_complex_values[address]); @@ -1273,92 +1263,6 @@ float GpuImage::ReturnSumSquareModulusComplexValues( ) { return float(tmpValComplex[tmp_val_idx::ReturnSumSquareModulusComplexValues] * tmpValComplex[tmp_val_idx::ReturnSumSquareModulusComplexValues]); } -void GpuImage::MipPixelWise(GpuImage& other_image) { - - MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension."); - precheck; - ReturnLaunchParameters(dims, true); - MipPixelWiseKernel<<>>(real_values, other_image.real_values, this->dims); - postcheck; -} - -__global__ void -MipPixelWiseKernel(cufftReal* mip, const cufftReal* correlation_output, const int4 dims) { - - int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x, - blockIdx.y * blockDim.y + threadIdx.y, - blockIdx.z); - - if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) { - int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims); - mip[address] = MAX(mip[address], correlation_output[address]); - } -} - -void GpuImage::MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, GpuImage& defocus, GpuImage& pixel, - float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel) { - - MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension."); - precheck; - ReturnLaunchParameters(dims, true); - MipPixelWiseKernel<<>>(real_values, other_image.real_values, - psi.real_values, phi.real_values, theta.real_values, defocus.real_values, pixel.real_values, - this->dims, c_psi, c_phi, c_theta, c_defocus, c_pixel); - postcheck; -} - -__global__ void -MipPixelWiseKernel(cufftReal* mip, cufftReal* correlation_output, cufftReal* psi, cufftReal* phi, cufftReal* theta, cufftReal* defocus, cufftReal* pixel, const int4 dims, - float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel) { - - int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x, - blockIdx.y * blockDim.y + threadIdx.y, - blockIdx.z); - - if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) { - int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims); - if ( correlation_output[address] > mip[address] ) { - mip[address] = correlation_output[address]; - psi[address] = c_psi; - phi[address] = c_phi; - theta[address] = c_theta; - defocus[address] = c_defocus; - pixel[address] = c_pixel; - } - } -} - -void GpuImage::MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, - float c_psi, float c_phi, float c_theta) { - - MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension."); - precheck; - ReturnLaunchParameters(dims, true); - MipPixelWiseKernel<<>>(real_values, other_image.real_values, - psi.real_values, phi.real_values, theta.real_values, - this->dims, c_psi, c_phi, c_theta); - postcheck; -} - -__global__ void -MipPixelWiseKernel(cufftReal* mip, cufftReal* correlation_output, cufftReal* psi, cufftReal* phi, cufftReal* theta, const int4 dims, - float c_psi, float c_phi, float c_theta) { - - int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x, - blockIdx.y * blockDim.y + threadIdx.y, - blockIdx.z); - - if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) { - int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims); - if ( correlation_output[address] > mip[address] ) { - mip[address] = correlation_output[address]; - psi[address] = c_psi; - phi[address] = c_phi; - theta[address] = c_theta; - } - } -} - template __global__ void ApplyBFactorKernel(StorageType* d_input, diff --git a/src/gpu/GpuImage.h b/src/gpu/GpuImage.h index 1b52fa56..a0986d11 100644 --- a/src/gpu/GpuImage.h +++ b/src/gpu/GpuImage.h @@ -30,8 +30,6 @@ class GpuImage { GpuImage& operator=(const GpuImage& t); GpuImage& operator=(const GpuImage* t); - // FIXME: move constructor? - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // START MEMBER VARIABLES FROM THE cpu IMAGE CLASS @@ -297,11 +295,6 @@ class GpuImage { void WaitBlocking( ); void RecordAndWait( ); // Maximum intensity projection - void MipPixelWise(GpuImage& other_image); - void MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, - float c_psi, float c_phi, float c_theta); - void MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, GpuImage& defocus, GpuImage& pixel, - float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel); // FIXME: These are added for the unblur refinement but are untested. template diff --git a/src/gpu/TemplateMatchingCore.cu b/src/gpu/TemplateMatchingCore.cu index aac6d156..9fd1659e 100644 --- a/src/gpu/TemplateMatchingCore.cu +++ b/src/gpu/TemplateMatchingCore.cu @@ -11,35 +11,26 @@ using namespace cistem_timer; -constexpr bool use_gpu_prj = true; - void TemplateMatchingCore::Init(MyApp* parent_pointer, std::shared_ptr wanted_template_reconstruction, Image& wanted_input_image, Image& current_projection, - float pixel_size_search_range, - float pixel_size_step, - float pixel_size, - float defocus_search_range, - float defocus_step, - float defocus1, - float defocus2, float psi_max, float psi_start, float psi_step, AnglesAndShifts& angles, EulerSearch& global_euler_search, - float histogram_min_scaled, - float histogram_step_scaled, - int histogram_number_of_bins, const int2 pre_padding, const int2 roi, + float histogram_sampling, + float stats_sampling, int first_search_position, int last_search_position, ProgressBar* my_progress, long total_correlation_positions, bool is_running_locally, bool use_fast_fft, + bool use_gpu_prj, int number_of_global_search_images_to_save) { @@ -49,6 +40,10 @@ void TemplateMatchingCore::Init(MyApp* parent_pointer, MyDebugAssertTrue(wanted_input_image.is_in_memory, "Input image must be in memory"); object_initialized_ = true; + this->use_gpu_prj = use_gpu_prj; + histogram_sampling_ = histogram_sampling; + stats_sampling_ = stats_sampling; + this->first_search_position = first_search_position; this->last_search_position = last_search_position; this->angles = angles; @@ -97,10 +92,8 @@ void TemplateMatchingCore::Init(MyApp* parent_pointer, d_statistical_buffers_ptrs[i]->Allocate(d_input_image.dims.x, d_input_image.dims.y, number_of_global_search_images_to_save, true); } - this->pre_padding = pre_padding; - this->roi = roi; - this->histogram_min_scaled = histogram_min_scaled; - this->histogram_step_scaled = histogram_step_scaled; + this->pre_padding = pre_padding; + this->roi = roi; this->my_progress = my_progress; this->total_correlation_positions = total_correlation_positions; @@ -115,10 +108,8 @@ void TemplateMatchingCore::Init(MyApp* parent_pointer, // Transfer the input image_memory_should_not_be_deallocated }; -void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel, float c_defocus, int threadIDX, long& current_correlation_position) { +void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, int threadIDX, long& current_correlation_position) { - this->c_defocus = c_defocus; - this->c_pixel = c_pixel; total_number_of_cccs_calculated = 0; total_number_of_histogram_samples = 0; total_number_of_stats_samples = 0; @@ -151,12 +142,10 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel, } #endif - const float wanted_histogram_sampling = 1.0f; // 100% until tested - const float wanted_stats_sampling = 1.0f; // 100% until tested if ( this_is_the_first_run_on_inner_loop ) { d_input_image.CopyFP32toFP16buffer(false); d_padded_reference.CopyFP32toFP16buffer(false); - my_dist = std::make_unique>(d_input_image, histogram_min_scaled, histogram_step_scaled, pre_padding, roi, wanted_histogram_sampling, wanted_stats_sampling); + my_dist = std::make_unique>(d_input_image, pre_padding, roi, histogram_sampling_, stats_sampling_); } else { my_dist->ZeroHistogram( ); @@ -177,15 +166,14 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel, cudaErr(cudaEventCreateWithFlags(&mip_is_done_Event, cudaEventBlockingSync)); #ifdef cisTEM_USING_FastFFT - FastFFT::FourierTransformer FT; - FastFFT::KernelFunction::my_functor noop; - FastFFT::KernelFunction::my_functor conj_mul; + FastFFT::FourierTransformer FT; // float scale_factor = powf((float)d_current_projection[0].number_of_real_space_pixels, -2.0); // float scale_factor = 1.f; float scale_factor = sqrtf(1.0f / float(d_input_image.number_of_real_space_pixels)); FastFFT::KernelFunction::my_functor conj_mul_then_scale(scale_factor); + FastFFT::KernelFunction::my_functor noop; if ( use_fast_fft ) { // TODO: overload that takes and short4's int4's instead of the individual values @@ -269,8 +257,8 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel, else { // Make sure the previous copy from host -> device has completed before we start to make another projection. // Event is created as non-blocking so this is a busy-wait. - - template_reconstruction.ExtractSlice(current_projection[current_projection_idx], angles, 1.0f, false); + MyDebugAssertFalse(cpu_template == nullptr, "Template reconstruction is not set with SetCpuTemplate"); + cpu_template->ExtractSlice(current_projection[current_projection_idx], angles, 1.0f, false); current_projection[current_projection_idx].SwapRealSpaceQuadrants( ); current_projection[current_projection_idx].MultiplyPixelWise(projection_filter); diff --git a/src/gpu/TemplateMatchingCore.h b/src/gpu/TemplateMatchingCore.h index fb91ea9c..753f1731 100644 --- a/src/gpu/TemplateMatchingCore.h +++ b/src/gpu/TemplateMatchingCore.h @@ -5,13 +5,15 @@ #include "GpuImage.h" #include "DeviceManager.h" -#include "Histogram.h" #include "template_matching_empirical_distribution.h" class TemplateMatchingCore { private: - bool object_initialized_; + bool object_initialized_; + bool use_gpu_prj; + float histogram_sampling_; + float stats_sampling_; public: TemplateMatchingCore( ) : object_initialized_{false} { }; @@ -24,14 +26,9 @@ class TemplateMatchingCore { void Init(int number_of_jobs); - DeviceManager gpuDev; - - int nGPUs; - int nThreads; int number_of_jobs_per_image_in_gui; // CPU images to be passed in - - Image template_reconstruction; std::shared_ptr template_gpu_shared; Image input_image; // These will be modified on the host from withing Template Matching Core so Allocate locally @@ -39,6 +36,8 @@ class TemplateMatchingCore { float binning_factor = 1.f; std::vector current_projection; + // Generally not used, except for --disable-gpu-prj + Image* cpu_template = nullptr; // These are assumed to be empty containers at the outset, so xfer host-->device is skipped GpuImage d_max_intensity_projection; @@ -62,20 +61,9 @@ class TemplateMatchingCore { GpuImage d_padded_reference; // Search range parameters - float pixel_size_search_range; - float pixel_size_step; - float pixel_size; - float defocus_search_range; - float defocus_step; - float defocus1; - float defocus2; float psi_max; float psi_start; float psi_step; - float minimum_threshold = 20.0f; // Optionally override this to limit what is considered for refinement - - float c_defocus; - float c_pixel; int current_search_position; int first_search_position; @@ -87,17 +75,13 @@ class TemplateMatchingCore { int n_global_search_images_to_save; - bool is_running_locally; - bool is_gpu_3d_swapped; - bool use_fast_fft; - Histogram histogram; + bool is_running_locally; + bool use_fast_fft; std::unique_ptr> my_dist; - float histogram_min_scaled; - float histogram_step_scaled; - int2 pre_padding; - int2 roi; + int2 pre_padding; + int2 roi; // Search objects AnglesAndShifts angles; @@ -122,38 +106,33 @@ class TemplateMatchingCore { void UpdateSecondaryPeaks( ); - void SetMinimumThreshold(float wanted_threshold) { minimum_threshold = wanted_threshold; } + void SetCpuTemplate(Image* cpu_template) { + this->cpu_template = cpu_template; + } void Init(MyApp* parent_pointer, std::shared_ptr template_reconstruction, Image& input_image, Image& current_projection, - float pixel_size_search_range, - float pixel_size_step, - float pixel_size, - float defocus_search_range, - float defocus_step, - float defocus1, - float defocus2, float psi_max, float psi_start, float psi_step, AnglesAndShifts& angles, EulerSearch& global_euler_search, - float histogram_min_scaled, - float histogram_step_scaled, - int histogram_number_of_bins, const int2 pre_padding, const int2 roi, + float histogram_sampling, + float stats_sampling, int first_search_position, int last_search_position, ProgressBar* my_progress, long total_correlation_positions, bool is_running_locally, bool use_fast_fft, + bool use_gpu_prj, int number_of_global_search_images_to_save = 1); - void RunInnerLoop(Image& projection_filter, float pixel_i, float defocus_i, int threadIDX, long& current_correlation_position); + void RunInnerLoop(Image& projection_filter, int threadIDX, long& current_correlation_position); }; #endif diff --git a/src/gpu/template_matching_empirical_distribution.cu b/src/gpu/template_matching_empirical_distribution.cu index 327e8dc0..8e04e1f9 100644 --- a/src/gpu/template_matching_empirical_distribution.cu +++ b/src/gpu/template_matching_empirical_distribution.cu @@ -34,13 +34,11 @@ inline __device__ __host__ bool test_gt_zero(T value) { */ template -TM_EmpiricalDistribution::TM_EmpiricalDistribution(GpuImage& reference_image, - histogram_storage_t histogram_min, - histogram_storage_t histogram_step, - int2 pre_padding, - int2 roi, - const float histogram_sampling_rate, - const float stats_sampling_rate) : pre_padding_{pre_padding}, +TM_EmpiricalDistribution::TM_EmpiricalDistribution(GpuImage& reference_image, + int2 pre_padding, + int2 roi, + const float histogram_sampling_rate, + const float stats_sampling_rate) : pre_padding_{pre_padding}, roi_{roi}, higher_order_moments_{false}, image_plane_mem_allocated_{reference_image.real_memory_allocated}, @@ -65,16 +63,16 @@ TM_EmpiricalDistribution::TM_EmpiricalDistribution(GpuImage& // If anything, given that the output of the matched filter is ~ Gaussian, all the numbers closer to zero are less // likely to be flushed to zero when denormal, so in that respect, bflaot16 may actually maintain higher precision. if constexpr ( std::is_same_v ) { - histogram_min_ = __float2half_rn(histogram_min); - histogram_step_ = __float2half_rn(histogram_step); + histogram_min_ = __float2half_rn(TM::histogram_min); + histogram_step_ = __float2half_rn(TM::histogram_step); } else if constexpr ( std::is_same_v ) { - histogram_min_ = __float2bfloat16_rn(histogram_min); - histogram_step_ = __float2bfloat16_rn(histogram_step); + histogram_min_ = __float2bfloat16_rn(TM::histogram_min); + histogram_step_ = __float2bfloat16_rn(TM::histogram_step); } else if constexpr ( std::is_same_v ) { - histogram_min_ = histogram_min; - histogram_step_ = histogram_step; + histogram_min_ = TM::histogram_min; + histogram_step_ = TM::histogram_step; } else { MyDebugAssertTrue(false, "input_type must be either __half __nv_bfloat16, or histogram_storage_t"); @@ -113,6 +111,7 @@ TM_EmpiricalDistribution::TM_EmpiricalDistribution(GpuImage& ZeroSamplingCounters( ); AllocateAndZeroStatisticalArrays( ); + object_initialized_ = true; }; template @@ -156,6 +155,12 @@ void TM_EmpiricalDistribution::AllocateAndZeroStatisticalArray template TM_EmpiricalDistribution::~TM_EmpiricalDistribution( ) { + if ( object_initialized_ ) + Delete( ); +}; + +template +void TM_EmpiricalDistribution::Delete( ) { MyDebugAssertFalse(cudaStreamQuery(calc_stream_[0]) == cudaErrorInvalidResourceHandle, "The cuda stream is invalid"); cudaErr(cudaFreeAsync(histogram_, calc_stream_[0])); @@ -178,7 +183,9 @@ TM_EmpiricalDistribution::~TM_EmpiricalDistribution( ) { cudaErr(cudaStreamDestroy(calc_stream_[0])); cudaErr(cudaEventDestroy(mip_stack_is_ready_event_[0])); -}; + + object_initialized_ = false; +} template void TM_EmpiricalDistribution::ZeroHistogram( ) { @@ -261,7 +268,7 @@ inline __device__ void write_mip_and_stats(float* sum_array, if constexpr ( std::is_same_v ) { // I though short circuit logic would be equivalent, but maybe the cuda driver is pre-fetching values? The nested conditional is ~3% faster on total run time // indicating we are skipping unnecessary reads. - if ( max_val > ccfType{cistem::match_template::MIN_VALUE_TO_MIP} ) { + if ( max_val > ccfType{TM::MIN_VALUE_TO_MIP} ) { if ( max_val > __low2half(mip_psi[address]) ) { mip_psi[address] = __halves2half2(max_val, psi[max_idx]); theta_phi[address] = __halves2half2(theta[max_idx], phi[max_idx]); @@ -269,7 +276,7 @@ inline __device__ void write_mip_and_stats(float* sum_array, } } else if constexpr ( std::is_same_v ) { - if ( max_val > ccfType{cistem::match_template::MIN_VALUE_TO_MIP} ) { + if ( max_val > ccfType{TM::MIN_VALUE_TO_MIP} ) { if ( max_val > __low2bfloat16(mip_psi[address]) ) { mip_psi[address] = __halves2bfloat162(max_val, psi[max_idx]); theta_phi[address] = __halves2bfloat162(theta[max_idx], phi[max_idx]); @@ -277,7 +284,7 @@ inline __device__ void write_mip_and_stats(float* sum_array, } } else if constexpr ( std::is_same_v ) { - if ( max_val > ccfType{cistem::match_template::MIN_VALUE_TO_MIP} ) { + if ( max_val > ccfType{TM::MIN_VALUE_TO_MIP} ) { if ( max_val > mip_psi[address].x ) { mip_psi[address].x = max_val; mip_psi[address].y = psi[max_idx]; @@ -344,7 +351,7 @@ __global__ void __launch_bounds__(TM::histogram_number_of_points) for ( int i = pre_padding.x + physical_X( ); i < pre_padding.x + roi.x; i += GridStride_2dGrid_X( ) ) { // address = ((z * NY + y) * pitch_in_pixels) + x; address = j * pitch_in_pixels_img + i; - ccfType max_val = ccfType{cistem::match_template::histogram_min}; + ccfType max_val = ccfType{TM::histogram_min}; int max_idx = 0; float sum{0.f}, sum_sq{0.f}; for ( int k = 0; k < n_slices_to_process; k++ ) { diff --git a/src/gpu/template_matching_empirical_distribution.h b/src/gpu/template_matching_empirical_distribution.h index ffed5c6b..c2a6b6e2 100644 --- a/src/gpu/template_matching_empirical_distribution.h +++ b/src/gpu/template_matching_empirical_distribution.h @@ -27,6 +27,7 @@ class TM_EmpiricalDistribution { private: bool higher_order_moments_; + bool object_initialized_{ }; int current_image_index_; ccfType histogram_min_; ccfType histogram_step_; @@ -82,15 +83,14 @@ class TM_EmpiricalDistribution { * @param n_images_to_accumulate_concurrently - the number of images to accumulate concurrently * */ - TM_EmpiricalDistribution(GpuImage& reference_image, - histogram_storage_t histogram_min, - histogram_storage_t histogram_step, - int2 pre_padding, - int2 roi, - const float histogram_sampling_rate, - const float stats_sampling_rate); + TM_EmpiricalDistribution(GpuImage& reference_image, + int2 pre_padding, + int2 roi, + const float histogram_sampling_rate, + const float stats_sampling_rate); ~TM_EmpiricalDistribution( ); + void Delete( ); // delete the copy and move constructors as these are not handled or needed. TM_EmpiricalDistribution(const TM_EmpiricalDistribution&) = delete; diff --git a/src/programs/match_template/match_template.cpp b/src/programs/match_template/match_template.cpp index 978c7aa7..b31f307e 100644 --- a/src/programs/match_template/match_template.cpp +++ b/src/programs/match_template/match_template.cpp @@ -70,6 +70,7 @@ class float GetMaxJobWaitTimeInSeconds( ) { return 360.0f; } private: + void AddCommandLineOptions( ); template void CalcGlobalCCCScalingFactor(double& global_ccc_mean, double& global_ccc_std_dev, @@ -99,6 +100,13 @@ IMPLEMENT_APP(MatchTemplateApp) void MatchTemplateApp::ProgramSpecificInit( ) { } +// Optional command-line stuff +void MatchTemplateApp::AddCommandLineOptions( ) { + command_line_parser.AddOption("", "histogram-sampling", "Random sampling of the histogram, fraction sampled. 0.05 - 1.0 [1.0 default]", wxCMD_LINE_VAL_DOUBLE); + command_line_parser.AddOption("", "stats-sampling", "Random sampling of the statistical arrays, fraction sampled. 0.05 - 1.0 [1.0 default]", wxCMD_LINE_VAL_DOUBLE); + command_line_parser.AddLongSwitch("disable-gpu-prj", "Disable projection using the gpu. Default false"); +} + // override the DoInteractiveUserInput void MatchTemplateApp::DoInteractiveUserInput( ) { @@ -255,6 +263,23 @@ bool MatchTemplateApp::DoCalculation( ) { wxDateTime start_time = wxDateTime::Now( ); + double temp_double; + float histogram_sampling = 1.0f; + float stats_sampling = 1.0f; + bool use_gpu_prj = true; + if ( command_line_parser.Found("histogram-sampling", &temp_double) ) { + std::cerr << "histogram-sampling: " << temp_double << std::endl; + histogram_sampling = float(temp_double); + } + if ( command_line_parser.Found("stats-sampling", &temp_double) ) { + std::cerr << "stats-sampling: " << temp_double << std::endl; + stats_sampling = float(temp_double); + } + if ( command_line_parser.FoundSwitch("disable-gpu-prj") ) { + SendInfo("Disabling GPU projection\n"); + use_gpu_prj = false; + } + wxString input_search_images_filename = my_current_job.arguments[0].ReturnStringArgument( ); wxString input_reconstruction_filename = my_current_job.arguments[1].ReturnStringArgument( ); float input_pixel_size = my_current_job.arguments[2].ReturnFloatArgument( ); @@ -282,7 +307,7 @@ bool MatchTemplateApp::DoCalculation( ) { wxString best_phi_output_file = my_current_job.arguments[24].ReturnStringArgument( ); wxString best_defocus_output_file = my_current_job.arguments[25].ReturnStringArgument( ); wxString best_pixel_size_output_file = my_current_job.arguments[26].ReturnStringArgument( ); - wxString scaled_mip_output_mrcfile = my_current_job.arguments[27].ReturnStringArgument( ); + wxString scaled_mip_output_file = my_current_job.arguments[27].ReturnStringArgument( ); wxString correlation_avg_output_file = my_current_job.arguments[28].ReturnStringArgument( ); wxString my_symmetry = my_current_job.arguments[29].ReturnStringArgument( ); float in_plane_angular_step = my_current_job.arguments[30].ReturnFloatArgument( ); @@ -312,6 +337,11 @@ bool MatchTemplateApp::DoCalculation( ) { max_threads = 1; } + if ( ! use_gpu && (FloatsAreAlmostTheSame(histogram_sampling, 1.0f) == false || FloatsAreAlmostTheSame(stats_sampling, 1.0f) == false) ) { + // only GPU can handle this + SendError("sub-sampling of the histogram and statistical arrays is only supported on the GPU implementation\n"); + } + profile_timing.start("Init"); ParameterMap parameter_map; // needed for euler search init //for (int i = 0; i < 5; i++) {parameter_map[i] = true;} @@ -334,7 +364,6 @@ bool MatchTemplateApp::DoCalculation( ) { float temp_float; float variance; - double temp_double; double temp_double_array[5]; int number_of_rotations; @@ -496,7 +525,6 @@ bool MatchTemplateApp::DoCalculation( ) { //psi_step = 5; // search grid - // TODO: when checking the impact of limiting the resolution, it may be worthwile to NOT limit the number of search positions global_euler_search.InitGrid(my_symmetry, angular_step, 0.0f, 0.0f, psi_max, psi_step, psi_start, data_sizer.GetSearchPixelSize( ) / high_resolution_limit_search, parameter_map, best_parameters_to_keep); @@ -632,43 +660,63 @@ bool MatchTemplateApp::DoCalculation( ) { if ( use_gpu ) { #ifdef ENABLEGPU - // FIXME: move this (and the above CPU steps) into a method to prepare the 3d reference. - profile_timing.start("Swap Fourier Quadrants"); - template_reconstruction.BackwardFFT( ); - template_reconstruction.SwapFourierSpaceQuadrants(false); - profile_timing.lap("Swap Fourier Quadrants"); - // We only want to have one copy of the 3d template in texture memory that each thread can then reference. - // First allocate a shared pointer and construct the GpuImage based on the CPU template - // TODO: Initially, i had this set to use - // GpuImage::InitializeBasedOnCpuImage(tmp_vol, false, true); where the memory is instructed not to be pinned. - // This should be fine now, but . - profile_timing.start("CopyHostToDeviceTextureComplex3d"); - - std::shared_ptr template_reconstruction_gpu = std::make_shared(template_reconstruction); - template_reconstruction_gpu->CopyHostToDeviceTextureComplex3d(template_reconstruction); - profile_timing.lap("CopyHostToDeviceTextureComplex3d"); + std::shared_ptr template_reconstruction_gpu; + if ( use_gpu_prj ) { + + // FIXME: move this (and the above CPU steps) into a method to prepare the 3d reference. + profile_timing.start("Swap Fourier Quadrants"); + template_reconstruction.BackwardFFT( ); + template_reconstruction.SwapFourierSpaceQuadrants(false); + profile_timing.lap("Swap Fourier Quadrants"); + // We only want to have one copy of the 3d template in texture memory that each thread can then reference. + // First allocate a shared pointer and construct the GpuImage based on the CPU template + // TODO: Initially, i had this set to use + // GpuImage::InitializeBasedOnCpuImage(tmp_vol, false, true); where the memory is instructed not to be pinned. + // This should be fine now, but . + profile_timing.start("CopyHostToDeviceTextureComplex3d"); + + template_reconstruction_gpu = std::make_shared(template_reconstruction); + template_reconstruction_gpu->CopyHostToDeviceTextureComplex3d(template_reconstruction); + + profile_timing.lap("CopyHostToDeviceTextureComplex3d"); + } + #pragma omp parallel num_threads(max_threads) { int tIDX = ReturnThreadNumberOfCurrentThread( ); - gpuDev.SetGpu(tIDX); + gpuDev.SetGpu( ); if ( first_gpu_loop ) { int t_first_search_position = first_search_position + (tIDX * incPos); int t_last_search_position = first_search_position + (incPos - 1) + (tIDX * incPos); - if ( tIDX == (max_threads - 1) ) t_last_search_position = maxPos; profile_timing.start("Init GPU"); - GPU[tIDX].Init(this, template_reconstruction_gpu, input_image, current_projection, - pixel_size_search_range, pixel_size_step, data_sizer.GetSearchPixelSize( ), - defocus_search_range, defocus_step, defocus1, defocus2, - psi_max, psi_start, psi_step, - angles, global_euler_search, - histogram_min, histogram_step, histogram_number_of_points, - data_sizer.GetPrePadding( ), data_sizer.GetRoi( ), t_first_search_position, t_last_search_position, - my_progress, total_correlation_positions_per_thread, is_running_locally, use_fast_fft); + GPU[tIDX].Init(this, + template_reconstruction_gpu, + input_image, + current_projection, + psi_max, + psi_start, + psi_step, + angles, + global_euler_search, + data_sizer.GetPrePadding( ), + data_sizer.GetRoi( ), + histogram_sampling, + stats_sampling, + t_first_search_position, + t_last_search_position, + my_progress, + total_correlation_positions_per_thread, + is_running_locally, + use_fast_fft, + use_gpu_prj); profile_timing.lap("Init GPU"); + if ( ! use_gpu_prj ) { + GPU[tIDX].SetCpuTemplate(&template_reconstruction); + } #ifdef USE_LERP_NOT_FOURIER_RESIZING std::cerr << "\n\nUsing LERP\n\n"; @@ -706,9 +754,9 @@ bool MatchTemplateApp::DoCalculation( ) { #pragma omp parallel num_threads(max_threads) { int tIDX = ReturnThreadNumberOfCurrentThread( ); - gpuDev.SetGpu(tIDX); + gpuDev.SetGpu( ); profile_timing.start("RunInnerLoop"); - GPU[tIDX].RunInnerLoop(projection_filter, size_i, defocus_i, tIDX, current_correlation_position); + GPU[tIDX].RunInnerLoop(projection_filter, tIDX, current_correlation_position); profile_timing.lap("RunInnerLoop"); #pragma omp critical @@ -920,7 +968,7 @@ bool MatchTemplateApp::DoCalculation( ) { #endif temp_image.WriteSlice(&mip_out, 1); - MRCFile scaled_mip_output_mrcfile(mip_output_file.ToStdString( ), true); + MRCFile scaled_mip_output_mrcfile(scaled_mip_output_file.ToStdString( ), true); scaled_mip_output_mrcfile.SetPixelSize(data_sizer.GetPixelSize( )); #ifdef USE_FP16_PARTICLE_STACKS scaled_mip_output_mrcfile.SetOutputToFP16( ); diff --git a/src/programs/match_template/template_matching_data_sizer.cpp b/src/programs/match_template/template_matching_data_sizer.cpp index 528c0eed..3fd1c40c 100644 --- a/src/programs/match_template/template_matching_data_sizer.cpp +++ b/src/programs/match_template/template_matching_data_sizer.cpp @@ -897,7 +897,6 @@ void TemplateMatchingDataSizer::FillInNearestNeighbors(Image& output_image, Imag else size_neighborhood += 2; } - // We could try to dilate out each neighborhood, but this will be slower given the bad memory access. Better to do a little extra. int offset_max = size_neighborhood / 2;