diff --git a/src/constants/constants.h b/src/constants/constants.h
index c01d8498..652525f9 100644
--- a/src/constants/constants.h
+++ b/src/constants/constants.h
@@ -42,7 +42,7 @@ constexpr float histogram_first_bin_midpoint = histogram_min + (histogram_step /
 constexpr int   number_of_output_images     = 8; //mip, psi, theta, phi, pixel, defocus, sums, sqsums (scaled mip is not sent out)
 constexpr int   number_of_meta_data_values  = 10; // img_x, img_y, number cccs, histogram values, pixel_size
 constexpr int   MAX_ALLOWED_NUMBER_OF_PEAKS = 1000; // An error will be thrown and job aborted if this number of peaks is exceeded in the make template results block
-constexpr float MAX_BINNING_FACTOR          = histogram_min; // The maximum binning factor allowed for the template matching search image
+constexpr float MAX_BINNING_FACTOR          = 10.f; // The maximum binning factor allowed for the template matching search image
 constexpr float MIN_VALUE_TO_MIP            = 1.0f; // The minimum value to be considered for the MIP
 
 /**
diff --git a/src/gpu/DeviceManager.cu b/src/gpu/DeviceManager.cu
index 994fd83f..119424f7 100644
--- a/src/gpu/DeviceManager.cu
+++ b/src/gpu/DeviceManager.cu
@@ -38,45 +38,19 @@ void DeviceManager::Init(int wanted_number_of_gpus) {
     float  GPU_score;
     float  GPU_score_max = 0.0f;
 
-    //  memset(&prop, 0, sizeof(cudaDeviceProp));
-    //  prop.memoryBusWidth = 8192;
-    //  prop.memoryClockRate = 4010000;
-    //  prop.multiProcessorCount = 160;
-    //  prop.totalGlobalMem = 68174610432; // 64 Gb
-    //
-    //  cudaErr(cudaChooseDevice(&selected_GPU, &prop));
-    //  wxPrintf("Chosen device dumber: %d\n", selected_GPU);
-
-    //  // sleep for a short random bit between 0.5 and 5 seconds. The goal is so mutli-card units have work sent to the next card.
-    //  wxMilliSleep((global_random_number_generator.GetUniformRandom() + 1.5) * 2000);
     for ( int iGPU = 0; iGPU < gpu_check; iGPU++ ) {
         cudaDeviceProp prop;
         cudaErr(cudaGetDeviceProperties(&prop, iGPU));
-        //	  wxPrintf("Device Number: %d\n", iGPU);
-        //	  wxPrintf("  Device name: %s\n", prop.name);
-        //	  wxPrintf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
-        //	  wxPrintf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
-        //	  wxPrintf("  Peak Memory Bandwidth (GB/s): %f\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
-        //	  wxPrintf("  Number of multiprocessors: %d\n", prop.multiProcessorCount);
-        //	  wxPrintf("  Threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
-        //	  wxPrintf("  Memory per multiprocessor: %li\n", long(prop.sharedMemPerMultiprocessor));
-        //	  wxPrintf("  Memory on device: %li\n", long(prop.totalGlobalMem));
-        //	  wxPrintf("\n");
 
         cudaErr(cudaDeviceGetAttribute(&cuda_compute_mode, cudaDevAttrComputeMode, iGPU));
 
         cudaErr(cudaSetDevice(iGPU));
 
-        //	  cudaErr(cudaMemGetInfo(&free_mem,&total_mem));
-        //	  wxPrintf("Found %zd free mem out of %zd on iGPU %d\n", free_mem, total_mem, iGPU);
-        //	  wxPrintf("cuda_compute_mode, cudaDevAttrComputeMode, cudaComputeModeProhibited = %i %i %i\n", cuda_compute_mode, cudaDevAttrComputeMode, cudaComputeModeProhibited);
-
         if ( cuda_compute_mode != cudaComputeModeProhibited && prop.totalGlobalMem > min_memory_available ) {
             if ( cuda_compute_mode != cudaComputeModeDefault ) {
                 wxPrintf("\n\n\tWarning : Your device id %d is not set to Default compute mode.\n\n", iGPU);
             }
             cudaErr(cudaMemGetInfo(&free_mem, &total_mem));
-            //		  wxPrintf("Found %zd free mem out of %zd on iGPU %d\n", free_mem, total_mem, iGPU);
             if ( total_mem < min_memory_total ) {
                 // Don't use this device. This might not be the best way to do this.
                 cudaErr(cudaDeviceReset( ));
@@ -85,14 +59,8 @@ void DeviceManager::Init(int wanted_number_of_gpus) {
             else if ( free_mem > max_mem && free_mem > min_memory_available ) {
                 max_mem = free_mem;
             }
-            //		  else if (free_mem > max_mem && free_mem > min_memory_available) { max_mem = free_mem; selected_GPU = iGPU; }
 
             GPU_score = (float)free_mem * (float)prop.memoryClockRate * (float)prop.memoryBusWidth * (float)prop.multiProcessorCount * (float)prop.maxThreadsPerMultiProcessor;
-            //		  wxPrintf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
-            //		  wxPrintf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
-            //		  wxPrintf("  Number of multiprocessors: %d\n", prop.multiProcessorCount);
-            //		  wxPrintf("  Threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
-            //		  wxPrintf("GPU score = %g\n", GPU_score);
             if ( GPU_score > GPU_score_max ) {
                 GPU_score_max = GPU_score;
                 selected_GPU  = iGPU;
@@ -110,15 +78,8 @@ void DeviceManager::Init(int wanted_number_of_gpus) {
     is_manager_initialized = true;
 };
 
-void DeviceManager::SetGpu(int cpu_thread_idx) {
-
-    //	// Select the current device
-    //	this->gpuIDX = -1;
-    //	cudaErr(cudaSetDevice(cpu_thread_idx % this->nGPUs));   // "% num_gpus" allows more CPU threads than GPU devices
+void DeviceManager::SetGpu( ) {
     cudaErr(cudaSetDevice(this->gpuIDX));
-    //	cudaErr(cudaGetDevice(&gpuIDX));
-
-    //	wxPrintf("For thread %d of nGpus %d assigned %d\n",cpu_thread_idx, nGPUs, gpuIDX);
 };
 
 void DeviceManager::ResetGpu( ) {
diff --git a/src/gpu/DeviceManager.h b/src/gpu/DeviceManager.h
index f0d5dfae..fd888ec5 100644
--- a/src/gpu/DeviceManager.h
+++ b/src/gpu/DeviceManager.h
@@ -13,7 +13,7 @@ class DeviceManager {
     virtual ~DeviceManager( );
 
     void Init(int wanted_number_of_gpus);
-    void SetGpu(int cpu_thread_idx);
+    void SetGpu( );
     void ResetGpu( );
     void ListDevices( );
 
diff --git a/src/gpu/GpuImage.cu b/src/gpu/GpuImage.cu
index 8af55522..6731c2ef 100644
--- a/src/gpu/GpuImage.cu
+++ b/src/gpu/GpuImage.cu
@@ -16,15 +16,6 @@
 // #define USE_BLOCK_REDUCE
 #define USE_FP16_FOR_WHITENPS
 
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, const cufftReal* correlation_output, const int4 dims);
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, cufftReal* other_image, cufftReal* psi, cufftReal* phi, cufftReal* theta,
-                   int4 dims, float c_psi, float c_phi, float c_theta);
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, cufftReal* other_image, cufftReal* psi, cufftReal* phi, cufftReal* theta, cufftReal* defocus, cufftReal* pixel, const int4 dims,
-                   float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel);
-
 // Inline declarations
 __device__ __forceinline__ int
 d_ReturnFourierLogicalCoordGivenPhysicalCoord_X(int physical_index,
@@ -527,7 +518,6 @@ MultiplyPixelWiseComplexConjugateKernel(const StorageType* __restrict__ img_comp
     int address = x + (dims.w / 2) * y;
     int stride  = (dims.w / 2) * dims.y;
 
-    constexpr float epsilon = 1e-6f;
     // We have one referece image, and this is broadcasted to all images in the stack
     if constexpr ( std::is_same<StorageType, __half2>::value ) {
         const Complex ref_val = (Complex)__half22float2(ref_complex_values[address]);
@@ -1273,92 +1263,6 @@ float GpuImage::ReturnSumSquareModulusComplexValues( ) {
     return float(tmpValComplex[tmp_val_idx::ReturnSumSquareModulusComplexValues] * tmpValComplex[tmp_val_idx::ReturnSumSquareModulusComplexValues]);
 }
 
-void GpuImage::MipPixelWise(GpuImage& other_image) {
-
-    MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension.");
-    precheck;
-    ReturnLaunchParameters(dims, true);
-    MipPixelWiseKernel<<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(real_values, other_image.real_values, this->dims);
-    postcheck;
-}
-
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, const cufftReal* correlation_output, const int4 dims) {
-
-    int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
-                            blockIdx.y * blockDim.y + threadIdx.y,
-                            blockIdx.z);
-
-    if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) {
-        int address  = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims);
-        mip[address] = MAX(mip[address], correlation_output[address]);
-    }
-}
-
-void GpuImage::MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, GpuImage& defocus, GpuImage& pixel,
-                            float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel) {
-
-    MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension.");
-    precheck;
-    ReturnLaunchParameters(dims, true);
-    MipPixelWiseKernel<<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(real_values, other_image.real_values,
-                                                                              psi.real_values, phi.real_values, theta.real_values, defocus.real_values, pixel.real_values,
-                                                                              this->dims, c_psi, c_phi, c_theta, c_defocus, c_pixel);
-    postcheck;
-}
-
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, cufftReal* correlation_output, cufftReal* psi, cufftReal* phi, cufftReal* theta, cufftReal* defocus, cufftReal* pixel, const int4 dims,
-                   float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel) {
-
-    int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
-                            blockIdx.y * blockDim.y + threadIdx.y,
-                            blockIdx.z);
-
-    if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) {
-        int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims);
-        if ( correlation_output[address] > mip[address] ) {
-            mip[address]     = correlation_output[address];
-            psi[address]     = c_psi;
-            phi[address]     = c_phi;
-            theta[address]   = c_theta;
-            defocus[address] = c_defocus;
-            pixel[address]   = c_pixel;
-        }
-    }
-}
-
-void GpuImage::MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta,
-                            float c_psi, float c_phi, float c_theta) {
-
-    MyDebugAssertTrue(HasSameDimensionsAs(&other_image), "Images have different dimension.");
-    precheck;
-    ReturnLaunchParameters(dims, true);
-    MipPixelWiseKernel<<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(real_values, other_image.real_values,
-                                                                              psi.real_values, phi.real_values, theta.real_values,
-                                                                              this->dims, c_psi, c_phi, c_theta);
-    postcheck;
-}
-
-__global__ void
-MipPixelWiseKernel(cufftReal* mip, cufftReal* correlation_output, cufftReal* psi, cufftReal* phi, cufftReal* theta, const int4 dims,
-                   float c_psi, float c_phi, float c_theta) {
-
-    int3 coords = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
-                            blockIdx.y * blockDim.y + threadIdx.y,
-                            blockIdx.z);
-
-    if ( coords.x < dims.x && coords.y < dims.y && coords.z < dims.z ) {
-        int address = d_ReturnReal1DAddressFromPhysicalCoord(coords, dims);
-        if ( correlation_output[address] > mip[address] ) {
-            mip[address]   = correlation_output[address];
-            psi[address]   = c_psi;
-            phi[address]   = c_phi;
-            theta[address] = c_theta;
-        }
-    }
-}
-
 template <typename StorageType>
 __global__ void
 ApplyBFactorKernel(StorageType* d_input,
diff --git a/src/gpu/GpuImage.h b/src/gpu/GpuImage.h
index 1b52fa56..a0986d11 100644
--- a/src/gpu/GpuImage.h
+++ b/src/gpu/GpuImage.h
@@ -30,8 +30,6 @@ class GpuImage {
     GpuImage& operator=(const GpuImage& t);
     GpuImage& operator=(const GpuImage* t);
 
-    // FIXME: move constructor?
-
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // START MEMBER VARIABLES FROM THE cpu IMAGE CLASS
 
@@ -297,11 +295,6 @@ class GpuImage {
     void WaitBlocking( );
     void RecordAndWait( );
     // Maximum intensity projection
-    void MipPixelWise(GpuImage& other_image);
-    void MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta,
-                      float c_psi, float c_phi, float c_theta);
-    void MipPixelWise(GpuImage& other_image, GpuImage& psi, GpuImage& phi, GpuImage& theta, GpuImage& defocus, GpuImage& pixel,
-                      float c_psi, float c_phi, float c_theta, float c_defocus, float c_pixel);
 
     // FIXME: These are added for the unblur refinement but are untested.
     template <typename StorageTypeBase = float>
diff --git a/src/gpu/TemplateMatchingCore.cu b/src/gpu/TemplateMatchingCore.cu
index aac6d156..9fd1659e 100644
--- a/src/gpu/TemplateMatchingCore.cu
+++ b/src/gpu/TemplateMatchingCore.cu
@@ -11,35 +11,26 @@
 
 using namespace cistem_timer;
 
-constexpr bool use_gpu_prj = true;
-
 void TemplateMatchingCore::Init(MyApp*                    parent_pointer,
                                 std::shared_ptr<GpuImage> wanted_template_reconstruction,
                                 Image&                    wanted_input_image,
                                 Image&                    current_projection,
-                                float                     pixel_size_search_range,
-                                float                     pixel_size_step,
-                                float                     pixel_size,
-                                float                     defocus_search_range,
-                                float                     defocus_step,
-                                float                     defocus1,
-                                float                     defocus2,
                                 float                     psi_max,
                                 float                     psi_start,
                                 float                     psi_step,
                                 AnglesAndShifts&          angles,
                                 EulerSearch&              global_euler_search,
-                                float                     histogram_min_scaled,
-                                float                     histogram_step_scaled,
-                                int                       histogram_number_of_bins,
                                 const int2                pre_padding,
                                 const int2                roi,
+                                float                     histogram_sampling,
+                                float                     stats_sampling,
                                 int                       first_search_position,
                                 int                       last_search_position,
                                 ProgressBar*              my_progress,
                                 long                      total_correlation_positions,
                                 bool                      is_running_locally,
                                 bool                      use_fast_fft,
+                                bool                      use_gpu_prj,
                                 int                       number_of_global_search_images_to_save)
 
 {
@@ -49,6 +40,10 @@ void TemplateMatchingCore::Init(MyApp*                    parent_pointer,
     MyDebugAssertTrue(wanted_input_image.is_in_memory, "Input image must be in memory");
     object_initialized_ = true;
 
+    this->use_gpu_prj   = use_gpu_prj;
+    histogram_sampling_ = histogram_sampling;
+    stats_sampling_     = stats_sampling;
+
     this->first_search_position          = first_search_position;
     this->last_search_position           = last_search_position;
     this->angles                         = angles;
@@ -97,10 +92,8 @@ void TemplateMatchingCore::Init(MyApp*                    parent_pointer,
         d_statistical_buffers_ptrs[i]->Allocate(d_input_image.dims.x, d_input_image.dims.y, number_of_global_search_images_to_save, true);
     }
 
-    this->pre_padding           = pre_padding;
-    this->roi                   = roi;
-    this->histogram_min_scaled  = histogram_min_scaled;
-    this->histogram_step_scaled = histogram_step_scaled;
+    this->pre_padding = pre_padding;
+    this->roi         = roi;
 
     this->my_progress                 = my_progress;
     this->total_correlation_positions = total_correlation_positions;
@@ -115,10 +108,8 @@ void TemplateMatchingCore::Init(MyApp*                    parent_pointer,
     // Transfer the input image_memory_should_not_be_deallocated
 };
 
-void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel, float c_defocus, int threadIDX, long& current_correlation_position) {
+void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, int threadIDX, long& current_correlation_position) {
 
-    this->c_defocus                   = c_defocus;
-    this->c_pixel                     = c_pixel;
     total_number_of_cccs_calculated   = 0;
     total_number_of_histogram_samples = 0;
     total_number_of_stats_samples     = 0;
@@ -151,12 +142,10 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel,
     }
 #endif
 
-    const float wanted_histogram_sampling = 1.0f; // 100% until tested
-    const float wanted_stats_sampling     = 1.0f; // 100% until tested
     if ( this_is_the_first_run_on_inner_loop ) {
         d_input_image.CopyFP32toFP16buffer(false);
         d_padded_reference.CopyFP32toFP16buffer(false);
-        my_dist = std::make_unique<TM_EmpiricalDistribution<__half, __half2>>(d_input_image, histogram_min_scaled, histogram_step_scaled, pre_padding, roi, wanted_histogram_sampling, wanted_stats_sampling);
+        my_dist = std::make_unique<TM_EmpiricalDistribution<__half, __half2>>(d_input_image, pre_padding, roi, histogram_sampling_, stats_sampling_);
     }
     else {
         my_dist->ZeroHistogram( );
@@ -177,15 +166,14 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel,
 
     cudaErr(cudaEventCreateWithFlags(&mip_is_done_Event, cudaEventBlockingSync));
 #ifdef cisTEM_USING_FastFFT
-    FastFFT::FourierTransformer<float, __half, __half2, 2>                           FT;
-    FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::NOOP>     noop;
-    FastFFT::KernelFunction::my_functor<float, 4, FastFFT::KernelFunction::CONJ_MUL> conj_mul;
+    FastFFT::FourierTransformer<float, __half, __half2, 2> FT;
 
     // float scale_factor = powf((float)d_current_projection[0].number_of_real_space_pixels, -2.0);
     // float scale_factor = 1.f;
     float scale_factor = sqrtf(1.0f / float(d_input_image.number_of_real_space_pixels));
 
     FastFFT::KernelFunction::my_functor<float, 4, FastFFT::KernelFunction::CONJ_MUL_THEN_SCALE> conj_mul_then_scale(scale_factor);
+    FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::NOOP>                noop;
 
     if ( use_fast_fft ) {
         // TODO: overload that takes and short4's int4's instead of the individual values
@@ -269,8 +257,8 @@ void TemplateMatchingCore::RunInnerLoop(Image& projection_filter, float c_pixel,
             else {
                 // Make sure the previous copy from host -> device has completed before we start to make another projection.
                 // Event is created as non-blocking so this is a busy-wait.
-
-                template_reconstruction.ExtractSlice(current_projection[current_projection_idx], angles, 1.0f, false);
+                MyDebugAssertFalse(cpu_template == nullptr, "Template reconstruction is not set with SetCpuTemplate");
+                cpu_template->ExtractSlice(current_projection[current_projection_idx], angles, 1.0f, false);
 
                 current_projection[current_projection_idx].SwapRealSpaceQuadrants( );
                 current_projection[current_projection_idx].MultiplyPixelWise(projection_filter);
diff --git a/src/gpu/TemplateMatchingCore.h b/src/gpu/TemplateMatchingCore.h
index fb91ea9c..753f1731 100644
--- a/src/gpu/TemplateMatchingCore.h
+++ b/src/gpu/TemplateMatchingCore.h
@@ -5,13 +5,15 @@
 
 #include "GpuImage.h"
 #include "DeviceManager.h"
-#include "Histogram.h"
 #include "template_matching_empirical_distribution.h"
 
 class TemplateMatchingCore {
 
   private:
-    bool object_initialized_;
+    bool  object_initialized_;
+    bool  use_gpu_prj;
+    float histogram_sampling_;
+    float stats_sampling_;
 
   public:
     TemplateMatchingCore( ) : object_initialized_{false} { };
@@ -24,14 +26,9 @@ class TemplateMatchingCore {
 
     void Init(int number_of_jobs);
 
-    DeviceManager gpuDev;
-
-    int nGPUs;
-    int nThreads;
     int number_of_jobs_per_image_in_gui;
 
     // CPU images to be passed in -
-    Image                     template_reconstruction;
     std::shared_ptr<GpuImage> template_gpu_shared;
     Image                     input_image; // These will be modified on the host from withing Template Matching Core so Allocate locally
 
@@ -39,6 +36,8 @@ class TemplateMatchingCore {
     float binning_factor = 1.f;
 
     std::vector<Image> current_projection;
+    // Generally not used, except for --disable-gpu-prj
+    Image* cpu_template = nullptr;
 
     // These are assumed to be empty containers at the outset, so xfer host-->device is skipped
     GpuImage d_max_intensity_projection;
@@ -62,20 +61,9 @@ class TemplateMatchingCore {
     GpuImage d_padded_reference;
 
     // Search range parameters
-    float pixel_size_search_range;
-    float pixel_size_step;
-    float pixel_size;
-    float defocus_search_range;
-    float defocus_step;
-    float defocus1;
-    float defocus2;
     float psi_max;
     float psi_start;
     float psi_step;
-    float minimum_threshold = 20.0f; //  Optionally override this to limit what is considered for refinement
-
-    float c_defocus;
-    float c_pixel;
 
     int  current_search_position;
     int  first_search_position;
@@ -87,17 +75,13 @@ class TemplateMatchingCore {
 
     int n_global_search_images_to_save;
 
-    bool      is_running_locally;
-    bool      is_gpu_3d_swapped;
-    bool      use_fast_fft;
-    Histogram histogram;
+    bool is_running_locally;
+    bool use_fast_fft;
 
     std::unique_ptr<TM_EmpiricalDistribution<__half, __half2>> my_dist;
 
-    float histogram_min_scaled;
-    float histogram_step_scaled;
-    int2  pre_padding;
-    int2  roi;
+    int2 pre_padding;
+    int2 roi;
 
     // Search objects
     AnglesAndShifts angles;
@@ -122,38 +106,33 @@ class TemplateMatchingCore {
 
     void UpdateSecondaryPeaks( );
 
-    void SetMinimumThreshold(float wanted_threshold) { minimum_threshold = wanted_threshold; }
+    void SetCpuTemplate(Image* cpu_template) {
+        this->cpu_template = cpu_template;
+    }
 
     void Init(MyApp*                    parent_pointer,
               std::shared_ptr<GpuImage> template_reconstruction,
               Image&                    input_image,
               Image&                    current_projection,
-              float                     pixel_size_search_range,
-              float                     pixel_size_step,
-              float                     pixel_size,
-              float                     defocus_search_range,
-              float                     defocus_step,
-              float                     defocus1,
-              float                     defocus2,
               float                     psi_max,
               float                     psi_start,
               float                     psi_step,
               AnglesAndShifts&          angles,
               EulerSearch&              global_euler_search,
-              float                     histogram_min_scaled,
-              float                     histogram_step_scaled,
-              int                       histogram_number_of_bins,
               const int2                pre_padding,
               const int2                roi,
+              float                     histogram_sampling,
+              float                     stats_sampling,
               int                       first_search_position,
               int                       last_search_position,
               ProgressBar*              my_progress,
               long                      total_correlation_positions,
               bool                      is_running_locally,
               bool                      use_fast_fft,
+              bool                      use_gpu_prj,
               int                       number_of_global_search_images_to_save = 1);
 
-    void RunInnerLoop(Image& projection_filter, float pixel_i, float defocus_i, int threadIDX, long& current_correlation_position);
+    void RunInnerLoop(Image& projection_filter, int threadIDX, long& current_correlation_position);
 };
 
 #endif
diff --git a/src/gpu/template_matching_empirical_distribution.cu b/src/gpu/template_matching_empirical_distribution.cu
index 327e8dc0..8e04e1f9 100644
--- a/src/gpu/template_matching_empirical_distribution.cu
+++ b/src/gpu/template_matching_empirical_distribution.cu
@@ -34,13 +34,11 @@ inline __device__ __host__ bool test_gt_zero(T value) {
  */
 
 template <typename ccfType, typename mipType>
-TM_EmpiricalDistribution<ccfType, mipType>::TM_EmpiricalDistribution(GpuImage&           reference_image,
-                                                                     histogram_storage_t histogram_min,
-                                                                     histogram_storage_t histogram_step,
-                                                                     int2                pre_padding,
-                                                                     int2                roi,
-                                                                     const float         histogram_sampling_rate,
-                                                                     const float         stats_sampling_rate) : pre_padding_{pre_padding},
+TM_EmpiricalDistribution<ccfType, mipType>::TM_EmpiricalDistribution(GpuImage&   reference_image,
+                                                                     int2        pre_padding,
+                                                                     int2        roi,
+                                                                     const float histogram_sampling_rate,
+                                                                     const float stats_sampling_rate) : pre_padding_{pre_padding},
                                                                                                         roi_{roi},
                                                                                                         higher_order_moments_{false},
                                                                                                         image_plane_mem_allocated_{reference_image.real_memory_allocated},
@@ -65,16 +63,16 @@ TM_EmpiricalDistribution<ccfType, mipType>::TM_EmpiricalDistribution(GpuImage&
     // If anything, given that the output of the matched filter is ~ Gaussian, all the numbers closer to zero are less
     // likely to be flushed to zero when denormal, so in that respect, bflaot16 may actually maintain higher precision.
     if constexpr ( std::is_same_v<ccfType, __half> ) {
-        histogram_min_  = __float2half_rn(histogram_min);
-        histogram_step_ = __float2half_rn(histogram_step);
+        histogram_min_  = __float2half_rn(TM::histogram_min);
+        histogram_step_ = __float2half_rn(TM::histogram_step);
     }
     else if constexpr ( std::is_same_v<ccfType, __nv_bfloat16> ) {
-        histogram_min_  = __float2bfloat16_rn(histogram_min);
-        histogram_step_ = __float2bfloat16_rn(histogram_step);
+        histogram_min_  = __float2bfloat16_rn(TM::histogram_min);
+        histogram_step_ = __float2bfloat16_rn(TM::histogram_step);
     }
     else if constexpr ( std::is_same_v<ccfType, histogram_storage_t> ) {
-        histogram_min_  = histogram_min;
-        histogram_step_ = histogram_step;
+        histogram_min_  = TM::histogram_min;
+        histogram_step_ = TM::histogram_step;
     }
     else {
         MyDebugAssertTrue(false, "input_type must be either __half __nv_bfloat16, or histogram_storage_t");
@@ -113,6 +111,7 @@ TM_EmpiricalDistribution<ccfType, mipType>::TM_EmpiricalDistribution(GpuImage&
     ZeroSamplingCounters( );
 
     AllocateAndZeroStatisticalArrays( );
+    object_initialized_ = true;
 };
 
 template <typename ccfType, typename mipType>
@@ -156,6 +155,12 @@ void TM_EmpiricalDistribution<ccfType, mipType>::AllocateAndZeroStatisticalArray
 
 template <typename ccfType, typename mipType>
 TM_EmpiricalDistribution<ccfType, mipType>::~TM_EmpiricalDistribution( ) {
+    if ( object_initialized_ )
+        Delete( );
+};
+
+template <typename ccfType, typename mipType>
+void TM_EmpiricalDistribution<ccfType, mipType>::Delete( ) {
     MyDebugAssertFalse(cudaStreamQuery(calc_stream_[0]) == cudaErrorInvalidResourceHandle, "The cuda stream is invalid");
 
     cudaErr(cudaFreeAsync(histogram_, calc_stream_[0]));
@@ -178,7 +183,9 @@ TM_EmpiricalDistribution<ccfType, mipType>::~TM_EmpiricalDistribution( ) {
 
     cudaErr(cudaStreamDestroy(calc_stream_[0]));
     cudaErr(cudaEventDestroy(mip_stack_is_ready_event_[0]));
-};
+
+    object_initialized_ = false;
+}
 
 template <typename ccfType, typename mipType>
 void TM_EmpiricalDistribution<ccfType, mipType>::ZeroHistogram( ) {
@@ -261,7 +268,7 @@ inline __device__ void write_mip_and_stats(float*   sum_array,
     if constexpr ( std::is_same_v<ccfType, __half> ) {
         // I though short circuit logic would be equivalent, but maybe the cuda driver is pre-fetching values? The nested conditional is ~3% faster on total run time
         // indicating we are skipping unnecessary reads.
-        if ( max_val > ccfType{cistem::match_template::MIN_VALUE_TO_MIP} ) {
+        if ( max_val > ccfType{TM::MIN_VALUE_TO_MIP} ) {
             if ( max_val > __low2half(mip_psi[address]) ) {
                 mip_psi[address]   = __halves2half2(max_val, psi[max_idx]);
                 theta_phi[address] = __halves2half2(theta[max_idx], phi[max_idx]);
@@ -269,7 +276,7 @@ inline __device__ void write_mip_and_stats(float*   sum_array,
         }
     }
     else if constexpr ( std::is_same_v<ccfType, __nv_bfloat16> ) {
-        if ( max_val > ccfType{cistem::match_template::MIN_VALUE_TO_MIP} ) {
+        if ( max_val > ccfType{TM::MIN_VALUE_TO_MIP} ) {
             if ( max_val > __low2bfloat16(mip_psi[address]) ) {
                 mip_psi[address]   = __halves2bfloat162(max_val, psi[max_idx]);
                 theta_phi[address] = __halves2bfloat162(theta[max_idx], phi[max_idx]);
@@ -277,7 +284,7 @@ inline __device__ void write_mip_and_stats(float*   sum_array,
         }
     }
     else if constexpr ( std::is_same_v<ccfType, histogram_storage_t> ) {
-        if ( max_val > ccfType{cistem::match_template::MIN_VALUE_TO_MIP} ) {
+        if ( max_val > ccfType{TM::MIN_VALUE_TO_MIP} ) {
             if ( max_val > mip_psi[address].x ) {
                 mip_psi[address].x   = max_val;
                 mip_psi[address].y   = psi[max_idx];
@@ -344,7 +351,7 @@ __global__ void __launch_bounds__(TM::histogram_number_of_points)
         for ( int i = pre_padding.x + physical_X( ); i < pre_padding.x + roi.x; i += GridStride_2dGrid_X( ) ) {
             // address = ((z * NY + y) * pitch_in_pixels) + x;
             address         = j * pitch_in_pixels_img + i;
-            ccfType max_val = ccfType{cistem::match_template::histogram_min};
+            ccfType max_val = ccfType{TM::histogram_min};
             int     max_idx = 0;
             float   sum{0.f}, sum_sq{0.f};
             for ( int k = 0; k < n_slices_to_process; k++ ) {
diff --git a/src/gpu/template_matching_empirical_distribution.h b/src/gpu/template_matching_empirical_distribution.h
index ffed5c6b..c2a6b6e2 100644
--- a/src/gpu/template_matching_empirical_distribution.h
+++ b/src/gpu/template_matching_empirical_distribution.h
@@ -27,6 +27,7 @@ class TM_EmpiricalDistribution {
 
   private:
     bool        higher_order_moments_;
+    bool        object_initialized_{ };
     int         current_image_index_;
     ccfType     histogram_min_;
     ccfType     histogram_step_;
@@ -82,15 +83,14 @@ class TM_EmpiricalDistribution {
  * @param n_images_to_accumulate_concurrently - the number of images to accumulate concurrently
  * 
  */
-    TM_EmpiricalDistribution(GpuImage&           reference_image,
-                             histogram_storage_t histogram_min,
-                             histogram_storage_t histogram_step,
-                             int2                pre_padding,
-                             int2                roi,
-                             const float         histogram_sampling_rate,
-                             const float         stats_sampling_rate);
+    TM_EmpiricalDistribution(GpuImage&   reference_image,
+                             int2        pre_padding,
+                             int2        roi,
+                             const float histogram_sampling_rate,
+                             const float stats_sampling_rate);
 
     ~TM_EmpiricalDistribution( );
+    void Delete( );
 
     // delete the copy and move  constructors as these are not handled or needed.
     TM_EmpiricalDistribution(const TM_EmpiricalDistribution&)            = delete;
diff --git a/src/programs/match_template/match_template.cpp b/src/programs/match_template/match_template.cpp
index 978c7aa7..b31f307e 100644
--- a/src/programs/match_template/match_template.cpp
+++ b/src/programs/match_template/match_template.cpp
@@ -70,6 +70,7 @@ class
     float GetMaxJobWaitTimeInSeconds( ) { return 360.0f; }
 
   private:
+    void AddCommandLineOptions( );
     template <typename StatsType>
     void CalcGlobalCCCScalingFactor(double&    global_ccc_mean,
                                     double&    global_ccc_std_dev,
@@ -99,6 +100,13 @@ IMPLEMENT_APP(MatchTemplateApp)
 void MatchTemplateApp::ProgramSpecificInit( ) {
 }
 
+// Optional command-line stuff
+void MatchTemplateApp::AddCommandLineOptions( ) {
+    command_line_parser.AddOption("", "histogram-sampling", "Random sampling of the histogram, fraction sampled. 0.05 - 1.0 [1.0 default]", wxCMD_LINE_VAL_DOUBLE);
+    command_line_parser.AddOption("", "stats-sampling", "Random sampling of the statistical arrays, fraction sampled. 0.05 - 1.0 [1.0 default]", wxCMD_LINE_VAL_DOUBLE);
+    command_line_parser.AddLongSwitch("disable-gpu-prj", "Disable projection using the gpu. Default false");
+}
+
 // override the DoInteractiveUserInput
 
 void MatchTemplateApp::DoInteractiveUserInput( ) {
@@ -255,6 +263,23 @@ bool MatchTemplateApp::DoCalculation( ) {
 
     wxDateTime start_time = wxDateTime::Now( );
 
+    double temp_double;
+    float  histogram_sampling = 1.0f;
+    float  stats_sampling     = 1.0f;
+    bool   use_gpu_prj        = true;
+    if ( command_line_parser.Found("histogram-sampling", &temp_double) ) {
+        std::cerr << "histogram-sampling: " << temp_double << std::endl;
+        histogram_sampling = float(temp_double);
+    }
+    if ( command_line_parser.Found("stats-sampling", &temp_double) ) {
+        std::cerr << "stats-sampling: " << temp_double << std::endl;
+        stats_sampling = float(temp_double);
+    }
+    if ( command_line_parser.FoundSwitch("disable-gpu-prj") ) {
+        SendInfo("Disabling GPU projection\n");
+        use_gpu_prj = false;
+    }
+
     wxString input_search_images_filename    = my_current_job.arguments[0].ReturnStringArgument( );
     wxString input_reconstruction_filename   = my_current_job.arguments[1].ReturnStringArgument( );
     float    input_pixel_size                = my_current_job.arguments[2].ReturnFloatArgument( );
@@ -282,7 +307,7 @@ bool MatchTemplateApp::DoCalculation( ) {
     wxString best_phi_output_file            = my_current_job.arguments[24].ReturnStringArgument( );
     wxString best_defocus_output_file        = my_current_job.arguments[25].ReturnStringArgument( );
     wxString best_pixel_size_output_file     = my_current_job.arguments[26].ReturnStringArgument( );
-    wxString scaled_mip_output_mrcfile       = my_current_job.arguments[27].ReturnStringArgument( );
+    wxString scaled_mip_output_file          = my_current_job.arguments[27].ReturnStringArgument( );
     wxString correlation_avg_output_file     = my_current_job.arguments[28].ReturnStringArgument( );
     wxString my_symmetry                     = my_current_job.arguments[29].ReturnStringArgument( );
     float    in_plane_angular_step           = my_current_job.arguments[30].ReturnFloatArgument( );
@@ -312,6 +337,11 @@ bool MatchTemplateApp::DoCalculation( ) {
         max_threads = 1;
     }
 
+    if ( ! use_gpu && (FloatsAreAlmostTheSame(histogram_sampling, 1.0f) == false || FloatsAreAlmostTheSame(stats_sampling, 1.0f) == false) ) {
+        // only GPU can handle this
+        SendError("sub-sampling of the histogram and statistical arrays is only supported on the GPU implementation\n");
+    }
+
     profile_timing.start("Init");
     ParameterMap parameter_map; // needed for euler search init
     //for (int i = 0; i < 5; i++) {parameter_map[i] = true;}
@@ -334,7 +364,6 @@ bool MatchTemplateApp::DoCalculation( ) {
 
     float  temp_float;
     float  variance;
-    double temp_double;
     double temp_double_array[5];
 
     int  number_of_rotations;
@@ -496,7 +525,6 @@ bool MatchTemplateApp::DoCalculation( ) {
     //psi_step = 5;
 
     // search grid
-
     // TODO: when checking the impact of limiting the resolution, it may be worthwile to NOT limit the number of search positions
     global_euler_search.InitGrid(my_symmetry, angular_step, 0.0f, 0.0f, psi_max, psi_step, psi_start, data_sizer.GetSearchPixelSize( ) / high_resolution_limit_search, parameter_map, best_parameters_to_keep);
 
@@ -632,43 +660,63 @@ bool MatchTemplateApp::DoCalculation( ) {
 
         if ( use_gpu ) {
 #ifdef ENABLEGPU
-            // FIXME: move this (and the above CPU steps) into a method to prepare the 3d reference.
-            profile_timing.start("Swap Fourier Quadrants");
-            template_reconstruction.BackwardFFT( );
-            template_reconstruction.SwapFourierSpaceQuadrants(false);
-            profile_timing.lap("Swap Fourier Quadrants");
-            // We only want to have one copy of the 3d template in texture memory that each thread can then reference.
-            // First allocate a shared pointer and construct the GpuImage based on the CPU template
-            // TODO: Initially, i had this set to use
-            // GpuImage::InitializeBasedOnCpuImage(tmp_vol, false, true); where the memory is instructed not to be pinned.
-            // This should be fine now, but .
-            profile_timing.start("CopyHostToDeviceTextureComplex3d");
-
-            std::shared_ptr<GpuImage> template_reconstruction_gpu = std::make_shared<GpuImage>(template_reconstruction);
-            template_reconstruction_gpu->CopyHostToDeviceTextureComplex3d(template_reconstruction);
-            profile_timing.lap("CopyHostToDeviceTextureComplex3d");
+            std::shared_ptr<GpuImage> template_reconstruction_gpu;
+            if ( use_gpu_prj ) {
+
+                // FIXME: move this (and the above CPU steps) into a method to prepare the 3d reference.
+                profile_timing.start("Swap Fourier Quadrants");
+                template_reconstruction.BackwardFFT( );
+                template_reconstruction.SwapFourierSpaceQuadrants(false);
+                profile_timing.lap("Swap Fourier Quadrants");
+                // We only want to have one copy of the 3d template in texture memory that each thread can then reference.
+                // First allocate a shared pointer and construct the GpuImage based on the CPU template
+                // TODO: Initially, i had this set to use
+                // GpuImage::InitializeBasedOnCpuImage(tmp_vol, false, true); where the memory is instructed not to be pinned.
+                // This should be fine now, but .
+                profile_timing.start("CopyHostToDeviceTextureComplex3d");
+
+                template_reconstruction_gpu = std::make_shared<GpuImage>(template_reconstruction);
+                template_reconstruction_gpu->CopyHostToDeviceTextureComplex3d(template_reconstruction);
+
+                profile_timing.lap("CopyHostToDeviceTextureComplex3d");
+            }
+
 #pragma omp parallel num_threads(max_threads)
             {
                 int tIDX = ReturnThreadNumberOfCurrentThread( );
-                gpuDev.SetGpu(tIDX);
+                gpuDev.SetGpu( );
 
                 if ( first_gpu_loop ) {
 
                     int t_first_search_position = first_search_position + (tIDX * incPos);
                     int t_last_search_position  = first_search_position + (incPos - 1) + (tIDX * incPos);
-
                     if ( tIDX == (max_threads - 1) )
                         t_last_search_position = maxPos;
                     profile_timing.start("Init GPU");
-                    GPU[tIDX].Init(this, template_reconstruction_gpu, input_image, current_projection,
-                                   pixel_size_search_range, pixel_size_step, data_sizer.GetSearchPixelSize( ),
-                                   defocus_search_range, defocus_step, defocus1, defocus2,
-                                   psi_max, psi_start, psi_step,
-                                   angles, global_euler_search,
-                                   histogram_min, histogram_step, histogram_number_of_points,
-                                   data_sizer.GetPrePadding( ), data_sizer.GetRoi( ), t_first_search_position, t_last_search_position,
-                                   my_progress, total_correlation_positions_per_thread, is_running_locally, use_fast_fft);
+                    GPU[tIDX].Init(this,
+                                   template_reconstruction_gpu,
+                                   input_image,
+                                   current_projection,
+                                   psi_max,
+                                   psi_start,
+                                   psi_step,
+                                   angles,
+                                   global_euler_search,
+                                   data_sizer.GetPrePadding( ),
+                                   data_sizer.GetRoi( ),
+                                   histogram_sampling,
+                                   stats_sampling,
+                                   t_first_search_position,
+                                   t_last_search_position,
+                                   my_progress,
+                                   total_correlation_positions_per_thread,
+                                   is_running_locally,
+                                   use_fast_fft,
+                                   use_gpu_prj);
                     profile_timing.lap("Init GPU");
+                    if ( ! use_gpu_prj ) {
+                        GPU[tIDX].SetCpuTemplate(&template_reconstruction);
+                    }
 
 #ifdef USE_LERP_NOT_FOURIER_RESIZING
                     std::cerr << "\n\nUsing LERP\n\n";
@@ -706,9 +754,9 @@ bool MatchTemplateApp::DoCalculation( ) {
 #pragma omp parallel num_threads(max_threads)
                 {
                     int tIDX = ReturnThreadNumberOfCurrentThread( );
-                    gpuDev.SetGpu(tIDX);
+                    gpuDev.SetGpu( );
                     profile_timing.start("RunInnerLoop");
-                    GPU[tIDX].RunInnerLoop(projection_filter, size_i, defocus_i, tIDX, current_correlation_position);
+                    GPU[tIDX].RunInnerLoop(projection_filter, tIDX, current_correlation_position);
                     profile_timing.lap("RunInnerLoop");
 
 #pragma omp critical
@@ -920,7 +968,7 @@ bool MatchTemplateApp::DoCalculation( ) {
 #endif
         temp_image.WriteSlice(&mip_out, 1);
 
-        MRCFile scaled_mip_output_mrcfile(mip_output_file.ToStdString( ), true);
+        MRCFile scaled_mip_output_mrcfile(scaled_mip_output_file.ToStdString( ), true);
         scaled_mip_output_mrcfile.SetPixelSize(data_sizer.GetPixelSize( ));
 #ifdef USE_FP16_PARTICLE_STACKS
         scaled_mip_output_mrcfile.SetOutputToFP16( );
diff --git a/src/programs/match_template/template_matching_data_sizer.cpp b/src/programs/match_template/template_matching_data_sizer.cpp
index 528c0eed..3fd1c40c 100644
--- a/src/programs/match_template/template_matching_data_sizer.cpp
+++ b/src/programs/match_template/template_matching_data_sizer.cpp
@@ -897,7 +897,6 @@ void TemplateMatchingDataSizer::FillInNearestNeighbors(Image& output_image, Imag
         else
             size_neighborhood += 2;
     }
-
     // We could try to dilate out each neighborhood, but this will be slower given the bad memory access. Better to do a little extra.
     int offset_max = size_neighborhood / 2;