diff --git a/app_cabanapic/configs/cabana.param b/app_cabanapic/configs/cabana.param
index f0d6b441..8f75e2a7 100644
--- a/app_cabanapic/configs/cabana.param
+++ b/app_cabanapic/configs/cabana.param
@@ -18,7 +18,7 @@ STRING cluster = block
 # System parameters
 BOOL opp_auto_sort          = false
 REAL opp_allocation_multiple = 1.1
-INT opp_threads_per_block   = 256
+INT opp_threads_per_block   = 512
 BOOL opp_profile_all        = false
 BOOL print_final   = true
 INT opp_gpu_direct = 0
@@ -28,7 +28,8 @@ STRING opp_fill = HoleFill_All
 INT opp_fill_period = 4
 
 BOOL opp_global_move = false
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
+INT gpu_reduction_arrays = 16
 
 STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/cabana/mesh_files/cab_48000.hdf5
 INT domain_expansion = 1
diff --git a/app_cabanapic_cg/cabana_opp.cpp b/app_cabanapic_cg/cabana_opp.cpp
index 60287172..e8e7aba0 100644
--- a/app_cabanapic_cg/cabana_opp.cpp
+++ b/app_cabanapic_cg/cabana_opp.cpp
@@ -1,5 +1,5 @@
 
-// Auto-generated at 2025-01-21 14:53:57.354254 by opp-translator
+// Auto-generated at 2025-02-03 16:39:08.296851 by opp-translator
 /* 
 BSD 3-Clause License
 
diff --git a/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp b/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp
index 471c0783..f680bd28 100644
--- a/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp
+++ b/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp
@@ -18,6 +18,12 @@ __constant__ OPP_INT opp_k1_map0_stride_d;
 
 
 namespace opp_k1 {
+enum Dim {
+    x = 0,
+    y = 1,
+    z = 2,
+};
+
 enum CellInterp {
     ex = 0,
     dexdy,
@@ -39,12 +45,6 @@ enum CellInterp {
     dcbzdz,
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-    z = 2,
-};
-
 __device__ inline void interpolate_mesh_fields_kernel(
     const double* cell0_e,
     const double* cell0_b,
diff --git a/app_cabanapic_cg/hip/compute_energy_kernel_loop.hpp b/app_cabanapic_cg/hip/compute_energy_kernel_loop.hpp
index 2bc9ebb6..e431a9a9 100644
--- a/app_cabanapic_cg/hip/compute_energy_kernel_loop.hpp
+++ b/app_cabanapic_cg/hip/compute_energy_kernel_loop.hpp
@@ -142,10 +142,10 @@ void opp_par_loop_all__compute_energy_kernel(opp_set set,
         for (int d = 0; d < 1; ++d)
             arg2_host_data[d] += ((OPP_REAL *)args[2].data)[b * 1 + d];
     }
-
     args[2].data = (char *)arg2_host_data;
     opp_mpi_reduce(&args[2], arg2_host_data);
 
+
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
     OPP_DEVICE_SYNCHRONIZE();   
  
diff --git a/app_cabanapic_cg/hip/get_max_x_values_kernel_loop.hpp b/app_cabanapic_cg/hip/get_max_x_values_kernel_loop.hpp
index f43888c1..6527a896 100644
--- a/app_cabanapic_cg/hip/get_max_x_values_kernel_loop.hpp
+++ b/app_cabanapic_cg/hip/get_max_x_values_kernel_loop.hpp
@@ -202,7 +202,6 @@ void opp_par_loop_all__get_max_x_values_kernel(opp_set set,
         for (int d = 0; d < 1; ++d)
             arg5_host_data[d] = MAX(arg5_host_data[d], ((OPP_REAL *)args[5].data)[b * 1 + d]);
     }
-
     args[1].data = (char *)arg1_host_data;
     opp_mpi_reduce(&args[1], arg1_host_data);
 
@@ -212,6 +211,7 @@ void opp_par_loop_all__get_max_x_values_kernel(opp_set set,
     args[5].data = (char *)arg5_host_data;
     opp_mpi_reduce(&args[5], arg5_host_data);
 
+
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
     OPP_DEVICE_SYNCHRONIZE();   
  
diff --git a/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp b/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp
index 34d0be95..e7b05381 100644
--- a/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp
+++ b/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp
@@ -38,33 +38,6 @@ enum CellAcc {
     jfz = 2 * 4,
 };
 
-enum CellInterp {
-    ex = 0,
-    dexdy,
-    dexdz,
-    d2exdydz,
-    ey,
-    deydz,
-    deydx,
-    d2eydzdx,
-    ez,
-    dezdx,
-    dezdy,
-    d2ezdxdy,
-    cbx,
-    dcbxdx,
-    cby,
-    dcbydy,
-    cbz,
-    dcbzdz,
-};
-
-enum Dim {
-    x = 0,
-    y = 1,
-    z = 2,
-};
-
 __device__ inline void weight_current_to_accumulator_kernel(
         double* cell_acc,
         const double* q,
@@ -94,6 +67,33 @@ __device__ inline void weight_current_to_accumulator_kernel(
     cell_acc[CellAcc::jfz + 3] += v3;
 }
 
+enum Dim {
+    x = 0,
+    y = 1,
+    z = 2,
+};
+
+enum CellInterp {
+    ex = 0,
+    dexdy,
+    dexdz,
+    d2exdydz,
+    ey,
+    deydz,
+    deydx,
+    d2eydzdx,
+    ez,
+    dezdx,
+    dezdy,
+    d2ezdxdy,
+    cbx,
+    dcbxdx,
+    cby,
+    dcbydy,
+    cbz,
+    dcbzdz,
+};
+
 __device__ inline void move_deposit_kernel(
     char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen
     const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen
@@ -481,12 +481,17 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
 
     int num_blocks = 200;
 
+    opp_init_particle_move(set, nargs, args);
+
     const int array_count = opp_params->get<OPP_INT>("gpu_reduction_arrays");
     if (!opp_use_segmented_reductions) {
         opp_create_thread_level_data<OPP_REAL>(args[5]);
     }
 
-    opp_init_particle_move(set, nargs, args);
+
+    opp_profiler->start("Mv_AllMv0");
+    // ----------------------------------------------------------------------------
+    // Multi-hop move particles within current MPI rank and if not mark for neighbour comm
 
     opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
     num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
@@ -498,9 +503,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
     opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat4_stride_d, &opp_k2_dat4_stride, &(args[4].dat->set->set_capacity), 1);
     opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat5_stride_d, &opp_k2_dat5_stride, &(args[5].dat->set->set_capacity), 1);
 
-    opp_profiler->start("Mv_AllMv0");
-    // ----------------------------------------------------------------------------
-    // Multi-hop move particles within current MPI rank and if not mark for neighbour comm
     if (!opp_use_segmented_reductions) // Do atomics ----------       
     {
         opp_profiler->start("move_kernel_only");
@@ -524,7 +526,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
         OPP_DEVICE_SYNCHRONIZE(); 
         opp_profiler->end("move_kernel_only");
     }
-    
     else // Do segmented reductions ----------       
     {
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_sr_set_stride_d, &opp_k2_sr_set_stride, &set->size, 1);
@@ -575,6 +576,11 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
     // then iterate over the newly added particles
     while (opp_finalize_particle_move(set)) {
 
+        opp_init_particle_move(set, nargs, args);
+
+        opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
+        num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
+        
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1);
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1);
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat2_stride_d, &opp_k2_dat2_stride, &(args[2].dat->set->set_capacity), 1);
@@ -582,10 +588,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat4_stride_d, &opp_k2_dat4_stride, &(args[4].dat->set->set_capacity), 1);
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat5_stride_d, &opp_k2_dat5_stride, &(args[5].dat->set->set_capacity), 1);
 
-        opp_init_particle_move(set, nargs, args);
-        opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
-        
-        num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
         if (!opp_use_segmented_reductions) // Do atomics ----------       
         {
             opp_profiler->start("move_kernel_only");
@@ -609,7 +611,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
             OPP_DEVICE_SYNCHRONIZE(); 
             opp_profiler->end("move_kernel_only");
         }
-        
         else // Do segmented reductions ----------       
         {
             // opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_sr_set_stride_d, &opp_k2_sr_set_stride, &set->size, 1);
@@ -656,6 +657,7 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
 
     if (!opp_use_segmented_reductions) {
         opp_reduce_thread_level_data<OPP_REAL>(args[5]);
+
     }
     else {
         opp_sr::clear_arrays<OPP_REAL>(sr_dat5_keys_dv, sr_dat5_values_dv, sr_dat5_keys_dv2, sr_dat5_values_dv2);
diff --git a/app_cabanapic_cg/hip/opp_kernels.cpp b/app_cabanapic_cg/hip/opp_kernels.cpp
index 06ba6849..cd5a0a5d 100644
--- a/app_cabanapic_cg/hip/opp_kernels.cpp
+++ b/app_cabanapic_cg/hip/opp_kernels.cpp
@@ -37,6 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "opp_hip.h"
 #include "device_kernels/hip_inline_kernels.h"
 
+OPP_REAL CONST_dt[1];
+OPP_REAL CONST_qsp[1];
+OPP_REAL CONST_cdt_d[3];
+OPP_REAL CONST_p[3];
+OPP_REAL CONST_qdt_2mc[1];
+OPP_REAL CONST_dt_eps0[1];
+OPP_REAL CONST_acc_coef[3];
+
 __constant__ OPP_REAL CONST_dt_d[1];
 __constant__ OPP_REAL CONST_qsp_d[1];
 __constant__ OPP_REAL CONST_cdt_d_d[3];
@@ -52,30 +60,37 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) {
 
     if (!strcmp(name, "CONST_dt")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_dt_d), data, dim * size));
+        std::memcpy(&CONST_dt, data, (size*dim));
         return;
     }
     if (!strcmp(name, "CONST_qsp")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_qsp_d), data, dim * size));
+        std::memcpy(&CONST_qsp, data, (size*dim));
         return;
     }
     if (!strcmp(name, "CONST_cdt_d")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_cdt_d_d), data, dim * size));
+        std::memcpy(&CONST_cdt_d, data, (size*dim));
         return;
     }
     if (!strcmp(name, "CONST_p")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_p_d), data, dim * size));
+        std::memcpy(&CONST_p, data, (size*dim));
         return;
     }
     if (!strcmp(name, "CONST_qdt_2mc")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_qdt_2mc_d), data, dim * size));
+        std::memcpy(&CONST_qdt_2mc, data, (size*dim));
         return;
     }
     if (!strcmp(name, "CONST_dt_eps0")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_dt_eps0_d), data, dim * size));
+        std::memcpy(&CONST_dt_eps0, data, (size*dim));
         return;
     }
     if (!strcmp(name, "CONST_acc_coef")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_acc_coef_d), data, dim * size));
+        std::memcpy(&CONST_acc_coef, data, (size*dim));
         return;
     }
 
diff --git a/app_cabanapic_cg/hip/update_ghosts_kernel_loop.hpp b/app_cabanapic_cg/hip/update_ghosts_kernel_loop.hpp
index d47a54ed..3895a44c 100644
--- a/app_cabanapic_cg/hip/update_ghosts_kernel_loop.hpp
+++ b/app_cabanapic_cg/hip/update_ghosts_kernel_loop.hpp
@@ -118,9 +118,9 @@ __global__ void opp_dev_sr_update_ghosts_kernel( // Used for Segmented Reduction
         );
 
         for (int d = 0; d < 3; ++d) {
-            sr_dat1_values[n + opp_k6_sr_set_stride_d * d] = arg2_0_local[d]; 
+            sr_dat1_values[1 * n + 0 + opp_k6_sr_set_stride_d * d] = arg2_0_local[d]; 
         }    
-        sr_dat1_keys[n] = map0[opp_k6_map0_stride_d * 0 + n];
+        sr_dat1_keys[1 * n + 0] = map0[opp_k6_map0_stride_d * 0 + n];
     }
     
 }
@@ -212,19 +212,21 @@ void opp_par_loop_all__update_ghosts_kernel(opp_set set,
      
         else // Do segmented reductions ----------       
         {
-            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k6_sr_set_stride_d, &opp_k6_sr_set_stride, &(iter_size), 1);
-
             size_t operating_size_dat1 = 0, resize_size_dat1 = 0;
 
             operating_size_dat1 += (size_t)(args[2].dat->dim);
             resize_size_dat1 += (size_t)(args[2].dat->dim);
 
-            operating_size_dat1 *= (size_t)(iter_size); // this might need to be + set->exec_size ..., or simply iter_size | even for opp_k6_sr_set_stride
+            operating_size_dat1 *= (size_t)(iter_size);
             resize_size_dat1 *= (size_t)(set->set_capacity);
 
-            opp_sr::init_arrays<OPP_REAL>(args[1].dat->dim, operating_size_dat1, resize_size_dat1,
-                        sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);
+            int k6_stride = 0;
+            k6_stride += iter_size * 1;
+            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k6_sr_set_stride_d, &opp_k6_sr_set_stride, &(k6_stride), 1);
 
+            opp_sr::init_arrays<OPP_REAL>(args[2].dat->dim, operating_size_dat1, resize_size_dat1,
+                        sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);
+        
             // Create key/value pairs
             opp_profiler->start("SR_CrKeyVal");
             opp_dev_sr_update_ghosts_kernel<<<num_blocks, block_size>>>( 
@@ -241,8 +243,8 @@ void opp_par_loop_all__update_ghosts_kernel(opp_set set,
             OPP_DEVICE_SYNCHRONIZE();
             opp_profiler->end("SR_CrKeyVal");
 
-            opp_sr::do_segmented_reductions<OPP_REAL>(args[2], iter_size,
-                        sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);       
+            opp_sr::do_segmented_reductions<OPP_REAL>(args[2], k6_stride,
+                        sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);
         }
     }
     args[3].data = (char *)arg3_host_data;
diff --git a/app_fempic/configs/coarse.param b/app_fempic/configs/coarse.param
index 88fc3075..7701f8e1 100644
--- a/app_fempic/configs/coarse.param
+++ b/app_fempic/configs/coarse.param
@@ -17,7 +17,7 @@ STRING global_mesh  = /home/zl/phd/box_mesh_gen/48000/mesh.dat
 STRING inlet_mesh   = /home/zl/phd/box_mesh_gen/48000/inlet.dat
 STRING wall_mesh    = /home/zl/phd/box_mesh_gen/48000/wall.dat
 # OR
-STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/fempic/mesh_files/box_48000.hdf5
+STRING hdf_filename = /home/zl/phd/box_mesh_gen/hdf5/box_48000.hdf5
 
 STRING rand_file    = /home/zl/phd/box_mesh_gen/random_100k.dat
 
@@ -25,20 +25,22 @@ BOOL invert_normals = false
 
 BOOL opp_auto_sort          = false
 REAL opp_allocation_multiple = 11
-INT opp_threads_per_block   = 256
+INT opp_threads_per_block   = 512
 
 BOOL opp_profile_all        = false
 INT opp_gpu_direct = 0
-
-REAL grid_spacing  = 25e-6
 BOOL print_final   = true
 
 BOOL opp_global_move = false
+REAL grid_spacing  = 25e-6
+BOOL opp_dh_data_generate = false
+# Use opp_dh_data_generate = true and OPP_DH_DATA_DUMP=1 in args to dump the structured mesh
 
 # block k-means mpi-block
 STRING cluster = block
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
+INT gpu_reduction_arrays = 16
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = Shuffle_Periodic
-INT opp_fill_period = 20
+INT opp_fill_period = 4
\ No newline at end of file
diff --git a/app_fempic_cg/configs/coarse.param b/app_fempic_cg/configs/coarse.param
index c6e86143..745e8aaf 100644
--- a/app_fempic_cg/configs/coarse.param
+++ b/app_fempic_cg/configs/coarse.param
@@ -17,7 +17,7 @@ STRING global_mesh  = /home/zl/phd/box_mesh_gen/48000/mesh.dat
 STRING inlet_mesh   = /home/zl/phd/box_mesh_gen/48000/inlet.dat
 STRING wall_mesh    = /home/zl/phd/box_mesh_gen/48000/wall.dat
 # OR
-STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/fempic/mesh_files/box_48000.hdf5
+STRING hdf_filename = /home/zl/phd/box_mesh_gen/hdf5/box_96000.hdf5
 
 STRING rand_file    = /home/zl/phd/box_mesh_gen/random_100k.dat
 
@@ -25,7 +25,7 @@ BOOL invert_normals = false
 
 BOOL opp_auto_sort          = false
 REAL opp_allocation_multiple = 11
-INT opp_threads_per_block   = 256
+INT opp_threads_per_block   = 512
 
 BOOL opp_profile_all        = false
 INT opp_gpu_direct = 0
@@ -38,7 +38,7 @@ BOOL opp_dh_data_generate = false
 
 # block k-means mpi-block
 STRING cluster = block
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 INT gpu_reduction_arrays = 16
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
diff --git a/app_fempic_cg/fempic_opp.cpp b/app_fempic_cg/fempic_opp.cpp
index 4f3d550c..618c2776 100644
--- a/app_fempic_cg/fempic_opp.cpp
+++ b/app_fempic_cg/fempic_opp.cpp
@@ -1,5 +1,5 @@
 
-// Auto-generated at 2025-01-21 14:55:58.979812 by opp-translator
+// Auto-generated at 2025-02-03 16:29:35.442468 by opp-translator
 /* 
 BSD 3-Clause License
 
diff --git a/app_fempic_cg/hip/deposit_charge_on_nodes_kernel_loop.hpp b/app_fempic_cg/hip/deposit_charge_on_nodes_kernel_loop.hpp
index fa2d50c2..0877e0c0 100644
--- a/app_fempic_cg/hip/deposit_charge_on_nodes_kernel_loop.hpp
+++ b/app_fempic_cg/hip/deposit_charge_on_nodes_kernel_loop.hpp
@@ -17,6 +17,9 @@ __constant__ OPP_INT opp_k5_sr_set_stride_d;
 thrust::device_vector<OPP_INT> sr_dat1_keys_dv;
 thrust::device_vector<OPP_REAL> sr_dat1_values_dv;
 
+thrust::device_vector<OPP_INT> sr_dat1_keys_dv2;
+thrust::device_vector<OPP_REAL> sr_dat1_values_dv2;
+
 namespace opp_k5 {
 __device__ inline void deposit_charge_on_nodes_kernel(
     const double *part_lc,
@@ -152,23 +155,22 @@ __global__ void opp_dev_sr_deposit_charge_on_nodes_kernel( // Used for Segmented
             arg4_3_local // n_charge_den 
         );
 
-        int offset = 0;
-        for (int d = 0; d < 1; ++d, ++offset) {
-            sr_dat1_values[n + opp_k5_sr_set_stride_d * offset] = arg1_0_local[d];
-            sr_dat1_keys[n + opp_k5_sr_set_stride_d * offset] = map0[opp_k5_map0_stride_d * 0 + opp_p2c[0]] + (d * opp_k5_dat1_stride_d);
-        }
-        for (int d = 0; d < 1; ++d, ++offset) {
-            sr_dat1_values[n + opp_k5_sr_set_stride_d * offset] = arg2_1_local[d];
-            sr_dat1_keys[n + opp_k5_sr_set_stride_d * offset] = map0[opp_k5_map0_stride_d * 1 + opp_p2c[0]] + (d * opp_k5_dat1_stride_d);
-        }
-        for (int d = 0; d < 1; ++d, ++offset) {
-            sr_dat1_values[n + opp_k5_sr_set_stride_d * offset] = arg3_2_local[d];
-            sr_dat1_keys[n + opp_k5_sr_set_stride_d * offset] = map0[opp_k5_map0_stride_d * 2 + opp_p2c[0]] + (d * opp_k5_dat1_stride_d);
-        }
-        for (int d = 0; d < 1; ++d, ++offset) {
-            sr_dat1_values[n + opp_k5_sr_set_stride_d * offset] = arg4_3_local[d];
-            sr_dat1_keys[n + opp_k5_sr_set_stride_d * offset] = map0[opp_k5_map0_stride_d * 3 + opp_p2c[0]] + (d * opp_k5_dat1_stride_d);
-        }
+        for (int d = 0; d < 1; ++d) {
+            sr_dat1_values[4 * n + 0 + opp_k5_sr_set_stride_d * d] = arg1_0_local[d]; 
+        }    
+        sr_dat1_keys[4 * n + 0] = map0[opp_k5_map0_stride_d * 0 + opp_p2c[0]];
+        for (int d = 0; d < 1; ++d) {
+            sr_dat1_values[4 * n + 1 + opp_k5_sr_set_stride_d * d] = arg2_1_local[d]; 
+        }    
+        sr_dat1_keys[4 * n + 1] = map0[opp_k5_map0_stride_d * 1 + opp_p2c[0]];
+        for (int d = 0; d < 1; ++d) {
+            sr_dat1_values[4 * n + 2 + opp_k5_sr_set_stride_d * d] = arg3_2_local[d]; 
+        }    
+        sr_dat1_keys[4 * n + 2] = map0[opp_k5_map0_stride_d * 2 + opp_p2c[0]];
+        for (int d = 0; d < 1; ++d) {
+            sr_dat1_values[4 * n + 3 + opp_k5_sr_set_stride_d * d] = arg4_3_local[d]; 
+        }    
+        sr_dat1_keys[4 * n + 3] = map0[opp_k5_map0_stride_d * 3 + opp_p2c[0]];
     }
     
 }
@@ -220,7 +222,7 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set,
         const OPP_INT end = iter_size;
         num_blocks = (end - start - 1) / block_size + 1;
 
-        if (!opp_params->get<OPP_BOOL>("use_reg_red")) // Do atomics ----------       
+        if (!opp_use_segmented_reductions) // Do atomics ----------       
         {
             const int array_count = opp_params->get<OPP_INT>("gpu_reduction_arrays");
             OPP_REAL** arg1_dat_thread_data_d = opp_create_thread_level_data<OPP_REAL>(args[1]);
@@ -238,8 +240,6 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set,
      
         else // Do segmented reductions ----------       
         {
-            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k5_sr_set_stride_d, &opp_k5_sr_set_stride, &(set->size), 1);
-
             size_t operating_size_dat1 = 0, resize_size_dat1 = 0;
 
             operating_size_dat1 += (size_t)(args[1].dat->dim);
@@ -251,13 +251,15 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set,
             operating_size_dat1 += (size_t)(args[4].dat->dim);
             resize_size_dat1 += (size_t)(args[4].dat->dim);
 
-            operating_size_dat1 *= (size_t)(set->size);
+            operating_size_dat1 *= (size_t)(iter_size);
             resize_size_dat1 *= (size_t)(set->set_capacity);
 
-            if (resize_size_dat1 > sr_dat1_keys_dv.size()) { // resize only if current vector is small        
-                sr_dat1_keys_dv.resize(resize_size_dat1, 0);
-                sr_dat1_values_dv.resize(resize_size_dat1, 0);
-            }
+            int k5_stride = 0;
+            k5_stride += iter_size * 4;
+            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k5_sr_set_stride_d, &opp_k5_sr_set_stride, &(k5_stride), 1);
+
+            opp_sr::init_arrays<OPP_REAL>(args[1].dat->dim, operating_size_dat1, resize_size_dat1,
+                        sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);
         
             // Create key/value pairs
             opp_profiler->start("SR_CrKeyVal");
@@ -273,41 +275,11 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set,
             OPP_DEVICE_SYNCHRONIZE();
             opp_profiler->end("SR_CrKeyVal");
 
-            // Sort by keys to bring the identical keys together
-            opp_profiler->start("SR_SortByKey");
-            thrust::sort_by_key(sr_dat1_keys_dv.begin(), sr_dat1_keys_dv.begin() + operating_size_dat1, 
-                sr_dat1_values_dv.begin());
-            opp_profiler->end("SR_SortByKey");
-
-            // Compute the unique keys and their corresponding values
-            opp_profiler->start("SR_RedByKey");
-            auto new_end = thrust::reduce_by_key(
-                sr_dat1_keys_dv.begin(), sr_dat1_keys_dv.begin() + operating_size_dat1,
-                sr_dat1_values_dv.begin(),
-                sr_dat1_keys_dv.begin(),
-                sr_dat1_values_dv.begin());        
-            opp_profiler->end("SR_RedByKey");
-
-            const size_t reduced_size = (new_end.first - sr_dat1_keys_dv.begin());
-            
-            // Assign reduced values to the nodes using keys/values
-            opp_profiler->start("SR_Assign");                
-            opp_k5::assign_values<<<num_blocks, block_size>>> ( // TODO : check whether num_blocks is correct
-                opp_get_dev_raw_ptr<OPP_INT>(sr_dat1_keys_dv),
-                opp_get_dev_raw_ptr<OPP_REAL>(sr_dat1_values_dv),
-                (OPP_REAL *) args[1].data_d,
-                0, reduced_size);
-            OPP_DEVICE_SYNCHRONIZE();
-            opp_profiler->end("SR_Assign");
-
-            // Last: clear the thrust vectors if this is the last iteration (avoid crash)
-            if (opp_params->get<OPP_INT>("num_steps") == (OPP_main_loop_iter + 1)) {
-                OPP_DEVICE_SYNCHRONIZE();
-                sr_dat1_values_dv.clear(); sr_dat1_values_dv.shrink_to_fit();
-                sr_dat1_keys_dv.clear(); sr_dat1_keys_dv.shrink_to_fit();
-            }        
+            opp_sr::do_segmented_reductions<OPP_REAL>(args[1], k5_stride,
+                        sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);
         }
     }
+    opp_sr::clear_arrays<OPP_REAL>(sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);
 
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
     OPP_DEVICE_SYNCHRONIZE();   
diff --git a/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp b/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp
index c696362f..e2d65070 100644
--- a/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp
+++ b/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp
@@ -161,13 +161,13 @@ void opp_par_loop_all__get_final_max_values_kernel(opp_set set,
         for (int d = 0; d < 1; ++d)
             arg3_host_data[d] = MAX(arg3_host_data[d], ((OPP_REAL *)args[3].data)[b * 1 + d]);
     }
-
     args[1].data = (char *)arg1_host_data;
     opp_mpi_reduce(&args[1], arg1_host_data);
 
     args[3].data = (char *)arg3_host_data;
     opp_mpi_reduce(&args[3], arg3_host_data);
 
+
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
     OPP_DEVICE_SYNCHRONIZE();   
  
diff --git a/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp b/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp
index cb0b776f..227ee156 100644
--- a/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp
+++ b/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp
@@ -124,10 +124,10 @@ void opp_par_loop_all__get_max_cef_kernel(opp_set set,
         for (int d = 0; d < 1; ++d)
             arg1_host_data[d] = MAX(arg1_host_data[d], ((OPP_REAL *)args[1].data)[b * 1 + d]);
     }
-
     args[1].data = (char *)arg1_host_data;
     opp_mpi_reduce(&args[1], arg1_host_data);
 
+
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
     OPP_DEVICE_SYNCHRONIZE();   
  
diff --git a/app_fempic_cg/hip/move_kernel_loop.hpp b/app_fempic_cg/hip/move_kernel_loop.hpp
index aaa4a284..8baaed01 100644
--- a/app_fempic_cg/hip/move_kernel_loop.hpp
+++ b/app_fempic_cg/hip/move_kernel_loop.hpp
@@ -15,6 +15,8 @@ __constant__ OPP_INT opp_k4_dat2_stride_d;
 __constant__ OPP_INT opp_k4_dat3_stride_d;
 __constant__ OPP_INT opp_k4_c2c_map_stride_d;
 
+
+
 namespace opp_k4 {
 
 namespace host {
@@ -114,6 +116,7 @@ __device__ inline void move_kernel(
 }
 }
 
+//--------------------------------------------------------------
 __global__ void opp_dev_move_kernel(
     const OPP_REAL *__restrict__ dat0,     // p_pos
     OPP_REAL *__restrict__ dat1,     // p_lc
@@ -130,11 +133,9 @@ __global__ void opp_dev_move_kernel(
     const OPP_INT end
 ) 
 {
-    const int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (thread_id + start < end) {
+    const int n = OPP_DEVICE_GLOBAL_LINEAR_ID + start;
 
-        const int n = thread_id + start;
+    if (n < end) {
 
         OPP_INT *opp_p2c = (p2c_map + n);
         if (opp_p2c[0] == MAX_CELL_INDEX) {
@@ -144,8 +145,7 @@ __global__ void opp_dev_move_kernel(
         char move_flag = OPP_NEED_MOVE;
         bool iter_one_flag = (OPP_comm_iteration_d > 0) ? false : true;
 
-        do
-        {
+        do {
             const OPP_INT p2c = opp_p2c[0]; // get the value here, since the kernel might change it
             const OPP_INT* opp_c2c = c2c_map + p2c;           
 
@@ -162,8 +162,10 @@ __global__ void opp_dev_move_kernel(
             *particle_remove_count, particle_remove_indices, move_particle_indices, 
             move_cell_indices, move_count));        
     }
+    
 }
 
+
 void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_map,
     opp_arg arg0,   // p_pos | OPP_READ
     opp_arg arg1,   // p_lc | OPP_WRITE
@@ -202,10 +204,6 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     int num_blocks = 200;
 
     opp_init_particle_move(set, nargs, args);
-
-    opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
-    num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
-
     if (useGlobalMove) {
            
 #ifdef USE_MPI         
@@ -227,6 +225,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
         }
 
         // check whether particles need to be moved via the global move routine
+        num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
         opp_dev_checkForGlobalMove3D_kernel<<<num_blocks, block_size>>>(
             (OPP_REAL*)args[0].data_d,    // p_pos 
             (OPP_INT *)args[4].data_d,    // p2c_map
@@ -254,16 +253,22 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
 #endif
     }
 
-    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1);
-    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1);
-    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1);
-    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, &(args[3].dat->set->set_capacity), 1);
+
 
 
     opp_profiler->start("Mv_AllMv0");
     // ----------------------------------------------------------------------------
     // check whether all particles not marked for global comm is within cell, 
     // and if not mark to move between cells within the MPI rank, mark for neighbour comm
+
+    opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
+    num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
+    
+    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1);
+    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1);
+    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1);
+    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, &(args[3].dat->set->set_capacity), 1);
+
     {
         opp_profiler->start("move_kernel_only");
         opp_dev_move_kernel<<<num_blocks, block_size>>>(
@@ -281,11 +286,13 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
             OPP_iter_start,
             OPP_iter_end
         );
-        OPP_DEVICE_SYNCHRONIZE();   
+        OPP_DEVICE_SYNCHRONIZE(); 
         opp_profiler->end("move_kernel_only");
     }
-    opp_profiler->end("Mv_AllMv0");
 
+
+    opp_profiler->end("Mv_AllMv0");
+    
 #ifdef USE_MPI 
     // ----------------------------------------------------------------------------
     // finalize the global move routine and iterate over newly added particles and check whether they need neighbour comm
@@ -308,6 +315,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
             const int start2 = (set->size - set->diff);
             const int end2 = set->size;
             num_blocks = (end2 - start2 - 1) / block_size + 1;
+
             opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1);
             opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1);
             opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1);
@@ -342,15 +350,16 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     // then iterate over the newly added particles
     while (opp_finalize_particle_move(set)) {
 
+        opp_init_particle_move(set, nargs, args);
+
+        opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
+        num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
+        
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1);
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1);
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1);
         opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, &(args[3].dat->set->set_capacity), 1);
 
-        opp_init_particle_move(set, nargs, args);
-        opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
-
-        num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
         {
             opp_profiler->start("move_kernel_only");
             opp_dev_move_kernel<<<num_blocks, block_size>>>(
@@ -368,9 +377,10 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
                 OPP_iter_start,
                 OPP_iter_end
             );
-            OPP_DEVICE_SYNCHRONIZE();   
+            OPP_DEVICE_SYNCHRONIZE(); 
             opp_profiler->end("move_kernel_only");
         }
+
     }
 
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
diff --git a/app_neso_advection/configs/advec.param b/app_neso_advection/configs/advec.param
index e262c518..11cefa36 100644
--- a/app_neso_advection/configs/advec.param
+++ b/app_neso_advection/configs/advec.param
@@ -13,8 +13,14 @@ REAL opp_allocation_multiple = 1.1
 INT opp_threads_per_block   = 256
 BOOL opp_profile_all        = false
 INT opp_gpu_direct = 0
+
 BOOL opp_global_move = false
 REAL grid_spacing    = 0.5
+BOOL opp_dh_data_generate = false
+# Use opp_dh_data_generate = true and OPP_DH_DATA_DUMP=1 in args to dump the structured mesh
+
+BOOL opp_segmented_red = false
+INT gpu_reduction_arrays = 16
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = HoleFill_All
diff --git a/app_neso_advection_cg/advec_opp.cpp b/app_neso_advection_cg/advec_opp.cpp
index 3cc754d6..87e32855 100644
--- a/app_neso_advection_cg/advec_opp.cpp
+++ b/app_neso_advection_cg/advec_opp.cpp
@@ -1,5 +1,5 @@
 
-// Auto-generated at 2025-01-21 15:02:40.426657 by opp-translator
+// Auto-generated at 2025-02-03 16:37:08.638646 by opp-translator
 /* 
 BSD 3-Clause License
 
diff --git a/app_neso_advection_cg/configs/advec.param b/app_neso_advection_cg/configs/advec.param
index 4d9f2554..11cefa36 100644
--- a/app_neso_advection_cg/configs/advec.param
+++ b/app_neso_advection_cg/configs/advec.param
@@ -19,6 +19,9 @@ REAL grid_spacing    = 0.5
 BOOL opp_dh_data_generate = false
 # Use opp_dh_data_generate = true and OPP_DH_DATA_DUMP=1 in args to dump the structured mesh
 
+BOOL opp_segmented_red = false
+INT gpu_reduction_arrays = 16
+
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = HoleFill_All
 INT opp_fill_period = 4
\ No newline at end of file
diff --git a/app_neso_advection_cg/hip/move_kernel_loop.hpp b/app_neso_advection_cg/hip/move_kernel_loop.hpp
index 99397b3d..d224480e 100644
--- a/app_neso_advection_cg/hip/move_kernel_loop.hpp
+++ b/app_neso_advection_cg/hip/move_kernel_loop.hpp
@@ -14,11 +14,48 @@ __constant__ OPP_INT opp_k2_c2c_map_stride_d;
 
 
 namespace opp_k2 {
+
+namespace host {
+enum CellMap {
+    xd_y = 0,
+    xu_y,
+    x_yd,
+    x_yu
+};
+
 enum Dim {
     x = 0,
     y = 1,
 };
 
+inline void move_kernel(const double* p_pos, const double* c_pos_ll)
+{
+    // check for x direction movement
+    const double p_pos_x = p_pos[Dim::x];
+    if (p_pos_x < c_pos_ll[Dim::x]) {
+        opp_p2c[0] = opp_c2c[CellMap::xd_y];
+        { opp_move_status_flag = OPP_NEED_MOVE; }; return;
+    }
+    if (p_pos_x > (c_pos_ll[Dim::x] + CONST_cell_width[0])) {
+        opp_p2c[0] = opp_c2c[CellMap::xu_y];
+        { opp_move_status_flag = OPP_NEED_MOVE; }; return;
+    }
+
+    // check for y direction movement
+    const double p_pos_y = p_pos[Dim::y];
+    if (p_pos_y < c_pos_ll[Dim::y]) {
+        opp_p2c[0] = opp_c2c[CellMap::x_yd];
+        { opp_move_status_flag = OPP_NEED_MOVE; }; return;
+    }
+    if (p_pos_y > (c_pos_ll[Dim::y] + CONST_cell_width[0])) {
+        opp_p2c[0] = opp_c2c[CellMap::x_yu];
+        { opp_move_status_flag = OPP_NEED_MOVE; }; return;
+    }
+
+    { opp_move_status_flag = OPP_MOVE_DONE; };
+}
+}
+
 enum CellMap {
     xd_y = 0,
     xu_y,
@@ -26,6 +63,11 @@ enum CellMap {
     x_yu
 };
 
+enum Dim {
+    x = 0,
+    y = 1,
+};
+
 __device__ inline void move_kernel(char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen
     const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen
     const double* p_pos, const double* c_pos_ll)
@@ -71,18 +113,19 @@ __global__ void opp_dev_move_kernel(
     const OPP_INT end
 ) 
 {
-    const int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
+    const int n = OPP_DEVICE_GLOBAL_LINEAR_ID + start;
 
-    if (thread_id + start < end) {
-
-        const int n = thread_id + start;
+    if (n < end) {
 
         OPP_INT *opp_p2c = (p2c_map + n);
+        if (opp_p2c[0] == MAX_CELL_INDEX) {
+            return;
+        }
+
         char move_flag = OPP_NEED_MOVE;
         bool iter_one_flag = (OPP_comm_iteration_d > 0) ? false : true;
 
-        do
-        {
+        do {
             const OPP_INT p2c = opp_p2c[0]; // get the value here, since the kernel might change it
             const OPP_INT* opp_c2c = c2c_map + p2c;           
 
@@ -100,7 +143,6 @@ __global__ void opp_dev_move_kernel(
     
 }
 
-//--------------------------------------------------------------
 
 void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_map,
     opp_arg arg0,   // p_pos | OPP_READ
@@ -118,7 +160,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     args[1] = arg1;
     args[2] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty
 
-    const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU);
+    opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU);
 
     const OPP_INT c2c_stride = c2c_map->from->size + c2c_map->from->exec_size + c2c_map->from->nonexec_size;
 
@@ -135,17 +177,155 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
 
     int num_blocks = 200;
 
-    do 
+    opp_init_particle_move(set, nargs, args);
+    if (useGlobalMove) {
+           
+#ifdef USE_MPI         
+        globalMover->initGlobalMove();
+        opp_init_dh_device(set);
+#endif
+        opp_profiler->start("GblMv_Move");
+
+        opp_mem::dev_copy_to_symbol<OPP_INT>(cellMapper_pos_stride_d, &cellMapper_pos_stride, &(args[0].dat->set->set_capacity), 1);
+        opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_rank_d, &OPP_rank, 1);
+
+        hipError_t err1 = hipMemcpyToSymbol(HIP_SYMBOL(opp_minSavedDHGrid_d), opp_minSavedDHGrid, 3 * sizeof(size_t));
+        if (err1 != hipSuccess) {
+            throw std::runtime_error(std::string("dev_copy_to_symbol: ") + hipGetErrorString(err1));
+        }
+        hipError_t err2 = hipMemcpyToSymbol(HIP_SYMBOL(opp_maxSavedDHGrid_d), opp_maxSavedDHGrid, 3 * sizeof(size_t));
+        if (err2 != hipSuccess) {
+            throw std::runtime_error(std::string("dev_copy_to_symbol: ") + hipGetErrorString(err2));
+        }
+
+        // check whether particles need to be moved via the global move routine
+        num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
+        opp_dev_checkForGlobalMove2D_kernel<<<num_blocks, block_size>>>(
+            (OPP_REAL*)args[0].data_d,    // p_pos 
+            (OPP_INT *)args[2].data_d,    // p2c_map
+            cellMapper->structMeshToCellMapping_d, 
+            cellMapper->structMeshToRankMapping_d,
+            cellMapper->oneOverGridSpacing_d, 
+            cellMapper->minGlbCoordinate_d, 
+            cellMapper->globalGridDims_d, 
+            cellMapper->globalGridSize_d,
+            set->particle_remove_count_d, 
+            OPP_remove_particle_indices_d, 
+            dh_indices_d.part_indices, 
+            dh_indices_d.cell_indices, 
+            dh_indices_d.rank_indices, 
+            dh_indices_d.move_count,
+            OPP_iter_start, OPP_iter_end
+        );
+        OPP_DEVICE_SYNCHRONIZE();
+
+        opp_profiler->end("GblMv_Move");
+
+#ifdef USE_MPI 
+        opp_gather_dh_move_indices(set);
+        globalMover->communicate(set);
+#endif
+    }
+
+
+
+
+    opp_profiler->start("Mv_AllMv0");
+    // ----------------------------------------------------------------------------
+    // check whether all particles not marked for global comm is within cell, 
+    // and if not mark to move between cells within the MPI rank, mark for neighbour comm
+
+    opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
+    num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
+    
+    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1);
+    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1);
+
     {
-        opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1);
-        opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1);
+        opp_profiler->start("move_kernel_only");
+        opp_dev_move_kernel<<<num_blocks, block_size>>>(
+            (OPP_REAL *)args[0].data_d,    // p_pos
+            (OPP_REAL *)args[1].data_d,    // c_pos_ll
+            (OPP_INT *)args[2].data_d,    // p2c_map
+            (OPP_INT *)c2c_map->map_d,    // c2c_map
+            (OPP_INT *)set->particle_remove_count_d,
+            (OPP_INT *)OPP_remove_particle_indices_d,
+            (OPP_INT *)OPP_move_particle_indices_d,
+            (OPP_INT *)OPP_move_cell_indices_d,
+            (OPP_INT *)OPP_move_count_d,
+            OPP_iter_start,
+            OPP_iter_end
+        );
+        OPP_DEVICE_SYNCHRONIZE(); 
+        opp_profiler->end("move_kernel_only");
+    }
+
+
+    opp_profiler->end("Mv_AllMv0");
+    
+#ifdef USE_MPI 
+    // ----------------------------------------------------------------------------
+    // finalize the global move routine and iterate over newly added particles and check whether they need neighbour comm
+    if (useGlobalMove) { 
+        
+        opp_profiler->start("GblMv_finalize");
+        const int finalized = globalMover->finalize(set);
+        opp_profiler->end("GblMv_finalize");
+
+        if (finalized > 0) {
+            opp_profiler->start("GblMv_AllMv");
+
+            // need to change arg data since particle resize in globalMover::finalize could change the pointer in dat->data 
+            for (int i = 0; i < nargs; i++)
+                if (args[i].argtype == OPP_ARG_DAT && args[i].dat->set->is_particle)
+                    args[i].data_d = args[i].dat->data_d;
+
+            // check whether the new particle is within cell, and if not move between cells within the MPI rank, 
+            // mark for neighbour comm. Do only for the globally moved particles 
+            const int start2 = (set->size - set->diff);
+            const int end2 = set->size;
+            num_blocks = (end2 - start2 - 1) / block_size + 1;
+
+            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1);
+            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1);
+
+            opp_profiler->start("move_kernel_only");      
+            opp_dev_move_kernel<<<num_blocks, block_size>>>(
+                (OPP_REAL *)args[0].data_d,    // p_pos
+                (OPP_REAL *)args[1].data_d,    // c_pos_ll
+                (OPP_INT *)args[2].data_d,    // p2c_map
+                (OPP_INT *)c2c_map->map_d,    // c2c_map
+                (OPP_INT *)set->particle_remove_count_d,
+                (OPP_INT *)OPP_remove_particle_indices_d,
+                (OPP_INT *)OPP_move_particle_indices_d,
+                (OPP_INT *)OPP_move_cell_indices_d,
+                (OPP_INT *)OPP_move_count_d,
+                start2,
+                end2
+            );
+            OPP_DEVICE_SYNCHRONIZE(); 
+            opp_profiler->end("move_kernel_only");
+
+            opp_profiler->end("GblMv_AllMv");
+        }
+    }
+#endif
+
+    // ----------------------------------------------------------------------------
+    // Do neighbour communication and if atleast one particle is received by the currect rank, 
+    // then iterate over the newly added particles
+    while (opp_finalize_particle_move(set)) {
 
         opp_init_particle_move(set, nargs, args);
-        opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
 
+        opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
         num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
+        
+        opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1);
+        opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1);
 
         {
+            opp_profiler->start("move_kernel_only");
             opp_dev_move_kernel<<<num_blocks, block_size>>>(
                 (OPP_REAL *)args[0].data_d,    // p_pos
                 (OPP_REAL *)args[1].data_d,    // c_pos_ll
@@ -159,21 +339,83 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
                 OPP_iter_start,
                 OPP_iter_end
             );
+            OPP_DEVICE_SYNCHRONIZE(); 
+            opp_profiler->end("move_kernel_only");
         }
 
-    } while (opp_finalize_particle_move(set)); 
+    }
 
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
     OPP_DEVICE_SYNCHRONIZE();   
  
     opp_profiler->end("move_kernel");
 }
+
 void opp_init_direct_hop_cg(double grid_spacing, const opp_dat c_gbl_id, const opp::BoundingBox& b_box, 
     opp_map c2c_map, opp_map p2c_map,
     opp_arg arg0, // p_pos | OPP_READ
     opp_arg arg1 // c_pos_ll | OPP_READ
 ) {
     opp_profiler->start("Setup_Mover");
-    
+
+    useGlobalMove = opp_params->get<OPP_BOOL>("opp_global_move");
+
+    if (OPP_DBG) opp_printf("opp_init_direct_hop_cg", "START useGlobalMove=%s", useGlobalMove ? "YES" : "NO");
+
+    if (useGlobalMove) {
+
+        const int nargs = 3;
+        opp_arg args[nargs];
+
+        args[0] = arg0;
+        args[1] = arg1;
+        args[2] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty
+
+#ifdef USE_MPI
+        opp_mpi_halo_exchanges_grouped(c_gbl_id->set, nargs, args, Device_CPU);
+
+        comm = std::make_shared<opp::Comm>(MPI_COMM_WORLD);
+        globalMover = std::make_unique<opp::GlobalParticleMover>(comm->comm_parent);
+
+        opp_mpi_halo_wait_all(nargs, args);
+#endif
+
+        boundingBox = std::make_shared<opp::BoundingBox>(b_box);
+        cellMapper = std::make_shared<opp::CellMapper>(boundingBox, grid_spacing, comm);
+        
+        const int c_set_size = c_gbl_id->set->size;
+
+        // lambda function for dh mesh search loop
+        auto all_cell_checker = [&](const opp_point& point, int& cid) {          
+
+            for (int ci = 0; ci < c_set_size; ++ci) {
+                opp_move_status_flag = OPP_NEED_MOVE;  
+                opp_move_hop_iter_one_flag = true;
+                
+                int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy
+                
+                opp_p2c = &(temp_ci);           
+                opp_c2c = &((c2c_map->map)[temp_ci * 4]);
+
+                opp_k2::host::move_kernel(
+                    (const OPP_REAL*)&point,
+                    (const OPP_REAL *)args[1].data + (temp_ci * 2) // c_pos_ll| OPP_READ
+                );
+                if (opp_move_status_flag == OPP_MOVE_DONE) {       
+                    cid = temp_ci;
+                    break;
+                }
+            }
+        };
+
+        if (opp_params->get<OPP_BOOL>("opp_dh_data_generate")) {
+            cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker);
+        }
+        else {
+            cellMapper->generateStructuredMeshFromFile(c_gbl_id->set, c_gbl_id);  
+        }
+    }
+
     opp_profiler->end("Setup_Mover");
 }
+
diff --git a/app_neso_advection_cg/hip/opp_kernels.cpp b/app_neso_advection_cg/hip/opp_kernels.cpp
index e59e7640..c745e8ac 100644
--- a/app_neso_advection_cg/hip/opp_kernels.cpp
+++ b/app_neso_advection_cg/hip/opp_kernels.cpp
@@ -37,6 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "opp_hip.h"
 #include "device_kernels/hip_inline_kernels.h"
 
+OPP_REAL CONST_extents[2];
+OPP_REAL CONST_dt[1];
+OPP_REAL CONST_cell_width[1];
+OPP_INT CONST_ndimcells[2];
+
 __constant__ OPP_REAL CONST_extents_d[2];
 __constant__ OPP_REAL CONST_dt_d[1];
 __constant__ OPP_REAL CONST_cell_width_d[1];
@@ -49,18 +54,22 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) {
 
     if (!strcmp(name, "CONST_extents")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_extents_d), data, dim * size));
+        std::memcpy(&CONST_extents, data, (size*dim));
         return;
     }
     if (!strcmp(name, "CONST_dt")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_dt_d), data, dim * size));
+        std::memcpy(&CONST_dt, data, (size*dim));
         return;
     }
     if (!strcmp(name, "CONST_cell_width")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_cell_width_d), data, dim * size));
+        std::memcpy(&CONST_cell_width, data, (size*dim));
         return;
     }
     if (!strcmp(name, "CONST_ndimcells")) {
         cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_ndimcells_d), data, dim * size));
+        std::memcpy(&CONST_ndimcells, data, (size*dim));
         return;
     }
 
diff --git a/app_neso_advection_cg/hip/verify_kernel_loop.hpp b/app_neso_advection_cg/hip/verify_kernel_loop.hpp
index 7479826d..98b21c40 100644
--- a/app_neso_advection_cg/hip/verify_kernel_loop.hpp
+++ b/app_neso_advection_cg/hip/verify_kernel_loop.hpp
@@ -170,10 +170,10 @@ void opp_par_loop_all__verify_kernel(opp_set set,
         for (int d = 0; d < 1; ++d)
             arg2_host_data[d] += ((OPP_INT *)args[2].data)[b * 1 + d];
     }
-
     args[2].data = (char *)arg2_host_data;
     opp_mpi_reduce(&args[2], arg2_host_data);
 
+
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
     OPP_DEVICE_SYNCHRONIZE();   
  
diff --git a/app_neso_advection_cg/mpi/move_kernel_loop.hpp b/app_neso_advection_cg/mpi/move_kernel_loop.hpp
index 2d3d3584..3f689927 100644
--- a/app_neso_advection_cg/mpi/move_kernel_loop.hpp
+++ b/app_neso_advection_cg/mpi/move_kernel_loop.hpp
@@ -257,7 +257,7 @@ void opp_init_direct_hop_cg(double grid_spacing, const opp_dat c_gbl_id, const o
         else {
             cellMapper->generateStructuredMeshFromFile(c_gbl_id->set, c_gbl_id);  
         }
-
+        
         opp_profiler->reg("GlbToLocal");
         opp_profiler->reg("GblMv_Move");
         opp_profiler->reg("GblMv_AllMv");
diff --git a/app_neso_advection_cg/seq/move_kernel_loop.hpp b/app_neso_advection_cg/seq/move_kernel_loop.hpp
index 662e1529..ea921f5a 100644
--- a/app_neso_advection_cg/seq/move_kernel_loop.hpp
+++ b/app_neso_advection_cg/seq/move_kernel_loop.hpp
@@ -190,7 +190,7 @@ void opp_init_direct_hop_cg(double grid_spacing, const opp_dat c_gbl_id, const o
         else {
             cellMapper->generateStructuredMeshFromFile(c_gbl_id->set, c_gbl_id);  
         }
-
+        
         opp_profiler->reg("GlbToLocal");
         opp_profiler->reg("GblMv_Move");
         opp_profiler->reg("GblMv_AllMv");
diff --git a/app_simpic/configs/system.param b/app_simpic/configs/system.param
index 86424391..804f88ca 100644
--- a/app_simpic/configs/system.param
+++ b/app_simpic/configs/system.param
@@ -23,7 +23,7 @@ BOOL opp_profile_all        = false
 INT opp_gpu_direct = 0
 BOOL opp_global_move = false
 REAL grid_spacing    = 0.5
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = Shuffle_Periodic
diff --git a/app_simpic_cg/configs/system.param b/app_simpic_cg/configs/system.param
index 86424391..804f88ca 100644
--- a/app_simpic_cg/configs/system.param
+++ b/app_simpic_cg/configs/system.param
@@ -23,7 +23,7 @@ BOOL opp_profile_all        = false
 INT opp_gpu_direct = 0
 BOOL opp_global_move = false
 REAL grid_spacing    = 0.5
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = Shuffle_Periodic
diff --git a/opp_translator/resources/templates/cpp/hip/loop_host.hpp.jinja b/opp_translator/resources/templates/cpp/hip/loop_host.hpp.jinja
index 38b4279d..7ff1fee1 100644
--- a/opp_translator/resources/templates/cpp/hip/loop_host.hpp.jinja
+++ b/opp_translator/resources/templates/cpp/hip/loop_host.hpp.jinja
@@ -148,6 +148,9 @@ __constant__ OPP_INT opp_k{{kernel_idx}}_sr_set_stride_d;
     {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) if config.seg_red %}
 thrust::device_vector<OPP_INT> sr_dat{{lh.dat(arg).id}}_keys_dv;
 thrust::device_vector<{{lh.dat(arg).typ}}> sr_dat{{lh.dat(arg).id}}_values_dv;
+
+thrust::device_vector<OPP_INT> sr_dat{{lh.dat(arg).id}}_keys_dv2;
+thrust::device_vector<{{lh.dat(arg).typ}}> sr_dat{{lh.dat(arg).id}}_values_dv2;
     {% endfor %}
 {% endblock %}
 
@@ -322,14 +325,21 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions
             {% endfor %}
         );
 
-        int offset = 0;
+        {% set counters = namespace(data={}) %}
+        {% set occurrences = namespace(data={}) %}
+        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
+            {% set _ = counters.data.update({arg.dat_id: 0}) %}
+            {% set x_ = occurrences.data.update({arg.dat_id: 0}) %}
+        {% endfor %}
+        {% for arg in lh.args|dat|indirect|reduction %}
+            {% set _ = counters.data.update({arg.dat_id: counters.data[arg.dat_id] + 1}) %}
+        {% endfor %}
         {% for arg in lh.args_expanded|indirect|reduction if config.atomics %}
-        for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d, ++offset) {
-            {#% for datx in lh.dats|same_iter_set_dat(lh) %#}
-            sr_dat{{arg.dat_id}}_values[n + opp_k{{kernel_idx}}_sr_set_stride_d * offset] = arg{{arg.id}}_{{arg.map_idx}}_local[d];
-            sr_dat{{arg.dat_id}}_keys[n + opp_k{{kernel_idx}}_sr_set_stride_d * offset] = {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}});
-            {#% endfor %#}
-        }
+        for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) {
+            sr_dat{{arg.dat_id}}_values[{{ counters.data[arg.dat_id] }} * n + {{ occurrences.data[arg.dat_id] }} + opp_k{{kernel_idx}}_sr_set_stride_d * d] = arg{{arg.id}}_{{arg.map_idx}}_local[d]; 
+        }    
+        sr_dat{{arg.dat_id}}_keys[{{ counters.data[arg.dat_id] }} * n + {{ occurrences.data[arg.dat_id] }}] = {{map_lookup(arg, kernel_idx)}};
+        {% set y_ = occurrences.data.update({arg.dat_id: occurrences.data[arg.dat_id] + 1}) %}
         {% endfor %}
     }
     {% for arg in lh.args|gbl|reduction %}
@@ -567,7 +577,7 @@ opp_dev_sr_{{lh.kernel}}<<<num_blocks, block_size{{-(", %s" % shared_size) if lh
 
         {% endif %}
     {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics and config.seg_red %}
-        if (!opp_params->get<OPP_BOOL>("use_reg_red")) // Do atomics ----------       
+        if (!opp_use_segmented_reductions) // Do atomics ----------       
     {% endif %}
     {% if lh is direct or config.atomics %}
         {
@@ -596,8 +606,6 @@ opp_dev_sr_{{lh.kernel}}<<<num_blocks, block_size{{-(", %s" % shared_size) if lh
     {% endif %}
     {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.seg_red %}
         {
-            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_sr_set_stride_d, &opp_k{{kernel_idx}}_sr_set_stride, &(set->size), 1);
-
         {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
             size_t operating_size_dat{{arg.dat_id}} = 0, resize_size_dat{{arg.dat_id}} = 0;
         {% endfor %}
@@ -608,15 +616,26 @@ opp_dev_sr_{{lh.kernel}}<<<num_blocks, block_size{{-(", %s" % shared_size) if lh
         {% endfor %}
 
         {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
-            operating_size_dat{{arg.dat_id}} *= (size_t)(set->size);
+            operating_size_dat{{arg.dat_id}} *= (size_t)(iter_size);
             resize_size_dat{{arg.dat_id}} *= (size_t)(set->set_capacity);
         {% endfor %}
 
+        {% set counters = namespace(data={}) %}
         {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
-            if (resize_size_dat{{arg.dat_id}} > sr_dat{{arg.dat_id}}_keys_dv.size()) { // resize only if current vector is small        
-                sr_dat{{arg.dat_id}}_keys_dv.resize(resize_size_dat{{arg.dat_id}}, 0);
-                sr_dat{{arg.dat_id}}_values_dv.resize(resize_size_dat{{arg.dat_id}}, 0);
-            }
+            {% set _ = counters.data.update({arg.dat_id: 0}) %}
+        {% endfor %}
+        {% for arg in lh.args|dat|indirect|reduction %}
+            {% set _ = counters.data.update({arg.dat_id: counters.data[arg.dat_id] + 1}) %}
+        {% endfor %}
+            int k{{kernel_idx}}_stride = 0;
+        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
+            k{{kernel_idx}}_stride += iter_size * {{ counters.data[arg.dat_id] }};
+        {% endfor %}
+            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_sr_set_stride_d, &opp_k{{kernel_idx}}_sr_set_stride, &(k{{kernel_idx}}_stride), 1);
+
+        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
+            opp_sr::init_arrays<{{lh.dat(arg).typ}}>(args[{{arg.id}}].dat->dim, operating_size_dat{{arg.dat_id}}, resize_size_dat{{arg.dat_id}},
+                        sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2);
         {% endfor %}
         
             // Create key/value pairs
@@ -627,39 +646,8 @@ opp_dev_sr_{{lh.kernel}}<<<num_blocks, block_size{{-(", %s" % shared_size) if lh
             opp_profiler->end("SR_CrKeyVal");
 
         {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
-            // Sort by keys to bring the identical keys together
-            opp_profiler->start("SR_SortByKey");
-            thrust::sort_by_key(sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}}, 
-                sr_dat{{arg.dat_id}}_values_dv.begin());
-            opp_profiler->end("SR_SortByKey");
-
-            // Compute the unique keys and their corresponding values
-            opp_profiler->start("SR_RedByKey");
-            auto new_end = thrust::reduce_by_key(
-                sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}},
-                sr_dat{{arg.dat_id}}_values_dv.begin(),
-                sr_dat{{arg.dat_id}}_keys_dv.begin(),
-                sr_dat{{arg.dat_id}}_values_dv.begin());        
-            opp_profiler->end("SR_RedByKey");
-
-            const size_t reduced_size = (new_end.first - sr_dat{{arg.dat_id}}_keys_dv.begin());
-            
-            // Assign reduced values to the nodes using keys/values
-            opp_profiler->start("SR_Assign");                
-            opp_k{{kernel_idx}}::assign_values<<<num_blocks, block_size>>> ( // TODO : check whether num_blocks is correct
-                opp_get_dev_raw_ptr<OPP_INT>(sr_dat{{arg.dat_id}}_keys_dv),
-                opp_get_dev_raw_ptr<OPP_REAL>(sr_dat{{arg.dat_id}}_values_dv),
-                (OPP_REAL *) args[{{arg.dat_id}}].data_d,
-                0, reduced_size);
-            OPP_DEVICE_SYNCHRONIZE();
-            opp_profiler->end("SR_Assign");
-
-            // Last: clear the thrust vectors if this is the last iteration (avoid crash)
-            if (opp_params->get<OPP_INT>("num_steps") == (OPP_main_loop_iter + 1)) {
-                OPP_DEVICE_SYNCHRONIZE();
-                sr_dat{{arg.dat_id}}_values_dv.clear(); sr_dat{{arg.dat_id}}_values_dv.shrink_to_fit();
-                sr_dat{{arg.dat_id}}_keys_dv.clear(); sr_dat{{arg.dat_id}}_keys_dv.shrink_to_fit();
-            }        
+            opp_sr::do_segmented_reductions<{{lh.dat(arg).typ}}>(args[{{arg.id}}], k{{kernel_idx}}_stride,
+                        sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2);
         {% endfor %}
         }
     {% endif %}
@@ -694,7 +682,6 @@ opp_dev_sr_{{lh.kernel}}<<<num_blocks, block_size{{-(", %s" % shared_size) if lh
         {% endif %}
     }
     {% endfor %}
-
     {% for arg in lh.args|gbl|reduction %}
         {% call opt_if(arg) %}
     args[{{arg.id}}].data = (char *)arg{{arg.id}}_host_data;
@@ -702,6 +689,10 @@ opp_dev_sr_{{lh.kernel}}<<<num_blocks, block_size{{-(", %s" % shared_size) if lh
         {% endcall %}
 
     {% endfor %}
+    {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
+    opp_sr::clear_arrays<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2);
+    {% endfor %}
+
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
     OPP_DEVICE_SYNCHRONIZE();   
     {% if lh is double_indirect_reduc %}
diff --git a/opp_translator/resources/templates/cpp/hip/master_kernel.cpp.jinja b/opp_translator/resources/templates/cpp/hip/master_kernel.cpp.jinja
index 33342e0d..17732786 100644
--- a/opp_translator/resources/templates/cpp/hip/master_kernel.cpp.jinja
+++ b/opp_translator/resources/templates/cpp/hip/master_kernel.cpp.jinja
@@ -10,6 +10,10 @@
 {% endblock %}
 
 {% block const_decls %}
+    {% for const in app.consts() %}
+{{const.typ}} {{const.ptr}}[{{const.dim}}];
+    {% endfor %}
+
     {% for const in app.consts() %}
 __constant__ {{const.typ}} {{const.ptr}}_d[{{const.dim}}];
     {% endfor %}
@@ -18,5 +22,6 @@ __constant__ {{const.typ}} {{const.ptr}}_d[{{const.dim}}];
 {% block const_decl_func %}
     {% call(const) const_decl_func("") %}
 cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL({{const.ptr}}_d), data, dim * size));
+        std::memcpy(&{{const.ptr}}, data, (size*dim));
     {%- endcall %}
 {% endblock %}
\ No newline at end of file
diff --git a/opp_translator/resources/templates/cpp/hip/move_loop_host.hpp.jinja b/opp_translator/resources/templates/cpp/hip/move_loop_host.hpp.jinja
index 4d7b99d2..2d6ad1ce 100644
--- a/opp_translator/resources/templates/cpp/hip/move_loop_host.hpp.jinja
+++ b/opp_translator/resources/templates/cpp/hip/move_loop_host.hpp.jinja
@@ -3,98 +3,19 @@
 {% block kernel %}
 
 namespace opp_k{{kernel_idx}} {
-{{kernel_func}}
-{% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.seg_red %}
-// Segmented Reductions Routines 
-// --------------------------------------------------------------
-__global__ void assign_values( 
-    const OPP_INT *__restrict keys,
-    const OPP_REAL *__restrict values,
-    OPP_REAL *__restrict dat,
-    const int start,
-    const int end) 
-{
-    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (tid + start < end) 
-    {
-        const int n = tid + start;
-        const int mapping = keys[n];  
-        dat[mapping] += values[n];
-    }
-}
-
-//--------------------------------------------------------------
-__global__ void sequence_OPP_INT_values( 
-    OPP_INT *__restrict values,
-    const int start,
-    const int end) 
-{
-    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (tid + start < end) 
-    {
-        const int n = tid + start;
-        values[n] = n; 
-    }
-}
-
-//--------------------------------------------------------------
-__global__ void reset_OPP_INT_values( 
-    OPP_INT *__restrict values,
-    const int start,
-    const int end) 
-{
-    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (tid + start < end) 
-    {
-        const int n = tid + start;
-        values[n] = 0; 
-    }
-}
-
-//--------------------------------------------------------------
-__global__ void reset_OPP_REAL_values( 
-    OPP_REAL *__restrict values,
-    const int start,
-    const int end) 
-{
-    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (tid + start < end) 
-    {
-        const int n = tid + start;
-        values[n] = 0.0; 
-    }
-}
-
-//--------------------------------------------------------------
-__global__ void assign_values_by_key( 
-    const OPP_INT *__restrict indices,
-    const OPP_REAL *__restrict values_in,
-    OPP_REAL *__restrict values_out,
-    const int start, const int end, const int dim) 
-{
-    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (tid + start < end) 
-    {
-        const int n = tid + start;
-        const int idx = indices[n];
+{% if lh.dh_loop_required %}
 
-        for (int d = 0; d < dim; d++) {
-            values_out[n + d * opp_k2_sr_set_stride_d] = values_in[idx + d * opp_k2_sr_set_stride_d];
-        }
-    }
+namespace host {
+{{host_kernel_func}}
 }
 
 {% endif %}
+{{kernel_func}}
 }
 
 {% endblock %}
 
-{% macro stride_hip(arg) -%}
+{% macro stride_device(arg) -%}
 {{-" * opp_k%s_dat%s_stride_d" % (kernel_idx, arg.dat_id) if lh.dat(arg) is soa-}}
 {%- endmacro %}
 
@@ -120,21 +41,21 @@ __global__ void assign_values_by_key(
     {% endif %}
 {% endmacro %}
 
-{% macro opt_hip_cond(arg) %}
+{% macro opt_device_cond(arg) %}
     {%- if arg is opt -%}optflags & 1 << {{lh.optIdx(arg)}}{%- endif -%}
 {% endmacro %}
 
-{% macro opt_hip_cond_comp(arg) %}
-    {%- if arg is opt -%}{{opt_hip_cond(arg)}} && {% endif -%}
+{% macro opt_device_cond_comp(arg) %}
+    {%- if arg is opt -%}{{opt_device_cond(arg)}} && {% endif -%}
 {% endmacro %}
 
-{% macro opt_hip_tern(arg, alt = "NULL") %}
-    {%- if arg is opt -%}{{opt_hip_cond(arg)}} ? {{caller()}} : {{alt}}{%- else -%}{{caller()}}{%- endif -%}
+{% macro opt_device_tern(arg, alt = "NULL") %}
+    {%- if arg is opt -%}{{opt_device_cond(arg)}} ? {{caller()}} : {{alt}}{%- else -%}{{caller()}}{%- endif -%}
 {% endmacro %}
 
-{% macro opt_hip_if(arg) %}
+{% macro opt_device_if(arg) %}
     {% if arg is opt %}
-    if ({{opt_hip_cond(arg)}}) {
+    if ({{opt_device_cond(arg)}}) {
     {{caller()|indent-}}
     {{"}"|indent(first = true)}}
     {% else %}
@@ -155,7 +76,7 @@ map{{arg.map_id}}[opp_k{{kernel_idx}}_map{{lh.map(arg).id}}_stride_d * {{arg.map
 {%- endif -%}    
 {%- endmacro %}
 
-{% macro arg_to_pointer_hip(arg) -%}
+{% macro arg_to_pointer_device(arg) -%}
 
     {%- if arg is gbl %}
         {% if arg is reduction %}
@@ -195,6 +116,31 @@ gbl{{arg.id}}
     {%- endif -%}
 {%- endmacro %}
 
+{%- macro arg_to_pointer_dh(arg, lh) %}
+    {%- if arg.id == 0 -%}
+            (const OPP_REAL*)&point,
+    {%- elif arg is gbl %}
+        {%- set cast = arg.typ -%}
+        ({{cast}} *)args[{{arg.id}}].data{%-if arg.id+1 != lh.args|length-%},{%-endif-%}
+    {%- else -%}
+
+        {%- if arg is direct -%}
+            {%- set offset = "" -%}
+        {%- elif arg is double_indirect -%}
+            {%- set offset = " + (map%s[%d] * %d)" % (arg.map_id, arg.map_idx, lh.dat(arg).dim) -%}
+        {%- elif arg is indirect -%}
+            {%- set offset = " + (temp_ci * %d)" % (lh.dat(arg).dim) -%}
+        {%- endif -%}
+
+        {# 0 is OP.AccessType.READ #}
+        {%- if arg.access_type.value == 0 -%} 
+            (const {{lh.dat(arg).typ}} *)args[{{arg.id}}].data{{offset}}{%-if arg.id+1 != lh.args|length-%},{%-endif%} // {% if arg is dat -%}{{lh.dat(arg).ptr}} {%- endif -%} | OPP_{{arg.access_type.name}}
+        {%- else -%}
+            arg{{arg.id}}_temp{%-if arg.id+1 != lh.args|length-%},{%-endif%} // {% if arg is dat -%}{{lh.dat(arg).ptr}} {%- endif -%} | OPP_{{arg.access_type.name}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endmacro -%}
+
 {% block prologue %}
 {{super()}}
     {% for dat in lh.dats|soa %}
@@ -234,7 +180,11 @@ thrust::device_vector<{{lh.dat(arg).typ}}> sr_dat{{lh.dat(arg).id}}_values_dv2;
 __global__ void opp_dev_{{lh.kernel}}(
     {{-"\n    const unsigned optflags," if lh.args|opt|length > 0}}
     {% for dat in lh.dats %}
+        {% if dat is indirect_reduction(lh) and lh is particle_loop %}
+    {{dat.typ}} **__restrict__ dat{{dat.id}}, const OPP_INT dat{{dat.id}}_arr_count,     // {{dat.ptr}}
+        {% else %}
     {{"const " if dat is read_in(lh)}}{{dat.typ}} *__restrict__ dat{{dat.id}},     // {{dat.ptr}}
+        {% endif %}
     {% endfor %}
     OPP_INT *__restrict__ p2c_map,
     const OPP_INT *__restrict__ c2c_map,
@@ -244,6 +194,9 @@ __global__ void opp_dev_{{lh.kernel}}(
     {% for arg in lh.args|gbl %}
     {{"const " if arg.access_type == OP.AccessType.Read}}{{arg.typ}} *gbl{{arg.id}},
     {% endfor %}
+    {% if lh is injected_loop %}
+    const OPP_INT inj_start,
+    {% endif %}
     OPP_INT *__restrict__ particle_remove_count,
     OPP_INT *__restrict__ particle_remove_indices,
     OPP_INT *__restrict__ move_particle_indices,
@@ -255,7 +208,7 @@ __global__ void opp_dev_{{lh.kernel}}(
 {
     {% for arg in lh.args|gbl|reduction %}
     {{arg.typ}} gbl{{arg.id}}_local[{{arg.dim}}];
-    for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{arg.dim}}; ++d)
+    for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{arg.dim}}; ++d)
         gbl{{arg.id}}_local[d] = {% if arg is inc -%}
             {{arg.typ}}_ZERO
         {%- else -%}
@@ -263,13 +216,17 @@ __global__ void opp_dev_{{lh.kernel}}(
         {%- endif -%};
 
     {% endfor %}
-    const int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (thread_id + start < end) {
+    const int n = OPP_DEVICE_GLOBAL_LINEAR_ID + start;
 
-        const int n = {{"thread_id + start" if config.atomics else "col_reord[thread_id + start]"}};
+    if (n < end) {
 
         OPP_INT *opp_p2c = (p2c_map + n);
+    {% if lh.dh_loop_required %}
+        if (opp_p2c[0] == MAX_CELL_INDEX) {
+            return;
+        }
+    {% endif %}
+
         char move_flag = OPP_NEED_MOVE;
         bool iter_one_flag = (OPP_comm_iteration_d > 0) ? false : true;
 
@@ -281,13 +238,18 @@ __global__ void opp_dev_{{lh.kernel}}(
             {% endif %}
 
         {% endfor %}
-        do
-        {
+        {% for dat in lh.dats if config.atomics and lh is particle_loop %}
+            {% if dat is indirect_reduction(lh) %}
+        {{dat.typ}}* tmp{{dat.id}} = dat{{dat.id}}[threadIdx.x % dat{{dat.id}}_arr_count];
+
+            {% endif %}
+        {% endfor %}
+        do {
             const OPP_INT p2c = opp_p2c[0]; // get the value here, since the kernel might change it
             const OPP_INT* opp_c2c = c2c_map + p2c;           
 
         {% for arg in lh.args_expanded|dat|indirect|reduction if config.atomics %}
-            for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d)
+            for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d)
             {% if arg is p2c_mapped and not double_indirect %}
                 arg{{arg.id}}_p2c_local[d] = {{lh.dat(arg).typ}}_ZERO;
             {% else %}
@@ -298,16 +260,21 @@ __global__ void opp_dev_{{lh.kernel}}(
             opp_k{{kernel_idx}}::{{lh.kernel}}(
                 move_flag, iter_one_flag, opp_c2c, opp_p2c,
                 {% for arg in lh.args %}
-                {%+ call opt_hip_tern(arg) %}{{arg_to_pointer_hip(arg)}}{% endcall %}{{"," if not loop.last}} // {% if arg is dat %}{{lh.dat(arg).ptr}} {% endif +%}
+                {%+ call opt_device_tern(arg) %}{{arg_to_pointer_device(arg)}}{% endcall %}{{"," if not loop.last}} // {% if arg is dat %}{{lh.dat(arg).ptr}} {% endif +%}
                 {% endfor %}          
             );
 
+        {% if lh is particle_loop and config.atomics %}
+        {% set dat_name = "tmp" -%}
+        {% else %}
+        {% set dat_name = "dat" -%}
+        {% endif %}
         {% for arg in lh.args_expanded|dat|indirect|reduction if config.atomics %}
-            for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d)
+            for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d)
             {% if arg is p2c_mapped and not double_indirect %}
-                atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}}), arg{{arg.id}}_p2c_local[d]);
+                atomicAdd({{dat_name}}{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_device(arg)}}), arg{{arg.id}}_p2c_local[d]);
             {% else %}
-                atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}}), arg{{arg.id}}_{{arg.map_idx}}_local[d]); // TODO: this looks incorrect
+                atomicAdd({{dat_name}}{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_device(arg)}}), arg{{arg.id}}_{{arg.map_idx}}_local[d]); // TODO: this looks incorrect
             {% endif %}        
         {% endfor %}
         } while (opp_part_check_status_device(move_flag, iter_one_flag, opp_p2c, n, 
@@ -316,7 +283,7 @@ __global__ void opp_dev_{{lh.kernel}}(
     }
     {% for arg in lh.args|gbl|reduction %}
 
-    for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{arg.dim}}; ++d)
+    for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{arg.dim}}; ++d)
         opp_reduction<OPP_{{arg.access_type.name}}>(gbl{{arg.id}} + blockIdx.x * {{arg.dim}} + d, gbl{{arg.id}}_local[d]);
     {% endfor %}
     {{ caller() }}
@@ -351,16 +318,13 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions
     const OPP_INT end
 ) 
 {
-    const int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (thread_id + start < end) {
+    const int n = OPP_DEVICE_GLOBAL_LINEAR_ID + start;
 
-        const int n = thread_id + start;
+    if (n < end) {
 
         OPP_INT *opp_p2c = (p2c_map + n);
         char move_flag = OPP_NEED_MOVE;
         bool iter_one_flag = (OPP_comm_iteration_d > 0) ? false : true;
-        bool on_old_cell = true;
 
         {% for arg in lh.args_expanded|dat|indirect|reduction if config.seg_red %}
             {% if arg is p2c_mapped and not double_indirect %}
@@ -370,13 +334,12 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions
             {% endif %}
 
         {% endfor %}
-        do
-        {
+        do {
             const OPP_INT p2c = opp_p2c[0]; // get the value here, since the kernel might change it
             const OPP_INT* opp_c2c = c2c_map + p2c;           
 
         {% for arg in lh.args_expanded|dat|indirect|reduction if config.seg_red %}
-            for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d)
+            for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d)
             {% if arg is p2c_mapped and not double_indirect %}
                 arg{{arg.id}}_p2c_local[d] = {{lh.dat(arg).typ}}_ZERO;
             {% else %}
@@ -387,30 +350,25 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions
             opp_k{{kernel_idx}}::{{lh.kernel}}(
                 move_flag, iter_one_flag, opp_c2c, opp_p2c,
                 {% for arg in lh.args %}
-                {%+ call opt_hip_tern(arg) %}{{arg_to_pointer_hip(arg)}}{% endcall %}{{"," if not loop.last}} // {% if arg is dat %}{{lh.dat(arg).ptr}} {% endif +%}
+                {%+ call opt_device_tern(arg) %}{{arg_to_pointer_device(arg)}}{% endcall %}{{"," if not loop.last}} // {% if arg is dat %}{{lh.dat(arg).ptr}} {% endif +%}
                 {% endfor %}          
             );
 
         {% for arg in lh.args_expanded|dat|indirect|reduction if config.seg_red %}
-            if (on_old_cell)
-            {
-                int offset = 0;
-                for (int d = 0; d < {{lh.dat(arg).dim}}; ++d, ++offset) {
-                    sr_dat{{arg.dat_id}}_values[n + opp_k{{kernel_idx}}_sr_set_stride_d * offset] = arg{{arg.dat_id}}_p2c_local[d];              
+            if (iter_one_flag) {
+                for (int d = 0; d < {{lh.dat(arg).dim}}; ++d) {
+                    sr_dat{{arg.dat_id}}_values[n + opp_k{{kernel_idx}}_sr_set_stride_d * d] = arg{{arg.dat_id}}_p2c_local[d];              
                 }
                 sr_dat{{arg.dat_id}}_keys[n] = p2c; // TODO : Generate for double indirections too!
             }
-            else
-            {
-                for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d)
+            else {
+                for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d)
             {% if arg is p2c_mapped and not double_indirect %}
-                    atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}}), arg{{arg.id}}_p2c_local[d]);
+                    atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_device(arg)}}), arg{{arg.id}}_p2c_local[d]);
             {% else %}
-                    atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}}), arg{{arg.id}}_{{arg.map_idx}}_local[d]); // TODO: this looks incorrect
+                    atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_device(arg)}}), arg{{arg.id}}_{{arg.map_idx}}_local[d]); // TODO: this looks incorrect
             {% endif %}
             }
-
-            on_old_cell = false;
         
         {% endfor %}
         } while (opp_part_check_status_device(move_flag, iter_one_flag, opp_p2c, n, 
@@ -428,8 +386,8 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions
         {% endcall %}
     {% endif %}
 
-//--------------------------------------------------------------
     {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.seg_red %}
+//--------------------------------------------------------------
         {% call seg_red_kernel_wrapper() %}
         {% endcall %}
     {% endif %}
@@ -460,7 +418,7 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
     {% endfor %}
     args[{{lh.args|length}}] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty
 
-    const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU);
+    opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU);
     {% if lh is double_indirect_reduc %}
 
 #ifdef USE_MPI
@@ -474,16 +432,6 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
     opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_c2c_map_stride_d, &opp_k{{kernel_idx}}_c2c_map_stride, &c2c_stride, 1);
 
     opp_mpi_halo_wait_all(nargs, args);
-    {% if lh.args|opt|length > 0 %}
-    unsigned optflags = 0;
-
-    {% for arg in lh.args|opt %}
-        {% call opt_if(arg) %}
-    optflags |= 1 << {{lh.optIdx(arg)}};
-        {% endcall %}
-
-    {% endfor %}
-    {% endif %}
     {% for arg in lh.args|gbl %}
     {{arg.typ}} *arg{{arg.id}}_host_data = ({{arg.typ}} *)args[{{arg.id}}].data;{{"\n" if loop.last}}
     {% endfor %}
@@ -523,69 +471,18 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
 
     int num_blocks = 200;
 
-    do 
-    {
-        {% for dat in lh.dats|soa %}
-        opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_dat{{dat.id}}_stride_d, &opp_k{{kernel_idx}}_dat{{dat.id}}_stride, &(args[{{dat.arg_id}}].dat->set->set_capacity), 1);
-        {% endfor %}
-        {% for map in lh.maps %}
-        opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_map{{map.id}}_stride_d, &opp_k{{kernel_idx}}_map{{map.id}}_stride, &(args[{{map.arg_id}}].size), 1);
-        {% endfor %}
-
-        opp_init_particle_move(set, nargs, args);
-        opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
-
-    {% if lh.args|gbl|reduction|length > 0 %}
-        {% if lh is direct %}
-    int max_blocks = num_blocks;
-        {% elif config.atomics %}
-    int max_blocks = (MAX(set->core_size, set->size + set->exec_size - set->core_size) - 1) / block_size + 1;
-        {% else %}
-    int max_blocks = 0;
-    for (int col = 0; col < plan->ncolors; ++col)
-        max_blocks = MAX(max_blocks, plan->ncolblk[col]);
-        {% endif %}
-
-    int reduction_bytes = 0;
-    int reduction_size = 0;
-
-        {% for arg in lh.args|gbl|reduction %}
-            {% call opt_if(arg) %}
-    reduction_bytes += ROUND_UP(max_blocks * {{arg.dim}} * sizeof({{arg.typ}}));
-    reduction_size   = MAX(reduction_size, sizeof({{arg.typ}}));
-            {% endcall %}
-        {% endfor %}
-
-    opp_reallocReductArrays(reduction_bytes);
-    reduction_bytes = 0;
-
-        {% for arg in lh.args|gbl|reduction %}
-            {% call opt_if(arg) %}
-    args[{{arg.id}}].data   = OPP_reduct_h + reduction_bytes;
-    args[{{arg.id}}].data_d = OPP_reduct_d + reduction_bytes;
-
-    for (int b = 0; b < max_blocks; ++b) {
-        for (int d = 0; d < {{arg.dim}}; ++d)
-            (({{arg.typ}} *)args[{{arg.id}}].data)[b * {{arg.dim}} + d] = {% if arg.access_type == OP.AccessType.INC -%}
-                {{arg.typ}}_ZERO
-            {%- else -%}
-                arg{{arg.id}}_host_data[d]
-            {%- endif %};
-    }
-
-    reduction_bytes += ROUND_UP(max_blocks * {{arg.dim}} * sizeof({{arg.typ}}));
-            {% endcall %}
-        {% endfor %}
-
-    opp_mvReductArraysToDevice(reduction_bytes);
-    {% endif %}
+    opp_init_particle_move(set, nargs, args);
 {% endblock %}
 
 {% macro kernel_call(shared_size) %}
 opp_dev_{{lh.kernel}}<<<num_blocks, block_size
 {{-(", %s" % shared_size) if lh.args|gbl|reduction|length > 0}}>>>(
     {% for dat in lh.dats %}
+        {% if dat is indirect_reduction(lh) and lh is particle_loop %}
+    ({{dat.typ}}**)args[{{dat.arg_id}}].dat->thread_data_d, array_count,    // {{dat.ptr}}
+        {% else %}
     ({{dat.typ}} *)args[{{dat.arg_id}}].data_d,    // {{dat.ptr}}
+        {% endif %}
     {% endfor %}
     (OPP_INT *)args[{{lh.args|length}}].data_d,    // p2c_map
     (OPP_INT *)c2c_map->map_d,    // c2c_map
@@ -636,144 +533,145 @@ opp_dev_sr_{{lh.kernel}}<<<num_blocks, block_size{{-(", %s" % shared_size) if lh
 );
 {%- endmacro %}
 
-{% block host_loop %}
-    {% if lh.args|gbl|reduction|length <= 0 %}
+{%- macro dh_check_for_global_move() %}
+    if (useGlobalMove) {
+           
+#ifdef USE_MPI         
+        globalMover->initGlobalMove();
+        opp_init_dh_device(set);
+#endif
+        opp_profiler->start("GblMv_Move");
+
+        opp_mem::dev_copy_to_symbol<OPP_INT>(cellMapper_pos_stride_d, &cellMapper_pos_stride, &(args[0].dat->set->set_capacity), 1);
+        opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_rank_d, &OPP_rank, 1);
+
+        hipError_t err1 = hipMemcpyToSymbol(HIP_SYMBOL(opp_minSavedDHGrid_d), opp_minSavedDHGrid, 3 * sizeof(size_t));
+        if (err1 != hipSuccess) {
+            throw std::runtime_error(std::string("dev_copy_to_symbol: ") + hipGetErrorString(err1));
+        }
+        hipError_t err2 = hipMemcpyToSymbol(HIP_SYMBOL(opp_maxSavedDHGrid_d), opp_maxSavedDHGrid, 3 * sizeof(size_t));
+        if (err2 != hipSuccess) {
+            throw std::runtime_error(std::string("dev_copy_to_symbol: ") + hipGetErrorString(err2));
+        }
+
+        // check whether particles need to be moved via the global move routine
         num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
-    {% endif %}
+        opp_dev_checkForGlobalMove{{lh.dat(lh.args[0]).dim}}D_kernel<<<num_blocks, block_size>>>(
+            (OPP_REAL*)args[0].data_d,    // {{lh.dat(lh.args[0]).ptr}} 
+            (OPP_INT *)args[{{lh.args|length}}].data_d,    // p2c_map
+            cellMapper->structMeshToCellMapping_d, 
+            cellMapper->structMeshToRankMapping_d,
+            cellMapper->oneOverGridSpacing_d, 
+            cellMapper->minGlbCoordinate_d, 
+            cellMapper->globalGridDims_d, 
+            cellMapper->globalGridSize_d,
+            set->particle_remove_count_d, 
+            OPP_remove_particle_indices_d, 
+            dh_indices_d.part_indices, 
+            dh_indices_d.cell_indices, 
+            dh_indices_d.rank_indices, 
+            dh_indices_d.move_count,
+            OPP_iter_start, OPP_iter_end
+        );
+        OPP_DEVICE_SYNCHRONIZE();
+
+        opp_profiler->end("GblMv_Move");
+
+#ifdef USE_MPI 
+        opp_gather_dh_move_indices(set);
+        globalMover->communicate(set);
+#endif
+    }
+{%- endmacro %}
 
-    {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics and config.seg_red %}
-        if (!opp_params->get<OPP_BOOL>("use_reg_red")) // Do atomics ----------       
-    {% endif %}
-    {% if lh is direct or config.atomics %}
-        {
+{%- macro dh_finalize() %}
+#ifdef USE_MPI 
+    // ----------------------------------------------------------------------------
+    // finalize the global move routine and iterate over newly added particles and check whether they need neighbour comm
+    if (useGlobalMove) { 
+        
+        opp_profiler->start("GblMv_finalize");
+        const int finalized = globalMover->finalize(set);
+        opp_profiler->end("GblMv_finalize");
+
+        if (finalized > 0) {
+            opp_profiler->start("GblMv_AllMv");
+
+            // need to change arg data since particle resize in globalMover::finalize could change the pointer in dat->data 
+            for (int i = 0; i < nargs; i++)
+                if (args[i].argtype == OPP_ARG_DAT && args[i].dat->set->is_particle)
+                    args[i].data_d = args[i].dat->data_d;
+
+            // check whether the new particle is within cell, and if not move between cells within the MPI rank, 
+            // mark for neighbour comm. Do only for the globally moved particles 
+            const int start2 = (set->size - set->diff);
+            const int end2 = set->size;
+            num_blocks = (end2 - start2 - 1) / block_size + 1;
+
+    {% for dat in lh.dats|soa %}
+            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_dat{{dat.id}}_stride_d, &opp_k{{kernel_idx}}_dat{{dat.id}}_stride, &(args[{{dat.arg_id}}].dat->set->set_capacity), 1);
+    {% endfor %}
+    {% for map in lh.maps %}
+            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_map{{map.id}}_stride_d, &opp_k{{kernel_idx}}_map{{map.id}}_stride, &(args[{{map.arg_id}}].size), 1);
+    {% endfor %}
+
+            opp_profiler->start("move_kernel_only");      
             {{kernel_call("(reduction_size * block_size)",
-                "OPP_iter_start", "OPP_iter_end")|indent(12)}}
+                    "start2", "end2")|indent(12)}}
+            OPP_DEVICE_SYNCHRONIZE(); 
+            opp_profiler->end("move_kernel_only");
+
+            opp_profiler->end("GblMv_AllMv");
         }
-    {% endif %}
-    {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics and config.seg_red %}     
-        else // Do segmented reductions ----------       
-    {% endif %}
-    {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.seg_red %}
-        {
-            opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_sr_set_stride_d, &opp_k{{kernel_idx}}_sr_set_stride, &set->size, 1);
+    }
+#endif
+{%- endmacro %}
 
-        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
-            size_t operating_size_dat{{arg.dat_id}} = 0, resize_size_dat{{arg.dat_id}} = 0;
-        {% endfor %}
 
-        {% for arg in lh.args|dat|indirect|reduction %}
-            operating_size_dat{{arg.dat_id}} += (size_t)1;
-            resize_size_dat{{arg.dat_id}} += (size_t)1;
-        {% endfor %}
+{%- macro generate_storage_for_global_reductions() -%}
+    {% if lh.args|gbl|reduction|length > 0 %}
+        {% if lh is direct %}
+    int max_blocks = num_blocks;
+        {% elif config.atomics %}
+    int max_blocks = (MAX(set->core_size, set->size + set->exec_size - set->core_size) - 1) / block_size + 1;
+        {% endif %}
 
-        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
-            operating_size_dat{{arg.dat_id}} *= (size_t)(set->size);
-            resize_size_dat{{arg.dat_id}} *= (size_t)(set->set_capacity);
-        {% endfor %}
+    int reduction_bytes = 0;
+    int reduction_size = 0;
 
-            // Resize the key/value device arrays only if current vector is small
-            opp_profiler->start("SRM_Resize");
-        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
-            if (resize_size_dat{{arg.dat_id}} > sr_dat{{arg.dat_id}}_keys_dv.size()) {
-                sr_dat{{arg.dat_id}}_keys_dv.resize(resize_size_dat{{arg.dat_id}}, 0);
-                sr_dat{{arg.dat_id}}_keys_dv2.resize(resize_size_dat{{arg.dat_id}}, 0);
-                sr_dat{{arg.dat_id}}_values_dv.resize(resize_size_dat{{arg.dat_id}} * (args[{{arg.id}}].dat->dim), 0);
-                sr_dat{{arg.dat_id}}_values_dv2.resize(resize_size_dat{{arg.dat_id}} * (args[{{arg.id}}].dat->dim), 0);
-            }
+        {% for arg in lh.args|gbl|reduction %}
+            {% call opt_if(arg) %}
+    reduction_bytes += ROUND_UP(max_blocks * {{arg.dim}} * sizeof({{arg.typ}}));
+    reduction_size   = MAX(reduction_size, sizeof({{arg.typ}}));
+            {% endcall %}
         {% endfor %}
-            opp_profiler->end("SRM_Resize");
 
-            // Reset the key/value device arrays
-            opp_profiler->start("SRM_Init");
-        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
-            opp_k{{kernel_idx}}::reset_OPP_INT_values<<<num_blocks, block_size>>>(
-                opp_get_dev_raw_ptr<OPP_INT>(sr_dat{{arg.dat_id}}_keys_dv), 0, sr_dat{{arg.dat_id}}_keys_dv.size());
-            opp_k{{kernel_idx}}::sequence_OPP_INT_values<<<num_blocks, block_size>>>(
-                opp_get_dev_raw_ptr<OPP_INT>(sr_dat{{arg.dat_id}}_keys_dv2), 0, sr_dat{{arg.dat_id}}_keys_dv2.size());
-            
-            const int num_blocks2 = (sr_dat{{arg.dat_id}}_values_dv.size() - 1) / block_size + 1;
-            opp_k{{kernel_idx}}::reset_{{lh.dat(arg).typ}}_values<<<num_blocks2, block_size>>>(
-                opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv), 0, sr_dat{{arg.dat_id}}_values_dv.size());
-            // opp_k{{kernel_idx}}::reset_{{lh.dat(arg).typ}}_values<<<num_blocks2, block_size>>>(
-            //     opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv2), 0, sr_dat{{arg.dat_id}}_values_dv2.size());
-            OPP_DEVICE_SYNCHRONIZE();
-        {% endfor %}
-            opp_profiler->end("SRM_Init");
+    opp_reallocReductArrays(reduction_bytes);
+    reduction_bytes = 0;
 
-            // Create key/value pairs
-            opp_profiler->start("SRM_CrKeyVal");
-            {{seg_red_kernel_call("(reduction_size * block_size)",
-                "OPP_iter_start", "OPP_iter_end")|indent(12)}}
-            OPP_DEVICE_SYNCHRONIZE();
-            opp_profiler->end("SRM_CrKeyVal");
+        {% for arg in lh.args|gbl|reduction %}
+            {% call opt_if(arg) %}
+    args[{{arg.id}}].data   = OPP_reduct_h + reduction_bytes;
+    args[{{arg.id}}].data_d = OPP_reduct_d + reduction_bytes;
 
-        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
-            // Sort by keys to bring the identical keys together and store the order in sr_dat{{arg.dat_id}}_keys_dv2
-            opp_profiler->start("SRM_SortByKey");
-            thrust::sort_by_key(thrust::device,
-                sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}}, 
-                sr_dat{{arg.dat_id}}_keys_dv2.begin());
-            opp_profiler->end("SRM_SortByKey"); 
-
-            // Sort values according to sr_dat{{arg.dat_id}}_keys_dv2
-            opp_profiler->start("SRM_AssignByKey"); 
-            opp_k{{kernel_idx}}::assign_values_by_key<<<num_blocks, block_size>>>(
-                opp_get_dev_raw_ptr<OPP_INT>(sr_dat{{arg.dat_id}}_keys_dv2),
-                opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv),
-                opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv2),
-                0, operating_size_dat{{arg.dat_id}}, {{lh.dat(arg).dim}});
-            OPP_DEVICE_SYNCHRONIZE();
-            opp_profiler->end("SRM_AssignByKey"); 
-
-            // Compute the unique keys and their corresponding values
-            opp_profiler->start("SRM_RedByKey");
-            auto new_end = thrust::reduce_by_key(thrust::device,
-                sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}},
-                sr_dat{{arg.dat_id}}_values_dv2.begin(),
-                sr_dat{{arg.dat_id}}_keys_dv2.begin(),
-                sr_dat{{arg.dat_id}}_values_dv.begin());  
-            const size_t reduced_size = (new_end.first - sr_dat{{arg.dat_id}}_keys_dv2.begin());
-
-            {% if lh.dat(arg).dim > 1 %}
-            for (int d = 1; d < {{lh.dat(arg).dim}}; ++d) {
-                auto new_end = thrust::reduce_by_key(thrust::device,
-                    sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}},
-                    sr_dat{{arg.dat_id}}_values_dv2.begin() + d * opp_k{{kernel_idx}}_sr_set_stride,
-                    thrust::make_discard_iterator(), sr_dat{{arg.dat_id}}_values_dv.begin() + d * opp_k{{kernel_idx}}_sr_set_stride);     
-            }      
-            {% endif %}
-            opp_profiler->end("SRM_RedByKey");
-
-            // Assign reduced values to the nodes using keys/values
-            opp_profiler->start("SRM_Assign");
-            num_blocks = reduced_size / block_size + 1;
-            for (int d = 0; d < {{lh.dat(arg).dim}}; ++d) { // Could invoke the kernel once and have all dims updated with that
-                opp_k{{kernel_idx}}::assign_values<<<num_blocks, block_size>>> ( 
-                    opp_get_dev_raw_ptr<OPP_INT>(sr_dat{{arg.dat_id}}_keys_dv2),
-                    (opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv) + d * opp_k{{kernel_idx}}_sr_set_stride),
-                    (({{lh.dat(arg).typ}} *) args[{{arg.dat_id}}].data_d) + d * opp_k{{kernel_idx}}_dat{{arg.dat_id}}_stride,
-                    0, reduced_size);
-            }
-            OPP_DEVICE_SYNCHRONIZE();
-            opp_profiler->end("SRM_Assign");
-
-            // Last: clear the thrust vectors if this is the last iteration (avoid crash)
-            opp_profiler->start("SRM_Clear");
-            if (opp_params->get<OPP_INT>("num_steps") == (OPP_main_loop_iter + 1)) {
-                OPP_DEVICE_SYNCHRONIZE();
-                sr_dat{{arg.dat_id}}_values_dv.clear(); sr_dat{{arg.dat_id}}_values_dv.shrink_to_fit();
-                sr_dat{{arg.dat_id}}_keys_dv.clear(); sr_dat{{arg.dat_id}}_keys_dv.shrink_to_fit();
-            } 
-            opp_profiler->end("SRM_Clear");
-        {% endfor %}
-        }    
-    {% endif %}
+    for (int b = 0; b < max_blocks; ++b) {
+        for (int d = 0; d < {{arg.dim}}; ++d)
+            (({{arg.typ}} *)args[{{arg.id}}].data)[b * {{arg.dim}} + d] = {% if arg.access_type == OP.AccessType.INC -%}
+                {{arg.typ}}_ZERO
+            {%- else -%}
+                arg{{arg.id}}_host_data[d]
+            {%- endif %};
+    }
 
-    } while (opp_finalize_particle_move(set)); 
+    reduction_bytes += ROUND_UP(max_blocks * {{arg.dim}} * sizeof({{arg.typ}}));
+            {% endcall %}
+        {% endfor %}
 
-{% endblock %}
+    opp_mvReductArraysToDevice(reduction_bytes);
+    {% endif %}
+{%- endmacro %}
 
-{% block host_epilogue %}
+{%- macro reduce_global_reductions() -%}
     {% if lh.args|gbl|read_write|length > 0 or lh.args|gbl|write|length > 0 %}
     mvConstArraysToHost(const_bytes);
     
@@ -804,6 +702,145 @@ opp_dev_sr_{{lh.kernel}}<<<num_blocks, block_size{{-(", %s" % shared_size) if lh
         {% endcall %}
 
     {% endfor %}
+{%- endmacro %}
+
+{%- macro create_arrays_for_particle_indirect_reductions() -%}
+    {% if lh.args|dat|indirect|reduction|length > 0 and lh is particle_loop %}
+    const int array_count = opp_params->get<OPP_INT>("gpu_reduction_arrays");
+    {% endif %}
+    {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics %}
+    if (!opp_use_segmented_reductions) {
+        {% for dat in lh.dats %}
+            {% if dat is indirect_reduction(lh) and lh is particle_loop %}
+        opp_create_thread_level_data<{{dat.typ}}>(args[{{dat.arg_id}}]);
+            {% endif %}
+        {% endfor %}
+    }
+    {% endif %}
+{%- endmacro %}
+
+{%- macro reduce_particle_indirect_reduction_arrays() -%}
+    {% for dat in lh.dats %}
+        {% if dat is indirect_reduction(lh) and lh is particle_loop %}
+    opp_reduce_thread_level_data<{{dat.typ}}>(args[{{dat.id}}]);
+        {% endif %}
+    {% endfor %}
+{%- endmacro %}
+
+{% macro multihop_move(pre) %}
+    opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
+    num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
+    
+    {% for dat in lh.dats|soa %}
+    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_dat{{dat.id}}_stride_d, &opp_k{{kernel_idx}}_dat{{dat.id}}_stride, &(args[{{dat.arg_id}}].dat->set->set_capacity), 1);
+    {% endfor %}
+    {% for map in lh.maps %}
+    opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_map{{map.id}}_stride_d, &opp_k{{kernel_idx}}_map{{map.id}}_stride, &(args[{{map.arg_id}}].size), 1);
+    {% endfor %}
+
+    {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics and config.seg_red %}
+    if (!opp_use_segmented_reductions) // Do atomics ----------       
+    {% endif %}
+    {% if lh is direct or config.atomics %}
+    {
+        opp_profiler->start("move_kernel_only");
+        {{kernel_call("(reduction_size * block_size)",
+            "OPP_iter_start", "OPP_iter_end")|indent(8)}}
+        OPP_DEVICE_SYNCHRONIZE(); 
+        opp_profiler->end("move_kernel_only");
+    }
+    {% endif %}
+    {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics and config.seg_red %}
+    else // Do segmented reductions ----------       
+    {% endif %}
+    {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.seg_red %}
+    {
+        {{pre}}opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k{{kernel_idx}}_sr_set_stride_d, &opp_k{{kernel_idx}}_sr_set_stride, &set->size, 1);
+
+        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
+        {{pre}}size_t operating_size_dat{{arg.dat_id}} = 0, resize_size_dat{{arg.dat_id}} = 0;
+        {% endfor %}
+
+        {% for arg in lh.args|dat|indirect|reduction %}
+        {{pre}}operating_size_dat{{arg.dat_id}} += (size_t)1;
+        {{pre}}resize_size_dat{{arg.dat_id}} += (size_t)1;
+        {% endfor %}
+
+        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
+        {{pre}}operating_size_dat{{arg.dat_id}} *= (size_t)(OPP_iter_end - OPP_iter_start);
+        {{pre}}resize_size_dat{{arg.dat_id}} *= (size_t)(set->set_capacity);
+        {% endfor %}
+
+        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
+        {{pre}}opp_sr::init_arrays<{{lh.dat(arg).typ}}>(args[{{arg.id}}].dat->dim, operating_size_dat{{arg.dat_id}}, resize_size_dat{{arg.dat_id}},
+        {{pre}}            sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2);
+        {% endfor %}
+
+        // Create key/value pairs
+        opp_profiler->start("SRM_CrKeyVal");
+        {{seg_red_kernel_call("(reduction_size * block_size)",
+            "OPP_iter_start", "OPP_iter_end")|indent(8)}}
+        OPP_DEVICE_SYNCHRONIZE();
+        opp_profiler->end("SRM_CrKeyVal");
+
+        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %}
+        {{pre}}opp_sr::do_segmented_reductions<{{lh.dat(arg).typ}}>(args[{{arg.dat_id}}], (OPP_iter_end - OPP_iter_start), 
+        {{pre}}            sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2);
+        {% endfor %}
+    }
+    {%- endif %}
+{% endmacro %}
+
+{% block host_loop %}
+    {% if lh.dh_loop_required %}
+{{ dh_check_for_global_move() }}
+
+    {% endif%}
+{{ generate_storage_for_global_reductions() }}
+{{ create_arrays_for_particle_indirect_reductions() }}
+
+    opp_profiler->start("Mv_AllMv0");
+    {% if lh.dh_loop_required %}
+    // ----------------------------------------------------------------------------
+    // check whether all particles not marked for global comm is within cell, 
+    // and if not mark to move between cells within the MPI rank, mark for neighbour comm
+    {% else %}
+    // ----------------------------------------------------------------------------
+    // Multi-hop move particles within current MPI rank and if not mark for neighbour comm
+    {% endif %}
+
+{{ multihop_move("") }}
+
+    opp_profiler->end("Mv_AllMv0");
+    {% if lh.dh_loop_required %}    
+{{ dh_finalize() }}
+    {% endif%}
+
+    // ----------------------------------------------------------------------------
+    // Do neighbour communication and if atleast one particle is received by the currect rank, 
+    // then iterate over the newly added particles
+    while (opp_finalize_particle_move(set)) {
+
+        opp_init_particle_move(set, nargs, args);
+
+    {{ multihop_move("// ")|indent(4) }}
+    }
+
+{{ reduce_global_reductions() }}
+    {%- if lh is indirect and lh.args|dat|indirect|reduction|length > 0 %}
+    if (!opp_use_segmented_reductions) {
+    {{ reduce_particle_indirect_reduction_arrays() }}
+    }
+    else {
+        {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) if config.seg_red %}
+        opp_sr::clear_arrays<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2);
+        {% endfor %}
+    }
+
+    {% endif %}
+{% endblock %}
+
+{% block host_epilogue %}
     opp_set_dirtybit_grouped(nargs, args, Device_GPU);
     OPP_DEVICE_SYNCHRONIZE();   
     {% if lh is double_indirect_reduc %}
@@ -818,6 +855,7 @@ opp_dev_sr_{{lh.kernel}}<<<num_blocks, block_size{{-(", %s" % shared_size) if lh
 
 {% block dh_init_wrapper %}
 {% if lh.dh_loop_required %}
+
 void opp_init_direct_hop_cg(double grid_spacing, const opp_dat c_gbl_id, const opp::BoundingBox& b_box, 
     opp_map c2c_map, opp_map p2c_map,
     {% for arg in lh.args %}
@@ -825,8 +863,91 @@ void opp_init_direct_hop_cg(double grid_spacing, const opp_dat c_gbl_id, const o
     {% endfor %}
 ) {
     opp_profiler->start("Setup_Mover");
-    
+
+    useGlobalMove = opp_params->get<OPP_BOOL>("opp_global_move");
+
+    if (OPP_DBG) opp_printf("opp_init_direct_hop_cg", "START useGlobalMove=%s", useGlobalMove ? "YES" : "NO");
+
+    if (useGlobalMove) {
+
+        const int nargs = {{lh.args|length + 1}};
+        opp_arg args[nargs];
+
+    {% for arg in lh.args %}
+        args[{{loop.index0}}] = {{arg_dat_redef(arg) if lh.args[arg.id] is vec else "arg%d" % arg.id}};
+    {% endfor %}
+        args[{{lh.args|length}}] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty
+
+#ifdef USE_MPI
+        opp_mpi_halo_exchanges_grouped(c_gbl_id->set, nargs, args, Device_CPU);
+
+        comm = std::make_shared<opp::Comm>(MPI_COMM_WORLD);
+        globalMover = std::make_unique<opp::GlobalParticleMover>(comm->comm_parent);
+
+        opp_mpi_halo_wait_all(nargs, args);
+#endif
+
+        boundingBox = std::make_shared<opp::BoundingBox>(b_box);
+        cellMapper = std::make_shared<opp::CellMapper>(boundingBox, grid_spacing, comm);
+        
+        const int c_set_size = c_gbl_id->set->size;
+
+        // lambda function for dh mesh search loop
+        auto all_cell_checker = [&](const opp_point& point, int& cid) {          
+    {% for arg in lh.args|dat %}
+        {% if arg.access_type.value in [1, 3, 4, 5, 2] %} 
+            // we dont want to change the original arrays during dh mesh generation, hence duplicate except OPP_READ
+            {{lh.dat(arg).typ}} arg{{arg.id}}_temp[{{lh.dat(arg).dim}}];
+        {%- endif %}
+    {%- endfor +%}
+            for (int ci = 0; ci < c_set_size; ++ci) {
+                opp_move_status_flag = OPP_NEED_MOVE;  
+                opp_move_hop_iter_one_flag = true;
+                
+                int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy
+                
+                opp_p2c = &(temp_ci);           
+                opp_c2c = &((c2c_map->map)[temp_ci * {{lh.c2c_map.dim}}]);
+    {% if lh is indirect %}
+        {% for map in lh.maps %}
+                const OPP_INT *map{{map.id}} = args[{{map.arg_id}}].map_data + (temp_ci * {{map.dim}});
+        {%- endfor %}   
+    {%- endif %}    
+    {%- for arg in lh.args|dat %}
+        {% if arg.access_type.value == 2 %} 
+
+            {%- if arg is double_indirect -%}
+            {%- set offset = " + (map%s[%d] * %d)" % (arg.map_id, arg.map_idx, lh.dat(arg).dim) -%}
+            {%- elif arg is indirect -%}
+            {%- set offset = " + (temp_ci * %d)" % (lh.dat(arg).dim) -%}
+            {%- endif %}
+                // arg{{arg.id}} is OPP_RW, hence get a copy just incase
+                std::memcpy(&arg{{arg.id}}_temp, ({{lh.dat(arg).typ}} *)args[{{arg.id}}].data{{offset}}, (sizeof({{lh.dat(arg).typ}}) * {{lh.dat(arg).dim}}));
+        {% endif %}
+    {%- endfor %}
+
+                opp_k{{kernel_idx}}::host::{{lh.kernel}}(
+    {% for arg in lh.args %}
+                    {{arg_to_pointer_dh(arg,lh)}}
+    {% endfor %}
+                );
+                if (opp_move_status_flag == OPP_MOVE_DONE) {       
+                    cid = temp_ci;
+                    break;
+                }
+            }
+        };
+
+        if (opp_params->get<OPP_BOOL>("opp_dh_data_generate")) {
+            cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker);
+        }
+        else {
+            cellMapper->generateStructuredMeshFromFile(c_gbl_id->set, c_gbl_id);  
+        }
+    }
+
     opp_profiler->end("Setup_Mover");
 }
+
 {% endif %}
 {% endblock %}
\ No newline at end of file
diff --git a/scripts/batch/advection/archer2/config.param b/scripts/batch/advection/archer2/config.param
index 75c74925..b5fd20cf 100644
--- a/scripts/batch/advection/archer2/config.param
+++ b/scripts/batch/advection/archer2/config.param
@@ -15,6 +15,7 @@ BOOL opp_profile_all        = false
 INT opp_gpu_direct = 0
 BOOL opp_global_move = true
 REAL grid_spacing           = 0.5
+BOOL opp_segmented_red = false
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = Shuffle_Periodic
diff --git a/scripts/batch/advection/avon/config.param b/scripts/batch/advection/avon/config.param
index a7c1de43..b5fd20cf 100644
--- a/scripts/batch/advection/avon/config.param
+++ b/scripts/batch/advection/avon/config.param
@@ -1,15 +1,22 @@
 # Simulation parameters
 INT max_iter = 250 
-INT nx = 32
-INT ny = 32
-REAL dt = 0.1
+INT nx = 1024
+INT ny = 1024
+REAL dt = 0.5
 REAL cell_width = 0.5
-INT n_particles = 6000000
-
+INT npart_per_cell = 100
+BOOL verify_particles = false
 
 # System parameters
 BOOL opp_auto_sort          = false
-INT opp_allocation_multiple = 1
+REAL opp_allocation_multiple = 1.1
 INT opp_threads_per_block   = 512
 BOOL opp_profile_all        = false
-INT 
\ No newline at end of file
+INT opp_gpu_direct = 0
+BOOL opp_global_move = true
+REAL grid_spacing           = 0.5
+BOOL opp_segmented_red = false
+
+# HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
+STRING opp_fill = Shuffle_Periodic
+INT opp_fill_period = 4
\ No newline at end of file
diff --git a/scripts/batch/advection/pan/config.param b/scripts/batch/advection/pan/config.param
index 75c74925..b5fd20cf 100644
--- a/scripts/batch/advection/pan/config.param
+++ b/scripts/batch/advection/pan/config.param
@@ -15,6 +15,7 @@ BOOL opp_profile_all        = false
 INT opp_gpu_direct = 0
 BOOL opp_global_move = true
 REAL grid_spacing           = 0.5
+BOOL opp_segmented_red = false
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = Shuffle_Periodic
diff --git a/scripts/batch/cabana/archer2/cabana.param b/scripts/batch/cabana/archer2/cabana.param
index 183020ca..5dcf904d 100644
--- a/scripts/batch/cabana/archer2/cabana.param
+++ b/scripts/batch/cabana/archer2/cabana.param
@@ -28,7 +28,7 @@ STRING opp_fill = HoleFill_All
 INT opp_fill_period = 4
 
 BOOL opp_global_move = false
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 
 STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/cabana/mesh_files/cab_48000.hdf5
 INT domain_expansion = 1
diff --git a/scripts/batch/cabana/bede/cabana.param b/scripts/batch/cabana/bede/cabana.param
index 62770eb2..7c9b3a97 100644
--- a/scripts/batch/cabana/bede/cabana.param
+++ b/scripts/batch/cabana/bede/cabana.param
@@ -28,7 +28,7 @@ STRING opp_fill = HoleFill_All
 INT opp_fill_period = 4
 
 BOOL opp_global_move = false
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 
 STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/cabana/mesh_files/cab_48000.hdf5
 INT domain_expansion = 1
diff --git a/scripts/batch/cabana/cosma/cabana.param b/scripts/batch/cabana/cosma/cabana.param
index 0bfe6b27..61c43487 100644
--- a/scripts/batch/cabana/cosma/cabana.param
+++ b/scripts/batch/cabana/cosma/cabana.param
@@ -28,7 +28,7 @@ STRING opp_fill = HoleFill_All
 INT opp_fill_period = 4
 
 BOOL opp_global_move = false
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 INT gpu_reduction_arrays = 16
 
 STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/cabana/mesh_files/cab_48000.hdf5
diff --git a/scripts/batch/cabana/lumi/cabana.param b/scripts/batch/cabana/lumi/cabana.param
index 45463fb2..5f116dcb 100644
--- a/scripts/batch/cabana/lumi/cabana.param
+++ b/scripts/batch/cabana/lumi/cabana.param
@@ -28,7 +28,7 @@ STRING opp_fill = HoleFill_All
 INT opp_fill_period = 4
 
 BOOL opp_global_move = false
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 
 INT domain_expansion = -1
 
diff --git a/scripts/batch/fempic/archer2/box_archer.param b/scripts/batch/fempic/archer2/box_archer.param
index 696fc390..68d71249 100644
--- a/scripts/batch/fempic/archer2/box_archer.param
+++ b/scripts/batch/fempic/archer2/box_archer.param
@@ -36,7 +36,7 @@ BOOL opp_global_move = false
 
 # block k-means mpi-block
 STRING cluster = block
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = Shuffle_Periodic
diff --git a/scripts/batch/fempic/avon/box_avon.param b/scripts/batch/fempic/avon/box_avon.param
index 495822b0..c55e9976 100644
--- a/scripts/batch/fempic/avon/box_avon.param
+++ b/scripts/batch/fempic/avon/box_avon.param
@@ -38,7 +38,7 @@ BOOL opp_global_move = false
 # block k-means mpi-block
 STRING cluster = block
 
-BOOL use_reg_red = true
+BOOL opp_segmented_red = true
 INT sed_red_block           = 100000000
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
diff --git a/scripts/batch/fempic/bede/box_bede.param b/scripts/batch/fempic/bede/box_bede.param
index bba91f81..226dd17c 100644
--- a/scripts/batch/fempic/bede/box_bede.param
+++ b/scripts/batch/fempic/bede/box_bede.param
@@ -32,7 +32,7 @@ INT opp_gpu_direct = 0
 
 # block k-means
 STRING cluster = block
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = Shuffle_Periodic
diff --git a/scripts/batch/fempic/cosma/coarse.param b/scripts/batch/fempic/cosma/coarse.param
index f4335908..fa6940f3 100644
--- a/scripts/batch/fempic/cosma/coarse.param
+++ b/scripts/batch/fempic/cosma/coarse.param
@@ -38,7 +38,7 @@ BOOL opp_dh_data_generate = false
 
 # block k-means mpi-block
 STRING cluster = block
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 INT gpu_reduction_arrays = 16
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
diff --git a/scripts/batch/fempic/cosma/fempic_mpi_IntelMAX1100_hdf5.sh b/scripts/batch/fempic/cosma/fempic_mpi_IntelMAX1100_hdf5.sh
index b5ba0a8b..130b67f1 100755
--- a/scripts/batch/fempic/cosma/fempic_mpi_IntelMAX1100_hdf5.sh
+++ b/scripts/batch/fempic/cosma/fempic_mpi_IntelMAX1100_hdf5.sh
@@ -49,7 +49,7 @@ for gpus in 2; do
                 sed -i "s|STRING hdf_filename = <path_to_hdf5_mesh_files>/box_48000.hdf5|STRING hdf_filename = /cosma/home/do018/dc-lant1/phd/Artifacts/mesh_files/box_${actual_config}.hdf5|" ${currentfilename}
 
                 if [ "$use_seg_red" -eq 1 ]; then
-                    sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+                    sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
                 fi
                 if [ "$use_hole_fill" -eq 1 ]; then
                     sed -i "s/STRING opp_fill = Shuffle_Periodic/STRING opp_fill = HoleFill_All/" ${currentfilename}
diff --git a/scripts/batch/fempic/cosma/fempic_mpi_MI300X_dat.sh b/scripts/batch/fempic/cosma/fempic_mpi_MI300X_dat.sh
index 9f0f13b3..d7b73a76 100755
--- a/scripts/batch/fempic/cosma/fempic_mpi_MI300X_dat.sh
+++ b/scripts/batch/fempic/cosma/fempic_mpi_MI300X_dat.sh
@@ -56,7 +56,7 @@ for config in 48000 96000 192000; do
                 sed -i "s|STRING wall_mesh    = <path_to_mesh_files>/wall.dat|STRING wall_mesh    = /cosma/home/do018/dc-lant1/phd/box_mesh_gen/${actual_config}/wall.dat|" ${currentfilename}
 
                 if [ "$use_seg_red" -eq 1 ]; then
-                    sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+                    sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
                 fi
                 if [ "$use_hole_fill" -eq 1 ]; then
                     sed -i "s/STRING opp_fill = Shuffle_Periodic/STRING opp_fill = HoleFill_All/" ${currentfilename}
diff --git a/scripts/batch/fempic/cosma/fempic_mpi_MI300X_hdf5.sh b/scripts/batch/fempic/cosma/fempic_mpi_MI300X_hdf5.sh
index 49efa125..b5af8f6c 100755
--- a/scripts/batch/fempic/cosma/fempic_mpi_MI300X_hdf5.sh
+++ b/scripts/batch/fempic/cosma/fempic_mpi_MI300X_hdf5.sh
@@ -49,7 +49,7 @@ for gpus in 8 4 2 1; do
                 sed -i "s|STRING hdf_filename = <path_to_hdf5_mesh_files>/box_48000.hdf5|STRING hdf_filename = /cosma/home/do018/dc-lant1/phd/Artifacts/mesh_files/box_${actual_config}.hdf5|" ${currentfilename}
 
                 if [ "$use_seg_red" -eq 1 ]; then
-                    sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+                    sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
                 fi
                 if [ "$use_hole_fill" -eq 1 ]; then
                     sed -i "s/STRING opp_fill = Shuffle_Periodic/STRING opp_fill = HoleFill_All/" ${currentfilename}
diff --git a/scripts/batch/fempic/lumi/box_fempic.param b/scripts/batch/fempic/lumi/box_fempic.param
index 9aa203e6..4c0ad587 100644
--- a/scripts/batch/fempic/lumi/box_fempic.param
+++ b/scripts/batch/fempic/lumi/box_fempic.param
@@ -32,7 +32,7 @@ BOOL opp_global_move = false
 
 # block k-means mpi-block
 STRING cluster = block
-BOOL use_reg_red = false
+BOOL opp_segmented_red = false
 
 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
 STRING opp_fill = Shuffle_Periodic
diff --git a/scripts/batch/fempic/lumi/fempic_energy10.sh b/scripts/batch/fempic/lumi/fempic_energy10.sh
index cd555b01..fdbec606 100755
--- a/scripts/batch/fempic/lumi/fempic_energy10.sh
+++ b/scripts/batch/fempic/lumi/fempic_energy10.sh
@@ -76,7 +76,7 @@ for run in 1 2 3; do
         echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with unsafe atomics"
         srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D10_UA_R${run}.log;
 
-        sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+        sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
         echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with seg red"
         srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D10_SR_R${run}.log;
         # ---------------------
@@ -116,7 +116,7 @@ for run in 1 2 3; do
         echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with unsafe atomics"
         srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D13_UA_R${run}.log;
 
-        sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+        sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
         echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with seg red"
         srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D13_SR_R${run}.log;
         # ---------------------
diff --git a/scripts/batch/fempic/lumi/fempic_energy5.sh b/scripts/batch/fempic/lumi/fempic_energy5.sh
index 39eed55d..26f4c5a4 100755
--- a/scripts/batch/fempic/lumi/fempic_energy5.sh
+++ b/scripts/batch/fempic/lumi/fempic_energy5.sh
@@ -76,7 +76,7 @@ for run in 1 2 3; do
         echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with unsafe atomics"
         srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D10_UA_R${run}.log;
 
-        sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+        sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
         echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with seg red"
         srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D10_SR_R${run}.log;
         # ---------------------
@@ -116,7 +116,7 @@ for run in 1 2 3; do
         echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with unsafe atomics"
         srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D13_UA_R${run}.log;
 
-        sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+        sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
         echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with seg red"
         srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D13_SR_R${run}.log;
         # ---------------------
diff --git a/scripts/batch/fempic/lumi/fempic_mpi_1.sh b/scripts/batch/fempic/lumi/fempic_mpi_1.sh
index a9bd4f43..6e44151c 100755
--- a/scripts/batch/fempic/lumi/fempic_mpi_1.sh
+++ b/scripts/batch/fempic/lumi/fempic_mpi_1.sh
@@ -82,7 +82,7 @@ for run in 1 2 3; do
             sed -i "s/STRING hdf_filename = <path_to_hdf5_mesh_file>/STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename}
             sed -i "s/STRING rand_file    = <path_to_hdf5_mesh_file>/STRING rand_file    = ${escaped_folder}\/random_100k.dat/" ${currentfilename}
             if [ "$use_seg_red" -eq 1 ]; then
-                sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+                sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
             fi
 
             # ---------------------       
@@ -126,7 +126,7 @@ for run in 1 2 3; do
             sed -i "s/STRING hdf_filename = <path_to_hdf5_mesh_file>/STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename}
             sed -i "s/STRING rand_file    = <path_to_hdf5_mesh_file>/STRING rand_file    = ${escaped_folder}\/random_100k.dat/" ${currentfilename}
             if [ "$use_seg_red" -eq 1 ]; then
-                sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+                sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
             fi
 
             # ---------------------       
@@ -170,7 +170,7 @@ for run in 1 2 3; do
             sed -i "s/STRING hdf_filename = <path_to_hdf5_mesh_file>/STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename}
             sed -i "s/STRING rand_file    = <path_to_hdf5_mesh_file>/STRING rand_file    = ${escaped_folder}\/random_100k.dat/" ${currentfilename}
             if [ "$use_seg_red" -eq 1 ]; then
-                sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+                sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
             fi
 
             # ---------------------       
@@ -214,7 +214,7 @@ for run in 1 2 3; do
             sed -i "s/STRING hdf_filename = <path_to_hdf5_mesh_file>/STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename}
             sed -i "s/STRING rand_file    = <path_to_hdf5_mesh_file>/STRING rand_file    = ${escaped_folder}\/random_100k.dat/" ${currentfilename}
             if [ "$use_seg_red" -eq 1 ]; then
-                sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+                sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
             fi
 
             # ---------------------       
diff --git a/scripts/batch/fempic/lumi/fempic_mpi_16.sh b/scripts/batch/fempic/lumi/fempic_mpi_16.sh
index a7cfba43..e06e51d1 100755
--- a/scripts/batch/fempic/lumi/fempic_mpi_16.sh
+++ b/scripts/batch/fempic/lumi/fempic_mpi_16.sh
@@ -90,7 +90,7 @@ for run in 1 2 3; do # 1 2 3
         sed -i "s/STRING hdf_filename = <path_to_hdf5_mesh_file>/STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename}
         sed -i "s/STRING rand_file    = <path_to_hdf5_mesh_file>/STRING rand_file    = ${escaped_folder}\/random_100k.dat/" ${currentfilename}
         if [ "$use_seg_red" -eq 1 ]; then
-            sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+            sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
         fi
         if [ "$config" -eq 12288000 ]; then
             sed -i "s/REAL plasma_den     = 1e18/REAL plasma_den     = 9.9e17/" ${currentfilename}
diff --git a/scripts/batch/fempic/lumi/fempic_mpi_2.sh b/scripts/batch/fempic/lumi/fempic_mpi_2.sh
index 4e372b60..0a91b86e 100755
--- a/scripts/batch/fempic/lumi/fempic_mpi_2.sh
+++ b/scripts/batch/fempic/lumi/fempic_mpi_2.sh
@@ -81,7 +81,7 @@ for run in 1 2 3; do
         sed -i "s/STRING hdf_filename = <path_to_hdf5_mesh_file>/STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename}
         sed -i "s/STRING rand_file    = <path_to_hdf5_mesh_file>/STRING rand_file    = ${escaped_folder}\/random_100k.dat/" ${currentfilename}
         if [ "$use_seg_red" -eq 1 ]; then
-            sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+            sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
         fi
 
         # ---------------------       
diff --git a/scripts/batch/fempic/lumi/fempic_mpi_32.sh b/scripts/batch/fempic/lumi/fempic_mpi_32.sh
index 28bf9c3f..5c6ede49 100755
--- a/scripts/batch/fempic/lumi/fempic_mpi_32.sh
+++ b/scripts/batch/fempic/lumi/fempic_mpi_32.sh
@@ -91,7 +91,7 @@ for run in 1 2 3; do
         sed -i "s/STRING hdf_filename = <path_to_hdf5_mesh_file>/STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename}
         sed -i "s/STRING rand_file    = <path_to_hdf5_mesh_file>/STRING rand_file    = ${escaped_folder}\/random_100k.dat/" ${currentfilename}
         if [ "$use_seg_red" -eq 1 ]; then
-            sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+            sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
         fi
         if [ "$config" -eq 12288000 ]; then
             sed -i "s/REAL plasma_den     = 1e18/REAL plasma_den     = 9.9e17/" ${currentfilename}
diff --git a/scripts/batch/fempic/lumi/fempic_mpi_4.sh b/scripts/batch/fempic/lumi/fempic_mpi_4.sh
index e8c4cc7b..260e42f2 100755
--- a/scripts/batch/fempic/lumi/fempic_mpi_4.sh
+++ b/scripts/batch/fempic/lumi/fempic_mpi_4.sh
@@ -80,7 +80,7 @@ for run in 1 2 3; do
         sed -i "s/STRING hdf_filename = <path_to_hdf5_mesh_file>/STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename}
         sed -i "s/STRING rand_file    = <path_to_hdf5_mesh_file>/STRING rand_file    = ${escaped_folder}\/random_100k.dat/" ${currentfilename}
         if [ "$use_seg_red" -eq 1 ]; then
-            sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+            sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
         fi
 
         # ---------------------       
diff --git a/scripts/batch/fempic/lumi/fempic_mpi_8.sh b/scripts/batch/fempic/lumi/fempic_mpi_8.sh
index 89107ed1..fa0ba26f 100755
--- a/scripts/batch/fempic/lumi/fempic_mpi_8.sh
+++ b/scripts/batch/fempic/lumi/fempic_mpi_8.sh
@@ -80,7 +80,7 @@ for run in 1 2 3; do
         sed -i "s/STRING hdf_filename = <path_to_hdf5_mesh_file>/STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename}
         sed -i "s/STRING rand_file    = <path_to_hdf5_mesh_file>/STRING rand_file    = ${escaped_folder}\/random_100k.dat/" ${currentfilename}
         if [ "$use_seg_red" -eq 1 ]; then
-            sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename}
+            sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename}
         fi
 
         # ---------------------