diff --git a/README.md b/README.md
index 1b57f83..04cc3ad 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,10 @@ This repository contains the implementation of the code translation tools and ru
  * `app_<app_name>_cg`: Copy of the example applications that are already being code generated. 
  * `scripts/source`: Some example source files used during implementation
  * `scripts/batch`: Some example slurm batch files used during implementation
- * `handcoded`: Some hand-coded application code, written prior code generation
+ * `app_handcoded`: Some hand-coded application code, written prior code generation
+
+## Documentation
+Documentation is available on [Read the Docs](https://zamanlantra.github.io/git_docs_test/index.html).
 
 ## Dependencies
 OP-PIC has a variety of toolchain dependencies that you will likely be able to obtain from your package manager or programming environment:
@@ -39,7 +42,7 @@ In addition, there are a few optional library dependencies that you will likely
  * Compile the required application version using the make command. (`make` followed by the required parallelization). 
  
  For example, `cd app_cabanapic; python3 $OPP_TRANSLATOR -v -I$OPP_PATH/include/ --file_paths cabanapic.cpp; make cuda_mpi`.
- A detail explanation can be found in the readme file of opp_translator folder and the required application folder.
+ A detail explanation can be found in the readme file of `opp_translator` folder and the required application folder.
  
  In addition, `app_<app_name>_cg` will additionally include code-generated files, ready to compile directly using make commands.
 
diff --git a/app_cabanapic/Makefile b/app_cabanapic/Makefile
index 52ddf2b..7cc652c 100644
--- a/app_cabanapic/Makefile
+++ b/app_cabanapic/Makefile
@@ -92,6 +92,14 @@ mpi: mklib
 			$(OBJ)/opp_kernels_mpi.o \
 			$(ALL_INC) $(ALL_LIBS) -lopp_mpi
 
+omp_mpi: mklib
+	     $(MPICPP) $(CPPFLAGS) -DUSE_MPI -fopenmp -DUSE_OMP -c cabana_opp.cpp -o $(OBJ)/cabana_opp_omp_mpi.o $(ALL_INC)
+	     $(MPICPP) $(CPPFLAGS) -DUSE_MPI -fopenmp -DUSE_OMP -c omp/opp_kernels.cpp -o $(OBJ)/opp_kernels_omp_mpi.o $(ALL_INC)
+	     $(MPICPP) $(CPPFLAGS) -DUSE_MPI -fopenmp -o $(BIN)/omp_mpi \
+			$(OBJ)/cabana_opp_omp_mpi.o \
+			$(OBJ)/opp_kernels_omp_mpi.o \
+			$(ALL_INC) $(ALL_LIBS) -lopp_omp_mpi
+
 mpi_hdf5: mklib
 		 $(MPICPP) $(CPPFLAGS) -DUSE_MPI -c cabana_op_hdf5.cpp -o $(OBJ)/cabana_op_hdf5.o $(ALL_INC)
 		 $(MPICPP) $(CPPFLAGS) -DUSE_MPI -c mpi/mpikernels.cpp -o $(OBJ)/mpikernels.o $(ALL_INC)
diff --git a/app_cabanapic_cg/Makefile b/app_cabanapic_cg/Makefile
index 52ddf2b..7cc652c 100644
--- a/app_cabanapic_cg/Makefile
+++ b/app_cabanapic_cg/Makefile
@@ -92,6 +92,14 @@ mpi: mklib
 			$(OBJ)/opp_kernels_mpi.o \
 			$(ALL_INC) $(ALL_LIBS) -lopp_mpi
 
+omp_mpi: mklib
+	     $(MPICPP) $(CPPFLAGS) -DUSE_MPI -fopenmp -DUSE_OMP -c cabana_opp.cpp -o $(OBJ)/cabana_opp_omp_mpi.o $(ALL_INC)
+	     $(MPICPP) $(CPPFLAGS) -DUSE_MPI -fopenmp -DUSE_OMP -c omp/opp_kernels.cpp -o $(OBJ)/opp_kernels_omp_mpi.o $(ALL_INC)
+	     $(MPICPP) $(CPPFLAGS) -DUSE_MPI -fopenmp -o $(BIN)/omp_mpi \
+			$(OBJ)/cabana_opp_omp_mpi.o \
+			$(OBJ)/opp_kernels_omp_mpi.o \
+			$(ALL_INC) $(ALL_LIBS) -lopp_omp_mpi
+
 mpi_hdf5: mklib
 		 $(MPICPP) $(CPPFLAGS) -DUSE_MPI -c cabana_op_hdf5.cpp -o $(OBJ)/cabana_op_hdf5.o $(ALL_INC)
 		 $(MPICPP) $(CPPFLAGS) -DUSE_MPI -c mpi/mpikernels.cpp -o $(OBJ)/mpikernels.o $(ALL_INC)
diff --git a/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp b/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp
index 99ad413..2e174c2 100644
--- a/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp
+++ b/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp
@@ -18,6 +18,12 @@ __constant__ OPP_INT opp_k1_map0_stride_d;
 
 
 namespace opp_k1 {
+enum Dim {
+    x = 0,
+    y = 1,
+    z = 2,
+};
+
 enum CellInterp {
     ex = 0,
     dexdy,
@@ -39,12 +45,6 @@ enum CellInterp {
     dcbzdz,
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-    z = 2,
-};
-
 __device__ inline void interpolate_mesh_fields_kernel(
     const double* cell0_e,
     const double* cell0_b,
diff --git a/app_cabanapic_cg/cuda/move_deposit_kernel_loop.hpp b/app_cabanapic_cg/cuda/move_deposit_kernel_loop.hpp
index 25c494b..6235f39 100644
--- a/app_cabanapic_cg/cuda/move_deposit_kernel_loop.hpp
+++ b/app_cabanapic_cg/cuda/move_deposit_kernel_loop.hpp
@@ -26,27 +26,6 @@ enum CellAcc {
     jfz = 2 * 4,
 };
 
-enum CellInterp {
-    ex = 0,
-    dexdy,
-    dexdz,
-    d2exdydz,
-    ey,
-    deydz,
-    deydx,
-    d2eydzdx,
-    ez,
-    dezdx,
-    dezdy,
-    d2ezdxdy,
-    cbx,
-    dcbxdx,
-    cby,
-    dcbydy,
-    cbz,
-    dcbzdz,
-};
-
 __device__ inline void weight_current_to_accumulator_kernel(
         double* cell_acc,
         const double* q,
@@ -82,6 +61,27 @@ enum Dim {
     z = 2,
 };
 
+enum CellInterp {
+    ex = 0,
+    dexdy,
+    dexdz,
+    d2exdydz,
+    ey,
+    deydz,
+    deydx,
+    d2eydzdx,
+    ez,
+    dezdx,
+    dezdy,
+    d2ezdxdy,
+    cbx,
+    dcbxdx,
+    cby,
+    dcbydy,
+    cbz,
+    dcbzdz,
+};
+
 __device__ inline void move_deposit_kernel(
     char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen
     const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen
diff --git a/app_cabanapic_cg/hip/interpolate_mesh_fields_kernel_loop.hpp b/app_cabanapic_cg/hip/interpolate_mesh_fields_kernel_loop.hpp
index 1bcb348..2ca42ab 100644
--- a/app_cabanapic_cg/hip/interpolate_mesh_fields_kernel_loop.hpp
+++ b/app_cabanapic_cg/hip/interpolate_mesh_fields_kernel_loop.hpp
@@ -18,6 +18,12 @@ __constant__ OPP_INT opp_k1_map0_stride_d;
 
 
 namespace opp_k1 {
+enum Dim {
+    x = 0,
+    y = 1,
+    z = 2,
+};
+
 enum CellInterp {
     ex = 0,
     dexdy,
@@ -39,12 +45,6 @@ enum CellInterp {
     dcbzdz,
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-    z = 2,
-};
-
 __device__ inline void interpolate_mesh_fields_kernel(
     const double* cell0_e,
     const double* cell0_b,
diff --git a/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp b/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp
index 893543b..b90dfe2 100644
--- a/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp
+++ b/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp
@@ -26,27 +26,6 @@ enum CellAcc {
     jfz = 2 * 4,
 };
 
-enum CellInterp {
-    ex = 0,
-    dexdy,
-    dexdz,
-    d2exdydz,
-    ey,
-    deydz,
-    deydx,
-    d2eydzdx,
-    ez,
-    dezdx,
-    dezdy,
-    d2ezdxdy,
-    cbx,
-    dcbxdx,
-    cby,
-    dcbydy,
-    cbz,
-    dcbzdz,
-};
-
 __device__ inline void weight_current_to_accumulator_kernel(
         double* cell_acc,
         const double* q,
@@ -82,6 +61,27 @@ enum Dim {
     z = 2,
 };
 
+enum CellInterp {
+    ex = 0,
+    dexdy,
+    dexdz,
+    d2exdydz,
+    ey,
+    deydz,
+    deydx,
+    d2eydzdx,
+    ez,
+    dezdx,
+    dezdy,
+    d2ezdxdy,
+    cbx,
+    dcbxdx,
+    cby,
+    dcbydy,
+    cbz,
+    dcbzdz,
+};
+
 __device__ inline void move_deposit_kernel(
     char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen
     const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen
diff --git a/app_cabanapic_cg/mpi/interpolate_mesh_fields_kernel_loop.hpp b/app_cabanapic_cg/mpi/interpolate_mesh_fields_kernel_loop.hpp
index 46ef72c..7b02d00 100644
--- a/app_cabanapic_cg/mpi/interpolate_mesh_fields_kernel_loop.hpp
+++ b/app_cabanapic_cg/mpi/interpolate_mesh_fields_kernel_loop.hpp
@@ -4,6 +4,12 @@
 //*********************************************
 
 namespace opp_k1 {
+enum Dim {
+    x = 0,
+    y = 1,
+    z = 2,
+};
+
 enum CellInterp {
     ex = 0,
     dexdy,
@@ -25,12 +31,6 @@ enum CellInterp {
     dcbzdz,
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-    z = 2,
-};
-
 inline void interpolate_mesh_fields_kernel(
     const double* cell0_e,
     const double* cell0_b,
diff --git a/app_cabanapic_cg/mpi/move_deposit_kernel_loop.hpp b/app_cabanapic_cg/mpi/move_deposit_kernel_loop.hpp
index 428c171..a4e4c37 100644
--- a/app_cabanapic_cg/mpi/move_deposit_kernel_loop.hpp
+++ b/app_cabanapic_cg/mpi/move_deposit_kernel_loop.hpp
@@ -10,27 +10,6 @@ enum CellAcc {
     jfz = 2 * 4,
 };
 
-enum CellInterp {
-    ex = 0,
-    dexdy,
-    dexdz,
-    d2exdydz,
-    ey,
-    deydz,
-    deydx,
-    d2eydzdx,
-    ez,
-    dezdx,
-    dezdy,
-    d2ezdxdy,
-    cbx,
-    dcbxdx,
-    cby,
-    dcbydy,
-    cbz,
-    dcbzdz,
-};
-
 inline void weight_current_to_accumulator_kernel(
         double* cell_acc,
         const double* q,
@@ -66,6 +45,27 @@ enum Dim {
     z = 2,
 };
 
+enum CellInterp {
+    ex = 0,
+    dexdy,
+    dexdz,
+    d2exdydz,
+    ey,
+    deydz,
+    deydx,
+    d2eydzdx,
+    ez,
+    dezdx,
+    dezdy,
+    d2ezdxdy,
+    cbx,
+    dcbxdx,
+    cby,
+    dcbydy,
+    cbz,
+    dcbzdz,
+};
+
 inline void move_deposit_kernel(
     double* part_vel,
     double* part_pos,
@@ -341,7 +341,7 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
 
 
     opp_profiler->start("Mv_AllMv0");
@@ -369,7 +369,7 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("move_deposit_kernel_only");
diff --git a/app_cabanapic_cg/omp/accumulate_current_to_cells_kernel_loop.hpp b/app_cabanapic_cg/omp/accumulate_current_to_cells_kernel_loop.hpp
index fe6dda7..58e8e9b 100644
--- a/app_cabanapic_cg/omp/accumulate_current_to_cells_kernel_loop.hpp
+++ b/app_cabanapic_cg/omp/accumulate_current_to_cells_kernel_loop.hpp
@@ -88,8 +88,8 @@ void opp_par_loop_all__accumulate_current_to_cells_kernel(opp_set set, opp_itera
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_cabanapic_cg/omp/advance_e_kernel_loop.hpp b/app_cabanapic_cg/omp/advance_e_kernel_loop.hpp
index eca8d69..edf424d 100644
--- a/app_cabanapic_cg/omp/advance_e_kernel_loop.hpp
+++ b/app_cabanapic_cg/omp/advance_e_kernel_loop.hpp
@@ -73,8 +73,8 @@ void opp_par_loop_all__advance_e_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_cabanapic_cg/omp/compute_energy_kernel_loop.hpp b/app_cabanapic_cg/omp/compute_energy_kernel_loop.hpp
index 717919e..030095e 100644
--- a/app_cabanapic_cg/omp/compute_energy_kernel_loop.hpp
+++ b/app_cabanapic_cg/omp/compute_energy_kernel_loop.hpp
@@ -59,8 +59,8 @@ void opp_par_loop_all__compute_energy_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_cabanapic_cg/omp/get_max_x_values_kernel_loop.hpp b/app_cabanapic_cg/omp/get_max_x_values_kernel_loop.hpp
index 6f614a0..24f9e7e 100644
--- a/app_cabanapic_cg/omp/get_max_x_values_kernel_loop.hpp
+++ b/app_cabanapic_cg/omp/get_max_x_values_kernel_loop.hpp
@@ -73,8 +73,8 @@ void opp_par_loop_all__get_max_x_values_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_cabanapic_cg/omp/half_advance_b_kernel_loop.hpp b/app_cabanapic_cg/omp/half_advance_b_kernel_loop.hpp
index b726e0c..6169963 100644
--- a/app_cabanapic_cg/omp/half_advance_b_kernel_loop.hpp
+++ b/app_cabanapic_cg/omp/half_advance_b_kernel_loop.hpp
@@ -67,8 +67,8 @@ void opp_par_loop_all__half_advance_b_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_cabanapic_cg/omp/interpolate_mesh_fields_kernel_loop.hpp b/app_cabanapic_cg/omp/interpolate_mesh_fields_kernel_loop.hpp
index 77bf955..827aa5f 100644
--- a/app_cabanapic_cg/omp/interpolate_mesh_fields_kernel_loop.hpp
+++ b/app_cabanapic_cg/omp/interpolate_mesh_fields_kernel_loop.hpp
@@ -4,6 +4,12 @@
 //*********************************************
 
 namespace opp_k1 {
+enum Dim {
+    x = 0,
+    y = 1,
+    z = 2,
+};
+
 enum CellInterp {
     ex = 0,
     dexdy,
@@ -25,12 +31,6 @@ enum CellInterp {
     dcbzdz,
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-    z = 2,
-};
-
 inline void interpolate_mesh_fields_kernel(
     const double* cell0_e,
     const double* cell0_b,
@@ -153,8 +153,8 @@ void opp_par_loop_all__interpolate_mesh_fields_kernel(opp_set set, opp_iterate_t
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_cabanapic_cg/omp/move_deposit_kernel_loop.hpp b/app_cabanapic_cg/omp/move_deposit_kernel_loop.hpp
index 151d0fb..43fe182 100644
--- a/app_cabanapic_cg/omp/move_deposit_kernel_loop.hpp
+++ b/app_cabanapic_cg/omp/move_deposit_kernel_loop.hpp
@@ -10,27 +10,6 @@ enum CellAcc {
     jfz = 2 * 4,
 };
 
-enum CellInterp {
-    ex = 0,
-    dexdy,
-    dexdz,
-    d2exdydz,
-    ey,
-    deydz,
-    deydx,
-    d2eydzdx,
-    ez,
-    dezdx,
-    dezdy,
-    d2ezdxdy,
-    cbx,
-    dcbxdx,
-    cby,
-    dcbydy,
-    cbz,
-    dcbzdz,
-};
-
 inline void weight_current_to_accumulator_kernel(
         double* cell_acc,
         const double* q,
@@ -66,6 +45,27 @@ enum Dim {
     z = 2,
 };
 
+enum CellInterp {
+    ex = 0,
+    dexdy,
+    dexdz,
+    d2exdydz,
+    ey,
+    deydz,
+    deydx,
+    d2eydzdx,
+    ez,
+    dezdx,
+    dezdy,
+    d2ezdxdy,
+    cbx,
+    dcbxdx,
+    cby,
+    dcbydy,
+    cbz,
+    dcbzdz,
+};
+
 inline void move_deposit_kernel(
     char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen
     const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen
@@ -350,7 +350,8 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
+    const int total_count = OPP_iter_end - OPP_iter_start;
 
 
     opp_profiler->start("Mv_AllMv0");
@@ -359,7 +360,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
     // check whether all particles not marked for global comm is within cell, 
     // and if not mark to move between cells within the MPI rank, mark for neighbour comm
     opp_profiler->start("move_deposit_kernel_only");
-    const int total_count = OPP_iter_end - OPP_iter_start;
     #pragma omp parallel for
     for (int thr = 0; thr < nthreads; thr++)
     {
@@ -386,7 +386,7 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("move_deposit_kernel_only");
diff --git a/app_cabanapic_cg/omp/update_ghosts_B_kernel_loop.hpp b/app_cabanapic_cg/omp/update_ghosts_B_kernel_loop.hpp
index 3aee572..f672235 100644
--- a/app_cabanapic_cg/omp/update_ghosts_B_kernel_loop.hpp
+++ b/app_cabanapic_cg/omp/update_ghosts_B_kernel_loop.hpp
@@ -56,8 +56,8 @@ void opp_par_loop_all__update_ghosts_B_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_cabanapic_cg/omp/update_ghosts_kernel_loop.hpp b/app_cabanapic_cg/omp/update_ghosts_kernel_loop.hpp
index 0a8eb1c..db77779 100644
--- a/app_cabanapic_cg/omp/update_ghosts_kernel_loop.hpp
+++ b/app_cabanapic_cg/omp/update_ghosts_kernel_loop.hpp
@@ -49,8 +49,8 @@ void opp_par_loop_all__update_ghosts_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_cabanapic_cg/seq/interpolate_mesh_fields_kernel_loop.hpp b/app_cabanapic_cg/seq/interpolate_mesh_fields_kernel_loop.hpp
index b74dbfa..bbbab2b 100644
--- a/app_cabanapic_cg/seq/interpolate_mesh_fields_kernel_loop.hpp
+++ b/app_cabanapic_cg/seq/interpolate_mesh_fields_kernel_loop.hpp
@@ -4,6 +4,12 @@
 //*********************************************
 
 namespace opp_k1 {
+enum Dim {
+    x = 0,
+    y = 1,
+    z = 2,
+};
+
 enum CellInterp {
     ex = 0,
     dexdy,
@@ -25,12 +31,6 @@ enum CellInterp {
     dcbzdz,
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-    z = 2,
-};
-
 inline void interpolate_mesh_fields_kernel(
     const double* cell0_e,
     const double* cell0_b,
diff --git a/app_cabanapic_cg/seq/move_deposit_kernel_loop.hpp b/app_cabanapic_cg/seq/move_deposit_kernel_loop.hpp
index d1e72ca..b08d65a 100644
--- a/app_cabanapic_cg/seq/move_deposit_kernel_loop.hpp
+++ b/app_cabanapic_cg/seq/move_deposit_kernel_loop.hpp
@@ -10,27 +10,6 @@ enum CellAcc {
     jfz = 2 * 4,
 };
 
-enum CellInterp {
-    ex = 0,
-    dexdy,
-    dexdz,
-    d2exdydz,
-    ey,
-    deydz,
-    deydx,
-    d2eydzdx,
-    ez,
-    dezdx,
-    dezdy,
-    d2ezdxdy,
-    cbx,
-    dcbxdx,
-    cby,
-    dcbydy,
-    cbz,
-    dcbzdz,
-};
-
 inline void weight_current_to_accumulator_kernel(
         double* cell_acc,
         const double* q,
@@ -66,6 +45,27 @@ enum Dim {
     z = 2,
 };
 
+enum CellInterp {
+    ex = 0,
+    dexdy,
+    dexdz,
+    d2exdydz,
+    ey,
+    deydz,
+    deydx,
+    d2eydzdx,
+    ez,
+    dezdx,
+    dezdy,
+    d2ezdxdy,
+    cbx,
+    dcbxdx,
+    cby,
+    dcbydy,
+    cbz,
+    dcbzdz,
+};
+
 inline void move_deposit_kernel(
     double* part_vel,
     double* part_pos,
diff --git a/app_fempic/fempic.cpp b/app_fempic/fempic.cpp
index 66ba838..c9cbd38 100644
--- a/app_fempic/fempic.cpp
+++ b/app_fempic/fempic.cpp
@@ -209,16 +209,19 @@ int main(int argc, char **argv)
 
             if (print_final_log)
             {
-                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0;
+                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0, max_c_ef = 0.0;
 
+                opp_par_loop(get_max_cef_kernel, "get_max_cef", cell_set, OPP_ITERATE_ALL,
+                    opp_arg_dat(c_ef, OPP_READ),
+                    opp_arg_gbl(&max_c_ef, 1, "double", OPP_MAX));
                 opp_par_loop(get_final_max_values_kernel, "get_final_max_values", node_set, OPP_ITERATE_ALL,
                     opp_arg_dat(n_charge_den, OPP_READ),
                     opp_arg_gbl(&max_n_chg_den, 1, "double", OPP_MAX),
                     opp_arg_dat(n_potential, OPP_READ),
                     opp_arg_gbl(&max_n_pot, 1, "double", OPP_MAX));
 
-                log = get_global_level_log(max_n_chg_den, max_n_pot, particle_set->size, 
-                    inject_count, (old_nparts - particle_set->size));
+                log = get_global_level_log(max_c_ef, max_n_pot, particle_set->size, inject_count, 
+                    (old_nparts - particle_set->size));
             }
 
             total_part_iter += particle_set->size;  
diff --git a/app_fempic/fempic_hdf5.cpp b/app_fempic/fempic_hdf5.cpp
index 6146f28..b8128d6 100644
--- a/app_fempic/fempic_hdf5.cpp
+++ b/app_fempic/fempic_hdf5.cpp
@@ -206,16 +206,19 @@ int main(int argc, char **argv)
 
             if (print_final_log)
             {
-                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0;
+                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0, max_c_ef = 0.0;
 
+                opp_par_loop(get_max_cef_kernel, "get_max_cef", cell_set, OPP_ITERATE_ALL,
+                    opp_arg_dat(c_ef, OPP_READ),
+                    opp_arg_gbl(&max_c_ef, 1, "double", OPP_MAX));
                 opp_par_loop(get_final_max_values_kernel, "get_final_max_values", node_set, OPP_ITERATE_ALL,
                     opp_arg_dat(n_charge_den, OPP_READ),
                     opp_arg_gbl(&max_n_chg_den, 1, "double", OPP_MAX),
                     opp_arg_dat(n_potential, OPP_READ),
                     opp_arg_gbl(&max_n_pot, 1, "double", OPP_MAX));
 
-                log = get_global_level_log(max_n_chg_den, max_n_pot, particle_set->size, 
-                    inject_count, (old_nparts - particle_set->size));
+                log = get_global_level_log(max_c_ef, max_n_pot, particle_set->size, inject_count, 
+                    (old_nparts - particle_set->size));
             }
 
             total_part_iter += particle_set->size;  
diff --git a/app_fempic/fempic_misc.h b/app_fempic/fempic_misc.h
index 6955b72..64b48bf 100644
--- a/app_fempic/fempic_misc.h
+++ b/app_fempic/fempic_misc.h
@@ -39,8 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * This function will enrich the particle distribution values into if_dist_dat
  * In addition, dummy_random dat will get enriched with random values in "rand_file"
 */
-inline int init_inject_distributions(opp_dat if_dist_dat, opp_dat if_area_dat, 
-                                        opp_dat dummy_random)
+inline int init_inject_distributions(opp_dat if_dist_dat, opp_dat if_area_dat, opp_dat dummy_random)
 {
     if (OPP_DBG) opp_printf("init_inject_distributions", "START");
 
@@ -52,8 +51,8 @@ inline int init_inject_distributions(opp_dat if_dist_dat, opp_dat if_area_dat,
     int total_inject_count = 0, max_inject_count_per_face = 0;
     double remainder = 0.0;
 
-    if (OPP_DBG) 
-        opp_printf("init_inject_distributions", "plasma_den=%2.15lE dt=%2.25lE ion_velocity=%2.15lE spwt=%2.15lE", 
+    OPP_RUN_ON_ROOT()
+        opp_printf("init_inject_distributions", "plasma_den=%2.15lE dt=%2.25lE ion_vel=%2.15lE spwt=%2.15lE", 
             plasma_den, dt, ion_velocity, spwt);
 
     // find the number of particles to be injected through each inlet face and 
@@ -66,7 +65,7 @@ inline int init_inject_distributions(opp_dat if_dist_dat, opp_dat if_area_dat,
         double num_real = num_per_sec * dt;
         double fnum_mp = num_real / spwt + remainder;
         int num_mp = (int)fnum_mp;
-        remainder = fnum_mp - num_mp;
+        // remainder = fnum_mp - num_mp;
 
         total_inject_count += num_mp;
 
@@ -196,14 +195,14 @@ inline void print_per_cell_particle_counts(opp_dat c_part_count, opp_dat part_me
 
 /*************************************************************************************************
  * This is a utility function create a global level log string using local values
- * @param max_n_charge_density - this is already reduced glocally by the opp_par_loop
+ * @param max_c_ef - this is already reduced glocally by the opp_par_loop
  * @param max_n_potential - this is already reduced glocally by the opp_par_loop
  * @param local_part_count - these are local values
  * @param local_parts_injected - these are local values
  * @param local_part_removed - these are local values
  * @return std::string
 */
-inline std::string get_global_level_log(double max_n_charge_density, double max_n_potential, 
+inline std::string get_global_level_log(double max_c_ef, double max_n_potential, 
     int local_part_count, int local_parts_injected, int local_part_removed)
 {
     std::string log = "";
@@ -230,8 +229,8 @@ inline std::string get_global_level_log(double max_n_charge_density, double max_
     log += std::string("\t np: ") + str(global_part_size, "%" PRId64);
     log += std::string(" (") + str(global_inj_size, "%" PRId64);
     log += std::string(" added, ") + str(global_removed, "%" PRId64);
-    log += std::string(" removed)\t max den: ") + str(max_n_charge_density, "%2.25lE");
-    log += std::string(" max |phi|: ") + str(max_n_potential, "%2.10lE");
+    log += std::string(" removed)\t max c_ef: ") + str(max_c_ef, "%2.15lE");
+    log += std::string(" max n_pot: ") + str(max_n_potential, "%2.15lE");
     log += std::string(" max_comm_iteration: ") + str(global_max_comm_iteration, "%d");
     return log;
 }
diff --git a/app_fempic/kernels.h b/app_fempic/kernels.h
index 1a5efcd..3d17bf4 100644
--- a/app_fempic/kernels.h
+++ b/app_fempic/kernels.h
@@ -35,8 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // USER WRITTEN CODE
 //*********************************************
 
-// #define OPP_KERNEL_LOOPP_UNROLL
-
 #include "opp_lib.h"
 
 //****************************************
@@ -46,34 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define KERNEL_NEIGHB_C         4
 #define KERNEL_DIM              3
 
-//*************************************************************************************************
-inline void calculate_injection_distribution(
-    int* injected_total,
-    double* face_area,
-    int* particle_distribution,
-    double* remainder 
-)
-{   
-    /*number of real ions per sec, given prescribed density and velocity*/
-    double num_per_sec = CONST_plasma_den[0] * CONST_ion_velocity[0] * (*face_area);
-
-    /*number of ions to generate in this time step*/
-    double num_real = num_per_sec * CONST_dt[0];
-
-    /*fraction number of macroparticles*/
-    double fnum_mp = num_real / CONST_spwt[0] + (*remainder);
-
-    /*integer number of macroparticles*/
-    int num_mp = (int)fnum_mp;
-
-    /*update reminder*/
-    (*remainder) = fnum_mp - num_mp;
-
-    (*injected_total) += num_mp;
-
-    (*particle_distribution) = (*injected_total);
-}
-
 //*************************************************************************************************
 inline void init_boundary_pot_kernel(
     const int *node_type, 
@@ -111,7 +81,7 @@ inline void inject_ions_kernel(
 {
     double a = dummy_part_random[0];
     double b = dummy_part_random[1];
-    if ((a + b) > 1)  
+    if ((a + b) > 1)  // TODO : Change the random dat to avoid this
     {
         a = (1 - a);
         b = (1 - b);
@@ -142,161 +112,6 @@ inline void calculate_new_pos_vel_kernel(
 }
 
 //*************************************************************************************************
-inline void deposit_charge_on_nodes_kernel(
-    const double *part_lc,
-    double *node_charge_den0,
-    double *node_charge_den1,
-    double *node_charge_den2,
-    double *node_charge_den3) {
-
-    node_charge_den0[0] += part_lc[0];
-    node_charge_den1[0] += part_lc[1];
-    node_charge_den2[0] += part_lc[2];
-    node_charge_den3[0] += part_lc[3];
-}
-
-//*************************************************************************************************
-inline void move_all_particles_to_cell_kernel(
-    const double *cell_ef,
-    double *part_pos,
-    double *part_vel,
-    double *part_lc,
-    int* current_cell_index,
-    const double *current_cell_volume,
-    const double *current_cell_det,
-    const int *cell_connectivity,
-    double *node_charge_den0,
-    double *node_charge_den1,
-    double *node_charge_den2,
-    double *node_charge_den3
-)
-{
-    if (OPP_DO_ONCE)
-    {
-        const double coefficient1 = CONST_charge[0] / CONST_mass[0] * (CONST_dt[0]);
-        for (int i = 0; i < KERNEL_DIM; i++)
-            part_vel[i] += (coefficient1 * cell_ef[i]);                  
-        
-        for (int i = 0; i < KERNEL_DIM; i++)
-            part_pos[i] += part_vel[i] * (CONST_dt[0]); // v = u + at
-    }
-
-    bool inside = true;
-    const double coefficient2 = KERNEL_ONE_OVER_SIX / (*current_cell_volume);
-    for (int i=0; i<KERNEL_N_PER_C; i++) /*loop over vertices*/
-    {
-        part_lc[i] = coefficient2 * (
-            current_cell_det[i * KERNEL_DET_FIELDS + 0] - 
-            current_cell_det[i * KERNEL_DET_FIELDS + 1] * part_pos[0] + 
-            current_cell_det[i * KERNEL_DET_FIELDS + 2] * part_pos[1] - 
-            current_cell_det[i * KERNEL_DET_FIELDS + 3] * part_pos[2]);
-        
-        if (part_lc[i] < 0.0 || 
-            part_lc[i] > 1.0)  
-                inside = false;
-                // m.inside_cell = false;
-    }    
-
-    if (inside)
-    {
-        OPP_PARTICLE_MOVE_DONE;
-
-        (*node_charge_den0) += part_lc[0];
-        (*node_charge_den1) += part_lc[1];
-        (*node_charge_den2) += part_lc[2];
-        (*node_charge_den3) += part_lc[3];
-
-        return;
-    }
-
-    // outside the last known cell, find most negative weight and 
-    // use that cell_index to reduce computations
-    int min_i = 0;
-    double min_lc = part_lc[0];
-    
-    for (int i=1; i<KERNEL_NEIGHB_C; i++)
-    {
-        if (part_lc[i] < min_lc) 
-        {
-            min_lc = part_lc[i];
-            min_i = i;
-        }
-    }
-
-    if (cell_connectivity[min_i] >= 0) // is there a neighbor in this direction?
-    {
-        (*current_cell_index) = cell_connectivity[min_i];
-        OPP_PARTICLE_NEED_MOVE;
-    }
-    else
-    {
-        OPP_PARTICLE_NEED_REMOVE;
-    }
-}
-
-//*************************************************************************************************
-inline void compute_node_charge_density_kernel(
-    double *node_charge_den,
-    const double *node_volume
-)
-{
-    (*node_charge_den) *= (CONST_spwt[0] / node_volume[0]);
-}
-
-//*************************************************************************************************
-inline void compute_electric_field_kernel(
-    double *cell_electric_field,             
-    const double *cell_shape_deriv,
-    const double *node_potential0,
-    const double *node_potential1,
-    const double *node_potential2,
-    const double *node_potential3
-)
-{
-    for (int dim = 0; dim < KERNEL_DIM; dim++)
-    { 
-        const double c1 = (cell_shape_deriv[0 * KERNEL_DIM + dim] * (*node_potential0));
-        const double c2 = (cell_shape_deriv[1 * KERNEL_DIM + dim] * (*node_potential1));
-        const double c3 = (cell_shape_deriv[2 * KERNEL_DIM + dim] * (*node_potential2));
-        const double c4 = (cell_shape_deriv[3 * KERNEL_DIM + dim] * (*node_potential3));
-
-        cell_electric_field[dim] -= (c1 + c2 + c3 + c4);
-    }    
-}
-
-//*************************************************************************************************
-
-// #define OPP_LOOPP_UNROLL
-//*******************************************************************************
-inline void isPointInCellKernel(bool& inside, const double *point_pos, double* point_lc, 
-                                const double *cell_volume, const double *cell_det) { 
-
-    inside = true;  
-    const double coefficient2 = KERNEL_ONE_OVER_SIX / (*cell_volume);
-
-    for (int i=0; i<KERNEL_N_PER_C; i++) { /*loop over vertices*/
-    
-        point_lc[i] = coefficient2 * (
-            cell_det[i * KERNEL_DET_FIELDS + 0] - 
-            cell_det[i * KERNEL_DET_FIELDS + 1] * point_pos[0] + 
-            cell_det[i * KERNEL_DET_FIELDS + 2] * point_pos[1] - 
-            cell_det[i * KERNEL_DET_FIELDS + 3] * point_pos[2]);
-    }  
-
-    if (point_lc[0] < 0.0 || 
-        point_lc[0] > 1.0 ||
-        point_lc[1] < 0.0 || 
-        point_lc[1] > 1.0 ||
-        point_lc[2] < 0.0 || 
-        point_lc[2] > 1.0 ||
-        point_lc[3] < 0.0 || 
-        point_lc[3] > 1.0) { 
-            
-            inside = false;
-    }  
-}
-
-//*******************************************************************************
 inline void move_kernel(
     const double *point_pos, double* point_lc, 
     const double *cell_volume, const double *cell_det
@@ -348,7 +163,51 @@ inline void move_kernel(
     }
 }
 
-//*******************************************************************************
+//*************************************************************************************************
+inline void deposit_charge_on_nodes_kernel(
+    const double *part_lc,
+    double *node_charge_den0,
+    double *node_charge_den1,
+    double *node_charge_den2,
+    double *node_charge_den3) {
+
+    node_charge_den0[0] += part_lc[0];
+    node_charge_den1[0] += part_lc[1];
+    node_charge_den2[0] += part_lc[2];
+    node_charge_den3[0] += part_lc[3];
+}
+
+//*************************************************************************************************
+inline void compute_node_charge_density_kernel(
+    double *node_charge_den,
+    const double *node_volume
+)
+{
+    (*node_charge_den) *= (CONST_spwt[0] / node_volume[0]);
+}
+
+//*************************************************************************************************
+inline void compute_electric_field_kernel(
+    double *cell_electric_field,             
+    const double *cell_shape_deriv,
+    const double *node_potential0,
+    const double *node_potential1,
+    const double *node_potential2,
+    const double *node_potential3
+)
+{
+    for (int dim = 0; dim < KERNEL_DIM; dim++)
+    { 
+        const double c1 = (cell_shape_deriv[0 * KERNEL_DIM + dim] * (*node_potential0));
+        const double c2 = (cell_shape_deriv[1 * KERNEL_DIM + dim] * (*node_potential1));
+        const double c3 = (cell_shape_deriv[2 * KERNEL_DIM + dim] * (*node_potential2));
+        const double c4 = (cell_shape_deriv[3 * KERNEL_DIM + dim] * (*node_potential3));
+
+        cell_electric_field[dim] -= (c1 + c2 + c3 + c4);
+    }    
+}
+
+//*************************************************************************************************
 inline void get_final_max_values_kernel(
     const OPP_REAL* n_charge_den,
     OPP_REAL* max_n_charge_den,
@@ -358,4 +217,15 @@ inline void get_final_max_values_kernel(
     *max_n_charge_den = MAX(abs(*n_charge_den), *max_n_charge_den);
     
     *max_n_pot = MAX(*n_pot, *max_n_pot);
-}
\ No newline at end of file
+}
+
+//*******************************************************************************
+inline void get_max_cef_kernel(
+    const OPP_REAL* val,
+    OPP_REAL* max_val)
+{
+    for (int dim = 0; dim < KERNEL_DIM; ++dim) 
+    {
+        *max_val = MAX(val[dim], *max_val);
+    }
+}
diff --git a/app_fempic_cg/Makefile b/app_fempic_cg/Makefile
index e86ec68..c283c02 100644
--- a/app_fempic_cg/Makefile
+++ b/app_fempic_cg/Makefile
@@ -122,7 +122,7 @@ omp_mpi: mklib
 	     $(MPICPP) $(CPPFLAGS) -fopenmp -DUSE_OMP -DUSE_MPI -c fempic_opp.cpp -o $(OBJ)/fempic_opp_omp_mpi.o $(ALL_INC)
 		 $(MPICPP) $(CPPFLAGS) -fopenmp -DUSE_OMP -DUSE_MPI -c field_solver/cpu.cpp -o $(OBJ)/field_solver_cpu_omp_mpi.o $(ALL_INC)
 		 $(MPICPP) $(CPPFLAGS) -fopenmp -DUSE_OMP -DUSE_MPI -c omp/opp_kernels.cpp -o $(OBJ)/opp_kernels_omp_mpi.o $(ALL_INC)
-	     $(MPICPP) $(CPPFLAGS) -fopenmp -o $(BIN)/omp_mpi_hdf5 \
+	     $(MPICPP) $(CPPFLAGS) -fopenmp -o $(BIN)/omp_mpi \
 		 	$(OBJ)/fempic_opp_omp_mpi.o \
 			$(OBJ)/opp_kernels_omp_mpi.o \
 			$(OBJ)/field_solver_cpu_omp_mpi.o \
diff --git a/app_fempic_cg/cuda/get_final_max_values_kernel_loop.hpp b/app_fempic_cg/cuda/get_final_max_values_kernel_loop.hpp
index 978f5f7..1ae6e00 100644
--- a/app_fempic_cg/cuda/get_final_max_values_kernel_loop.hpp
+++ b/app_fempic_cg/cuda/get_final_max_values_kernel_loop.hpp
@@ -3,15 +3,15 @@
 // AUTO GENERATED CODE
 //*********************************************
 
-OPP_INT opp_k8_dat0_stride = -1;
-OPP_INT opp_k8_dat1_stride = -1;
+OPP_INT opp_k9_dat0_stride = -1;
+OPP_INT opp_k9_dat1_stride = -1;
 
-__constant__ OPP_INT opp_k8_dat0_stride_d;
-__constant__ OPP_INT opp_k8_dat1_stride_d;
+__constant__ OPP_INT opp_k9_dat0_stride_d;
+__constant__ OPP_INT opp_k9_dat1_stride_d;
 
 
 
-namespace opp_k8 {
+namespace opp_k9 {
 __device__ inline void get_final_max_values_kernel(
     const double* n_charge_den,
     double* max_n_charge_den,
@@ -47,7 +47,7 @@ __global__ void opp_dev_get_final_max_values_kernel(
 
     for (int n = thread_id; n < (end - start); n += blockDim.x * gridDim.x) {
 
-        opp_k8::get_final_max_values_kernel(
+        opp_k9::get_final_max_values_kernel(
             dat0 + n, // n_charge_den 
             gbl1_local, // 
             dat1 + n, // n_potential 
@@ -89,17 +89,17 @@ void opp_par_loop_all__get_final_max_values_kernel(opp_set set, opp_iterate_type
     OPP_REAL *arg1_host_data = (OPP_REAL *)args[1].data;
     OPP_REAL *arg3_host_data = (OPP_REAL *)args[3].data;
 
-    if (opp_k8_dat0_stride != args[0].dat->set->set_capacity) {
-        opp_k8_dat0_stride = args[0].dat->set->set_capacity;
-        cutilSafeCall(cudaMemcpyToSymbol(opp_k8_dat0_stride_d, &opp_k8_dat0_stride, sizeof(OPP_INT)));
+    if (opp_k9_dat0_stride != args[0].dat->set->set_capacity) {
+        opp_k9_dat0_stride = args[0].dat->set->set_capacity;
+        cutilSafeCall(cudaMemcpyToSymbol(opp_k9_dat0_stride_d, &opp_k9_dat0_stride, sizeof(OPP_INT)));
     }
-    if (opp_k8_dat1_stride != args[2].dat->set->set_capacity) {
-        opp_k8_dat1_stride = args[2].dat->set->set_capacity;
-        cutilSafeCall(cudaMemcpyToSymbol(opp_k8_dat1_stride_d, &opp_k8_dat1_stride, sizeof(OPP_INT)));
+    if (opp_k9_dat1_stride != args[2].dat->set->set_capacity) {
+        opp_k9_dat1_stride = args[2].dat->set->set_capacity;
+        cutilSafeCall(cudaMemcpyToSymbol(opp_k9_dat1_stride_d, &opp_k9_dat1_stride, sizeof(OPP_INT)));
     }
 
-#ifdef OPP_BLOCK_SIZE_8
-    const int block_size = OPP_BLOCK_SIZE_8;
+#ifdef OPP_BLOCK_SIZE_9
+    const int block_size = OPP_BLOCK_SIZE_9;
 #else
     const int block_size = OPP_gpu_threads_per_block;
 #endif
diff --git a/app_fempic_cg/cuda/get_max_cef_kernel_loop.hpp b/app_fempic_cg/cuda/get_max_cef_kernel_loop.hpp
new file mode 100644
index 0000000..5578a4e
--- /dev/null
+++ b/app_fempic_cg/cuda/get_max_cef_kernel_loop.hpp
@@ -0,0 +1,139 @@
+
+//*********************************************
+// AUTO GENERATED CODE
+//*********************************************
+
+OPP_INT opp_k8_dat0_stride = -1;
+
+__constant__ OPP_INT opp_k8_dat0_stride_d;
+
+
+
+namespace opp_k8 {
+__device__ inline void get_max_cef_kernel(
+    const double* val,
+    double* max_val)
+{
+    for (int dim = 0; dim < 3; ++dim)
+    {
+        *max_val = ((val[(dim) * opp_k8_dat0_stride_d] > *max_val) ? (val[(dim) * opp_k8_dat0_stride_d]) : (*max_val));
+    }
+}
+
+}
+
+//--------------------------------------------------------------
+__global__ void opp_dev_get_max_cef_kernel(
+    const OPP_REAL *__restrict__ dat0,  // c_ef
+    OPP_REAL *gbl1,
+    const OPP_INT start,
+    const OPP_INT end
+) 
+{
+    OPP_REAL gbl1_local[1];
+    for (int d = 0; d < 1; ++d)
+        gbl1_local[d] = gbl1[blockIdx.x * 1 + d];
+
+    const int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
+
+    for (int n = thread_id; n < (end - start); n += blockDim.x * gridDim.x) {
+
+        opp_k8::get_max_cef_kernel(
+            dat0 + n, // c_ef 
+            gbl1_local // 
+        );
+    }
+
+    for (int d = 0; d < 1; ++d)
+        opp_reduction<OPP_MAX>(gbl1 + blockIdx.x * 1 + d, gbl1_local[d]);
+    
+}
+
+//--------------------------------------------------------------
+void opp_par_loop_all__get_max_cef_kernel(opp_set set, opp_iterate_type, 
+    opp_arg arg0, // c_ef | OPP_READ
+    opp_arg arg1 // | OPP_MAX
+) 
+{
+    const int nargs = 2;
+    opp_arg args[nargs];
+
+    args[0] = arg0;
+    args[1] = arg1;
+
+    opp_profiler->start("get_max_cef_kernel");
+
+    if (OPP_DBG) opp_printf("APP", "opp_par_loop_all__get_max_cef_kernel set_size %d", set->size);
+
+    const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU);
+ 
+ 
+    OPP_REAL *arg1_host_data = (OPP_REAL *)args[1].data;
+
+    if (opp_k8_dat0_stride != args[0].dat->set->set_capacity) {
+        opp_k8_dat0_stride = args[0].dat->set->set_capacity;
+        cutilSafeCall(cudaMemcpyToSymbol(opp_k8_dat0_stride_d, &opp_k8_dat0_stride, sizeof(OPP_INT)));
+    }
+
+#ifdef OPP_BLOCK_SIZE_8
+    const int block_size = OPP_BLOCK_SIZE_8;
+#else
+    const int block_size = OPP_gpu_threads_per_block;
+#endif
+
+    opp_mpi_halo_wait_all(nargs, args);
+
+    int num_blocks = 200;
+
+    int max_blocks = num_blocks;
+
+    int reduction_bytes = 0;
+    int reduction_size = 0;
+
+    reduction_bytes += ROUND_UP(max_blocks * 1 * sizeof(OPP_REAL));
+    reduction_size   = MAX(reduction_size, sizeof(OPP_REAL));
+
+    opp_reallocReductArrays(reduction_bytes);
+    reduction_bytes = 0;
+
+    args[1].data   = OPP_reduct_h + reduction_bytes;
+    args[1].data_d = OPP_reduct_d + reduction_bytes;
+
+    for (int b = 0; b < max_blocks; ++b) {
+        for (int d = 0; d < 1; ++d)
+            ((OPP_REAL *)args[1].data)[b * 1 + d] = arg1_host_data[d];
+    }
+
+    reduction_bytes += ROUND_UP(max_blocks * 1 * sizeof(OPP_REAL));
+
+    opp_mvReductArraysToDevice(reduction_bytes);
+    
+    if (iter_size > 0) 
+    {
+        const OPP_INT start = 0;
+        const OPP_INT end = iter_size;
+        {
+            opp_dev_get_max_cef_kernel<<<num_blocks, block_size, (reduction_size * block_size)>>>(
+                (OPP_REAL *)args[0].data_d,     // c_ef
+                (OPP_REAL *)args[1].data_d,
+                start,
+                end
+            );
+        }
+    }
+
+    opp_mvReductArraysToHost(reduction_bytes);
+
+    for (int b = 0; b < max_blocks; ++b) {
+        for (int d = 0; d < 1; ++d)
+            arg1_host_data[d] = MAX(arg1_host_data[d], ((OPP_REAL *)args[1].data)[b * 1 + d]);
+    }
+
+    args[1].data = (char *)arg1_host_data;
+    opp_mpi_reduce(&args[1], arg1_host_data);
+
+    opp_set_dirtybit_grouped(nargs, args, Device_GPU);
+    cutilSafeCall(cudaDeviceSynchronize());   
+ 
+    opp_profiler->end("get_max_cef_kernel");
+}
diff --git a/app_fempic_cg/cuda/inject_ions_kernel_loop.hpp b/app_fempic_cg/cuda/inject_ions_kernel_loop.hpp
index 2c6645c..a945da6 100644
--- a/app_fempic_cg/cuda/inject_ions_kernel_loop.hpp
+++ b/app_fempic_cg/cuda/inject_ions_kernel_loop.hpp
@@ -45,7 +45,7 @@ __device__ inline void inject_ions_kernel(
 {
     double a = dummy_part_random[(0) * opp_k2_dat9_stride_d];
     double b = dummy_part_random[(1) * opp_k2_dat9_stride_d];
-    if ((a + b) > 1)
+    if ((a + b) > 1)  // TODO : Change the random dat to avoid this
     {
         a = (1 - a);
         b = (1 - b);
diff --git a/app_fempic_cg/cuda/opp_kernels.cu b/app_fempic_cg/cuda/opp_kernels.cu
index fff3e4e..1755952 100644
--- a/app_fempic_cg/cuda/opp_kernels.cu
+++ b/app_fempic_cg/cuda/opp_kernels.cu
@@ -99,5 +99,7 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) {
 
 #include "compute_electric_field_kernel_loop.hpp"
 
+#include "get_max_cef_kernel_loop.hpp"
+
 #include "get_final_max_values_kernel_loop.hpp"
 
diff --git a/app_fempic_cg/fempic.cpp b/app_fempic_cg/fempic.cpp
index 66ba838..c9cbd38 100644
--- a/app_fempic_cg/fempic.cpp
+++ b/app_fempic_cg/fempic.cpp
@@ -209,16 +209,19 @@ int main(int argc, char **argv)
 
             if (print_final_log)
             {
-                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0;
+                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0, max_c_ef = 0.0;
 
+                opp_par_loop(get_max_cef_kernel, "get_max_cef", cell_set, OPP_ITERATE_ALL,
+                    opp_arg_dat(c_ef, OPP_READ),
+                    opp_arg_gbl(&max_c_ef, 1, "double", OPP_MAX));
                 opp_par_loop(get_final_max_values_kernel, "get_final_max_values", node_set, OPP_ITERATE_ALL,
                     opp_arg_dat(n_charge_den, OPP_READ),
                     opp_arg_gbl(&max_n_chg_den, 1, "double", OPP_MAX),
                     opp_arg_dat(n_potential, OPP_READ),
                     opp_arg_gbl(&max_n_pot, 1, "double", OPP_MAX));
 
-                log = get_global_level_log(max_n_chg_den, max_n_pot, particle_set->size, 
-                    inject_count, (old_nparts - particle_set->size));
+                log = get_global_level_log(max_c_ef, max_n_pot, particle_set->size, inject_count, 
+                    (old_nparts - particle_set->size));
             }
 
             total_part_iter += particle_set->size;  
diff --git a/app_fempic_cg/fempic_hdf5.cpp b/app_fempic_cg/fempic_hdf5.cpp
index 6146f28..b8128d6 100644
--- a/app_fempic_cg/fempic_hdf5.cpp
+++ b/app_fempic_cg/fempic_hdf5.cpp
@@ -206,16 +206,19 @@ int main(int argc, char **argv)
 
             if (print_final_log)
             {
-                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0;
+                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0, max_c_ef = 0.0;
 
+                opp_par_loop(get_max_cef_kernel, "get_max_cef", cell_set, OPP_ITERATE_ALL,
+                    opp_arg_dat(c_ef, OPP_READ),
+                    opp_arg_gbl(&max_c_ef, 1, "double", OPP_MAX));
                 opp_par_loop(get_final_max_values_kernel, "get_final_max_values", node_set, OPP_ITERATE_ALL,
                     opp_arg_dat(n_charge_den, OPP_READ),
                     opp_arg_gbl(&max_n_chg_den, 1, "double", OPP_MAX),
                     opp_arg_dat(n_potential, OPP_READ),
                     opp_arg_gbl(&max_n_pot, 1, "double", OPP_MAX));
 
-                log = get_global_level_log(max_n_chg_den, max_n_pot, particle_set->size, 
-                    inject_count, (old_nparts - particle_set->size));
+                log = get_global_level_log(max_c_ef, max_n_pot, particle_set->size, inject_count, 
+                    (old_nparts - particle_set->size));
             }
 
             total_part_iter += particle_set->size;  
diff --git a/app_fempic_cg/fempic_hdf5_opp.cpp b/app_fempic_cg/fempic_hdf5_opp.cpp
index 21c2902..f0cfb7d 100644
--- a/app_fempic_cg/fempic_hdf5_opp.cpp
+++ b/app_fempic_cg/fempic_hdf5_opp.cpp
@@ -1,5 +1,5 @@
 
-// Auto-generated at 2024-05-20 11:43:24.481974 by opp-translator
+// Auto-generated at 2024-05-27 11:54:26.023476 by opp-translator
 /* 
 BSD 3-Clause License
 
@@ -44,6 +44,7 @@ void opp_particle_move__move_kernel(opp_set,opp_map,opp_map,opp_arg,opp_arg,opp_
 void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg,opp_arg,opp_arg,opp_arg);
 void opp_par_loop_all__compute_node_charge_density_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg);
 void opp_par_loop_all__compute_electric_field_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg,opp_arg,opp_arg,opp_arg,opp_arg);
+void opp_par_loop_all__get_max_cef_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg);
 void opp_par_loop_all__get_final_max_values_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg,opp_arg,opp_arg);
 void opp_init_direct_hop_cg(double,int,const opp_dat,const opp::BoundingBox&,opp_map,opp_map,opp_arg,opp_arg,opp_arg,opp_arg);
 
@@ -214,16 +215,19 @@ int main(int argc, char **argv)
 
             if (print_final_log)
             {
-                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0;
+                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0, max_c_ef = 0.0;
 
+                opp_par_loop_all__get_max_cef_kernel(cell_set, OPP_ITERATE_ALL,
+                    opp_arg_dat(c_ef, OPP_READ),
+                    opp_arg_gbl(&max_c_ef, 1, "double", OPP_MAX));
                 opp_par_loop_all__get_final_max_values_kernel(node_set, OPP_ITERATE_ALL,
                     opp_arg_dat(n_charge_den, OPP_READ),
                     opp_arg_gbl(&max_n_chg_den, 1, "double", OPP_MAX),
                     opp_arg_dat(n_potential, OPP_READ),
                     opp_arg_gbl(&max_n_pot, 1, "double", OPP_MAX));
 
-                log = get_global_level_log(max_n_chg_den, max_n_pot, particle_set->size, 
-                    inject_count, (old_nparts - particle_set->size));
+                log = get_global_level_log(max_c_ef, max_n_pot, particle_set->size, inject_count, 
+                    (old_nparts - particle_set->size));
             }
 
             total_part_iter += particle_set->size;  
diff --git a/app_fempic_cg/fempic_misc.h b/app_fempic_cg/fempic_misc.h
index cd00ccd..64b48bf 100644
--- a/app_fempic_cg/fempic_misc.h
+++ b/app_fempic_cg/fempic_misc.h
@@ -39,8 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * This function will enrich the particle distribution values into if_dist_dat
  * In addition, dummy_random dat will get enriched with random values in "rand_file"
 */
-inline int init_inject_distributions(opp_dat if_dist_dat, opp_dat if_area_dat, 
-                                        opp_dat dummy_random)
+inline int init_inject_distributions(opp_dat if_dist_dat, opp_dat if_area_dat, opp_dat dummy_random)
 {
     if (OPP_DBG) opp_printf("init_inject_distributions", "START");
 
@@ -52,8 +51,9 @@ inline int init_inject_distributions(opp_dat if_dist_dat, opp_dat if_area_dat,
     int total_inject_count = 0, max_inject_count_per_face = 0;
     double remainder = 0.0;
 
-    opp_printf("init_inject_distributions", "plasma_den=%2.15lE dt=%2.25lE ion_velocity=%2.15lE spwt=%2.15lE", 
-        plasma_den, dt, ion_velocity, spwt);
+    OPP_RUN_ON_ROOT()
+        opp_printf("init_inject_distributions", "plasma_den=%2.15lE dt=%2.25lE ion_vel=%2.15lE spwt=%2.15lE", 
+            plasma_den, dt, ion_velocity, spwt);
 
     // find the number of particles to be injected through each inlet face and 
     // get the max injected particle count per face
@@ -65,7 +65,7 @@ inline int init_inject_distributions(opp_dat if_dist_dat, opp_dat if_area_dat,
         double num_real = num_per_sec * dt;
         double fnum_mp = num_real / spwt + remainder;
         int num_mp = (int)fnum_mp;
-        remainder = fnum_mp - num_mp;
+        // remainder = fnum_mp - num_mp;
 
         total_inject_count += num_mp;
 
@@ -195,14 +195,14 @@ inline void print_per_cell_particle_counts(opp_dat c_part_count, opp_dat part_me
 
 /*************************************************************************************************
  * This is a utility function create a global level log string using local values
- * @param max_n_charge_density - this is already reduced glocally by the opp_par_loop
+ * @param max_c_ef - this is already reduced glocally by the opp_par_loop
  * @param max_n_potential - this is already reduced glocally by the opp_par_loop
  * @param local_part_count - these are local values
  * @param local_parts_injected - these are local values
  * @param local_part_removed - these are local values
  * @return std::string
 */
-inline std::string get_global_level_log(double max_n_charge_density, double max_n_potential, 
+inline std::string get_global_level_log(double max_c_ef, double max_n_potential, 
     int local_part_count, int local_parts_injected, int local_part_removed)
 {
     std::string log = "";
@@ -229,8 +229,8 @@ inline std::string get_global_level_log(double max_n_charge_density, double max_
     log += std::string("\t np: ") + str(global_part_size, "%" PRId64);
     log += std::string(" (") + str(global_inj_size, "%" PRId64);
     log += std::string(" added, ") + str(global_removed, "%" PRId64);
-    log += std::string(" removed)\t max den: ") + str(max_n_charge_density, "%2.25lE");
-    log += std::string(" max |phi|: ") + str(max_n_potential, "%2.10lE");
+    log += std::string(" removed)\t max c_ef: ") + str(max_c_ef, "%2.15lE");
+    log += std::string(" max n_pot: ") + str(max_n_potential, "%2.15lE");
     log += std::string(" max_comm_iteration: ") + str(global_max_comm_iteration, "%d");
     return log;
 }
diff --git a/app_fempic_cg/fempic_opp.cpp b/app_fempic_cg/fempic_opp.cpp
index d255c9c..d0d7478 100644
--- a/app_fempic_cg/fempic_opp.cpp
+++ b/app_fempic_cg/fempic_opp.cpp
@@ -1,5 +1,5 @@
 
-// Auto-generated at 2024-05-20 11:43:18.395285 by opp-translator
+// Auto-generated at 2024-05-27 11:54:32.808673 by opp-translator
 /* 
 BSD 3-Clause License
 
@@ -44,6 +44,7 @@ void opp_particle_move__move_kernel(opp_set,opp_map,opp_map,opp_arg,opp_arg,opp_
 void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg,opp_arg,opp_arg,opp_arg);
 void opp_par_loop_all__compute_node_charge_density_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg);
 void opp_par_loop_all__compute_electric_field_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg,opp_arg,opp_arg,opp_arg,opp_arg);
+void opp_par_loop_all__get_max_cef_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg);
 void opp_par_loop_all__get_final_max_values_kernel(opp_set,opp_iterate_type,opp_arg,opp_arg,opp_arg,opp_arg);
 void opp_init_direct_hop_cg(double,int,const opp_dat,const opp::BoundingBox&,opp_map,opp_map,opp_arg,opp_arg,opp_arg,opp_arg);
 
@@ -217,16 +218,19 @@ int main(int argc, char **argv)
 
             if (print_final_log)
             {
-                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0;
+                OPP_REAL max_n_chg_den = 0.0, max_n_pot = 0.0, max_c_ef = 0.0;
 
+                opp_par_loop_all__get_max_cef_kernel(cell_set, OPP_ITERATE_ALL,
+                    opp_arg_dat(c_ef, OPP_READ),
+                    opp_arg_gbl(&max_c_ef, 1, "double", OPP_MAX));
                 opp_par_loop_all__get_final_max_values_kernel(node_set, OPP_ITERATE_ALL,
                     opp_arg_dat(n_charge_den, OPP_READ),
                     opp_arg_gbl(&max_n_chg_den, 1, "double", OPP_MAX),
                     opp_arg_dat(n_potential, OPP_READ),
                     opp_arg_gbl(&max_n_pot, 1, "double", OPP_MAX));
 
-                log = get_global_level_log(max_n_chg_den, max_n_pot, particle_set->size, 
-                    inject_count, (old_nparts - particle_set->size));
+                log = get_global_level_log(max_c_ef, max_n_pot, particle_set->size, inject_count, 
+                    (old_nparts - particle_set->size));
             }
 
             total_part_iter += particle_set->size;  
diff --git a/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp b/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp
index a6a0aad..4f70361 100644
--- a/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp
+++ b/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp
@@ -3,15 +3,15 @@
 // AUTO GENERATED CODE
 //*********************************************
 
-OPP_INT opp_k8_dat0_stride = -1;
-OPP_INT opp_k8_dat1_stride = -1;
+OPP_INT opp_k9_dat0_stride = -1;
+OPP_INT opp_k9_dat1_stride = -1;
 
-__constant__ OPP_INT opp_k8_dat0_stride_d;
-__constant__ OPP_INT opp_k8_dat1_stride_d;
+__constant__ OPP_INT opp_k9_dat0_stride_d;
+__constant__ OPP_INT opp_k9_dat1_stride_d;
 
 
 
-namespace opp_k8 {
+namespace opp_k9 {
 __device__ inline void get_final_max_values_kernel(
     const double* n_charge_den,
     double* max_n_charge_den,
@@ -47,7 +47,7 @@ __global__ void opp_dev_get_final_max_values_kernel(
 
     for (int n = thread_id; n < (end - start); n += blockDim.x * gridDim.x) {
 
-        opp_k8::get_final_max_values_kernel(
+        opp_k9::get_final_max_values_kernel(
             dat0 + n, // n_charge_den 
             gbl1_local, // 
             dat1 + n, // n_potential 
@@ -89,17 +89,17 @@ void opp_par_loop_all__get_final_max_values_kernel(opp_set set, opp_iterate_type
     OPP_REAL *arg1_host_data = (OPP_REAL *)args[1].data;
     OPP_REAL *arg3_host_data = (OPP_REAL *)args[3].data;
 
-    if (opp_k8_dat0_stride != args[0].dat->set->set_capacity) {
-        opp_k8_dat0_stride = args[0].dat->set->set_capacity;
-        cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(opp_k8_dat0_stride_d), &opp_k8_dat0_stride, sizeof(OPP_INT)));
+    if (opp_k9_dat0_stride != args[0].dat->set->set_capacity) {
+        opp_k9_dat0_stride = args[0].dat->set->set_capacity;
+        cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(opp_k9_dat0_stride_d), &opp_k9_dat0_stride, sizeof(OPP_INT)));
     }
-    if (opp_k8_dat1_stride != args[2].dat->set->set_capacity) {
-        opp_k8_dat1_stride = args[2].dat->set->set_capacity;
-        cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(opp_k8_dat1_stride_d), &opp_k8_dat1_stride, sizeof(OPP_INT)));
+    if (opp_k9_dat1_stride != args[2].dat->set->set_capacity) {
+        opp_k9_dat1_stride = args[2].dat->set->set_capacity;
+        cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(opp_k9_dat1_stride_d), &opp_k9_dat1_stride, sizeof(OPP_INT)));
     }
 
-#ifdef OPP_BLOCK_SIZE_8
-    const int block_size = OPP_BLOCK_SIZE_8;
+#ifdef OPP_BLOCK_SIZE_9
+    const int block_size = OPP_BLOCK_SIZE_9;
 #else
     const int block_size = OPP_gpu_threads_per_block;
 #endif
diff --git a/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp b/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp
new file mode 100644
index 0000000..624aa49
--- /dev/null
+++ b/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp
@@ -0,0 +1,139 @@
+
+//*********************************************
+// AUTO GENERATED CODE
+//*********************************************
+
+OPP_INT opp_k8_dat0_stride = -1;
+
+__constant__ OPP_INT opp_k8_dat0_stride_d;
+
+
+
+namespace opp_k8 {
+__device__ inline void get_max_cef_kernel(
+    const double* val,
+    double* max_val)
+{
+    for (int dim = 0; dim < 3; ++dim)
+    {
+        *max_val = ((val[(dim) * opp_k8_dat0_stride_d] > *max_val) ? (val[(dim) * opp_k8_dat0_stride_d]) : (*max_val));
+    }
+}
+
+}
+
+//--------------------------------------------------------------
+__global__ void opp_dev_get_max_cef_kernel(
+    const OPP_REAL *__restrict__ dat0,  // c_ef
+    OPP_REAL *gbl1,
+    const OPP_INT start,
+    const OPP_INT end
+) 
+{
+    OPP_REAL gbl1_local[1];
+    for (int d = 0; d < 1; ++d)
+        gbl1_local[d] = gbl1[blockIdx.x * 1 + d];
+
+    const int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
+
+    for (int n = thread_id; n < (end - start); n += blockDim.x * gridDim.x) {
+
+        opp_k8::get_max_cef_kernel(
+            dat0 + n, // c_ef 
+            gbl1_local // 
+        );
+    }
+
+    for (int d = 0; d < 1; ++d)
+        opp_reduction<OPP_MAX>(gbl1 + blockIdx.x * 1 + d, gbl1_local[d]);
+    
+}
+
+//--------------------------------------------------------------
+void opp_par_loop_all__get_max_cef_kernel(opp_set set, opp_iterate_type, 
+    opp_arg arg0, // c_ef | OPP_READ
+    opp_arg arg1 // | OPP_MAX
+) 
+{
+    const int nargs = 2;
+    opp_arg args[nargs];
+
+    args[0] = arg0;
+    args[1] = arg1;
+
+    opp_profiler->start("get_max_cef_kernel");
+
+    if (OPP_DBG) opp_printf("APP", "opp_par_loop_all__get_max_cef_kernel set_size %d", set->size);
+
+    const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU);
+ 
+ 
+    OPP_REAL *arg1_host_data = (OPP_REAL *)args[1].data;
+
+    if (opp_k8_dat0_stride != args[0].dat->set->set_capacity) {
+        opp_k8_dat0_stride = args[0].dat->set->set_capacity;
+        cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(opp_k8_dat0_stride_d), &opp_k8_dat0_stride, sizeof(OPP_INT)));
+    }
+
+#ifdef OPP_BLOCK_SIZE_8
+    const int block_size = OPP_BLOCK_SIZE_8;
+#else
+    const int block_size = OPP_gpu_threads_per_block;
+#endif
+
+    opp_mpi_halo_wait_all(nargs, args);
+
+    int num_blocks = 200;
+
+    int max_blocks = num_blocks;
+
+    int reduction_bytes = 0;
+    int reduction_size = 0;
+
+    reduction_bytes += ROUND_UP(max_blocks * 1 * sizeof(OPP_REAL));
+    reduction_size   = MAX(reduction_size, sizeof(OPP_REAL));
+
+    opp_reallocReductArrays(reduction_bytes);
+    reduction_bytes = 0;
+
+    args[1].data   = OPP_reduct_h + reduction_bytes;
+    args[1].data_d = OPP_reduct_d + reduction_bytes;
+
+    for (int b = 0; b < max_blocks; ++b) {
+        for (int d = 0; d < 1; ++d)
+            ((OPP_REAL *)args[1].data)[b * 1 + d] = arg1_host_data[d];
+    }
+
+    reduction_bytes += ROUND_UP(max_blocks * 1 * sizeof(OPP_REAL));
+
+    opp_mvReductArraysToDevice(reduction_bytes);
+    
+    if (iter_size > 0) 
+    {
+        const OPP_INT start = 0;
+        const OPP_INT end = iter_size;
+        {
+            opp_dev_get_max_cef_kernel<<<num_blocks, block_size, (reduction_size * block_size)>>>(
+                (OPP_REAL *)args[0].data_d,     // c_ef
+                (OPP_REAL *)args[1].data_d,
+                start,
+                end
+            );
+        }
+    }
+
+    opp_mvReductArraysToHost(reduction_bytes);
+
+    for (int b = 0; b < max_blocks; ++b) {
+        for (int d = 0; d < 1; ++d)
+            arg1_host_data[d] = MAX(arg1_host_data[d], ((OPP_REAL *)args[1].data)[b * 1 + d]);
+    }
+
+    args[1].data = (char *)arg1_host_data;
+    opp_mpi_reduce(&args[1], arg1_host_data);
+
+    opp_set_dirtybit_grouped(nargs, args, Device_GPU);
+    cutilSafeCall(hipDeviceSynchronize());   
+ 
+    opp_profiler->end("get_max_cef_kernel");
+}
diff --git a/app_fempic_cg/hip/inject_ions_kernel_loop.hpp b/app_fempic_cg/hip/inject_ions_kernel_loop.hpp
index 2a5cfa6..b518aed 100644
--- a/app_fempic_cg/hip/inject_ions_kernel_loop.hpp
+++ b/app_fempic_cg/hip/inject_ions_kernel_loop.hpp
@@ -45,7 +45,7 @@ __device__ inline void inject_ions_kernel(
 {
     double a = dummy_part_random[(0) * opp_k2_dat9_stride_d];
     double b = dummy_part_random[(1) * opp_k2_dat9_stride_d];
-    if ((a + b) > 1)
+    if ((a + b) > 1)  // TODO : Change the random dat to avoid this
     {
         a = (1 - a);
         b = (1 - b);
diff --git a/app_fempic_cg/hip/opp_kernels.cpp b/app_fempic_cg/hip/opp_kernels.cpp
index b231bf7..f7dfd07 100644
--- a/app_fempic_cg/hip/opp_kernels.cpp
+++ b/app_fempic_cg/hip/opp_kernels.cpp
@@ -99,5 +99,7 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) {
 
 #include "compute_electric_field_kernel_loop.hpp"
 
+#include "get_max_cef_kernel_loop.hpp"
+
 #include "get_final_max_values_kernel_loop.hpp"
 
diff --git a/app_fempic_cg/kernels.h b/app_fempic_cg/kernels.h
index 1a5efcd..3d17bf4 100644
--- a/app_fempic_cg/kernels.h
+++ b/app_fempic_cg/kernels.h
@@ -35,8 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // USER WRITTEN CODE
 //*********************************************
 
-// #define OPP_KERNEL_LOOPP_UNROLL
-
 #include "opp_lib.h"
 
 //****************************************
@@ -46,34 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define KERNEL_NEIGHB_C         4
 #define KERNEL_DIM              3
 
-//*************************************************************************************************
-inline void calculate_injection_distribution(
-    int* injected_total,
-    double* face_area,
-    int* particle_distribution,
-    double* remainder 
-)
-{   
-    /*number of real ions per sec, given prescribed density and velocity*/
-    double num_per_sec = CONST_plasma_den[0] * CONST_ion_velocity[0] * (*face_area);
-
-    /*number of ions to generate in this time step*/
-    double num_real = num_per_sec * CONST_dt[0];
-
-    /*fraction number of macroparticles*/
-    double fnum_mp = num_real / CONST_spwt[0] + (*remainder);
-
-    /*integer number of macroparticles*/
-    int num_mp = (int)fnum_mp;
-
-    /*update reminder*/
-    (*remainder) = fnum_mp - num_mp;
-
-    (*injected_total) += num_mp;
-
-    (*particle_distribution) = (*injected_total);
-}
-
 //*************************************************************************************************
 inline void init_boundary_pot_kernel(
     const int *node_type, 
@@ -111,7 +81,7 @@ inline void inject_ions_kernel(
 {
     double a = dummy_part_random[0];
     double b = dummy_part_random[1];
-    if ((a + b) > 1)  
+    if ((a + b) > 1)  // TODO : Change the random dat to avoid this
     {
         a = (1 - a);
         b = (1 - b);
@@ -142,161 +112,6 @@ inline void calculate_new_pos_vel_kernel(
 }
 
 //*************************************************************************************************
-inline void deposit_charge_on_nodes_kernel(
-    const double *part_lc,
-    double *node_charge_den0,
-    double *node_charge_den1,
-    double *node_charge_den2,
-    double *node_charge_den3) {
-
-    node_charge_den0[0] += part_lc[0];
-    node_charge_den1[0] += part_lc[1];
-    node_charge_den2[0] += part_lc[2];
-    node_charge_den3[0] += part_lc[3];
-}
-
-//*************************************************************************************************
-inline void move_all_particles_to_cell_kernel(
-    const double *cell_ef,
-    double *part_pos,
-    double *part_vel,
-    double *part_lc,
-    int* current_cell_index,
-    const double *current_cell_volume,
-    const double *current_cell_det,
-    const int *cell_connectivity,
-    double *node_charge_den0,
-    double *node_charge_den1,
-    double *node_charge_den2,
-    double *node_charge_den3
-)
-{
-    if (OPP_DO_ONCE)
-    {
-        const double coefficient1 = CONST_charge[0] / CONST_mass[0] * (CONST_dt[0]);
-        for (int i = 0; i < KERNEL_DIM; i++)
-            part_vel[i] += (coefficient1 * cell_ef[i]);                  
-        
-        for (int i = 0; i < KERNEL_DIM; i++)
-            part_pos[i] += part_vel[i] * (CONST_dt[0]); // v = u + at
-    }
-
-    bool inside = true;
-    const double coefficient2 = KERNEL_ONE_OVER_SIX / (*current_cell_volume);
-    for (int i=0; i<KERNEL_N_PER_C; i++) /*loop over vertices*/
-    {
-        part_lc[i] = coefficient2 * (
-            current_cell_det[i * KERNEL_DET_FIELDS + 0] - 
-            current_cell_det[i * KERNEL_DET_FIELDS + 1] * part_pos[0] + 
-            current_cell_det[i * KERNEL_DET_FIELDS + 2] * part_pos[1] - 
-            current_cell_det[i * KERNEL_DET_FIELDS + 3] * part_pos[2]);
-        
-        if (part_lc[i] < 0.0 || 
-            part_lc[i] > 1.0)  
-                inside = false;
-                // m.inside_cell = false;
-    }    
-
-    if (inside)
-    {
-        OPP_PARTICLE_MOVE_DONE;
-
-        (*node_charge_den0) += part_lc[0];
-        (*node_charge_den1) += part_lc[1];
-        (*node_charge_den2) += part_lc[2];
-        (*node_charge_den3) += part_lc[3];
-
-        return;
-    }
-
-    // outside the last known cell, find most negative weight and 
-    // use that cell_index to reduce computations
-    int min_i = 0;
-    double min_lc = part_lc[0];
-    
-    for (int i=1; i<KERNEL_NEIGHB_C; i++)
-    {
-        if (part_lc[i] < min_lc) 
-        {
-            min_lc = part_lc[i];
-            min_i = i;
-        }
-    }
-
-    if (cell_connectivity[min_i] >= 0) // is there a neighbor in this direction?
-    {
-        (*current_cell_index) = cell_connectivity[min_i];
-        OPP_PARTICLE_NEED_MOVE;
-    }
-    else
-    {
-        OPP_PARTICLE_NEED_REMOVE;
-    }
-}
-
-//*************************************************************************************************
-inline void compute_node_charge_density_kernel(
-    double *node_charge_den,
-    const double *node_volume
-)
-{
-    (*node_charge_den) *= (CONST_spwt[0] / node_volume[0]);
-}
-
-//*************************************************************************************************
-inline void compute_electric_field_kernel(
-    double *cell_electric_field,             
-    const double *cell_shape_deriv,
-    const double *node_potential0,
-    const double *node_potential1,
-    const double *node_potential2,
-    const double *node_potential3
-)
-{
-    for (int dim = 0; dim < KERNEL_DIM; dim++)
-    { 
-        const double c1 = (cell_shape_deriv[0 * KERNEL_DIM + dim] * (*node_potential0));
-        const double c2 = (cell_shape_deriv[1 * KERNEL_DIM + dim] * (*node_potential1));
-        const double c3 = (cell_shape_deriv[2 * KERNEL_DIM + dim] * (*node_potential2));
-        const double c4 = (cell_shape_deriv[3 * KERNEL_DIM + dim] * (*node_potential3));
-
-        cell_electric_field[dim] -= (c1 + c2 + c3 + c4);
-    }    
-}
-
-//*************************************************************************************************
-
-// #define OPP_LOOPP_UNROLL
-//*******************************************************************************
-inline void isPointInCellKernel(bool& inside, const double *point_pos, double* point_lc, 
-                                const double *cell_volume, const double *cell_det) { 
-
-    inside = true;  
-    const double coefficient2 = KERNEL_ONE_OVER_SIX / (*cell_volume);
-
-    for (int i=0; i<KERNEL_N_PER_C; i++) { /*loop over vertices*/
-    
-        point_lc[i] = coefficient2 * (
-            cell_det[i * KERNEL_DET_FIELDS + 0] - 
-            cell_det[i * KERNEL_DET_FIELDS + 1] * point_pos[0] + 
-            cell_det[i * KERNEL_DET_FIELDS + 2] * point_pos[1] - 
-            cell_det[i * KERNEL_DET_FIELDS + 3] * point_pos[2]);
-    }  
-
-    if (point_lc[0] < 0.0 || 
-        point_lc[0] > 1.0 ||
-        point_lc[1] < 0.0 || 
-        point_lc[1] > 1.0 ||
-        point_lc[2] < 0.0 || 
-        point_lc[2] > 1.0 ||
-        point_lc[3] < 0.0 || 
-        point_lc[3] > 1.0) { 
-            
-            inside = false;
-    }  
-}
-
-//*******************************************************************************
 inline void move_kernel(
     const double *point_pos, double* point_lc, 
     const double *cell_volume, const double *cell_det
@@ -348,7 +163,51 @@ inline void move_kernel(
     }
 }
 
-//*******************************************************************************
+//*************************************************************************************************
+inline void deposit_charge_on_nodes_kernel(
+    const double *part_lc,
+    double *node_charge_den0,
+    double *node_charge_den1,
+    double *node_charge_den2,
+    double *node_charge_den3) {
+
+    node_charge_den0[0] += part_lc[0];
+    node_charge_den1[0] += part_lc[1];
+    node_charge_den2[0] += part_lc[2];
+    node_charge_den3[0] += part_lc[3];
+}
+
+//*************************************************************************************************
+inline void compute_node_charge_density_kernel(
+    double *node_charge_den,
+    const double *node_volume
+)
+{
+    (*node_charge_den) *= (CONST_spwt[0] / node_volume[0]);
+}
+
+//*************************************************************************************************
+inline void compute_electric_field_kernel(
+    double *cell_electric_field,             
+    const double *cell_shape_deriv,
+    const double *node_potential0,
+    const double *node_potential1,
+    const double *node_potential2,
+    const double *node_potential3
+)
+{
+    for (int dim = 0; dim < KERNEL_DIM; dim++)
+    { 
+        const double c1 = (cell_shape_deriv[0 * KERNEL_DIM + dim] * (*node_potential0));
+        const double c2 = (cell_shape_deriv[1 * KERNEL_DIM + dim] * (*node_potential1));
+        const double c3 = (cell_shape_deriv[2 * KERNEL_DIM + dim] * (*node_potential2));
+        const double c4 = (cell_shape_deriv[3 * KERNEL_DIM + dim] * (*node_potential3));
+
+        cell_electric_field[dim] -= (c1 + c2 + c3 + c4);
+    }    
+}
+
+//*************************************************************************************************
 inline void get_final_max_values_kernel(
     const OPP_REAL* n_charge_den,
     OPP_REAL* max_n_charge_den,
@@ -358,4 +217,15 @@ inline void get_final_max_values_kernel(
     *max_n_charge_den = MAX(abs(*n_charge_den), *max_n_charge_den);
     
     *max_n_pot = MAX(*n_pot, *max_n_pot);
-}
\ No newline at end of file
+}
+
+//*******************************************************************************
+inline void get_max_cef_kernel(
+    const OPP_REAL* val,
+    OPP_REAL* max_val)
+{
+    for (int dim = 0; dim < KERNEL_DIM; ++dim) 
+    {
+        *max_val = MAX(val[dim], *max_val);
+    }
+}
diff --git a/app_fempic_cg/mpi/get_final_max_values_kernel_loop.hpp b/app_fempic_cg/mpi/get_final_max_values_kernel_loop.hpp
index c471cd9..31917cd 100644
--- a/app_fempic_cg/mpi/get_final_max_values_kernel_loop.hpp
+++ b/app_fempic_cg/mpi/get_final_max_values_kernel_loop.hpp
@@ -3,7 +3,7 @@
 // AUTO GENERATED CODE
 //*********************************************
 
-namespace opp_k8 {
+namespace opp_k9 {
 inline void get_final_max_values_kernel(
     const double* n_charge_den,
     double* max_n_charge_den,
@@ -42,7 +42,7 @@ void opp_par_loop_all__get_final_max_values_kernel(opp_set set, opp_iterate_type
         if (n == set->core_size)
             opp_mpi_halo_wait_all(nargs, args);
 
-        opp_k8::get_final_max_values_kernel(
+        opp_k9::get_final_max_values_kernel(
             (const OPP_REAL *)args[0].data + (n * 1),
             (OPP_REAL *)args[1].data,
             (const OPP_REAL *)args[2].data + (n * 1),
diff --git a/app_fempic_cg/mpi/get_max_cef_kernel_loop.hpp b/app_fempic_cg/mpi/get_max_cef_kernel_loop.hpp
new file mode 100644
index 0000000..728b959
--- /dev/null
+++ b/app_fempic_cg/mpi/get_max_cef_kernel_loop.hpp
@@ -0,0 +1,52 @@
+
+//*********************************************
+// AUTO GENERATED CODE
+//*********************************************
+
+namespace opp_k8 {
+inline void get_max_cef_kernel(
+    const double* val,
+    double* max_val)
+{
+    for (int dim = 0; dim < 3; ++dim)
+    {
+        *max_val = ((val[dim] > *max_val) ? (val[dim]) : (*max_val));
+    }
+}
+}
+
+void opp_par_loop_all__get_max_cef_kernel(opp_set set, opp_iterate_type, 
+    opp_arg arg0, // c_ef | OPP_READ
+    opp_arg arg1 // | OPP_MAX
+) 
+{
+    const int nargs = 2;
+    opp_arg args[nargs];
+
+    args[0] = arg0;
+    args[1] = arg1;
+
+    opp_profiler->start("get_max_cef_kernel");
+
+    if (OPP_DBG) opp_printf("APP", "opp_par_loop_all__get_max_cef_kernel set_size %d", set->size);
+
+    const int iter_size = opp_mpi_halo_exchanges(set, nargs, args);
+ 
+    for (int n = 0; n < iter_size; ++n) 
+    {
+        if (n == set->core_size)
+            opp_mpi_halo_wait_all(nargs, args);
+
+        opp_k8::get_max_cef_kernel(
+            (const OPP_REAL *)args[0].data + (n * 3),
+            (OPP_REAL *)args[1].data
+        );
+    }
+    if (iter_size == 0 || iter_size == set->core_size)
+        opp_mpi_halo_wait_all(nargs, args);
+    opp_mpi_reduce(&args[1], (OPP_REAL *)args[1].data);
+ 
+    opp_set_dirtybit(nargs, args);
+
+    opp_profiler->end("get_max_cef_kernel");
+}
diff --git a/app_fempic_cg/mpi/inject_ions_kernel_loop.hpp b/app_fempic_cg/mpi/inject_ions_kernel_loop.hpp
index b8c9ada..a40d6cf 100644
--- a/app_fempic_cg/mpi/inject_ions_kernel_loop.hpp
+++ b/app_fempic_cg/mpi/inject_ions_kernel_loop.hpp
@@ -19,7 +19,7 @@ inline void inject_ions_kernel(
 {
     double a = dummy_part_random[0];
     double b = dummy_part_random[1];
-    if ((a + b) > 1)
+    if ((a + b) > 1)  // TODO : Change the random dat to avoid this
     {
         a = (1 - a);
         b = (1 - b);
diff --git a/app_fempic_cg/mpi/move_kernel_loop.hpp b/app_fempic_cg/mpi/move_kernel_loop.hpp
index ab88389..402470d 100644
--- a/app_fempic_cg/mpi/move_kernel_loop.hpp
+++ b/app_fempic_cg/mpi/move_kernel_loop.hpp
@@ -108,7 +108,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
 
     if (useGlobalMove) {
         
@@ -187,7 +187,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("move_kernel_only");
@@ -349,8 +349,8 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
         cellMapper->lockWindows();
         int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; 
         if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && 
-                    (most_suitable_cid < set->size)) {          
-            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);       
+                    (most_suitable_cid < set->size)) {        
+            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);      
         }
         cellMapper->unlockWindows();
     }
diff --git a/app_fempic_cg/mpi/opp_kernels.cpp b/app_fempic_cg/mpi/opp_kernels.cpp
index fc21276..20f4ba5 100644
--- a/app_fempic_cg/mpi/opp_kernels.cpp
+++ b/app_fempic_cg/mpi/opp_kernels.cpp
@@ -93,5 +93,7 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) {
 
 #include "compute_electric_field_kernel_loop.hpp"
 
+#include "get_max_cef_kernel_loop.hpp"
+
 #include "get_final_max_values_kernel_loop.hpp"
 
diff --git a/app_fempic_cg/omp/calculate_new_pos_vel_kernel_loop.hpp b/app_fempic_cg/omp/calculate_new_pos_vel_kernel_loop.hpp
index 2967a76..3018f9b 100644
--- a/app_fempic_cg/omp/calculate_new_pos_vel_kernel_loop.hpp
+++ b/app_fempic_cg/omp/calculate_new_pos_vel_kernel_loop.hpp
@@ -47,8 +47,8 @@ void opp_par_loop_all__calculate_new_pos_vel_kernel(opp_set set, opp_iterate_typ
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_fempic_cg/omp/compute_electric_field_kernel_loop.hpp b/app_fempic_cg/omp/compute_electric_field_kernel_loop.hpp
index 836e3cb..a4f13a4 100644
--- a/app_fempic_cg/omp/compute_electric_field_kernel_loop.hpp
+++ b/app_fempic_cg/omp/compute_electric_field_kernel_loop.hpp
@@ -60,8 +60,8 @@ void opp_par_loop_all__compute_electric_field_kernel(opp_set set, opp_iterate_ty
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_fempic_cg/omp/deposit_charge_on_nodes_kernel_loop.hpp b/app_fempic_cg/omp/deposit_charge_on_nodes_kernel_loop.hpp
index 9d61af4..297340e 100644
--- a/app_fempic_cg/omp/deposit_charge_on_nodes_kernel_loop.hpp
+++ b/app_fempic_cg/omp/deposit_charge_on_nodes_kernel_loop.hpp
@@ -57,8 +57,8 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set, opp_iterate_t
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
 
         OPP_REAL* arg1_dat_thread_data = (OPP_REAL *)((*(args[1].dat->thread_data))[thr]);
       
diff --git a/app_fempic_cg/omp/get_final_max_values_kernel_loop.hpp b/app_fempic_cg/omp/get_final_max_values_kernel_loop.hpp
index d19e49d..986b6fd 100644
--- a/app_fempic_cg/omp/get_final_max_values_kernel_loop.hpp
+++ b/app_fempic_cg/omp/get_final_max_values_kernel_loop.hpp
@@ -3,7 +3,7 @@
 // AUTO GENERATED CODE
 //*********************************************
 
-namespace opp_k8 {
+namespace opp_k9 {
 inline void get_final_max_values_kernel(
     const double* n_charge_den,
     double* max_n_charge_den,
@@ -59,12 +59,12 @@ void opp_par_loop_all__get_final_max_values_kernel(opp_set set, opp_iterate_type
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
-            opp_k8::get_final_max_values_kernel(
+            opp_k9::get_final_max_values_kernel(
                 (const OPP_REAL *)args[0].data + (n * 1),
                 arg1_l + (1 * thr),
                 (const OPP_REAL *)args[2].data + (n * 1),
diff --git a/app_fempic_cg/omp/get_max_cef_kernel_loop.hpp b/app_fempic_cg/omp/get_max_cef_kernel_loop.hpp
new file mode 100644
index 0000000..6ba3eef
--- /dev/null
+++ b/app_fempic_cg/omp/get_max_cef_kernel_loop.hpp
@@ -0,0 +1,73 @@
+
+//*********************************************
+// AUTO GENERATED CODE
+//*********************************************
+
+namespace opp_k8 {
+inline void get_max_cef_kernel(
+    const double* val,
+    double* max_val)
+{
+    for (int dim = 0; dim < 3; ++dim)
+    {
+        *max_val = ((val[dim] > *max_val) ? (val[dim]) : (*max_val));
+    }
+}
+}
+
+void opp_par_loop_all__get_max_cef_kernel(opp_set set, opp_iterate_type, 
+    opp_arg arg0, // c_ef | OPP_READ
+    opp_arg arg1 // | OPP_MAX
+) 
+{
+    const int nargs = 2;
+    opp_arg args[nargs];
+
+    args[0] = arg0;
+    args[1] = arg1;
+
+    opp_profiler->start("get_max_cef_kernel");
+
+    if (OPP_DBG) opp_printf("APP", "opp_par_loop_all__get_max_cef_kernel set_size %d", set->size);
+
+    const int nthreads = omp_get_max_threads(); 
+
+    const int iter_size = opp_mpi_halo_exchanges(set, nargs, args);
+ 
+    opp_mpi_halo_wait_all(nargs, args);    
+
+    OPP_REAL arg1_l[nthreads * 1];
+    for (int thr = 0; thr < nthreads; thr++)
+        for (int d = 0; d < 1; d++)
+            arg1_l[1 * thr + d] = OPP_REAL_ZERO;
+
+  
+    
+  
+ 
+    #pragma omp parallel for 
+    for (int thr = 0; thr < nthreads; thr++)
+    {
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
+      
+        for (size_t n = start; n < finish; n++)
+        { 
+            opp_k8::get_max_cef_kernel(
+                (const OPP_REAL *)args[0].data + (n * 3),
+                arg1_l + (1 * thr)
+            );
+        }
+    }
+    
+    for (int thr = 0; thr < nthreads; thr++) 
+        for (int d = 0; d < 1; d++)
+            ((OPP_REAL*)args[1].data)[d] = MAX(((OPP_REAL *)args[1].data)[d], arg1_l[1 * thr + d]);
+#ifdef USE_MPI 
+    opp_mpi_reduce(&args[1], (OPP_REAL *)args[1].data);
+#endif
+ 
+    opp_set_dirtybit(nargs, args);
+
+    opp_profiler->end("get_max_cef_kernel");
+}
diff --git a/app_fempic_cg/omp/inject_ions_kernel_loop.hpp b/app_fempic_cg/omp/inject_ions_kernel_loop.hpp
index 6545930..164559c 100644
--- a/app_fempic_cg/omp/inject_ions_kernel_loop.hpp
+++ b/app_fempic_cg/omp/inject_ions_kernel_loop.hpp
@@ -19,7 +19,7 @@ inline void inject_ions_kernel(
 {
     double a = dummy_part_random[0];
     double b = dummy_part_random[1];
-    if ((a + b) > 1)
+    if ((a + b) > 1)  // TODO : Change the random dat to avoid this
     {
         a = (1 - a);
         b = (1 - b);
@@ -84,8 +84,8 @@ void opp_par_loop_injected__inject_ions_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_fempic_cg/omp/move_kernel_loop.hpp b/app_fempic_cg/omp/move_kernel_loop.hpp
index 62ba8c5..67c4c5c 100644
--- a/app_fempic_cg/omp/move_kernel_loop.hpp
+++ b/app_fempic_cg/omp/move_kernel_loop.hpp
@@ -116,12 +116,14 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
+    const int total_count = OPP_iter_end - OPP_iter_start;
 
-#ifdef USE_MPI 
     if (useGlobalMove) { // For now Global Move will not work with MPI
-        
+
+#ifdef USE_MPI         
         globalMover->initGlobalMove();
+#endif
 
         opp_profiler->start("GblMv_Move");
 
@@ -129,8 +131,8 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
         #pragma omp parallel for
         for (int thr = 0; thr < nthreads; thr++)
         {
-            const size_t start  = ((size_t)size * thr) / nthreads;
-            const size_t finish = ((size_t)size * (thr+1)) / nthreads;
+            const size_t start  = ((size_t)total_count * thr) / nthreads;
+            const size_t finish = ((size_t)total_count * (thr+1)) / nthreads;
 
             for (size_t i = start; i < finish; i++)
             {
@@ -139,9 +141,9 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
                 const opp_point* point = (const opp_point*)&(((OPP_REAL*)args[0].data)[i * 3]); 
 
                 // check for global move, and if satisfy global move criteria, then remove the particle from current rank
-                if (opp_part_checkForGlobalMove3D(set, *point, i, opp_p2c[0])) { 
+                if (opp_part_checkForGlobalMove3D(set, *point, i, opp_p2c[0], thr)) { 
                     
-                    set->particle_remove_count++;
+                    part_remove_count_per_thr[thr] += 1;
                     continue;  
                 }
             }
@@ -149,9 +151,11 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
 
         opp_profiler->end("GblMv_Move");
 
+#ifdef USE_MPI 
+        opp_gather_gbl_move_indices();
         globalMover->communicate(set);
-    }
 #endif
+    }
 
     opp_profiler->start("Mv_AllMv0");
 
@@ -159,7 +163,6 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     // check whether all particles not marked for global comm is within cell, 
     // and if not mark to move between cells within the MPI rank, mark for neighbour comm
     opp_profiler->start("move_kernel_only");
-    const int total_count = OPP_iter_end - OPP_iter_start;
     #pragma omp parallel for
     for (int thr = 0; thr < nthreads; thr++)
     {
@@ -217,7 +220,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("move_kernel_only");
@@ -300,7 +303,7 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
     opp_profiler->start("Setup_Mover_s1");
     double x = 0.0, y = 0.0, z = 0.0;
     
-    #pragma omp parallel for
+    #pragma omp parallel for private(x, y, z)
     for (int dz = cellMapper->localGridStart.z; dz < cellMapper->localGridEnd.z; dz++) {       
         z = min_glb_coords.z + dz * cellMapper->gridSpacing;        
         for (int dy = cellMapper->localGridStart.y; dy < cellMapper->localGridEnd.y; dy++) {            
@@ -359,8 +362,18 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
     for (const auto& pair : removed_coords)
         removed_coords_keys.push_back(pair.first);
 
+    std::vector<std::vector<std::pair<int, int>>> tmp_add_per_thr;
+    tmp_add_per_thr.resize(OPP_nthreads);
+
     #pragma omp parallel for
-    for (size_t i = 0; i < removed_coords_keys.size(); ++i) {
+    for (int thr = 0; thr < OPP_nthreads; thr++)
+    {
+        const size_t start  = (removed_coords_keys.size() * thr) / OPP_nthreads;
+        const size_t finish = (removed_coords_keys.size() * (thr+1)) / OPP_nthreads;
+      
+        for (size_t i = start; i < finish; i++)
+        {
+
         const size_t index = removed_coords_keys[i];
         opp_point& p = removed_coords[index];
         double &x = p.x, &y = p.y, &z = p.z;
@@ -394,14 +407,20 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
         }    
 
         // Allow neighbours to write on-behalf of the current rank, to reduce issues
-        cellMapper->lockWindows();
         int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; 
         if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && 
-                    (most_suitable_cid < set->size)) {          
-            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);       
+                    (most_suitable_cid < set->size)) {            tmp_add_per_thr[thr].push_back(std::make_pair(index, most_suitable_gbl_cid));      
+        }
+        }
+    }
+    
+    cellMapper->lockWindows();
+    for (auto& thread_vec : tmp_add_per_thr) {
+        for (auto& thread_data : thread_vec) {       
+            cellMapper->enrichStructuredMesh(thread_data.first, thread_data.second, OPP_rank);   
         }
-        cellMapper->unlockWindows();
     }
+    cellMapper->unlockWindows();
     opp_profiler->end("Setup_Mover_s3");
 
     // Step 4 : For MPI, get the inter-node values reduced to the structured mesh
diff --git a/app_fempic_cg/omp/opp_kernels.cpp b/app_fempic_cg/omp/opp_kernels.cpp
index c45c87d..5472b45 100644
--- a/app_fempic_cg/omp/opp_kernels.cpp
+++ b/app_fempic_cg/omp/opp_kernels.cpp
@@ -93,5 +93,7 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) {
 
 #include "compute_electric_field_kernel_loop.hpp"
 
+#include "get_max_cef_kernel_loop.hpp"
+
 #include "get_final_max_values_kernel_loop.hpp"
 
diff --git a/app_fempic_cg/seq/get_final_max_values_kernel_loop.hpp b/app_fempic_cg/seq/get_final_max_values_kernel_loop.hpp
index 5752f47..5bc9f7d 100644
--- a/app_fempic_cg/seq/get_final_max_values_kernel_loop.hpp
+++ b/app_fempic_cg/seq/get_final_max_values_kernel_loop.hpp
@@ -3,7 +3,7 @@
 // AUTO GENERATED CODE
 //*********************************************
 
-namespace opp_k8 {
+namespace opp_k9 {
 inline void get_final_max_values_kernel(
     const double* n_charge_den,
     double* max_n_charge_den,
@@ -39,7 +39,7 @@ void opp_par_loop_all__get_final_max_values_kernel(opp_set set, opp_iterate_type
 
     for (int n = 0; n < iter_size; ++n) 
     {
-        opp_k8::get_final_max_values_kernel(
+        opp_k9::get_final_max_values_kernel(
             (const OPP_REAL *)args[0].data + (n * 1),
             (OPP_REAL *)args[1].data,
             (const OPP_REAL *)args[2].data + (n * 1),
diff --git a/app_fempic_cg/seq/get_max_cef_kernel_loop.hpp b/app_fempic_cg/seq/get_max_cef_kernel_loop.hpp
new file mode 100644
index 0000000..2fea35a
--- /dev/null
+++ b/app_fempic_cg/seq/get_max_cef_kernel_loop.hpp
@@ -0,0 +1,44 @@
+
+//*********************************************
+// AUTO GENERATED CODE
+//*********************************************
+
+namespace opp_k8 {
+inline void get_max_cef_kernel(
+    const double* val,
+    double* max_val)
+{
+    for (int dim = 0; dim < 3; ++dim)
+    {
+        *max_val = ((val[dim] > *max_val) ? (val[dim]) : (*max_val));
+    }
+}
+}
+
+void opp_par_loop_all__get_max_cef_kernel(opp_set set, opp_iterate_type, 
+    opp_arg arg0, // c_ef | OPP_READ
+    opp_arg arg1 // | OPP_MAX
+) 
+{
+    const int nargs = 2;
+    opp_arg args[nargs];
+
+    args[0] = arg0;
+    args[1] = arg1;
+
+    opp_profiler->start("get_max_cef_kernel");
+
+    if (OPP_DBG) opp_printf("APP", "opp_par_loop_all__get_max_cef_kernel set_size %d", set->size);
+
+    const int iter_size = set->size;
+
+    for (int n = 0; n < iter_size; ++n) 
+    {
+        opp_k8::get_max_cef_kernel(
+            (const OPP_REAL *)args[0].data + (n * 3),
+            (OPP_REAL *)args[1].data
+        );
+    }
+
+    opp_profiler->end("get_max_cef_kernel");
+}
diff --git a/app_fempic_cg/seq/inject_ions_kernel_loop.hpp b/app_fempic_cg/seq/inject_ions_kernel_loop.hpp
index d3b7312..77e9e9a 100644
--- a/app_fempic_cg/seq/inject_ions_kernel_loop.hpp
+++ b/app_fempic_cg/seq/inject_ions_kernel_loop.hpp
@@ -19,7 +19,7 @@ inline void inject_ions_kernel(
 {
     double a = dummy_part_random[0];
     double b = dummy_part_random[1];
-    if ((a + b) > 1)
+    if ((a + b) > 1)  // TODO : Change the random dat to avoid this
     {
         a = (1 - a);
         b = (1 - b);
diff --git a/app_fempic_cg/seq/move_kernel_loop.hpp b/app_fempic_cg/seq/move_kernel_loop.hpp
index 9a037aa..4c677ec 100644
--- a/app_fempic_cg/seq/move_kernel_loop.hpp
+++ b/app_fempic_cg/seq/move_kernel_loop.hpp
@@ -289,8 +289,8 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
         cellMapper->lockWindows();
         int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; 
         if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && 
-                    (most_suitable_cid < set->size)) {          
-            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);       
+                    (most_suitable_cid < set->size)) {        
+            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);      
         }
         cellMapper->unlockWindows();
     }
diff --git a/app_fempic_cg/seq/opp_kernels.cpp b/app_fempic_cg/seq/opp_kernels.cpp
index 894d1f7..b5aaa83 100644
--- a/app_fempic_cg/seq/opp_kernels.cpp
+++ b/app_fempic_cg/seq/opp_kernels.cpp
@@ -93,5 +93,7 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) {
 
 #include "compute_electric_field_kernel_loop.hpp"
 
+#include "get_max_cef_kernel_loop.hpp"
+
 #include "get_final_max_values_kernel_loop.hpp"
 
diff --git a/app_neso_advection/advec.cpp b/app_neso_advection/advec.cpp
index 65e7b4e..c178069 100644
--- a/app_neso_advection/advec.cpp
+++ b/app_neso_advection/advec.cpp
@@ -125,8 +125,10 @@ int main(int argc, char **argv)
                 log += str(OPP_max_comm_iteration, "max_comm_iteration: %d");
             }
 
+            const int64_t glb_parts = get_global_parts_iterated(part_set->size);
             OPP_RUN_ON_ROOT()
-                opp_printf("Main", "ts: %d parts: %d | %s ****", OPP_main_loop_iter, part_set->size, log.c_str());       
+                opp_printf("Main", "ts: %d parts: %d | %s ****", OPP_main_loop_iter, 
+                    glb_parts, log.c_str());           
         }
     opp_profiler->end("MainLoop");
         
diff --git a/app_neso_advection_cg/advec.cpp b/app_neso_advection_cg/advec.cpp
index 65e7b4e..c178069 100644
--- a/app_neso_advection_cg/advec.cpp
+++ b/app_neso_advection_cg/advec.cpp
@@ -125,8 +125,10 @@ int main(int argc, char **argv)
                 log += str(OPP_max_comm_iteration, "max_comm_iteration: %d");
             }
 
+            const int64_t glb_parts = get_global_parts_iterated(part_set->size);
             OPP_RUN_ON_ROOT()
-                opp_printf("Main", "ts: %d parts: %d | %s ****", OPP_main_loop_iter, part_set->size, log.c_str());       
+                opp_printf("Main", "ts: %d parts: %d | %s ****", OPP_main_loop_iter, 
+                    glb_parts, log.c_str());           
         }
     opp_profiler->end("MainLoop");
         
diff --git a/app_neso_advection_cg/advec_opp.cpp b/app_neso_advection_cg/advec_opp.cpp
index e39a227..98dc456 100644
--- a/app_neso_advection_cg/advec_opp.cpp
+++ b/app_neso_advection_cg/advec_opp.cpp
@@ -1,5 +1,5 @@
 
-// Auto-generated at 2024-05-20 13:43:42.797840 by opp-translator
+// Auto-generated at 2024-05-27 13:53:22.778598 by opp-translator
 /* 
 BSD 3-Clause License
 
@@ -129,8 +129,10 @@ int main(int argc, char **argv)
                 log += str(OPP_max_comm_iteration, "max_comm_iteration: %d");
             }
 
+            const int64_t glb_parts = get_global_parts_iterated(part_set->size);
             OPP_RUN_ON_ROOT()
-                opp_printf("Main", "ts: %d parts: %d | %s ****", OPP_main_loop_iter, part_set->size, log.c_str());       
+                opp_printf("Main", "ts: %d parts: %d | %s ****", OPP_main_loop_iter, 
+                    glb_parts, log.c_str());           
         }
     opp_profiler->end("MainLoop");
         
diff --git a/app_neso_advection_cg/configs/advec.param b/app_neso_advection_cg/configs/advec.param
index 5b99e99..4041e29 100644
--- a/app_neso_advection_cg/configs/advec.param
+++ b/app_neso_advection_cg/configs/advec.param
@@ -1,7 +1,7 @@
 # Simulation parameters
 INT num_steps = 250 
-INT nx = 128
-INT ny = 128
+INT nx = 256
+INT ny = 256
 REAL dt = 0.5
 REAL cell_width = 0.5
 INT npart_per_cell = 1000
diff --git a/app_neso_advection_cg/cuda/move_kernel_loop.hpp b/app_neso_advection_cg/cuda/move_kernel_loop.hpp
index 375e18b..d72e223 100644
--- a/app_neso_advection_cg/cuda/move_kernel_loop.hpp
+++ b/app_neso_advection_cg/cuda/move_kernel_loop.hpp
@@ -12,6 +12,11 @@ __constant__ OPP_INT opp_k2_dat1_stride_d;
 __constant__ OPP_INT opp_k2_c2c_map_stride_d;
 
 namespace opp_k2 {
+enum Dim {
+    x = 0,
+    y = 1,
+};
+
 enum CellMap {
     xd_y = 0,
     xu_y,
@@ -19,11 +24,6 @@ enum CellMap {
     x_yu
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-};
-
 __device__ inline void move_kernel(char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen
     const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen
     const double* part_pos, const double* cell_pos_ll)
diff --git a/app_neso_advection_cg/hip/move_kernel_loop.hpp b/app_neso_advection_cg/hip/move_kernel_loop.hpp
index 01bfe47..21dd9dc 100644
--- a/app_neso_advection_cg/hip/move_kernel_loop.hpp
+++ b/app_neso_advection_cg/hip/move_kernel_loop.hpp
@@ -12,6 +12,11 @@ __constant__ OPP_INT opp_k2_dat1_stride_d;
 __constant__ OPP_INT opp_k2_c2c_map_stride_d;
 
 namespace opp_k2 {
+enum Dim {
+    x = 0,
+    y = 1,
+};
+
 enum CellMap {
     xd_y = 0,
     xu_y,
@@ -19,11 +24,6 @@ enum CellMap {
     x_yu
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-};
-
 __device__ inline void move_kernel(char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen
     const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen
     const double* part_pos, const double* cell_pos_ll)
diff --git a/app_neso_advection_cg/mpi/move_kernel_loop.hpp b/app_neso_advection_cg/mpi/move_kernel_loop.hpp
index b82326d..fa667d7 100644
--- a/app_neso_advection_cg/mpi/move_kernel_loop.hpp
+++ b/app_neso_advection_cg/mpi/move_kernel_loop.hpp
@@ -4,6 +4,11 @@
 //*********************************************
 
 namespace opp_k2 {
+enum Dim {
+    x = 0,
+    y = 1,
+};
+
 enum CellMap {
     xd_y = 0,
     xu_y,
@@ -11,11 +16,6 @@ enum CellMap {
     x_yu
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-};
-
 inline void move_kernel(const double* part_pos, const double* cell_pos_ll)
 {
     // check for x direction movement
@@ -91,7 +91,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
 
     if (useGlobalMove) {
         
@@ -170,7 +170,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("move_kernel_only");
@@ -328,8 +328,8 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
         cellMapper->lockWindows();
         int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; 
         if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && 
-                    (most_suitable_cid < set->size)) {          
-            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);       
+                    (most_suitable_cid < set->size)) {        
+            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);      
         }
         cellMapper->unlockWindows();
     }
diff --git a/app_neso_advection_cg/omp/move_kernel_loop.hpp b/app_neso_advection_cg/omp/move_kernel_loop.hpp
index c59ee43..98973fb 100644
--- a/app_neso_advection_cg/omp/move_kernel_loop.hpp
+++ b/app_neso_advection_cg/omp/move_kernel_loop.hpp
@@ -4,6 +4,11 @@
 //*********************************************
 
 namespace opp_k2 {
+enum Dim {
+    x = 0,
+    y = 1,
+};
+
 enum CellMap {
     xd_y = 0,
     xu_y,
@@ -11,11 +16,6 @@ enum CellMap {
     x_yu
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-};
-
 inline void move_kernel(char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen
     const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen
     const double* part_pos, const double* cell_pos_ll)
@@ -99,12 +99,14 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
+    const int total_count = OPP_iter_end - OPP_iter_start;
 
-#ifdef USE_MPI 
     if (useGlobalMove) { // For now Global Move will not work with MPI
-        
+
+#ifdef USE_MPI         
         globalMover->initGlobalMove();
+#endif
 
         opp_profiler->start("GblMv_Move");
 
@@ -112,8 +114,8 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
         #pragma omp parallel for
         for (int thr = 0; thr < nthreads; thr++)
         {
-            const size_t start  = ((size_t)size * thr) / nthreads;
-            const size_t finish = ((size_t)size * (thr+1)) / nthreads;
+            const size_t start  = ((size_t)total_count * thr) / nthreads;
+            const size_t finish = ((size_t)total_count * (thr+1)) / nthreads;
 
             for (size_t i = start; i < finish; i++)
             {
@@ -122,9 +124,9 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
                 const opp_point* point = (const opp_point*)&(((OPP_REAL*)args[0].data)[i * 2]); 
 
                 // check for global move, and if satisfy global move criteria, then remove the particle from current rank
-                if (opp_part_checkForGlobalMove2D(set, *point, i, opp_p2c[0])) { 
+                if (opp_part_checkForGlobalMove2D(set, *point, i, opp_p2c[0], thr)) { 
                     
-                    set->particle_remove_count++;
+                    part_remove_count_per_thr[thr] += 1;
                     continue;  
                 }
             }
@@ -132,9 +134,11 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
 
         opp_profiler->end("GblMv_Move");
 
+#ifdef USE_MPI 
+        opp_gather_gbl_move_indices();
         globalMover->communicate(set);
-    }
 #endif
+    }
 
     opp_profiler->start("Mv_AllMv0");
 
@@ -142,7 +146,6 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     // check whether all particles not marked for global comm is within cell, 
     // and if not mark to move between cells within the MPI rank, mark for neighbour comm
     opp_profiler->start("move_kernel_only");
-    const int total_count = OPP_iter_end - OPP_iter_start;
     #pragma omp parallel for
     for (int thr = 0; thr < nthreads; thr++)
     {
@@ -200,7 +203,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("move_kernel_only");
@@ -279,7 +282,7 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
     opp_profiler->start("Setup_Mover_s1");
     double x = 0.0, y = 0.0, z = 0.0;
     
-    #pragma omp parallel for
+    #pragma omp parallel for private(x, y, z)
     for (int dz = cellMapper->localGridStart.z; dz < cellMapper->localGridEnd.z; dz++) {       
         z = min_glb_coords.z + dz * cellMapper->gridSpacing;        
         for (int dy = cellMapper->localGridStart.y; dy < cellMapper->localGridEnd.y; dy++) {            
@@ -338,8 +341,18 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
     for (const auto& pair : removed_coords)
         removed_coords_keys.push_back(pair.first);
 
+    std::vector<std::vector<std::pair<int, int>>> tmp_add_per_thr;
+    tmp_add_per_thr.resize(OPP_nthreads);
+
     #pragma omp parallel for
-    for (size_t i = 0; i < removed_coords_keys.size(); ++i) {
+    for (int thr = 0; thr < OPP_nthreads; thr++)
+    {
+        const size_t start  = (removed_coords_keys.size() * thr) / OPP_nthreads;
+        const size_t finish = (removed_coords_keys.size() * (thr+1)) / OPP_nthreads;
+      
+        for (size_t i = start; i < finish; i++)
+        {
+
         const size_t index = removed_coords_keys[i];
         opp_point& p = removed_coords[index];
         double &x = p.x, &y = p.y, &z = p.z;
@@ -373,14 +386,20 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
         }    
 
         // Allow neighbours to write on-behalf of the current rank, to reduce issues
-        cellMapper->lockWindows();
         int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; 
         if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && 
-                    (most_suitable_cid < set->size)) {          
-            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);       
+                    (most_suitable_cid < set->size)) {            tmp_add_per_thr[thr].push_back(std::make_pair(index, most_suitable_gbl_cid));      
+        }
+        }
+    }
+    
+    cellMapper->lockWindows();
+    for (auto& thread_vec : tmp_add_per_thr) {
+        for (auto& thread_data : thread_vec) {       
+            cellMapper->enrichStructuredMesh(thread_data.first, thread_data.second, OPP_rank);   
         }
-        cellMapper->unlockWindows();
     }
+    cellMapper->unlockWindows();
     opp_profiler->end("Setup_Mover_s3");
 
     // Step 4 : For MPI, get the inter-node values reduced to the structured mesh
diff --git a/app_neso_advection_cg/omp/verify_kernel_loop.hpp b/app_neso_advection_cg/omp/verify_kernel_loop.hpp
index c95b23e..e161c8a 100644
--- a/app_neso_advection_cg/omp/verify_kernel_loop.hpp
+++ b/app_neso_advection_cg/omp/verify_kernel_loop.hpp
@@ -84,8 +84,8 @@ void opp_par_loop_all__verify_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_neso_advection_cg/seq/move_kernel_loop.hpp b/app_neso_advection_cg/seq/move_kernel_loop.hpp
index 881c18d..eccb1ea 100644
--- a/app_neso_advection_cg/seq/move_kernel_loop.hpp
+++ b/app_neso_advection_cg/seq/move_kernel_loop.hpp
@@ -4,6 +4,11 @@
 //*********************************************
 
 namespace opp_k2 {
+enum Dim {
+    x = 0,
+    y = 1,
+};
+
 enum CellMap {
     xd_y = 0,
     xu_y,
@@ -11,11 +16,6 @@ enum CellMap {
     x_yu
 };
 
-enum Dim {
-    x = 0,
-    y = 1,
-};
-
 inline void move_kernel(const double* part_pos, const double* cell_pos_ll)
 {
     // check for x direction movement
@@ -268,8 +268,8 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
         cellMapper->lockWindows();
         int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; 
         if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && 
-                    (most_suitable_cid < set->size)) {          
-            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);       
+                    (most_suitable_cid < set->size)) {        
+            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);      
         }
         cellMapper->unlockWindows();
     }
diff --git a/app_simpic/README.md b/app_simpic/README.md
index 22edc89..d9d5ae7 100644
--- a/app_simpic/README.md
+++ b/app_simpic/README.md
@@ -22,6 +22,11 @@ https://bitbucket.org/lecad-peg/simpic/src/ab6a92dea645ee39747aff214884a51a14802
 
 *Both these issues will make the shared memory serial and distributed memory runs deviate even if we use the same number of particles per cell and the same total number of particles and mesh elements per simulation*
 
+## TODO
+If MPI versions of Simpic needs to be developed with OP-PIC, the above issues should be addressed.
+
+In addition, the mesh/particles should be distributed, an appropriate partitioning routine should be called and the field solver needs to be re-implemented, perhaps consider using PETSc.
+
 ## Structure
  * `simpic.cpp` : The main file containing OP-PIC API calls. 
  * `kernels.h` : The user written elemental kernel functions.
diff --git a/app_simpic_cg/README.md b/app_simpic_cg/README.md
index f3a00d1..6ce35da 100644
--- a/app_simpic_cg/README.md
+++ b/app_simpic_cg/README.md
@@ -36,6 +36,11 @@ https://bitbucket.org/lecad-peg/simpic/src/ab6a92dea645ee39747aff214884a51a14802
 
 *Both these issues will make the shared memory serial and distributed memory runs deviate even if we use the same number of particles per cell and the same total number of particles and mesh elements per simulation*
 
+## TODO
+If MPI versions of Simpic needs to be developed with OP-PIC, the above issues should be addressed.
+
+In addition, the mesh/particles should be distributed, an appropriate partitioning routine should be called and the field solver needs to be re-implemented, perhaps consider using PETSc.
+
 ## Structure
  * `simpic.cpp` : The main file containing OP-PIC API calls. 
  * `kernels.h` : The user written elemental kernel functions.
diff --git a/app_simpic_cg/funcs_from_simpic.cpp b/app_simpic_cg/funcs_from_simpic.cpp
index f958361..9cfdbcf 100644
--- a/app_simpic_cg/funcs_from_simpic.cpp
+++ b/app_simpic_cg/funcs_from_simpic.cpp
@@ -178,54 +178,46 @@ void seq_field_solve_poissons_equation(opp_set set, opp_arg arg0, opp_arg arg1)
 
     // modify density array
     double nlold, nrold;
-    if(rank == 0)
-    {
+
+    if(rank == 0) {
         nlold = node0_field_J[0];
         node0_field_J[0] = 0.;
     }
-    else
-    {
+    else {
         node0_field_J[0] *= 2;
     }
-    if(rank == last)
-    {
+
+    if(rank == last) {
         nrold = node0_field_J[nc];
         node0_field_J[nc] = 0.;
     }
-    else
-    {
+    else {
         node0_field_J[nc] *= 2;
     }
 
-    int nstrt = 0;
-
-    // Tridiagonal matrix of Poisson equation 𝜙𝑗+1−2𝜙𝑗+𝜙𝑗−1=𝑝𝑗 is solved with Gaussian elimination 
-    // by each processor
+    // Tridiagonal matrix of Poisson eq. 𝜙𝑗+1−2𝜙𝑗+𝜙𝑗−1=𝑝𝑗 is solved with Gaussian elimination by each processor 
     {
-        int j;
-        double bet = tri_b[nstrt];
-        node0_field_P[nstrt] = node0_field_J[nstrt]/bet;
+        double bet = tri_b[0];
+        node0_field_P[0] = node0_field_J[0] / bet;
 
-        for(j = nstrt + 1; j < set_size; j++) 
+        for(int j = 1; j < set_size; j++) 
         {
-            gam[j]              = tri_c[j-1]/bet;
-            bet                 = tri_b[j] - tri_a[j]*gam[j];
-            node0_field_P[j]    = (node0_field_J[j] - tri_a[j]*node0_field_P[j-1])/bet;
+            gam[j]              = tri_c[j-1] / bet;
+            bet                 = tri_b[j] - tri_a[j] * gam[j];
+            node0_field_P[j]    = (node0_field_J[j] - tri_a[j] * node0_field_P[j-1]) / bet;
         }
 
-        for(j = nc - 1; j >= nstrt; j--)
+        for(int j = nc - 1; j >= 0; j--)
         {
             node0_field_P[j] -= (gam[j+1] * node0_field_P[j+1]);
         }
     }
 
     // restore density array
-    if(rank == 0)
-    {
+    if(rank == 0) {
         node0_field_J[0] = 2*nlold;
     }
-    if(rank == last)
-    {
+    if(rank == last) {
         node0_field_J[nc] = 2*nrold;
     }
 
diff --git a/app_simpic_cg/mpi/move_kernel_loop.hpp b/app_simpic_cg/mpi/move_kernel_loop.hpp
index d8bd052..7c145cc 100644
--- a/app_simpic_cg/mpi/move_kernel_loop.hpp
+++ b/app_simpic_cg/mpi/move_kernel_loop.hpp
@@ -92,7 +92,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
 
 
     opp_profiler->start("Mv_AllMv0");
@@ -120,7 +120,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("move_kernel_only");
diff --git a/app_simpic_cg/omp/move_kernel_loop.hpp b/app_simpic_cg/omp/move_kernel_loop.hpp
index 4bfb3c5..c8e7c77 100644
--- a/app_simpic_cg/omp/move_kernel_loop.hpp
+++ b/app_simpic_cg/omp/move_kernel_loop.hpp
@@ -100,7 +100,8 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
+    const int total_count = OPP_iter_end - OPP_iter_start;
 
 
     opp_profiler->start("Mv_AllMv0");
@@ -109,7 +110,6 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
     // check whether all particles not marked for global comm is within cell, 
     // and if not mark to move between cells within the MPI rank, mark for neighbour comm
     opp_profiler->start("move_kernel_only");
-    const int total_count = OPP_iter_end - OPP_iter_start;
     #pragma omp parallel for
     for (int thr = 0; thr < nthreads; thr++)
     {
@@ -134,7 +134,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("move_kernel_only");
diff --git a/app_simpic_cg/omp/weight_f2p_kernel_loop.hpp b/app_simpic_cg/omp/weight_f2p_kernel_loop.hpp
index e3b40d5..7c2a1bb 100644
--- a/app_simpic_cg/omp/weight_f2p_kernel_loop.hpp
+++ b/app_simpic_cg/omp/weight_f2p_kernel_loop.hpp
@@ -51,8 +51,8 @@ void opp_par_loop_all__weight_f2p_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
       
         for (size_t n = start; n < finish; n++)
         { 
diff --git a/app_simpic_cg/omp/weight_p2f_kernel_loop.hpp b/app_simpic_cg/omp/weight_p2f_kernel_loop.hpp
index ddc9eb3..53c9a34 100644
--- a/app_simpic_cg/omp/weight_p2f_kernel_loop.hpp
+++ b/app_simpic_cg/omp/weight_p2f_kernel_loop.hpp
@@ -54,8 +54,8 @@ void opp_par_loop_all__weight_p2f_kernel(opp_set set, opp_iterate_type,
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
 
         OPP_REAL* arg0_dat_thread_data = (OPP_REAL *)((*(args[0].dat->thread_data))[thr]);
       
diff --git a/opp_lib/Makefile b/opp_lib/Makefile
index 6202cfe..fe51bbf 100644
--- a/opp_lib/Makefile
+++ b/opp_lib/Makefile
@@ -95,14 +95,22 @@ seq: mklib
 	     $(CPP) $(CPPFLAGS) -c $(SRC)/core/opp_util.cpp -o $(OBJ)/opp_util.o $(OPP_INC) $(PETSC_INC)
 		 $(CPP) $(CPPFLAGS) -c $(SRC)/core/opp_params.cpp -o $(OBJ)/opp_params.o $(OPP_INC) $(PETSC_INC)
 	     $(CPP) $(CPPFLAGS) -c $(SRC)/seq/opp_seq.cpp -o $(OBJ)/opp_seq.o $(OPP_INC) $(PETSC_INC)
-	     ar -r $(LIB)/libopp_seq.a $(OBJ)/opp_lib_core.o $(OBJ)/opp_util.o $(OBJ)/opp_params.o $(OBJ)/opp_seq.o
+	     ar -r $(LIB)/libopp_seq.a \
+		 	$(OBJ)/opp_lib_core.o \
+			$(OBJ)/opp_util.o \
+			$(OBJ)/opp_params.o \
+			$(OBJ)/opp_seq.o
 
 omp: mklib
 	     $(CPP) $(CPPFLAGS) -fopenmp -c $(SRC)/core/opp_lib_core.cpp -o $(OBJ)/opp_lib_core.o $(OPP_INC) $(PETSC_INC)
 	     $(CPP) $(CPPFLAGS) -fopenmp -c $(SRC)/core/opp_util.cpp -o $(OBJ)/opp_util.o $(OPP_INC) $(PETSC_INC)
 		 $(CPP) $(CPPFLAGS) -fopenmp -c $(SRC)/core/opp_params.cpp -o $(OBJ)/opp_params.o $(OPP_INC) $(PETSC_INC)
 	     $(CPP) $(CPPFLAGS) -fopenmp -c $(SRC)/omp/opp_omp.cpp -o $(OBJ)/opp_omp.o $(OPP_INC) $(PETSC_INC)
-	     ar -r $(LIB)/libopp_omp.a $(OBJ)/opp_lib_core.o $(OBJ)/opp_util.o $(OBJ)/opp_params.o $(OBJ)/opp_omp.o
+	     ar -r $(LIB)/libopp_omp.a \
+		 	$(OBJ)/opp_lib_core.o \
+			$(OBJ)/opp_util.o \
+			$(OBJ)/opp_params.o \
+			$(OBJ)/opp_omp.o
 
 cuda: mklib
 	     $(CPP) $(CPPFLAGS) -DUSE_THRUST -c $(SRC)/core/opp_lib_core.cpp -o $(OBJ)/opp_lib_core.o $(OPP_INC) $(PETSC_INC) $(CUDA_INC)
@@ -225,7 +233,18 @@ omp_mpi: mklib
 		 $(MPICPP) $(CPPFLAGS) -fopenmp -DUSE_MPI -c $(SRC)/mpi/opp_mpi_particle_comm.cpp -o $(OBJ)/opp_mpi_particle_comm.o $(OPP_INC) $(PETSC_INC)
 		 $(MPICPP) $(CPPFLAGS) -fopenmp -DUSE_MPI -c $(SRC)/mpi/opp_mpi_double_ind_reducs.cpp -o $(OBJ)/opp_mpi_double_ind_reducs.o $(OPP_INC) $(PETSC_INC)
 		 $(MPICPP) $(CPPFLAGS) -fopenmp -DUSE_MPI -c $(SRC)/mpi/opp_mpi_utils.cpp -o $(OBJ)/opp_mpi_utils.o $(OPP_INC) $(PETSC_INC)
-	     ar -r $(LIB)/libopp_omp_mpi.a $(OBJ)/opp_lib_core.o $(OBJ)/opp_util.o $(OBJ)/opp_params.o $(OBJ)/opp_omp.o $(OBJ)/opp_mpi_halo.o $(OBJ)/opp_mpi_halo_core.o $(OBJ)/opp_mpi_partition.o $(OBJ)/opp_mpi_core.o $(OBJ)/opp_mpi_particle_comm.o $(OBJ)/opp_mpi_double_ind_reducs.o $(OBJ)/opp_mpi_utils.o
+	     ar -r $(LIB)/libopp_omp_mpi.a \
+		 	$(OBJ)/opp_lib_core.o \
+			$(OBJ)/opp_util.o \
+			$(OBJ)/opp_params.o \
+			$(OBJ)/opp_omp.o \
+			$(OBJ)/opp_mpi_halo.o \
+			$(OBJ)/opp_mpi_halo_core.o \
+			$(OBJ)/opp_mpi_partition.o \
+			$(OBJ)/opp_mpi_core.o \
+			$(OBJ)/opp_mpi_particle_comm.o \
+			$(OBJ)/opp_mpi_double_ind_reducs.o \
+			$(OBJ)/opp_mpi_utils.o
 
 clean: 
 	     rm -f *.o *.d *.a
diff --git a/opp_lib/include/opp_omp.h b/opp_lib/include/opp_omp.h
index fd4b000..ce4a657 100644
--- a/opp_lib/include/opp_omp.h
+++ b/opp_lib/include/opp_omp.h
@@ -39,44 +39,64 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     #include <opp_mpi_core.h>
 #endif
 
+struct opp_tmp_gbl_move_info
+{
+    opp_set set = nullptr;
+    OPP_INT part_idx = -1;
+    int struct_cell_rank = MAX_CELL_INDEX;
+    OPP_INT gbl_cell_idx = MAX_CELL_INDEX;
+
+    opp_tmp_gbl_move_info(opp_set _set, OPP_INT _part_idx, int _struct_cell_rank, OPP_INT _gbl_cell_idx) : 
+        set(_set), part_idx(_part_idx), struct_cell_rank(_struct_cell_rank), gbl_cell_idx(_gbl_cell_idx) {}
+};
+
+extern int OPP_nthreads;
 extern std::vector<int> part_remove_count_per_thr;
+extern std::vector<std::vector<int>> move_part_indices_per_thr;
+extern std::vector<std::vector<opp_tmp_gbl_move_info>> gbl_move_indices_per_thr;
 
 // TODO : May need to write with array capcity and realloc always if partticle dats are used
 template <class T> 
 void opp_create_thread_level_data(opp_arg arg, T init_value)
 {
     opp_dat dat = arg.dat;
-    int nthreads = omp_get_max_threads();
+    opp_set set = dat->set;
+    const int nthreads = omp_get_max_threads();
 
     if (dat->set->is_particle)
     {
-        std::cerr << "Cannot create thread level data for particle dat [" << dat->name << "] (dat in a dynamic set)" << std::endl;
+        std::cerr << "Cannot create thread level data for particle dat [" << dat->name << 
+            "] (dat in a dynamic set)" << std::endl;
         exit(-1);
     }
 
     if (OPP_DBG) printf("opp_create_thread_level_data template[%d]\n", nthreads);
 
+    const size_t dat_set_size = (size_t)(set->size) + (size_t)(set->exec_size) + (size_t)(set->nonexec_size);
+
     if (dat->thread_data->size() <= 0)
     {
         dat->thread_data->push_back(dat->data);
 
         for (int thr = 1; thr < nthreads; thr++)
         {
-            char* thr_data = (char *)opp_host_malloc((size_t)dat->size * (size_t)(dat->set->size) * sizeof(char));;
+            char* thr_data = (char *)opp_host_malloc((size_t)dat->size * dat_set_size * sizeof(char));;
             dat->thread_data->push_back(thr_data);
         }
     }
 
     if ((int)dat->thread_data->size() != nthreads)
     {
-        std::cerr << "opp_create_thread_level_data dat [" << dat->name << "] thread_data not properly created [(int)dat->thread_data.size():" << (int)dat->thread_data->size() << " nthreads:" << nthreads << std::endl;
+        std::cerr << "opp_create_thread_level_data dat [" << dat->name << 
+            "] thread_data not properly created [(int)dat->thread_data.size():" << (int)dat->thread_data->size() << 
+            " nthreads:" << nthreads << std::endl;
         return;
     }
 
     #pragma omp parallel for
     for (int thr = 1; thr < nthreads; thr++)
     {
-        std::fill_n((T*)(dat->thread_data->at(thr)), (dat->dim * dat->set->size), init_value);
+        std::fill_n((T*)(dat->thread_data->at(thr)), (dat->dim * dat_set_size), init_value);
     }
 }
 
@@ -94,12 +114,14 @@ void opp_reduce_thread_level_data(opp_arg arg)
 
     if (set->size > 0) 
     {
+        const size_t dat_set_size = (size_t)(set->size) + (size_t)(set->exec_size) + (size_t)(set->nonexec_size);
+
         #pragma omp parallel for
         for (int thr = 0; thr < nthreads; thr++)
         {
-            int start  = ((dat->dim * set->size)* thr)/nthreads;
-            int finish = ((dat->dim * set->size)*(thr+1))/nthreads;
-            
+            int start  = ((dat->dim * dat_set_size)* thr)/nthreads;
+            int finish = ((dat->dim * dat_set_size)*(thr+1))/nthreads;
+            // if (OPP_DBG) printf("opp_reduce_thread_level_data THREAD [%d] %d %d\n", thr, start, finish);
             for (int n = start; n < finish; n++)
             {
                 for (int array_num = 1; array_num < nthreads; array_num++)
@@ -112,10 +134,12 @@ void opp_reduce_thread_level_data(opp_arg arg)
                             ((T*)dat->data)[n] += td[n];
                             break;
                         default:
-                            std::cerr << "opp_reduce_thread_level_data dat [" << dat->name << "] acc [" << (int)arg.acc << "] not implemented" << std::endl;
+                            std::cerr << "opp_reduce_thread_level_data dat [" << dat->name << "] acc [" << 
+                                            (int)arg.acc << "] not implemented" << std::endl;
                     }
                 }
             }
+            // if (OPP_DBG) printf("opp_reduce_thread_level_data THREAD [%d] END\n", thr);
         }
     }
 }
@@ -138,14 +162,22 @@ void opp_mpi_print_dat_to_txtfile(opp_dat dat, const char *file_name);
 
 inline void opp_mpi_reduce(opp_arg *args, double *data) 
 {
+#ifdef USE_MPI
+    opp_mpi_reduce_double(args, data);
+#else
     (void)args;
     (void)data;
+#endif
 }
 
 inline void opp_mpi_reduce(opp_arg *args, int *data) 
 {
+#ifdef USE_MPI
+    opp_mpi_reduce_int(args, data);
+#else
     (void)args;
     (void)data;
+#endif
 }
 
 enum oppx_move_status : char
@@ -192,12 +224,12 @@ inline bool opp_check_part_move_status(const char move_flag, bool& iter_one_flag
 
 //*******************************************************************************
 // returns true, if the current particle needs to be removed from the rank
-inline bool opp_part_checkForGlobalMove_util(opp_set set, const opp_point& point, const int partIndex, int& cellIdx, 
-                const size_t structCellIdx) {
+inline bool opp_part_checkForGlobalMove_util(opp_set set, const opp_point& point, const int partIndex, 
+            int& cellIdx, const size_t structCellIdx, const int thread) {
 
     if (structCellIdx == MAX_CELL_INDEX) { // This happens when point is out of the unstructured mesh
-        if (OPP_DBG)
-            opp_printf("move", 
+        if (false && OPP_DBG)
+            opp_printf("opp_part_checkForGlobalMove", 
             "Remove %d [Struct cell index invalid - strCellIdx:%zu] [%2.16lE, %2.16lE, %2.16lE]", 
                 partIndex, structCellIdx, point.x, point.y, point.z);
 
@@ -205,24 +237,93 @@ inline bool opp_part_checkForGlobalMove_util(opp_set set, const opp_point& point
         return true;
     }
 
-    cellIdx = cellMapper->findClosestCellIndex(structCellIdx);           
-    if (cellIdx == MAX_CELL_INDEX) { // Particle is outside the mesh, need to remove
+#ifdef USE_MPI  
+    const int structCellRank = cellMapper->findClosestCellRank(structCellIdx);
+
+    // Check whether the paticles need global moving, if yes start global moving process, 
+    // if no, move to the closest local cell
+    if (structCellRank != OPP_rank) {
+
+        if (structCellRank == MAX_CELL_INDEX) {
+            if (false && OPP_DBG)
+                opp_printf("opp_part_checkForGlobalMove", 
+                "Remove %d [Rank invalid - strCellRank:%d loclCellIdx:%zu strCellIdx:%zu] [%2.16lE, %2.16lE, %2.16lE]", 
+                    partIndex, structCellRank, cellMapper->findClosestCellIndex(structCellIdx), structCellIdx, 
+                    point.x, point.y, point.z);
+            cellIdx = MAX_CELL_INDEX;
+            return true;
+        }
+
+        // Due to renumbering local cell indices will be different to global, hence do global comm with global indices
+        const size_t globalCellIndex = cellMapper->findClosestCellIndex(structCellIdx);
+
+        if (globalCellIndex == MAX_CELL_INDEX) {
+            if (false && OPP_DBG)
+                opp_printf("opp_part_checkForGlobalMove", 
+                "Remove %d [CellIdx invalid - strCellRank:%d loclCellIdx:%zu strCellIdx:%zu] [%2.16lE, %2.16lE, %2.16lE]", 
+                    partIndex, structCellRank, globalCellIndex, structCellIdx, point.x, point.y, point.z);
+            cellIdx = MAX_CELL_INDEX;
+            return true;
+        }
+
+        // if the new rank is not the current rank, mark the particle to be sent via global comm
+        // globalMover->markParticleToMove(set, partIndex, structCellRank, globalCellIndex);
+        gbl_move_indices_per_thr[thread].push_back(
+                opp_tmp_gbl_move_info(set, partIndex, structCellRank, globalCellIndex));
+        // if (OPP_DBG)
+        //     opp_printf("opp_part_checkForGlobalMove", "Mark part %d [Move to rank %d gblCellIdx %d]", 
+        //         partIndex, structCellRank, globalCellIndex);
+        cellIdx = MAX_CELL_INDEX;
         return true;
     }
+    else
+#endif 
+    {
+        
+        // Due to renumbering local cell indices will be different to global, hence do global comm with global indices
+        cellIdx = cellMapper->findClosestCellIndex(structCellIdx);
+
+        if (false && OPP_DBG && (cellIdx < 0 || cellIdx >= set->cells_set->size)) {
+            opp_printf("opp_part_checkForGlobalMove", 
+                "Error... Particle %d assigned to current rank but invalid cell index %d [strCellIdx:%zu]", 
+                    partIndex, cellIdx, structCellIdx);
+            opp_abort("opp_part_checkForGlobalMove Error... Particle assigned to current rank but invalid cell index");
+        }
 
+        if (cellIdx == MAX_CELL_INDEX) { // Particle is outside the mesh, need to remove
+            return true;
+        }
+    }
+                
     return false;
 }
 
 //*******************************************************************************
 // returns true, if the current particle needs to be removed from the rank
-inline bool opp_part_checkForGlobalMove2D(opp_set set, const opp_point& point, const int partIndex, int& cellIdx) {
+inline bool opp_part_checkForGlobalMove2D(opp_set set, const opp_point& point, const int partIndex, 
+        int& cellIdx, const int thread) {
     const size_t structCellIdx = cellMapper->findStructuredCellIndex2D(point);
-    return opp_part_checkForGlobalMove_util(set, point, partIndex, cellIdx, structCellIdx);
+    return opp_part_checkForGlobalMove_util(set, point, partIndex, cellIdx, structCellIdx, thread);
 }
 
 //*******************************************************************************
 // returns true, if the current particle needs to be removed from the rank
-inline bool opp_part_checkForGlobalMove3D(opp_set set, const opp_point& point, const int partIndex, int& cellIdx) {
+inline bool opp_part_checkForGlobalMove3D(opp_set set, const opp_point& point, const int partIndex, 
+        int& cellIdx, const int thread) {
     const size_t structCellIdx = cellMapper->findStructuredCellIndex3D(point);
-    return opp_part_checkForGlobalMove_util(set, point, partIndex, cellIdx, structCellIdx);
-}
\ No newline at end of file
+    return opp_part_checkForGlobalMove_util(set, point, partIndex, cellIdx, structCellIdx, thread);
+}
+
+//*******************************************************************************
+// gathers all per thread global move information into the global mover for communication
+inline void opp_gather_gbl_move_indices() {
+
+#ifdef USE_MPI
+    for (auto& per_thread_info : gbl_move_indices_per_thr) {
+        for (auto& p_info : per_thread_info) {
+            globalMover->markParticleToMove(p_info.set, p_info.part_idx, 
+                p_info.struct_cell_rank, p_info.gbl_cell_idx);
+        }
+    }
+#endif
+}
diff --git a/opp_lib/src/mpi/opp_mpi_particle_comm.cpp b/opp_lib/src/mpi/opp_mpi_particle_comm.cpp
index f214e57..08241fb 100644
--- a/opp_lib/src/mpi/opp_mpi_particle_comm.cpp
+++ b/opp_lib/src/mpi/opp_mpi_particle_comm.cpp
@@ -1075,7 +1075,7 @@ void GlobalParticleMover::communicate(opp_set set) {
         int64_t recvSize = (this->h_recv_rank_npart[rankx] * set->particle_size);
 
         // opp_printf("Communicate", "Expected to recv %d bytes from rank %d, buffer size %zu", 
-        //     sendSize, recvRank, partRecvBufferPerRank.size());
+        //     recvSize, recvRank, partRecvBufferPerRank.size());
 
         CHECK(MPI_Irecv(&(partRecvBufferPerRank[0]), recvSize, MPI_CHAR, recvRank, 43, 
                 this->comm, &this->h_recv_requests[rankx]));
diff --git a/opp_lib/src/omp/opp_omp.cpp b/opp_lib/src/omp/opp_omp.cpp
index 7ce232c..24389a9 100644
--- a/opp_lib/src/omp/opp_omp.cpp
+++ b/opp_lib/src/omp/opp_omp.cpp
@@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 int OPP_nthreads = 1;
 std::vector<int> part_remove_count_per_thr;
 std::vector<std::vector<int>> move_part_indices_per_thr;
+std::vector<std::vector<opp_tmp_gbl_move_info>> gbl_move_indices_per_thr;
 std::vector<opp_move_var> move_var_per_thr;
 
 void opp_part_pack(opp_set set);
@@ -88,6 +89,13 @@ void opp_init(int argc, char **argv)
 //****************************************
 void opp_exit()
 {
+    if (OPP_DBG) opp_printf("opp_exit", "");
+    
+    globalMover.reset();
+    cellMapper.reset();
+    boundingBox.reset();
+    comm.reset();
+
 #ifdef USE_MPI 
         opp_halo_destroy(); // free memory allocated to halos and mpi_buffers 
         opp_partition_destroy(); // free memory used for holding partition information
@@ -401,9 +409,22 @@ void opp_init_particle_move(opp_set set, int nargs, opp_arg *args)
 
 #ifdef USE_MPI
     opp_move_part_indices.clear();
+    opp_move_part_indices.reserve(20000);
 
     move_part_indices_per_thr.resize(OPP_nthreads);
-    std::fill(move_part_indices_per_thr.begin(), move_part_indices_per_thr.end(), std::vector<int>());
+    for (int t = 0; t < OPP_nthreads; t++) 
+    {
+        move_part_indices_per_thr[t].clear();
+        move_part_indices_per_thr[t].reserve(1000);
+    }
+    // std::fill(move_part_indices_per_thr.begin(), move_part_indices_per_thr.end(), std::vector<int>());
+
+    gbl_move_indices_per_thr.resize(OPP_nthreads);
+    for (int t = 0; t < OPP_nthreads; t++) 
+    {
+        gbl_move_indices_per_thr[t].clear();
+        gbl_move_indices_per_thr[t].reserve(1000);
+    }    
 #endif
 
     if (OPP_comm_iteration == 0)
@@ -786,7 +807,7 @@ void opp_mpi_halo_wait_all(int nargs, opp_arg *args)
 {
     // Nothing to execute here
 }
-#endif
+#endif // if USE_MPI is defined, the functions in opp_mpi_particle_comm.cpp and opp_mpi_halo.cpp is used
 
 //****************************************
 opp_move_var opp_get_move_var(int thread)
diff --git a/opp_translator/generated_ast/README.md b/opp_translator/generated_ast/README.md
index af3ee6d..9e7afc0 100644
--- a/opp_translator/generated_ast/README.md
+++ b/opp_translator/generated_ast/README.md
@@ -1,3 +1,3 @@
 These are for example purposes, and these files are not required for code-generation.
 
-The code-generator will create its own AST depending on the available application file once invoked.
\ No newline at end of file
+The code-generator will create its own AST (using clang) depending on the available application file once invoked.
\ No newline at end of file
diff --git a/opp_translator/resources/templates/cpp/move_loop_dh_init_host.hpp.jinja b/opp_translator/resources/templates/cpp/move_loop_dh_init_host.hpp.jinja
index 1239613..8811b9d 100644
--- a/opp_translator/resources/templates/cpp/move_loop_dh_init_host.hpp.jinja
+++ b/opp_translator/resources/templates/cpp/move_loop_dh_init_host.hpp.jinja
@@ -108,7 +108,7 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
     double x = 0.0, y = 0.0, z = 0.0;
     
     {% if is_omp %}
-    #pragma omp parallel for
+    #pragma omp parallel for private(x, y, z)
     {% endif %}
     for (int dz = cellMapper->localGridStart.z; dz < cellMapper->localGridEnd.z; dz++) {       
         z = min_glb_coords.z + dz * cellMapper->gridSpacing;        
@@ -173,8 +173,18 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
     for (const auto& pair : removed_coords)
         removed_coords_keys.push_back(pair.first);
 
+    std::vector<std::vector<std::pair<int, int>>> tmp_add_per_thr;
+    tmp_add_per_thr.resize(OPP_nthreads);
+
     #pragma omp parallel for
-    for (size_t i = 0; i < removed_coords_keys.size(); ++i) {
+    for (int thr = 0; thr < OPP_nthreads; thr++)
+    {
+        const size_t start  = (removed_coords_keys.size() * thr) / OPP_nthreads;
+        const size_t finish = (removed_coords_keys.size() * (thr+1)) / OPP_nthreads;
+      
+        for (size_t i = start; i < finish; i++)
+        {
+
         const size_t index = removed_coords_keys[i];
         opp_point& p = removed_coords[index];
         double &x = p.x, &y = p.y, &z = p.z;
@@ -214,14 +224,34 @@ inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map
         }    
 
         // Allow neighbours to write on-behalf of the current rank, to reduce issues
+    {% if not is_omp %}
         cellMapper->lockWindows();
+    {% endif %}
         int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; 
         if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && 
-                    (most_suitable_cid < set->size)) {          
-            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);       
+                    (most_suitable_cid < set->size)) {
+    {%- if is_omp %}
+            tmp_add_per_thr[thr].push_back(std::make_pair(index, most_suitable_gbl_cid));
+    {%- else %}        
+            cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank);
+    {%- endif %}      
         }
+    {% if not is_omp %}
         cellMapper->unlockWindows();
+    {% else %}
+        }
+    {% endif %}
+    }
+    {% if is_omp %}
+    
+    cellMapper->lockWindows();
+    for (auto& thread_vec : tmp_add_per_thr) {
+        for (auto& thread_data : thread_vec) {       
+            cellMapper->enrichStructuredMesh(thread_data.first, thread_data.second, OPP_rank);   
+        }
     }
+    cellMapper->unlockWindows();
+    {% endif %}
     opp_profiler->end("Setup_Mover_s3");
 
     // Step 4 : For MPI, get the inter-node values reduced to the structured mesh
diff --git a/opp_translator/resources/templates/cpp/mpi/move_loop_host.hpp.jinja b/opp_translator/resources/templates/cpp/mpi/move_loop_host.hpp.jinja
index c6da2d4..7d98eec 100644
--- a/opp_translator/resources/templates/cpp/mpi/move_loop_host.hpp.jinja
+++ b/opp_translator/resources/templates/cpp/mpi/move_loop_host.hpp.jinja
@@ -99,7 +99,7 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
 
 {% if lh.dh_loop_required %}
     if (useGlobalMove) {
@@ -182,7 +182,7 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("{{lh.kernel}}_only");
diff --git a/opp_translator/resources/templates/cpp/omp/loop_host.hpp.jinja b/opp_translator/resources/templates/cpp/omp/loop_host.hpp.jinja
index 65a6cbe..1089a5f 100644
--- a/opp_translator/resources/templates/cpp/omp/loop_host.hpp.jinja
+++ b/opp_translator/resources/templates/cpp/omp/loop_host.hpp.jinja
@@ -112,8 +112,8 @@ void opp_par_loop_{{lh.iterator_type}}__{{lh.kernel}}(opp_set set, opp_iterate_t
     #pragma omp parallel for 
     for (int thr = 0; thr < nthreads; thr++)
     {
-        const size_t start  = (iter_size * thr) / nthreads;
-        const size_t finish = (iter_size * (thr+1)) / nthreads;
+        const size_t start  = ((size_t)iter_size * thr) / nthreads;
+        const size_t finish = ((size_t)iter_size * (thr+1)) / nthreads;
         {% for arg in lh.args|dat|reduction|not_already_mapped(lh)|not_same_iter_set_dat(lh) if lh is indirect %}
 
         {{lh.dat(arg).typ}}* arg{{arg.id}}_dat_thread_data = ({{lh.dat(arg).typ}} *)((*(args[{{arg.id}}].dat->thread_data))[thr]);
diff --git a/opp_translator/resources/templates/cpp/omp/move_loop_host.hpp.jinja b/opp_translator/resources/templates/cpp/omp/move_loop_host.hpp.jinja
index f25bb12..511ed55 100644
--- a/opp_translator/resources/templates/cpp/omp/move_loop_host.hpp.jinja
+++ b/opp_translator/resources/templates/cpp/omp/move_loop_host.hpp.jinja
@@ -108,13 +108,15 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
     };
 
     // ----------------------------------------------------------------------------
-    opp_init_particle_move(set, 0, nullptr);
+    opp_init_particle_move(set, nargs, args);
+    const int total_count = OPP_iter_end - OPP_iter_start;
 
 {% if lh.dh_loop_required %}
-#ifdef USE_MPI 
     if (useGlobalMove) { // For now Global Move will not work with MPI
-        
+
+#ifdef USE_MPI         
         globalMover->initGlobalMove();
+#endif
 
         opp_profiler->start("GblMv_Move");
 
@@ -122,8 +124,8 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
         #pragma omp parallel for
         for (int thr = 0; thr < nthreads; thr++)
         {
-            const size_t start  = ((size_t)size * thr) / nthreads;
-            const size_t finish = ((size_t)size * (thr+1)) / nthreads;
+            const size_t start  = ((size_t)total_count * thr) / nthreads;
+            const size_t finish = ((size_t)total_count * (thr+1)) / nthreads;
 
             for (size_t i = start; i < finish; i++)
             {
@@ -132,9 +134,9 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
                 const opp_point* point = (const opp_point*)&(((OPP_REAL*)args[0].data)[i * {{lh.dat(lh.args[0]).dim}}]); 
 
                 // check for global move, and if satisfy global move criteria, then remove the particle from current rank
-                if (opp_part_checkForGlobalMove{{lh.dat(lh.args[0]).dim}}D(set, *point, i, opp_p2c[0])) { 
+                if (opp_part_checkForGlobalMove{{lh.dat(lh.args[0]).dim}}D(set, *point, i, opp_p2c[0], thr)) { 
                     
-                    set->particle_remove_count++;
+                    part_remove_count_per_thr[thr] += 1;
                     continue;  
                 }
             }
@@ -142,9 +144,11 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
 
         opp_profiler->end("GblMv_Move");
 
+#ifdef USE_MPI 
+        opp_gather_gbl_move_indices();
         globalMover->communicate(set);
-    }
 #endif
+    }
 {% endif %}
 
     opp_profiler->start("Mv_AllMv0");
@@ -153,7 +157,6 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
     // check whether all particles not marked for global comm is within cell, 
     // and if not mark to move between cells within the MPI rank, mark for neighbour comm
     opp_profiler->start("{{lh.kernel}}_only");
-    const int total_count = OPP_iter_end - OPP_iter_start;
     #pragma omp parallel for
     for (int thr = 0; thr < nthreads; thr++)
     {
@@ -227,7 +230,7 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_
         const std::string profName = std::string("Mv_AllMv") + std::to_string(OPP_comm_iteration);
         opp_profiler->start(profName);
         
-        opp_init_particle_move(set, 0, nullptr);
+        opp_init_particle_move(set, nargs, args);
 
         // check whether particle is within cell, and if not move between cells within the MPI rank, mark for neighbour comm
         opp_profiler->start("{{lh.kernel}}_only");
diff --git a/scripts/batch/fempic/avon/avon_fempic_mpi_1.sh b/scripts/batch/fempic/avon/avon_fempic_mpi_1.sh
index b1f1495..b4aa87b 100755
--- a/scripts/batch/fempic/avon/avon_fempic_mpi_1.sh
+++ b/scripts/batch/fempic/avon/avon_fempic_mpi_1.sh
@@ -10,12 +10,20 @@
 runFolder=$PWD"/MPI1_"$(date +"D_%Y_%m_%d_T_%I_%M_%S")
 echo "Creating running folder" $runFolder
 
-binary="/home/dcs/csrcnj/phd/OP-PIC/fempic_mpi/bin/mpi_hdf5"
+binary="/home/dcs/csrcnj/phd/OP-PIC/app_fempic/bin/mpi_hdf5"
 echo "Using Binary" $binary
 
-module load GCC/10.3.0  OpenMPI/4.1.1
-module load PETSc/3.15.1
-module load HDF5/1.12.1
+module purge
+
+# module load GCC/10.3.0  OpenMPI/4.1.1
+# module load PETSc/3.15.1
+# module load HDF5/1.12.1
+
+module load intel/2022a
+module load HDF5/1.13.1
+# unset I_MPI_PMI_LIBRARY
+export PETSC_INSTALL_PATH=/home/dcs/csrcnj/lib_install/petsc-3.21.0_intel2022a
+export LD_LIBRARY_PATH=$PETSC_INSTALL_PATH/lib:$LD_LIBRARY_PATH
 
 # export I_MPI_PMI_LIBRARY=/usr/lib/x86_64-linux-gnu/libpmi.so
 # export I_MPI_PMI_LIBRARY=/usr/lib64/pmix/lib/libpmi.so
diff --git a/scripts/batch/fempic/avon/avon_fempic_mpi_16.sh b/scripts/batch/fempic/avon/avon_fempic_mpi_16.sh
index 1fe6cfd..b27eda6 100755
--- a/scripts/batch/fempic/avon/avon_fempic_mpi_16.sh
+++ b/scripts/batch/fempic/avon/avon_fempic_mpi_16.sh
@@ -10,12 +10,20 @@
 runFolder=$PWD"/MPI16_"$(date +"D_%Y_%m_%d_T_%I_%M_%S")
 echo "Creating running folder" $runFolder
 
-binary="/home/dcs/csrcnj/phd/OP-PIC/fempic_mpi/bin/mpi_hdf5"
+binary="/home/dcs/csrcnj/phd/OP-PIC/app_fempic/bin/mpi_hdf5"
 echo "Using Binary" $binary
 
-module load GCC/10.3.0  OpenMPI/4.1.1
-module load PETSc/3.15.1
-module load HDF5/1.12.1
+module purge
+
+# module load GCC/10.3.0  OpenMPI/4.1.1
+# module load PETSc/3.15.1
+# module load HDF5/1.12.1
+
+module load intel/2022a
+module load HDF5/1.13.1
+# unset I_MPI_PMI_LIBRARY
+export PETSC_INSTALL_PATH=/home/dcs/csrcnj/lib_install/petsc-3.21.0_intel2022a
+export LD_LIBRARY_PATH=$PETSC_INSTALL_PATH/lib:$LD_LIBRARY_PATH
 
 # export I_MPI_PMI_LIBRARY=/usr/lib/x86_64-linux-gnu/libpmi.so
 # export I_MPI_PMI_LIBRARY=/usr/lib64/pmix/lib/libpmi.so
diff --git a/scripts/batch/fempic/avon/avon_fempic_mpi_2.sh b/scripts/batch/fempic/avon/avon_fempic_mpi_2.sh
index 7d11d50..610a086 100755
--- a/scripts/batch/fempic/avon/avon_fempic_mpi_2.sh
+++ b/scripts/batch/fempic/avon/avon_fempic_mpi_2.sh
@@ -10,12 +10,20 @@
 runFolder=$PWD"/MPI2_"$(date +"D_%Y_%m_%d_T_%I_%M_%S")
 echo "Creating running folder" $runFolder
 
-binary="/home/dcs/csrcnj/phd/OP-PIC/fempic_mpi/bin/mpi_hdf5"
+binary="/home/dcs/csrcnj/phd/OP-PIC/app_fempic/bin/mpi_hdf5"
 echo "Using Binary" $binary
 
-module load GCC/10.3.0  OpenMPI/4.1.1
-module load PETSc/3.15.1
-module load HDF5/1.12.1
+module purge
+
+# module load GCC/10.3.0  OpenMPI/4.1.1
+# module load PETSc/3.15.1
+# module load HDF5/1.12.1
+
+module load intel/2022a
+module load HDF5/1.13.1
+# unset I_MPI_PMI_LIBRARY
+export PETSC_INSTALL_PATH=/home/dcs/csrcnj/lib_install/petsc-3.21.0_intel2022a
+export LD_LIBRARY_PATH=$PETSC_INSTALL_PATH/lib:$LD_LIBRARY_PATH
 
 # export I_MPI_PMI_LIBRARY=/usr/lib/x86_64-linux-gnu/libpmi.so
 # export I_MPI_PMI_LIBRARY=/usr/lib64/pmix/lib/libpmi.so
diff --git a/scripts/batch/fempic/avon/avon_fempic_mpi_32.sh b/scripts/batch/fempic/avon/avon_fempic_mpi_32.sh
index 986dd37..ebe8c51 100755
--- a/scripts/batch/fempic/avon/avon_fempic_mpi_32.sh
+++ b/scripts/batch/fempic/avon/avon_fempic_mpi_32.sh
@@ -10,12 +10,20 @@
 runFolder=$PWD"/MPI32_"$(date +"D_%Y_%m_%d_T_%I_%M_%S")
 echo "Creating running folder" $runFolder
 
-binary="/home/dcs/csrcnj/phd/OP-PIC/fempic_mpi/bin/mpi_hdf5"
+binary="/home/dcs/csrcnj/phd/OP-PIC/app_fempic/bin/mpi_hdf5"
 echo "Using Binary" $binary
 
-module load GCC/10.3.0  OpenMPI/4.1.1
-module load PETSc/3.15.1
-module load HDF5/1.12.1
+module purge
+
+# module load GCC/10.3.0  OpenMPI/4.1.1
+# module load PETSc/3.15.1
+# module load HDF5/1.12.1
+
+module load intel/2022a
+module load HDF5/1.13.1
+# unset I_MPI_PMI_LIBRARY
+export PETSC_INSTALL_PATH=/home/dcs/csrcnj/lib_install/petsc-3.21.0_intel2022a
+export LD_LIBRARY_PATH=$PETSC_INSTALL_PATH/lib:$LD_LIBRARY_PATH
 
 # export I_MPI_PMI_LIBRARY=/usr/lib/x86_64-linux-gnu/libpmi.so
 # export I_MPI_PMI_LIBRARY=/usr/lib64/pmix/lib/libpmi.so
diff --git a/scripts/batch/fempic/avon/avon_fempic_mpi_4.sh b/scripts/batch/fempic/avon/avon_fempic_mpi_4.sh
index b6940cb..1d8b157 100755
--- a/scripts/batch/fempic/avon/avon_fempic_mpi_4.sh
+++ b/scripts/batch/fempic/avon/avon_fempic_mpi_4.sh
@@ -10,12 +10,20 @@
 runFolder=$PWD"/MPI4_"$(date +"D_%Y_%m_%d_T_%I_%M_%S")
 echo "Creating running folder" $runFolder
 
-binary="/home/dcs/csrcnj/phd/OP-PIC/fempic_mpi/bin/mpi_hdf5"
+binary="/home/dcs/csrcnj/phd/OP-PIC/app_fempic/bin/mpi_hdf5"
 echo "Using Binary" $binary
 
-module load GCC/10.3.0  OpenMPI/4.1.1
-module load PETSc/3.15.1
-module load HDF5/1.12.1
+module purge
+
+# module load GCC/10.3.0  OpenMPI/4.1.1
+# module load PETSc/3.15.1
+# module load HDF5/1.12.1
+
+module load intel/2022a
+module load HDF5/1.13.1
+# unset I_MPI_PMI_LIBRARY
+export PETSC_INSTALL_PATH=/home/dcs/csrcnj/lib_install/petsc-3.21.0_intel2022a
+export LD_LIBRARY_PATH=$PETSC_INSTALL_PATH/lib:$LD_LIBRARY_PATH
 
 # export I_MPI_PMI_LIBRARY=/usr/lib/x86_64-linux-gnu/libpmi.so
 # export I_MPI_PMI_LIBRARY=/usr/lib64/pmix/lib/libpmi.so
diff --git a/scripts/batch/fempic/avon/avon_fempic_mpi_8.sh b/scripts/batch/fempic/avon/avon_fempic_mpi_8.sh
index e6f04c1..6b74d6d 100755
--- a/scripts/batch/fempic/avon/avon_fempic_mpi_8.sh
+++ b/scripts/batch/fempic/avon/avon_fempic_mpi_8.sh
@@ -10,12 +10,20 @@
 runFolder=$PWD"/MPI8_"$(date +"D_%Y_%m_%d_T_%I_%M_%S")
 echo "Creating running folder" $runFolder
 
-binary="/home/dcs/csrcnj/phd/OP-PIC/fempic_mpi/bin/mpi_hdf5"
+binary="/home/dcs/csrcnj/phd/OP-PIC/app_fempic/bin/mpi_hdf5"
 echo "Using Binary" $binary
 
-module load GCC/10.3.0  OpenMPI/4.1.1
-module load PETSc/3.15.1
-module load HDF5/1.12.1
+module purge
+
+# module load GCC/10.3.0  OpenMPI/4.1.1
+# module load PETSc/3.15.1
+# module load HDF5/1.12.1
+
+module load intel/2022a
+module load HDF5/1.13.1
+# unset I_MPI_PMI_LIBRARY
+export PETSC_INSTALL_PATH=/home/dcs/csrcnj/lib_install/petsc-3.21.0_intel2022a
+export LD_LIBRARY_PATH=$PETSC_INSTALL_PATH/lib:$LD_LIBRARY_PATH
 
 # export I_MPI_PMI_LIBRARY=/usr/lib/x86_64-linux-gnu/libpmi.so
 # export I_MPI_PMI_LIBRARY=/usr/lib64/pmix/lib/libpmi.so
diff --git a/scripts/batch/fempic/avon/avon_fempic_mpi_energy21.sh b/scripts/batch/fempic/avon/avon_fempic_mpi_energy21.sh
index 98381a0..a40ae26 100755
--- a/scripts/batch/fempic/avon/avon_fempic_mpi_energy21.sh
+++ b/scripts/batch/fempic/avon/avon_fempic_mpi_energy21.sh
@@ -12,17 +12,20 @@ echo "Start date and time: $(date +"%Y-%m-%d %H:%M:%S")"
 runFolder=$PWD"/MPI_E_"$(date +"D_%Y_%m_%d_T_%I_%M_%S")
 echo "Creating running folder" $runFolder
 
-binary="/home/dcs/csrcnj/phd/OP-PIC/fempic_mpi/bin/mpi_hdf5"
+binary="/home/dcs/csrcnj/phd/OP-PIC/app_fempic/bin/mpi_hdf5"
 echo "Using Binary" $binary
 
+module purge
+
 # module load GCC/10.3.0  OpenMPI/4.1.1
 # module load PETSc/3.15.1
 # module load HDF5/1.12.1
 
-module purge
 module load intel/2022a
 module load HDF5/1.13.1
-
+# unset I_MPI_PMI_LIBRARY
+export PETSC_INSTALL_PATH=/home/dcs/csrcnj/lib_install/petsc-3.21.0_intel2022a
+export LD_LIBRARY_PATH=$PETSC_INSTALL_PATH/lib:$LD_LIBRARY_PATH
 # export I_MPI_PMI_LIBRARY=/usr/lib/x86_64-linux-gnu/libpmi.so
 # export I_MPI_PMI_LIBRARY=/usr/lib64/pmix/lib/libpmi.so