diff --git a/app_cabanapic/configs/cabana.param b/app_cabanapic/configs/cabana.param index f0d6b441..8f75e2a7 100644 --- a/app_cabanapic/configs/cabana.param +++ b/app_cabanapic/configs/cabana.param @@ -18,7 +18,7 @@ STRING cluster = block # System parameters BOOL opp_auto_sort = false REAL opp_allocation_multiple = 1.1 -INT opp_threads_per_block = 256 +INT opp_threads_per_block = 512 BOOL opp_profile_all = false BOOL print_final = true INT opp_gpu_direct = 0 @@ -28,7 +28,8 @@ STRING opp_fill = HoleFill_All INT opp_fill_period = 4 BOOL opp_global_move = false -BOOL use_reg_red = false +BOOL opp_segmented_red = false +INT gpu_reduction_arrays = 16 STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/cabana/mesh_files/cab_48000.hdf5 INT domain_expansion = 1 diff --git a/app_cabanapic_cg/cabana_opp.cpp b/app_cabanapic_cg/cabana_opp.cpp index 60287172..e8e7aba0 100644 --- a/app_cabanapic_cg/cabana_opp.cpp +++ b/app_cabanapic_cg/cabana_opp.cpp @@ -1,5 +1,5 @@ -// Auto-generated at 2025-01-21 14:53:57.354254 by opp-translator +// Auto-generated at 2025-02-03 16:39:08.296851 by opp-translator /* BSD 3-Clause License diff --git a/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp b/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp index 471c0783..f680bd28 100644 --- a/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp +++ b/app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp @@ -18,6 +18,12 @@ __constant__ OPP_INT opp_k1_map0_stride_d; namespace opp_k1 { +enum Dim { + x = 0, + y = 1, + z = 2, +}; + enum CellInterp { ex = 0, dexdy, @@ -39,12 +45,6 @@ enum CellInterp { dcbzdz, }; -enum Dim { - x = 0, - y = 1, - z = 2, -}; - __device__ inline void interpolate_mesh_fields_kernel( const double* cell0_e, const double* cell0_b, diff --git a/app_cabanapic_cg/hip/compute_energy_kernel_loop.hpp b/app_cabanapic_cg/hip/compute_energy_kernel_loop.hpp index 2bc9ebb6..e431a9a9 100644 --- a/app_cabanapic_cg/hip/compute_energy_kernel_loop.hpp +++ b/app_cabanapic_cg/hip/compute_energy_kernel_loop.hpp @@ -142,10 +142,10 @@ void opp_par_loop_all__compute_energy_kernel(opp_set set, for (int d = 0; d < 1; ++d) arg2_host_data[d] += ((OPP_REAL *)args[2].data)[b * 1 + d]; } - args[2].data = (char *)arg2_host_data; opp_mpi_reduce(&args[2], arg2_host_data); + opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); diff --git a/app_cabanapic_cg/hip/get_max_x_values_kernel_loop.hpp b/app_cabanapic_cg/hip/get_max_x_values_kernel_loop.hpp index f43888c1..6527a896 100644 --- a/app_cabanapic_cg/hip/get_max_x_values_kernel_loop.hpp +++ b/app_cabanapic_cg/hip/get_max_x_values_kernel_loop.hpp @@ -202,7 +202,6 @@ void opp_par_loop_all__get_max_x_values_kernel(opp_set set, for (int d = 0; d < 1; ++d) arg5_host_data[d] = MAX(arg5_host_data[d], ((OPP_REAL *)args[5].data)[b * 1 + d]); } - args[1].data = (char *)arg1_host_data; opp_mpi_reduce(&args[1], arg1_host_data); @@ -212,6 +211,7 @@ void opp_par_loop_all__get_max_x_values_kernel(opp_set set, args[5].data = (char *)arg5_host_data; opp_mpi_reduce(&args[5], arg5_host_data); + opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); diff --git a/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp b/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp index 34d0be95..e7b05381 100644 --- a/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp +++ b/app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp @@ -38,33 +38,6 @@ enum CellAcc { jfz = 2 * 4, }; -enum CellInterp { - ex = 0, - dexdy, - dexdz, - d2exdydz, - ey, - deydz, - deydx, - d2eydzdx, - ez, - dezdx, - dezdy, - d2ezdxdy, - cbx, - dcbxdx, - cby, - dcbydy, - cbz, - dcbzdz, -}; - -enum Dim { - x = 0, - y = 1, - z = 2, -}; - __device__ inline void weight_current_to_accumulator_kernel( double* cell_acc, const double* q, @@ -94,6 +67,33 @@ __device__ inline void weight_current_to_accumulator_kernel( cell_acc[CellAcc::jfz + 3] += v3; } +enum Dim { + x = 0, + y = 1, + z = 2, +}; + +enum CellInterp { + ex = 0, + dexdy, + dexdz, + d2exdydz, + ey, + deydz, + deydx, + d2eydzdx, + ez, + dezdx, + dezdy, + d2ezdxdy, + cbx, + dcbxdx, + cby, + dcbydy, + cbz, + dcbzdz, +}; + __device__ inline void move_deposit_kernel( char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen @@ -481,12 +481,17 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma int num_blocks = 200; + opp_init_particle_move(set, nargs, args); + const int array_count = opp_params->get("gpu_reduction_arrays"); if (!opp_use_segmented_reductions) { opp_create_thread_level_data(args[5]); } - opp_init_particle_move(set, nargs, args); + + opp_profiler->start("Mv_AllMv0"); + // ---------------------------------------------------------------------------- + // Multi-hop move particles within current MPI rank and if not mark for neighbour comm opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; @@ -498,9 +503,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma opp_mem::dev_copy_to_symbol(opp_k2_dat4_stride_d, &opp_k2_dat4_stride, &(args[4].dat->set->set_capacity), 1); opp_mem::dev_copy_to_symbol(opp_k2_dat5_stride_d, &opp_k2_dat5_stride, &(args[5].dat->set->set_capacity), 1); - opp_profiler->start("Mv_AllMv0"); - // ---------------------------------------------------------------------------- - // Multi-hop move particles within current MPI rank and if not mark for neighbour comm if (!opp_use_segmented_reductions) // Do atomics ---------- { opp_profiler->start("move_kernel_only"); @@ -524,7 +526,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("move_kernel_only"); } - else // Do segmented reductions ---------- { opp_mem::dev_copy_to_symbol(opp_k2_sr_set_stride_d, &opp_k2_sr_set_stride, &set->size, 1); @@ -575,6 +576,11 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma // then iterate over the newly added particles while (opp_finalize_particle_move(set)) { + opp_init_particle_move(set, nargs, args); + + opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); + num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; + opp_mem::dev_copy_to_symbol(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1); opp_mem::dev_copy_to_symbol(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1); opp_mem::dev_copy_to_symbol(opp_k2_dat2_stride_d, &opp_k2_dat2_stride, &(args[2].dat->set->set_capacity), 1); @@ -582,10 +588,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma opp_mem::dev_copy_to_symbol(opp_k2_dat4_stride_d, &opp_k2_dat4_stride, &(args[4].dat->set->set_capacity), 1); opp_mem::dev_copy_to_symbol(opp_k2_dat5_stride_d, &opp_k2_dat5_stride, &(args[5].dat->set->set_capacity), 1); - opp_init_particle_move(set, nargs, args); - opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); - - num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; if (!opp_use_segmented_reductions) // Do atomics ---------- { opp_profiler->start("move_kernel_only"); @@ -609,7 +611,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("move_kernel_only"); } - else // Do segmented reductions ---------- { // opp_mem::dev_copy_to_symbol(opp_k2_sr_set_stride_d, &opp_k2_sr_set_stride, &set->size, 1); @@ -656,6 +657,7 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma if (!opp_use_segmented_reductions) { opp_reduce_thread_level_data(args[5]); + } else { opp_sr::clear_arrays(sr_dat5_keys_dv, sr_dat5_values_dv, sr_dat5_keys_dv2, sr_dat5_values_dv2); diff --git a/app_cabanapic_cg/hip/opp_kernels.cpp b/app_cabanapic_cg/hip/opp_kernels.cpp index 06ba6849..cd5a0a5d 100644 --- a/app_cabanapic_cg/hip/opp_kernels.cpp +++ b/app_cabanapic_cg/hip/opp_kernels.cpp @@ -37,6 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "opp_hip.h" #include "device_kernels/hip_inline_kernels.h" +OPP_REAL CONST_dt[1]; +OPP_REAL CONST_qsp[1]; +OPP_REAL CONST_cdt_d[3]; +OPP_REAL CONST_p[3]; +OPP_REAL CONST_qdt_2mc[1]; +OPP_REAL CONST_dt_eps0[1]; +OPP_REAL CONST_acc_coef[3]; + __constant__ OPP_REAL CONST_dt_d[1]; __constant__ OPP_REAL CONST_qsp_d[1]; __constant__ OPP_REAL CONST_cdt_d_d[3]; @@ -52,30 +60,37 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) { if (!strcmp(name, "CONST_dt")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_dt_d), data, dim * size)); + std::memcpy(&CONST_dt, data, (size*dim)); return; } if (!strcmp(name, "CONST_qsp")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_qsp_d), data, dim * size)); + std::memcpy(&CONST_qsp, data, (size*dim)); return; } if (!strcmp(name, "CONST_cdt_d")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_cdt_d_d), data, dim * size)); + std::memcpy(&CONST_cdt_d, data, (size*dim)); return; } if (!strcmp(name, "CONST_p")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_p_d), data, dim * size)); + std::memcpy(&CONST_p, data, (size*dim)); return; } if (!strcmp(name, "CONST_qdt_2mc")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_qdt_2mc_d), data, dim * size)); + std::memcpy(&CONST_qdt_2mc, data, (size*dim)); return; } if (!strcmp(name, "CONST_dt_eps0")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_dt_eps0_d), data, dim * size)); + std::memcpy(&CONST_dt_eps0, data, (size*dim)); return; } if (!strcmp(name, "CONST_acc_coef")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_acc_coef_d), data, dim * size)); + std::memcpy(&CONST_acc_coef, data, (size*dim)); return; } diff --git a/app_cabanapic_cg/hip/update_ghosts_kernel_loop.hpp b/app_cabanapic_cg/hip/update_ghosts_kernel_loop.hpp index d47a54ed..3895a44c 100644 --- a/app_cabanapic_cg/hip/update_ghosts_kernel_loop.hpp +++ b/app_cabanapic_cg/hip/update_ghosts_kernel_loop.hpp @@ -118,9 +118,9 @@ __global__ void opp_dev_sr_update_ghosts_kernel( // Used for Segmented Reduction ); for (int d = 0; d < 3; ++d) { - sr_dat1_values[n + opp_k6_sr_set_stride_d * d] = arg2_0_local[d]; + sr_dat1_values[1 * n + 0 + opp_k6_sr_set_stride_d * d] = arg2_0_local[d]; } - sr_dat1_keys[n] = map0[opp_k6_map0_stride_d * 0 + n]; + sr_dat1_keys[1 * n + 0] = map0[opp_k6_map0_stride_d * 0 + n]; } } @@ -212,19 +212,21 @@ void opp_par_loop_all__update_ghosts_kernel(opp_set set, else // Do segmented reductions ---------- { - opp_mem::dev_copy_to_symbol(opp_k6_sr_set_stride_d, &opp_k6_sr_set_stride, &(iter_size), 1); - size_t operating_size_dat1 = 0, resize_size_dat1 = 0; operating_size_dat1 += (size_t)(args[2].dat->dim); resize_size_dat1 += (size_t)(args[2].dat->dim); - operating_size_dat1 *= (size_t)(iter_size); // this might need to be + set->exec_size ..., or simply iter_size | even for opp_k6_sr_set_stride + operating_size_dat1 *= (size_t)(iter_size); resize_size_dat1 *= (size_t)(set->set_capacity); - opp_sr::init_arrays(args[1].dat->dim, operating_size_dat1, resize_size_dat1, - sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2); + int k6_stride = 0; + k6_stride += iter_size * 1; + opp_mem::dev_copy_to_symbol(opp_k6_sr_set_stride_d, &opp_k6_sr_set_stride, &(k6_stride), 1); + opp_sr::init_arrays(args[2].dat->dim, operating_size_dat1, resize_size_dat1, + sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2); + // Create key/value pairs opp_profiler->start("SR_CrKeyVal"); opp_dev_sr_update_ghosts_kernel<<>>( @@ -241,8 +243,8 @@ void opp_par_loop_all__update_ghosts_kernel(opp_set set, OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("SR_CrKeyVal"); - opp_sr::do_segmented_reductions(args[2], iter_size, - sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2); + opp_sr::do_segmented_reductions(args[2], k6_stride, + sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2); } } args[3].data = (char *)arg3_host_data; diff --git a/app_fempic/configs/coarse.param b/app_fempic/configs/coarse.param index 88fc3075..7701f8e1 100644 --- a/app_fempic/configs/coarse.param +++ b/app_fempic/configs/coarse.param @@ -17,7 +17,7 @@ STRING global_mesh = /home/zl/phd/box_mesh_gen/48000/mesh.dat STRING inlet_mesh = /home/zl/phd/box_mesh_gen/48000/inlet.dat STRING wall_mesh = /home/zl/phd/box_mesh_gen/48000/wall.dat # OR -STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/fempic/mesh_files/box_48000.hdf5 +STRING hdf_filename = /home/zl/phd/box_mesh_gen/hdf5/box_48000.hdf5 STRING rand_file = /home/zl/phd/box_mesh_gen/random_100k.dat @@ -25,20 +25,22 @@ BOOL invert_normals = false BOOL opp_auto_sort = false REAL opp_allocation_multiple = 11 -INT opp_threads_per_block = 256 +INT opp_threads_per_block = 512 BOOL opp_profile_all = false INT opp_gpu_direct = 0 - -REAL grid_spacing = 25e-6 BOOL print_final = true BOOL opp_global_move = false +REAL grid_spacing = 25e-6 +BOOL opp_dh_data_generate = false +# Use opp_dh_data_generate = true and OPP_DH_DATA_DUMP=1 in args to dump the structured mesh # block k-means mpi-block STRING cluster = block -BOOL use_reg_red = false +BOOL opp_segmented_red = false +INT gpu_reduction_arrays = 16 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = Shuffle_Periodic -INT opp_fill_period = 20 +INT opp_fill_period = 4 \ No newline at end of file diff --git a/app_fempic_cg/configs/coarse.param b/app_fempic_cg/configs/coarse.param index c6e86143..745e8aaf 100644 --- a/app_fempic_cg/configs/coarse.param +++ b/app_fempic_cg/configs/coarse.param @@ -17,7 +17,7 @@ STRING global_mesh = /home/zl/phd/box_mesh_gen/48000/mesh.dat STRING inlet_mesh = /home/zl/phd/box_mesh_gen/48000/inlet.dat STRING wall_mesh = /home/zl/phd/box_mesh_gen/48000/wall.dat # OR -STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/fempic/mesh_files/box_48000.hdf5 +STRING hdf_filename = /home/zl/phd/box_mesh_gen/hdf5/box_96000.hdf5 STRING rand_file = /home/zl/phd/box_mesh_gen/random_100k.dat @@ -25,7 +25,7 @@ BOOL invert_normals = false BOOL opp_auto_sort = false REAL opp_allocation_multiple = 11 -INT opp_threads_per_block = 256 +INT opp_threads_per_block = 512 BOOL opp_profile_all = false INT opp_gpu_direct = 0 @@ -38,7 +38,7 @@ BOOL opp_dh_data_generate = false # block k-means mpi-block STRING cluster = block -BOOL use_reg_red = false +BOOL opp_segmented_red = false INT gpu_reduction_arrays = 16 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic diff --git a/app_fempic_cg/fempic_opp.cpp b/app_fempic_cg/fempic_opp.cpp index 4f3d550c..618c2776 100644 --- a/app_fempic_cg/fempic_opp.cpp +++ b/app_fempic_cg/fempic_opp.cpp @@ -1,5 +1,5 @@ -// Auto-generated at 2025-01-21 14:55:58.979812 by opp-translator +// Auto-generated at 2025-02-03 16:29:35.442468 by opp-translator /* BSD 3-Clause License diff --git a/app_fempic_cg/hip/deposit_charge_on_nodes_kernel_loop.hpp b/app_fempic_cg/hip/deposit_charge_on_nodes_kernel_loop.hpp index fa2d50c2..0877e0c0 100644 --- a/app_fempic_cg/hip/deposit_charge_on_nodes_kernel_loop.hpp +++ b/app_fempic_cg/hip/deposit_charge_on_nodes_kernel_loop.hpp @@ -17,6 +17,9 @@ __constant__ OPP_INT opp_k5_sr_set_stride_d; thrust::device_vector sr_dat1_keys_dv; thrust::device_vector sr_dat1_values_dv; +thrust::device_vector sr_dat1_keys_dv2; +thrust::device_vector sr_dat1_values_dv2; + namespace opp_k5 { __device__ inline void deposit_charge_on_nodes_kernel( const double *part_lc, @@ -152,23 +155,22 @@ __global__ void opp_dev_sr_deposit_charge_on_nodes_kernel( // Used for Segmented arg4_3_local // n_charge_den ); - int offset = 0; - for (int d = 0; d < 1; ++d, ++offset) { - sr_dat1_values[n + opp_k5_sr_set_stride_d * offset] = arg1_0_local[d]; - sr_dat1_keys[n + opp_k5_sr_set_stride_d * offset] = map0[opp_k5_map0_stride_d * 0 + opp_p2c[0]] + (d * opp_k5_dat1_stride_d); - } - for (int d = 0; d < 1; ++d, ++offset) { - sr_dat1_values[n + opp_k5_sr_set_stride_d * offset] = arg2_1_local[d]; - sr_dat1_keys[n + opp_k5_sr_set_stride_d * offset] = map0[opp_k5_map0_stride_d * 1 + opp_p2c[0]] + (d * opp_k5_dat1_stride_d); - } - for (int d = 0; d < 1; ++d, ++offset) { - sr_dat1_values[n + opp_k5_sr_set_stride_d * offset] = arg3_2_local[d]; - sr_dat1_keys[n + opp_k5_sr_set_stride_d * offset] = map0[opp_k5_map0_stride_d * 2 + opp_p2c[0]] + (d * opp_k5_dat1_stride_d); - } - for (int d = 0; d < 1; ++d, ++offset) { - sr_dat1_values[n + opp_k5_sr_set_stride_d * offset] = arg4_3_local[d]; - sr_dat1_keys[n + opp_k5_sr_set_stride_d * offset] = map0[opp_k5_map0_stride_d * 3 + opp_p2c[0]] + (d * opp_k5_dat1_stride_d); - } + for (int d = 0; d < 1; ++d) { + sr_dat1_values[4 * n + 0 + opp_k5_sr_set_stride_d * d] = arg1_0_local[d]; + } + sr_dat1_keys[4 * n + 0] = map0[opp_k5_map0_stride_d * 0 + opp_p2c[0]]; + for (int d = 0; d < 1; ++d) { + sr_dat1_values[4 * n + 1 + opp_k5_sr_set_stride_d * d] = arg2_1_local[d]; + } + sr_dat1_keys[4 * n + 1] = map0[opp_k5_map0_stride_d * 1 + opp_p2c[0]]; + for (int d = 0; d < 1; ++d) { + sr_dat1_values[4 * n + 2 + opp_k5_sr_set_stride_d * d] = arg3_2_local[d]; + } + sr_dat1_keys[4 * n + 2] = map0[opp_k5_map0_stride_d * 2 + opp_p2c[0]]; + for (int d = 0; d < 1; ++d) { + sr_dat1_values[4 * n + 3 + opp_k5_sr_set_stride_d * d] = arg4_3_local[d]; + } + sr_dat1_keys[4 * n + 3] = map0[opp_k5_map0_stride_d * 3 + opp_p2c[0]]; } } @@ -220,7 +222,7 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set, const OPP_INT end = iter_size; num_blocks = (end - start - 1) / block_size + 1; - if (!opp_params->get("use_reg_red")) // Do atomics ---------- + if (!opp_use_segmented_reductions) // Do atomics ---------- { const int array_count = opp_params->get("gpu_reduction_arrays"); OPP_REAL** arg1_dat_thread_data_d = opp_create_thread_level_data(args[1]); @@ -238,8 +240,6 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set, else // Do segmented reductions ---------- { - opp_mem::dev_copy_to_symbol(opp_k5_sr_set_stride_d, &opp_k5_sr_set_stride, &(set->size), 1); - size_t operating_size_dat1 = 0, resize_size_dat1 = 0; operating_size_dat1 += (size_t)(args[1].dat->dim); @@ -251,13 +251,15 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set, operating_size_dat1 += (size_t)(args[4].dat->dim); resize_size_dat1 += (size_t)(args[4].dat->dim); - operating_size_dat1 *= (size_t)(set->size); + operating_size_dat1 *= (size_t)(iter_size); resize_size_dat1 *= (size_t)(set->set_capacity); - if (resize_size_dat1 > sr_dat1_keys_dv.size()) { // resize only if current vector is small - sr_dat1_keys_dv.resize(resize_size_dat1, 0); - sr_dat1_values_dv.resize(resize_size_dat1, 0); - } + int k5_stride = 0; + k5_stride += iter_size * 4; + opp_mem::dev_copy_to_symbol(opp_k5_sr_set_stride_d, &opp_k5_sr_set_stride, &(k5_stride), 1); + + opp_sr::init_arrays(args[1].dat->dim, operating_size_dat1, resize_size_dat1, + sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2); // Create key/value pairs opp_profiler->start("SR_CrKeyVal"); @@ -273,41 +275,11 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set, OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("SR_CrKeyVal"); - // Sort by keys to bring the identical keys together - opp_profiler->start("SR_SortByKey"); - thrust::sort_by_key(sr_dat1_keys_dv.begin(), sr_dat1_keys_dv.begin() + operating_size_dat1, - sr_dat1_values_dv.begin()); - opp_profiler->end("SR_SortByKey"); - - // Compute the unique keys and their corresponding values - opp_profiler->start("SR_RedByKey"); - auto new_end = thrust::reduce_by_key( - sr_dat1_keys_dv.begin(), sr_dat1_keys_dv.begin() + operating_size_dat1, - sr_dat1_values_dv.begin(), - sr_dat1_keys_dv.begin(), - sr_dat1_values_dv.begin()); - opp_profiler->end("SR_RedByKey"); - - const size_t reduced_size = (new_end.first - sr_dat1_keys_dv.begin()); - - // Assign reduced values to the nodes using keys/values - opp_profiler->start("SR_Assign"); - opp_k5::assign_values<<>> ( // TODO : check whether num_blocks is correct - opp_get_dev_raw_ptr(sr_dat1_keys_dv), - opp_get_dev_raw_ptr(sr_dat1_values_dv), - (OPP_REAL *) args[1].data_d, - 0, reduced_size); - OPP_DEVICE_SYNCHRONIZE(); - opp_profiler->end("SR_Assign"); - - // Last: clear the thrust vectors if this is the last iteration (avoid crash) - if (opp_params->get("num_steps") == (OPP_main_loop_iter + 1)) { - OPP_DEVICE_SYNCHRONIZE(); - sr_dat1_values_dv.clear(); sr_dat1_values_dv.shrink_to_fit(); - sr_dat1_keys_dv.clear(); sr_dat1_keys_dv.shrink_to_fit(); - } + opp_sr::do_segmented_reductions(args[1], k5_stride, + sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2); } } + opp_sr::clear_arrays(sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2); opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); diff --git a/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp b/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp index c696362f..e2d65070 100644 --- a/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp +++ b/app_fempic_cg/hip/get_final_max_values_kernel_loop.hpp @@ -161,13 +161,13 @@ void opp_par_loop_all__get_final_max_values_kernel(opp_set set, for (int d = 0; d < 1; ++d) arg3_host_data[d] = MAX(arg3_host_data[d], ((OPP_REAL *)args[3].data)[b * 1 + d]); } - args[1].data = (char *)arg1_host_data; opp_mpi_reduce(&args[1], arg1_host_data); args[3].data = (char *)arg3_host_data; opp_mpi_reduce(&args[3], arg3_host_data); + opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); diff --git a/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp b/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp index cb0b776f..227ee156 100644 --- a/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp +++ b/app_fempic_cg/hip/get_max_cef_kernel_loop.hpp @@ -124,10 +124,10 @@ void opp_par_loop_all__get_max_cef_kernel(opp_set set, for (int d = 0; d < 1; ++d) arg1_host_data[d] = MAX(arg1_host_data[d], ((OPP_REAL *)args[1].data)[b * 1 + d]); } - args[1].data = (char *)arg1_host_data; opp_mpi_reduce(&args[1], arg1_host_data); + opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); diff --git a/app_fempic_cg/hip/move_kernel_loop.hpp b/app_fempic_cg/hip/move_kernel_loop.hpp index aaa4a284..8baaed01 100644 --- a/app_fempic_cg/hip/move_kernel_loop.hpp +++ b/app_fempic_cg/hip/move_kernel_loop.hpp @@ -15,6 +15,8 @@ __constant__ OPP_INT opp_k4_dat2_stride_d; __constant__ OPP_INT opp_k4_dat3_stride_d; __constant__ OPP_INT opp_k4_c2c_map_stride_d; + + namespace opp_k4 { namespace host { @@ -114,6 +116,7 @@ __device__ inline void move_kernel( } } +//-------------------------------------------------------------- __global__ void opp_dev_move_kernel( const OPP_REAL *__restrict__ dat0, // p_pos OPP_REAL *__restrict__ dat1, // p_lc @@ -130,11 +133,9 @@ __global__ void opp_dev_move_kernel( const OPP_INT end ) { - const int thread_id = threadIdx.x + blockIdx.x * blockDim.x; - - if (thread_id + start < end) { + const int n = OPP_DEVICE_GLOBAL_LINEAR_ID + start; - const int n = thread_id + start; + if (n < end) { OPP_INT *opp_p2c = (p2c_map + n); if (opp_p2c[0] == MAX_CELL_INDEX) { @@ -144,8 +145,7 @@ __global__ void opp_dev_move_kernel( char move_flag = OPP_NEED_MOVE; bool iter_one_flag = (OPP_comm_iteration_d > 0) ? false : true; - do - { + do { const OPP_INT p2c = opp_p2c[0]; // get the value here, since the kernel might change it const OPP_INT* opp_c2c = c2c_map + p2c; @@ -162,8 +162,10 @@ __global__ void opp_dev_move_kernel( *particle_remove_count, particle_remove_indices, move_particle_indices, move_cell_indices, move_count)); } + } + void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_map, opp_arg arg0, // p_pos | OPP_READ opp_arg arg1, // p_lc | OPP_WRITE @@ -202,10 +204,6 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma int num_blocks = 200; opp_init_particle_move(set, nargs, args); - - opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); - num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; - if (useGlobalMove) { #ifdef USE_MPI @@ -227,6 +225,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma } // check whether particles need to be moved via the global move routine + num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; opp_dev_checkForGlobalMove3D_kernel<<>>( (OPP_REAL*)args[0].data_d, // p_pos (OPP_INT *)args[4].data_d, // p2c_map @@ -254,16 +253,22 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma #endif } - opp_mem::dev_copy_to_symbol(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1); - opp_mem::dev_copy_to_symbol(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1); - opp_mem::dev_copy_to_symbol(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1); - opp_mem::dev_copy_to_symbol(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, &(args[3].dat->set->set_capacity), 1); + opp_profiler->start("Mv_AllMv0"); // ---------------------------------------------------------------------------- // check whether all particles not marked for global comm is within cell, // and if not mark to move between cells within the MPI rank, mark for neighbour comm + + opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); + num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; + + opp_mem::dev_copy_to_symbol(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, &(args[3].dat->set->set_capacity), 1); + { opp_profiler->start("move_kernel_only"); opp_dev_move_kernel<<>>( @@ -281,11 +286,13 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma OPP_iter_start, OPP_iter_end ); - OPP_DEVICE_SYNCHRONIZE(); + OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("move_kernel_only"); } - opp_profiler->end("Mv_AllMv0"); + + opp_profiler->end("Mv_AllMv0"); + #ifdef USE_MPI // ---------------------------------------------------------------------------- // finalize the global move routine and iterate over newly added particles and check whether they need neighbour comm @@ -308,6 +315,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma const int start2 = (set->size - set->diff); const int end2 = set->size; num_blocks = (end2 - start2 - 1) / block_size + 1; + opp_mem::dev_copy_to_symbol(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1); opp_mem::dev_copy_to_symbol(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1); opp_mem::dev_copy_to_symbol(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1); @@ -342,15 +350,16 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma // then iterate over the newly added particles while (opp_finalize_particle_move(set)) { + opp_init_particle_move(set, nargs, args); + + opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); + num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; + opp_mem::dev_copy_to_symbol(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1); opp_mem::dev_copy_to_symbol(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1); opp_mem::dev_copy_to_symbol(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1); opp_mem::dev_copy_to_symbol(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, &(args[3].dat->set->set_capacity), 1); - opp_init_particle_move(set, nargs, args); - opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); - - num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; { opp_profiler->start("move_kernel_only"); opp_dev_move_kernel<<>>( @@ -368,9 +377,10 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma OPP_iter_start, OPP_iter_end ); - OPP_DEVICE_SYNCHRONIZE(); + OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("move_kernel_only"); } + } opp_set_dirtybit_grouped(nargs, args, Device_GPU); diff --git a/app_neso_advection/configs/advec.param b/app_neso_advection/configs/advec.param index e262c518..11cefa36 100644 --- a/app_neso_advection/configs/advec.param +++ b/app_neso_advection/configs/advec.param @@ -13,8 +13,14 @@ REAL opp_allocation_multiple = 1.1 INT opp_threads_per_block = 256 BOOL opp_profile_all = false INT opp_gpu_direct = 0 + BOOL opp_global_move = false REAL grid_spacing = 0.5 +BOOL opp_dh_data_generate = false +# Use opp_dh_data_generate = true and OPP_DH_DATA_DUMP=1 in args to dump the structured mesh + +BOOL opp_segmented_red = false +INT gpu_reduction_arrays = 16 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = HoleFill_All diff --git a/app_neso_advection_cg/advec_opp.cpp b/app_neso_advection_cg/advec_opp.cpp index 3cc754d6..87e32855 100644 --- a/app_neso_advection_cg/advec_opp.cpp +++ b/app_neso_advection_cg/advec_opp.cpp @@ -1,5 +1,5 @@ -// Auto-generated at 2025-01-21 15:02:40.426657 by opp-translator +// Auto-generated at 2025-02-03 16:37:08.638646 by opp-translator /* BSD 3-Clause License diff --git a/app_neso_advection_cg/configs/advec.param b/app_neso_advection_cg/configs/advec.param index 4d9f2554..11cefa36 100644 --- a/app_neso_advection_cg/configs/advec.param +++ b/app_neso_advection_cg/configs/advec.param @@ -19,6 +19,9 @@ REAL grid_spacing = 0.5 BOOL opp_dh_data_generate = false # Use opp_dh_data_generate = true and OPP_DH_DATA_DUMP=1 in args to dump the structured mesh +BOOL opp_segmented_red = false +INT gpu_reduction_arrays = 16 + # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = HoleFill_All INT opp_fill_period = 4 \ No newline at end of file diff --git a/app_neso_advection_cg/hip/move_kernel_loop.hpp b/app_neso_advection_cg/hip/move_kernel_loop.hpp index 99397b3d..d224480e 100644 --- a/app_neso_advection_cg/hip/move_kernel_loop.hpp +++ b/app_neso_advection_cg/hip/move_kernel_loop.hpp @@ -14,11 +14,48 @@ __constant__ OPP_INT opp_k2_c2c_map_stride_d; namespace opp_k2 { + +namespace host { +enum CellMap { + xd_y = 0, + xu_y, + x_yd, + x_yu +}; + enum Dim { x = 0, y = 1, }; +inline void move_kernel(const double* p_pos, const double* c_pos_ll) +{ + // check for x direction movement + const double p_pos_x = p_pos[Dim::x]; + if (p_pos_x < c_pos_ll[Dim::x]) { + opp_p2c[0] = opp_c2c[CellMap::xd_y]; + { opp_move_status_flag = OPP_NEED_MOVE; }; return; + } + if (p_pos_x > (c_pos_ll[Dim::x] + CONST_cell_width[0])) { + opp_p2c[0] = opp_c2c[CellMap::xu_y]; + { opp_move_status_flag = OPP_NEED_MOVE; }; return; + } + + // check for y direction movement + const double p_pos_y = p_pos[Dim::y]; + if (p_pos_y < c_pos_ll[Dim::y]) { + opp_p2c[0] = opp_c2c[CellMap::x_yd]; + { opp_move_status_flag = OPP_NEED_MOVE; }; return; + } + if (p_pos_y > (c_pos_ll[Dim::y] + CONST_cell_width[0])) { + opp_p2c[0] = opp_c2c[CellMap::x_yu]; + { opp_move_status_flag = OPP_NEED_MOVE; }; return; + } + + { opp_move_status_flag = OPP_MOVE_DONE; }; +} +} + enum CellMap { xd_y = 0, xu_y, @@ -26,6 +63,11 @@ enum CellMap { x_yu }; +enum Dim { + x = 0, + y = 1, +}; + __device__ inline void move_kernel(char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen const double* p_pos, const double* c_pos_ll) @@ -71,18 +113,19 @@ __global__ void opp_dev_move_kernel( const OPP_INT end ) { - const int thread_id = threadIdx.x + blockIdx.x * blockDim.x; + const int n = OPP_DEVICE_GLOBAL_LINEAR_ID + start; - if (thread_id + start < end) { - - const int n = thread_id + start; + if (n < end) { OPP_INT *opp_p2c = (p2c_map + n); + if (opp_p2c[0] == MAX_CELL_INDEX) { + return; + } + char move_flag = OPP_NEED_MOVE; bool iter_one_flag = (OPP_comm_iteration_d > 0) ? false : true; - do - { + do { const OPP_INT p2c = opp_p2c[0]; // get the value here, since the kernel might change it const OPP_INT* opp_c2c = c2c_map + p2c; @@ -100,7 +143,6 @@ __global__ void opp_dev_move_kernel( } -//-------------------------------------------------------------- void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_map, opp_arg arg0, // p_pos | OPP_READ @@ -118,7 +160,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma args[1] = arg1; args[2] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty - const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU); + opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU); const OPP_INT c2c_stride = c2c_map->from->size + c2c_map->from->exec_size + c2c_map->from->nonexec_size; @@ -135,17 +177,155 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma int num_blocks = 200; - do + opp_init_particle_move(set, nargs, args); + if (useGlobalMove) { + +#ifdef USE_MPI + globalMover->initGlobalMove(); + opp_init_dh_device(set); +#endif + opp_profiler->start("GblMv_Move"); + + opp_mem::dev_copy_to_symbol(cellMapper_pos_stride_d, &cellMapper_pos_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(OPP_rank_d, &OPP_rank, 1); + + hipError_t err1 = hipMemcpyToSymbol(HIP_SYMBOL(opp_minSavedDHGrid_d), opp_minSavedDHGrid, 3 * sizeof(size_t)); + if (err1 != hipSuccess) { + throw std::runtime_error(std::string("dev_copy_to_symbol: ") + hipGetErrorString(err1)); + } + hipError_t err2 = hipMemcpyToSymbol(HIP_SYMBOL(opp_maxSavedDHGrid_d), opp_maxSavedDHGrid, 3 * sizeof(size_t)); + if (err2 != hipSuccess) { + throw std::runtime_error(std::string("dev_copy_to_symbol: ") + hipGetErrorString(err2)); + } + + // check whether particles need to be moved via the global move routine + num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; + opp_dev_checkForGlobalMove2D_kernel<<>>( + (OPP_REAL*)args[0].data_d, // p_pos + (OPP_INT *)args[2].data_d, // p2c_map + cellMapper->structMeshToCellMapping_d, + cellMapper->structMeshToRankMapping_d, + cellMapper->oneOverGridSpacing_d, + cellMapper->minGlbCoordinate_d, + cellMapper->globalGridDims_d, + cellMapper->globalGridSize_d, + set->particle_remove_count_d, + OPP_remove_particle_indices_d, + dh_indices_d.part_indices, + dh_indices_d.cell_indices, + dh_indices_d.rank_indices, + dh_indices_d.move_count, + OPP_iter_start, OPP_iter_end + ); + OPP_DEVICE_SYNCHRONIZE(); + + opp_profiler->end("GblMv_Move"); + +#ifdef USE_MPI + opp_gather_dh_move_indices(set); + globalMover->communicate(set); +#endif + } + + + + + opp_profiler->start("Mv_AllMv0"); + // ---------------------------------------------------------------------------- + // check whether all particles not marked for global comm is within cell, + // and if not mark to move between cells within the MPI rank, mark for neighbour comm + + opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); + num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; + + opp_mem::dev_copy_to_symbol(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1); + { - opp_mem::dev_copy_to_symbol(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1); - opp_mem::dev_copy_to_symbol(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1); + opp_profiler->start("move_kernel_only"); + opp_dev_move_kernel<<>>( + (OPP_REAL *)args[0].data_d, // p_pos + (OPP_REAL *)args[1].data_d, // c_pos_ll + (OPP_INT *)args[2].data_d, // p2c_map + (OPP_INT *)c2c_map->map_d, // c2c_map + (OPP_INT *)set->particle_remove_count_d, + (OPP_INT *)OPP_remove_particle_indices_d, + (OPP_INT *)OPP_move_particle_indices_d, + (OPP_INT *)OPP_move_cell_indices_d, + (OPP_INT *)OPP_move_count_d, + OPP_iter_start, + OPP_iter_end + ); + OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->end("move_kernel_only"); + } + + + opp_profiler->end("Mv_AllMv0"); + +#ifdef USE_MPI + // ---------------------------------------------------------------------------- + // finalize the global move routine and iterate over newly added particles and check whether they need neighbour comm + if (useGlobalMove) { + + opp_profiler->start("GblMv_finalize"); + const int finalized = globalMover->finalize(set); + opp_profiler->end("GblMv_finalize"); + + if (finalized > 0) { + opp_profiler->start("GblMv_AllMv"); + + // need to change arg data since particle resize in globalMover::finalize could change the pointer in dat->data + for (int i = 0; i < nargs; i++) + if (args[i].argtype == OPP_ARG_DAT && args[i].dat->set->is_particle) + args[i].data_d = args[i].dat->data_d; + + // check whether the new particle is within cell, and if not move between cells within the MPI rank, + // mark for neighbour comm. Do only for the globally moved particles + const int start2 = (set->size - set->diff); + const int end2 = set->size; + num_blocks = (end2 - start2 - 1) / block_size + 1; + + opp_mem::dev_copy_to_symbol(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1); + + opp_profiler->start("move_kernel_only"); + opp_dev_move_kernel<<>>( + (OPP_REAL *)args[0].data_d, // p_pos + (OPP_REAL *)args[1].data_d, // c_pos_ll + (OPP_INT *)args[2].data_d, // p2c_map + (OPP_INT *)c2c_map->map_d, // c2c_map + (OPP_INT *)set->particle_remove_count_d, + (OPP_INT *)OPP_remove_particle_indices_d, + (OPP_INT *)OPP_move_particle_indices_d, + (OPP_INT *)OPP_move_cell_indices_d, + (OPP_INT *)OPP_move_count_d, + start2, + end2 + ); + OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->end("move_kernel_only"); + + opp_profiler->end("GblMv_AllMv"); + } + } +#endif + + // ---------------------------------------------------------------------------- + // Do neighbour communication and if atleast one particle is received by the currect rank, + // then iterate over the newly added particles + while (opp_finalize_particle_move(set)) { opp_init_particle_move(set, nargs, args); - opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); + opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; + + opp_mem::dev_copy_to_symbol(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1); { + opp_profiler->start("move_kernel_only"); opp_dev_move_kernel<<>>( (OPP_REAL *)args[0].data_d, // p_pos (OPP_REAL *)args[1].data_d, // c_pos_ll @@ -159,21 +339,83 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma OPP_iter_start, OPP_iter_end ); + OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->end("move_kernel_only"); } - } while (opp_finalize_particle_move(set)); + } opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("move_kernel"); } + void opp_init_direct_hop_cg(double grid_spacing, const opp_dat c_gbl_id, const opp::BoundingBox& b_box, opp_map c2c_map, opp_map p2c_map, opp_arg arg0, // p_pos | OPP_READ opp_arg arg1 // c_pos_ll | OPP_READ ) { opp_profiler->start("Setup_Mover"); - + + useGlobalMove = opp_params->get("opp_global_move"); + + if (OPP_DBG) opp_printf("opp_init_direct_hop_cg", "START useGlobalMove=%s", useGlobalMove ? "YES" : "NO"); + + if (useGlobalMove) { + + const int nargs = 3; + opp_arg args[nargs]; + + args[0] = arg0; + args[1] = arg1; + args[2] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty + +#ifdef USE_MPI + opp_mpi_halo_exchanges_grouped(c_gbl_id->set, nargs, args, Device_CPU); + + comm = std::make_shared(MPI_COMM_WORLD); + globalMover = std::make_unique(comm->comm_parent); + + opp_mpi_halo_wait_all(nargs, args); +#endif + + boundingBox = std::make_shared(b_box); + cellMapper = std::make_shared(boundingBox, grid_spacing, comm); + + const int c_set_size = c_gbl_id->set->size; + + // lambda function for dh mesh search loop + auto all_cell_checker = [&](const opp_point& point, int& cid) { + + for (int ci = 0; ci < c_set_size; ++ci) { + opp_move_status_flag = OPP_NEED_MOVE; + opp_move_hop_iter_one_flag = true; + + int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy + + opp_p2c = &(temp_ci); + opp_c2c = &((c2c_map->map)[temp_ci * 4]); + + opp_k2::host::move_kernel( + (const OPP_REAL*)&point, + (const OPP_REAL *)args[1].data + (temp_ci * 2) // c_pos_ll| OPP_READ + ); + if (opp_move_status_flag == OPP_MOVE_DONE) { + cid = temp_ci; + break; + } + } + }; + + if (opp_params->get("opp_dh_data_generate")) { + cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker); + } + else { + cellMapper->generateStructuredMeshFromFile(c_gbl_id->set, c_gbl_id); + } + } + opp_profiler->end("Setup_Mover"); } + diff --git a/app_neso_advection_cg/hip/opp_kernels.cpp b/app_neso_advection_cg/hip/opp_kernels.cpp index e59e7640..c745e8ac 100644 --- a/app_neso_advection_cg/hip/opp_kernels.cpp +++ b/app_neso_advection_cg/hip/opp_kernels.cpp @@ -37,6 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "opp_hip.h" #include "device_kernels/hip_inline_kernels.h" +OPP_REAL CONST_extents[2]; +OPP_REAL CONST_dt[1]; +OPP_REAL CONST_cell_width[1]; +OPP_INT CONST_ndimcells[2]; + __constant__ OPP_REAL CONST_extents_d[2]; __constant__ OPP_REAL CONST_dt_d[1]; __constant__ OPP_REAL CONST_cell_width_d[1]; @@ -49,18 +54,22 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) { if (!strcmp(name, "CONST_extents")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_extents_d), data, dim * size)); + std::memcpy(&CONST_extents, data, (size*dim)); return; } if (!strcmp(name, "CONST_dt")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_dt_d), data, dim * size)); + std::memcpy(&CONST_dt, data, (size*dim)); return; } if (!strcmp(name, "CONST_cell_width")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_cell_width_d), data, dim * size)); + std::memcpy(&CONST_cell_width, data, (size*dim)); return; } if (!strcmp(name, "CONST_ndimcells")) { cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_ndimcells_d), data, dim * size)); + std::memcpy(&CONST_ndimcells, data, (size*dim)); return; } diff --git a/app_neso_advection_cg/hip/verify_kernel_loop.hpp b/app_neso_advection_cg/hip/verify_kernel_loop.hpp index 7479826d..98b21c40 100644 --- a/app_neso_advection_cg/hip/verify_kernel_loop.hpp +++ b/app_neso_advection_cg/hip/verify_kernel_loop.hpp @@ -170,10 +170,10 @@ void opp_par_loop_all__verify_kernel(opp_set set, for (int d = 0; d < 1; ++d) arg2_host_data[d] += ((OPP_INT *)args[2].data)[b * 1 + d]; } - args[2].data = (char *)arg2_host_data; opp_mpi_reduce(&args[2], arg2_host_data); + opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); diff --git a/app_neso_advection_cg/mpi/move_kernel_loop.hpp b/app_neso_advection_cg/mpi/move_kernel_loop.hpp index 2d3d3584..3f689927 100644 --- a/app_neso_advection_cg/mpi/move_kernel_loop.hpp +++ b/app_neso_advection_cg/mpi/move_kernel_loop.hpp @@ -257,7 +257,7 @@ void opp_init_direct_hop_cg(double grid_spacing, const opp_dat c_gbl_id, const o else { cellMapper->generateStructuredMeshFromFile(c_gbl_id->set, c_gbl_id); } - + opp_profiler->reg("GlbToLocal"); opp_profiler->reg("GblMv_Move"); opp_profiler->reg("GblMv_AllMv"); diff --git a/app_neso_advection_cg/seq/move_kernel_loop.hpp b/app_neso_advection_cg/seq/move_kernel_loop.hpp index 662e1529..ea921f5a 100644 --- a/app_neso_advection_cg/seq/move_kernel_loop.hpp +++ b/app_neso_advection_cg/seq/move_kernel_loop.hpp @@ -190,7 +190,7 @@ void opp_init_direct_hop_cg(double grid_spacing, const opp_dat c_gbl_id, const o else { cellMapper->generateStructuredMeshFromFile(c_gbl_id->set, c_gbl_id); } - + opp_profiler->reg("GlbToLocal"); opp_profiler->reg("GblMv_Move"); opp_profiler->reg("GblMv_AllMv"); diff --git a/app_simpic/configs/system.param b/app_simpic/configs/system.param index 86424391..804f88ca 100644 --- a/app_simpic/configs/system.param +++ b/app_simpic/configs/system.param @@ -23,7 +23,7 @@ BOOL opp_profile_all = false INT opp_gpu_direct = 0 BOOL opp_global_move = false REAL grid_spacing = 0.5 -BOOL use_reg_red = false +BOOL opp_segmented_red = false # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = Shuffle_Periodic diff --git a/app_simpic_cg/configs/system.param b/app_simpic_cg/configs/system.param index 86424391..804f88ca 100644 --- a/app_simpic_cg/configs/system.param +++ b/app_simpic_cg/configs/system.param @@ -23,7 +23,7 @@ BOOL opp_profile_all = false INT opp_gpu_direct = 0 BOOL opp_global_move = false REAL grid_spacing = 0.5 -BOOL use_reg_red = false +BOOL opp_segmented_red = false # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = Shuffle_Periodic diff --git a/opp_translator/resources/templates/cpp/hip/loop_host.hpp.jinja b/opp_translator/resources/templates/cpp/hip/loop_host.hpp.jinja index 38b4279d..7ff1fee1 100644 --- a/opp_translator/resources/templates/cpp/hip/loop_host.hpp.jinja +++ b/opp_translator/resources/templates/cpp/hip/loop_host.hpp.jinja @@ -148,6 +148,9 @@ __constant__ OPP_INT opp_k{{kernel_idx}}_sr_set_stride_d; {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) if config.seg_red %} thrust::device_vector sr_dat{{lh.dat(arg).id}}_keys_dv; thrust::device_vector<{{lh.dat(arg).typ}}> sr_dat{{lh.dat(arg).id}}_values_dv; + +thrust::device_vector sr_dat{{lh.dat(arg).id}}_keys_dv2; +thrust::device_vector<{{lh.dat(arg).typ}}> sr_dat{{lh.dat(arg).id}}_values_dv2; {% endfor %} {% endblock %} @@ -322,14 +325,21 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions {% endfor %} ); - int offset = 0; + {% set counters = namespace(data={}) %} + {% set occurrences = namespace(data={}) %} + {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} + {% set _ = counters.data.update({arg.dat_id: 0}) %} + {% set x_ = occurrences.data.update({arg.dat_id: 0}) %} + {% endfor %} + {% for arg in lh.args|dat|indirect|reduction %} + {% set _ = counters.data.update({arg.dat_id: counters.data[arg.dat_id] + 1}) %} + {% endfor %} {% for arg in lh.args_expanded|indirect|reduction if config.atomics %} - for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d, ++offset) { - {#% for datx in lh.dats|same_iter_set_dat(lh) %#} - sr_dat{{arg.dat_id}}_values[n + opp_k{{kernel_idx}}_sr_set_stride_d * offset] = arg{{arg.id}}_{{arg.map_idx}}_local[d]; - sr_dat{{arg.dat_id}}_keys[n + opp_k{{kernel_idx}}_sr_set_stride_d * offset] = {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}}); - {#% endfor %#} - } + for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) { + sr_dat{{arg.dat_id}}_values[{{ counters.data[arg.dat_id] }} * n + {{ occurrences.data[arg.dat_id] }} + opp_k{{kernel_idx}}_sr_set_stride_d * d] = arg{{arg.id}}_{{arg.map_idx}}_local[d]; + } + sr_dat{{arg.dat_id}}_keys[{{ counters.data[arg.dat_id] }} * n + {{ occurrences.data[arg.dat_id] }}] = {{map_lookup(arg, kernel_idx)}}; + {% set y_ = occurrences.data.update({arg.dat_id: occurrences.data[arg.dat_id] + 1}) %} {% endfor %} } {% for arg in lh.args|gbl|reduction %} @@ -567,7 +577,7 @@ opp_dev_sr_{{lh.kernel}}<< 0 and config.atomics and config.seg_red %} - if (!opp_params->get("use_reg_red")) // Do atomics ---------- + if (!opp_use_segmented_reductions) // Do atomics ---------- {% endif %} {% if lh is direct or config.atomics %} { @@ -596,8 +606,6 @@ opp_dev_sr_{{lh.kernel}}<< 0 and config.seg_red %} { - opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_sr_set_stride_d, &opp_k{{kernel_idx}}_sr_set_stride, &(set->size), 1); - {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} size_t operating_size_dat{{arg.dat_id}} = 0, resize_size_dat{{arg.dat_id}} = 0; {% endfor %} @@ -608,15 +616,26 @@ opp_dev_sr_{{lh.kernel}}<<size); + operating_size_dat{{arg.dat_id}} *= (size_t)(iter_size); resize_size_dat{{arg.dat_id}} *= (size_t)(set->set_capacity); {% endfor %} + {% set counters = namespace(data={}) %} {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} - if (resize_size_dat{{arg.dat_id}} > sr_dat{{arg.dat_id}}_keys_dv.size()) { // resize only if current vector is small - sr_dat{{arg.dat_id}}_keys_dv.resize(resize_size_dat{{arg.dat_id}}, 0); - sr_dat{{arg.dat_id}}_values_dv.resize(resize_size_dat{{arg.dat_id}}, 0); - } + {% set _ = counters.data.update({arg.dat_id: 0}) %} + {% endfor %} + {% for arg in lh.args|dat|indirect|reduction %} + {% set _ = counters.data.update({arg.dat_id: counters.data[arg.dat_id] + 1}) %} + {% endfor %} + int k{{kernel_idx}}_stride = 0; + {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} + k{{kernel_idx}}_stride += iter_size * {{ counters.data[arg.dat_id] }}; + {% endfor %} + opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_sr_set_stride_d, &opp_k{{kernel_idx}}_sr_set_stride, &(k{{kernel_idx}}_stride), 1); + + {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} + opp_sr::init_arrays<{{lh.dat(arg).typ}}>(args[{{arg.id}}].dat->dim, operating_size_dat{{arg.dat_id}}, resize_size_dat{{arg.dat_id}}, + sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2); {% endfor %} // Create key/value pairs @@ -627,39 +646,8 @@ opp_dev_sr_{{lh.kernel}}<<end("SR_CrKeyVal"); {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} - // Sort by keys to bring the identical keys together - opp_profiler->start("SR_SortByKey"); - thrust::sort_by_key(sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}}, - sr_dat{{arg.dat_id}}_values_dv.begin()); - opp_profiler->end("SR_SortByKey"); - - // Compute the unique keys and their corresponding values - opp_profiler->start("SR_RedByKey"); - auto new_end = thrust::reduce_by_key( - sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}}, - sr_dat{{arg.dat_id}}_values_dv.begin(), - sr_dat{{arg.dat_id}}_keys_dv.begin(), - sr_dat{{arg.dat_id}}_values_dv.begin()); - opp_profiler->end("SR_RedByKey"); - - const size_t reduced_size = (new_end.first - sr_dat{{arg.dat_id}}_keys_dv.begin()); - - // Assign reduced values to the nodes using keys/values - opp_profiler->start("SR_Assign"); - opp_k{{kernel_idx}}::assign_values<<>> ( // TODO : check whether num_blocks is correct - opp_get_dev_raw_ptr(sr_dat{{arg.dat_id}}_keys_dv), - opp_get_dev_raw_ptr(sr_dat{{arg.dat_id}}_values_dv), - (OPP_REAL *) args[{{arg.dat_id}}].data_d, - 0, reduced_size); - OPP_DEVICE_SYNCHRONIZE(); - opp_profiler->end("SR_Assign"); - - // Last: clear the thrust vectors if this is the last iteration (avoid crash) - if (opp_params->get("num_steps") == (OPP_main_loop_iter + 1)) { - OPP_DEVICE_SYNCHRONIZE(); - sr_dat{{arg.dat_id}}_values_dv.clear(); sr_dat{{arg.dat_id}}_values_dv.shrink_to_fit(); - sr_dat{{arg.dat_id}}_keys_dv.clear(); sr_dat{{arg.dat_id}}_keys_dv.shrink_to_fit(); - } + opp_sr::do_segmented_reductions<{{lh.dat(arg).typ}}>(args[{{arg.id}}], k{{kernel_idx}}_stride, + sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2); {% endfor %} } {% endif %} @@ -694,7 +682,6 @@ opp_dev_sr_{{lh.kernel}}<<(sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2); + {% endfor %} + opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); {% if lh is double_indirect_reduc %} diff --git a/opp_translator/resources/templates/cpp/hip/master_kernel.cpp.jinja b/opp_translator/resources/templates/cpp/hip/master_kernel.cpp.jinja index 33342e0d..17732786 100644 --- a/opp_translator/resources/templates/cpp/hip/master_kernel.cpp.jinja +++ b/opp_translator/resources/templates/cpp/hip/master_kernel.cpp.jinja @@ -10,6 +10,10 @@ {% endblock %} {% block const_decls %} + {% for const in app.consts() %} +{{const.typ}} {{const.ptr}}[{{const.dim}}]; + {% endfor %} + {% for const in app.consts() %} __constant__ {{const.typ}} {{const.ptr}}_d[{{const.dim}}]; {% endfor %} @@ -18,5 +22,6 @@ __constant__ {{const.typ}} {{const.ptr}}_d[{{const.dim}}]; {% block const_decl_func %} {% call(const) const_decl_func("") %} cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL({{const.ptr}}_d), data, dim * size)); + std::memcpy(&{{const.ptr}}, data, (size*dim)); {%- endcall %} {% endblock %} \ No newline at end of file diff --git a/opp_translator/resources/templates/cpp/hip/move_loop_host.hpp.jinja b/opp_translator/resources/templates/cpp/hip/move_loop_host.hpp.jinja index 4d7b99d2..2d6ad1ce 100644 --- a/opp_translator/resources/templates/cpp/hip/move_loop_host.hpp.jinja +++ b/opp_translator/resources/templates/cpp/hip/move_loop_host.hpp.jinja @@ -3,98 +3,19 @@ {% block kernel %} namespace opp_k{{kernel_idx}} { -{{kernel_func}} -{% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.seg_red %} -// Segmented Reductions Routines -// -------------------------------------------------------------- -__global__ void assign_values( - const OPP_INT *__restrict keys, - const OPP_REAL *__restrict values, - OPP_REAL *__restrict dat, - const int start, - const int end) -{ - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - - if (tid + start < end) - { - const int n = tid + start; - const int mapping = keys[n]; - dat[mapping] += values[n]; - } -} - -//-------------------------------------------------------------- -__global__ void sequence_OPP_INT_values( - OPP_INT *__restrict values, - const int start, - const int end) -{ - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - - if (tid + start < end) - { - const int n = tid + start; - values[n] = n; - } -} - -//-------------------------------------------------------------- -__global__ void reset_OPP_INT_values( - OPP_INT *__restrict values, - const int start, - const int end) -{ - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - - if (tid + start < end) - { - const int n = tid + start; - values[n] = 0; - } -} - -//-------------------------------------------------------------- -__global__ void reset_OPP_REAL_values( - OPP_REAL *__restrict values, - const int start, - const int end) -{ - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - - if (tid + start < end) - { - const int n = tid + start; - values[n] = 0.0; - } -} - -//-------------------------------------------------------------- -__global__ void assign_values_by_key( - const OPP_INT *__restrict indices, - const OPP_REAL *__restrict values_in, - OPP_REAL *__restrict values_out, - const int start, const int end, const int dim) -{ - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - - if (tid + start < end) - { - const int n = tid + start; - const int idx = indices[n]; +{% if lh.dh_loop_required %} - for (int d = 0; d < dim; d++) { - values_out[n + d * opp_k2_sr_set_stride_d] = values_in[idx + d * opp_k2_sr_set_stride_d]; - } - } +namespace host { +{{host_kernel_func}} } {% endif %} +{{kernel_func}} } {% endblock %} -{% macro stride_hip(arg) -%} +{% macro stride_device(arg) -%} {{-" * opp_k%s_dat%s_stride_d" % (kernel_idx, arg.dat_id) if lh.dat(arg) is soa-}} {%- endmacro %} @@ -120,21 +41,21 @@ __global__ void assign_values_by_key( {% endif %} {% endmacro %} -{% macro opt_hip_cond(arg) %} +{% macro opt_device_cond(arg) %} {%- if arg is opt -%}optflags & 1 << {{lh.optIdx(arg)}}{%- endif -%} {% endmacro %} -{% macro opt_hip_cond_comp(arg) %} - {%- if arg is opt -%}{{opt_hip_cond(arg)}} && {% endif -%} +{% macro opt_device_cond_comp(arg) %} + {%- if arg is opt -%}{{opt_device_cond(arg)}} && {% endif -%} {% endmacro %} -{% macro opt_hip_tern(arg, alt = "NULL") %} - {%- if arg is opt -%}{{opt_hip_cond(arg)}} ? {{caller()}} : {{alt}}{%- else -%}{{caller()}}{%- endif -%} +{% macro opt_device_tern(arg, alt = "NULL") %} + {%- if arg is opt -%}{{opt_device_cond(arg)}} ? {{caller()}} : {{alt}}{%- else -%}{{caller()}}{%- endif -%} {% endmacro %} -{% macro opt_hip_if(arg) %} +{% macro opt_device_if(arg) %} {% if arg is opt %} - if ({{opt_hip_cond(arg)}}) { + if ({{opt_device_cond(arg)}}) { {{caller()|indent-}} {{"}"|indent(first = true)}} {% else %} @@ -155,7 +76,7 @@ map{{arg.map_id}}[opp_k{{kernel_idx}}_map{{lh.map(arg).id}}_stride_d * {{arg.map {%- endif -%} {%- endmacro %} -{% macro arg_to_pointer_hip(arg) -%} +{% macro arg_to_pointer_device(arg) -%} {%- if arg is gbl %} {% if arg is reduction %} @@ -195,6 +116,31 @@ gbl{{arg.id}} {%- endif -%} {%- endmacro %} +{%- macro arg_to_pointer_dh(arg, lh) %} + {%- if arg.id == 0 -%} + (const OPP_REAL*)&point, + {%- elif arg is gbl %} + {%- set cast = arg.typ -%} + ({{cast}} *)args[{{arg.id}}].data{%-if arg.id+1 != lh.args|length-%},{%-endif-%} + {%- else -%} + + {%- if arg is direct -%} + {%- set offset = "" -%} + {%- elif arg is double_indirect -%} + {%- set offset = " + (map%s[%d] * %d)" % (arg.map_id, arg.map_idx, lh.dat(arg).dim) -%} + {%- elif arg is indirect -%} + {%- set offset = " + (temp_ci * %d)" % (lh.dat(arg).dim) -%} + {%- endif -%} + + {# 0 is OP.AccessType.READ #} + {%- if arg.access_type.value == 0 -%} + (const {{lh.dat(arg).typ}} *)args[{{arg.id}}].data{{offset}}{%-if arg.id+1 != lh.args|length-%},{%-endif%} // {% if arg is dat -%}{{lh.dat(arg).ptr}} {%- endif -%} | OPP_{{arg.access_type.name}} + {%- else -%} + arg{{arg.id}}_temp{%-if arg.id+1 != lh.args|length-%},{%-endif%} // {% if arg is dat -%}{{lh.dat(arg).ptr}} {%- endif -%} | OPP_{{arg.access_type.name}} + {%- endif -%} + {%- endif -%} +{%- endmacro -%} + {% block prologue %} {{super()}} {% for dat in lh.dats|soa %} @@ -234,7 +180,11 @@ thrust::device_vector<{{lh.dat(arg).typ}}> sr_dat{{lh.dat(arg).id}}_values_dv2; __global__ void opp_dev_{{lh.kernel}}( {{-"\n const unsigned optflags," if lh.args|opt|length > 0}} {% for dat in lh.dats %} + {% if dat is indirect_reduction(lh) and lh is particle_loop %} + {{dat.typ}} **__restrict__ dat{{dat.id}}, const OPP_INT dat{{dat.id}}_arr_count, // {{dat.ptr}} + {% else %} {{"const " if dat is read_in(lh)}}{{dat.typ}} *__restrict__ dat{{dat.id}}, // {{dat.ptr}} + {% endif %} {% endfor %} OPP_INT *__restrict__ p2c_map, const OPP_INT *__restrict__ c2c_map, @@ -244,6 +194,9 @@ __global__ void opp_dev_{{lh.kernel}}( {% for arg in lh.args|gbl %} {{"const " if arg.access_type == OP.AccessType.Read}}{{arg.typ}} *gbl{{arg.id}}, {% endfor %} + {% if lh is injected_loop %} + const OPP_INT inj_start, + {% endif %} OPP_INT *__restrict__ particle_remove_count, OPP_INT *__restrict__ particle_remove_indices, OPP_INT *__restrict__ move_particle_indices, @@ -255,7 +208,7 @@ __global__ void opp_dev_{{lh.kernel}}( { {% for arg in lh.args|gbl|reduction %} {{arg.typ}} gbl{{arg.id}}_local[{{arg.dim}}]; - for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{arg.dim}}; ++d) + for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{arg.dim}}; ++d) gbl{{arg.id}}_local[d] = {% if arg is inc -%} {{arg.typ}}_ZERO {%- else -%} @@ -263,13 +216,17 @@ __global__ void opp_dev_{{lh.kernel}}( {%- endif -%}; {% endfor %} - const int thread_id = threadIdx.x + blockIdx.x * blockDim.x; - - if (thread_id + start < end) { + const int n = OPP_DEVICE_GLOBAL_LINEAR_ID + start; - const int n = {{"thread_id + start" if config.atomics else "col_reord[thread_id + start]"}}; + if (n < end) { OPP_INT *opp_p2c = (p2c_map + n); + {% if lh.dh_loop_required %} + if (opp_p2c[0] == MAX_CELL_INDEX) { + return; + } + {% endif %} + char move_flag = OPP_NEED_MOVE; bool iter_one_flag = (OPP_comm_iteration_d > 0) ? false : true; @@ -281,13 +238,18 @@ __global__ void opp_dev_{{lh.kernel}}( {% endif %} {% endfor %} - do - { + {% for dat in lh.dats if config.atomics and lh is particle_loop %} + {% if dat is indirect_reduction(lh) %} + {{dat.typ}}* tmp{{dat.id}} = dat{{dat.id}}[threadIdx.x % dat{{dat.id}}_arr_count]; + + {% endif %} + {% endfor %} + do { const OPP_INT p2c = opp_p2c[0]; // get the value here, since the kernel might change it const OPP_INT* opp_c2c = c2c_map + p2c; {% for arg in lh.args_expanded|dat|indirect|reduction if config.atomics %} - for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) + for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) {% if arg is p2c_mapped and not double_indirect %} arg{{arg.id}}_p2c_local[d] = {{lh.dat(arg).typ}}_ZERO; {% else %} @@ -298,16 +260,21 @@ __global__ void opp_dev_{{lh.kernel}}( opp_k{{kernel_idx}}::{{lh.kernel}}( move_flag, iter_one_flag, opp_c2c, opp_p2c, {% for arg in lh.args %} - {%+ call opt_hip_tern(arg) %}{{arg_to_pointer_hip(arg)}}{% endcall %}{{"," if not loop.last}} // {% if arg is dat %}{{lh.dat(arg).ptr}} {% endif +%} + {%+ call opt_device_tern(arg) %}{{arg_to_pointer_device(arg)}}{% endcall %}{{"," if not loop.last}} // {% if arg is dat %}{{lh.dat(arg).ptr}} {% endif +%} {% endfor %} ); + {% if lh is particle_loop and config.atomics %} + {% set dat_name = "tmp" -%} + {% else %} + {% set dat_name = "dat" -%} + {% endif %} {% for arg in lh.args_expanded|dat|indirect|reduction if config.atomics %} - for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) + for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) {% if arg is p2c_mapped and not double_indirect %} - atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}}), arg{{arg.id}}_p2c_local[d]); + atomicAdd({{dat_name}}{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_device(arg)}}), arg{{arg.id}}_p2c_local[d]); {% else %} - atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}}), arg{{arg.id}}_{{arg.map_idx}}_local[d]); // TODO: this looks incorrect + atomicAdd({{dat_name}}{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_device(arg)}}), arg{{arg.id}}_{{arg.map_idx}}_local[d]); // TODO: this looks incorrect {% endif %} {% endfor %} } while (opp_part_check_status_device(move_flag, iter_one_flag, opp_p2c, n, @@ -316,7 +283,7 @@ __global__ void opp_dev_{{lh.kernel}}( } {% for arg in lh.args|gbl|reduction %} - for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{arg.dim}}; ++d) + for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{arg.dim}}; ++d) opp_reduction(gbl{{arg.id}} + blockIdx.x * {{arg.dim}} + d, gbl{{arg.id}}_local[d]); {% endfor %} {{ caller() }} @@ -351,16 +318,13 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions const OPP_INT end ) { - const int thread_id = threadIdx.x + blockIdx.x * blockDim.x; - - if (thread_id + start < end) { + const int n = OPP_DEVICE_GLOBAL_LINEAR_ID + start; - const int n = thread_id + start; + if (n < end) { OPP_INT *opp_p2c = (p2c_map + n); char move_flag = OPP_NEED_MOVE; bool iter_one_flag = (OPP_comm_iteration_d > 0) ? false : true; - bool on_old_cell = true; {% for arg in lh.args_expanded|dat|indirect|reduction if config.seg_red %} {% if arg is p2c_mapped and not double_indirect %} @@ -370,13 +334,12 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions {% endif %} {% endfor %} - do - { + do { const OPP_INT p2c = opp_p2c[0]; // get the value here, since the kernel might change it const OPP_INT* opp_c2c = c2c_map + p2c; {% for arg in lh.args_expanded|dat|indirect|reduction if config.seg_red %} - for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) + for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) {% if arg is p2c_mapped and not double_indirect %} arg{{arg.id}}_p2c_local[d] = {{lh.dat(arg).typ}}_ZERO; {% else %} @@ -387,30 +350,25 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions opp_k{{kernel_idx}}::{{lh.kernel}}( move_flag, iter_one_flag, opp_c2c, opp_p2c, {% for arg in lh.args %} - {%+ call opt_hip_tern(arg) %}{{arg_to_pointer_hip(arg)}}{% endcall %}{{"," if not loop.last}} // {% if arg is dat %}{{lh.dat(arg).ptr}} {% endif +%} + {%+ call opt_device_tern(arg) %}{{arg_to_pointer_device(arg)}}{% endcall %}{{"," if not loop.last}} // {% if arg is dat %}{{lh.dat(arg).ptr}} {% endif +%} {% endfor %} ); {% for arg in lh.args_expanded|dat|indirect|reduction if config.seg_red %} - if (on_old_cell) - { - int offset = 0; - for (int d = 0; d < {{lh.dat(arg).dim}}; ++d, ++offset) { - sr_dat{{arg.dat_id}}_values[n + opp_k{{kernel_idx}}_sr_set_stride_d * offset] = arg{{arg.dat_id}}_p2c_local[d]; + if (iter_one_flag) { + for (int d = 0; d < {{lh.dat(arg).dim}}; ++d) { + sr_dat{{arg.dat_id}}_values[n + opp_k{{kernel_idx}}_sr_set_stride_d * d] = arg{{arg.dat_id}}_p2c_local[d]; } sr_dat{{arg.dat_id}}_keys[n] = p2c; // TODO : Generate for double indirections too! } - else - { - for (int d = 0; {{opt_hip_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) + else { + for (int d = 0; {{opt_device_cond_comp(arg)}}d < {{lh.dat(arg).dim}}; ++d) {% if arg is p2c_mapped and not double_indirect %} - atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}}), arg{{arg.id}}_p2c_local[d]); + atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_device(arg)}}), arg{{arg.id}}_p2c_local[d]); {% else %} - atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_hip(arg)}}), arg{{arg.id}}_{{arg.map_idx}}_local[d]); // TODO: this looks incorrect + atomicAdd(dat{{arg.dat_id}} + {{map_lookup(arg, kernel_idx)}} + (d{{stride_device(arg)}}), arg{{arg.id}}_{{arg.map_idx}}_local[d]); // TODO: this looks incorrect {% endif %} } - - on_old_cell = false; {% endfor %} } while (opp_part_check_status_device(move_flag, iter_one_flag, opp_p2c, n, @@ -428,8 +386,8 @@ __global__ void opp_dev_sr_{{lh.kernel}}( // Used for Segmented Reductions {% endcall %} {% endif %} -//-------------------------------------------------------------- {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.seg_red %} +//-------------------------------------------------------------- {% call seg_red_kernel_wrapper() %} {% endcall %} {% endif %} @@ -460,7 +418,7 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_ {% endfor %} args[{{lh.args|length}}] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty - const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU); + opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU); {% if lh is double_indirect_reduc %} #ifdef USE_MPI @@ -474,16 +432,6 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_ opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_c2c_map_stride_d, &opp_k{{kernel_idx}}_c2c_map_stride, &c2c_stride, 1); opp_mpi_halo_wait_all(nargs, args); - {% if lh.args|opt|length > 0 %} - unsigned optflags = 0; - - {% for arg in lh.args|opt %} - {% call opt_if(arg) %} - optflags |= 1 << {{lh.optIdx(arg)}}; - {% endcall %} - - {% endfor %} - {% endif %} {% for arg in lh.args|gbl %} {{arg.typ}} *arg{{arg.id}}_host_data = ({{arg.typ}} *)args[{{arg.id}}].data;{{"\n" if loop.last}} {% endfor %} @@ -523,69 +471,18 @@ void opp_particle_move__{{lh.kernel}}(opp_set set, opp_map c2c_map, opp_map p2c_ int num_blocks = 200; - do - { - {% for dat in lh.dats|soa %} - opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_dat{{dat.id}}_stride_d, &opp_k{{kernel_idx}}_dat{{dat.id}}_stride, &(args[{{dat.arg_id}}].dat->set->set_capacity), 1); - {% endfor %} - {% for map in lh.maps %} - opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_map{{map.id}}_stride_d, &opp_k{{kernel_idx}}_map{{map.id}}_stride, &(args[{{map.arg_id}}].size), 1); - {% endfor %} - - opp_init_particle_move(set, nargs, args); - opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); - - {% if lh.args|gbl|reduction|length > 0 %} - {% if lh is direct %} - int max_blocks = num_blocks; - {% elif config.atomics %} - int max_blocks = (MAX(set->core_size, set->size + set->exec_size - set->core_size) - 1) / block_size + 1; - {% else %} - int max_blocks = 0; - for (int col = 0; col < plan->ncolors; ++col) - max_blocks = MAX(max_blocks, plan->ncolblk[col]); - {% endif %} - - int reduction_bytes = 0; - int reduction_size = 0; - - {% for arg in lh.args|gbl|reduction %} - {% call opt_if(arg) %} - reduction_bytes += ROUND_UP(max_blocks * {{arg.dim}} * sizeof({{arg.typ}})); - reduction_size = MAX(reduction_size, sizeof({{arg.typ}})); - {% endcall %} - {% endfor %} - - opp_reallocReductArrays(reduction_bytes); - reduction_bytes = 0; - - {% for arg in lh.args|gbl|reduction %} - {% call opt_if(arg) %} - args[{{arg.id}}].data = OPP_reduct_h + reduction_bytes; - args[{{arg.id}}].data_d = OPP_reduct_d + reduction_bytes; - - for (int b = 0; b < max_blocks; ++b) { - for (int d = 0; d < {{arg.dim}}; ++d) - (({{arg.typ}} *)args[{{arg.id}}].data)[b * {{arg.dim}} + d] = {% if arg.access_type == OP.AccessType.INC -%} - {{arg.typ}}_ZERO - {%- else -%} - arg{{arg.id}}_host_data[d] - {%- endif %}; - } - - reduction_bytes += ROUND_UP(max_blocks * {{arg.dim}} * sizeof({{arg.typ}})); - {% endcall %} - {% endfor %} - - opp_mvReductArraysToDevice(reduction_bytes); - {% endif %} + opp_init_particle_move(set, nargs, args); {% endblock %} {% macro kernel_call(shared_size) %} opp_dev_{{lh.kernel}}<< 0}}>>>( {% for dat in lh.dats %} + {% if dat is indirect_reduction(lh) and lh is particle_loop %} + ({{dat.typ}}**)args[{{dat.arg_id}}].dat->thread_data_d, array_count, // {{dat.ptr}} + {% else %} ({{dat.typ}} *)args[{{dat.arg_id}}].data_d, // {{dat.ptr}} + {% endif %} {% endfor %} (OPP_INT *)args[{{lh.args|length}}].data_d, // p2c_map (OPP_INT *)c2c_map->map_d, // c2c_map @@ -636,144 +533,145 @@ opp_dev_sr_{{lh.kernel}}<<initGlobalMove(); + opp_init_dh_device(set); +#endif + opp_profiler->start("GblMv_Move"); + + opp_mem::dev_copy_to_symbol(cellMapper_pos_stride_d, &cellMapper_pos_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(OPP_rank_d, &OPP_rank, 1); + + hipError_t err1 = hipMemcpyToSymbol(HIP_SYMBOL(opp_minSavedDHGrid_d), opp_minSavedDHGrid, 3 * sizeof(size_t)); + if (err1 != hipSuccess) { + throw std::runtime_error(std::string("dev_copy_to_symbol: ") + hipGetErrorString(err1)); + } + hipError_t err2 = hipMemcpyToSymbol(HIP_SYMBOL(opp_maxSavedDHGrid_d), opp_maxSavedDHGrid, 3 * sizeof(size_t)); + if (err2 != hipSuccess) { + throw std::runtime_error(std::string("dev_copy_to_symbol: ") + hipGetErrorString(err2)); + } + + // check whether particles need to be moved via the global move routine num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; - {% endif %} + opp_dev_checkForGlobalMove{{lh.dat(lh.args[0]).dim}}D_kernel<<>>( + (OPP_REAL*)args[0].data_d, // {{lh.dat(lh.args[0]).ptr}} + (OPP_INT *)args[{{lh.args|length}}].data_d, // p2c_map + cellMapper->structMeshToCellMapping_d, + cellMapper->structMeshToRankMapping_d, + cellMapper->oneOverGridSpacing_d, + cellMapper->minGlbCoordinate_d, + cellMapper->globalGridDims_d, + cellMapper->globalGridSize_d, + set->particle_remove_count_d, + OPP_remove_particle_indices_d, + dh_indices_d.part_indices, + dh_indices_d.cell_indices, + dh_indices_d.rank_indices, + dh_indices_d.move_count, + OPP_iter_start, OPP_iter_end + ); + OPP_DEVICE_SYNCHRONIZE(); + + opp_profiler->end("GblMv_Move"); + +#ifdef USE_MPI + opp_gather_dh_move_indices(set); + globalMover->communicate(set); +#endif + } +{%- endmacro %} - {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics and config.seg_red %} - if (!opp_params->get("use_reg_red")) // Do atomics ---------- - {% endif %} - {% if lh is direct or config.atomics %} - { +{%- macro dh_finalize() %} +#ifdef USE_MPI + // ---------------------------------------------------------------------------- + // finalize the global move routine and iterate over newly added particles and check whether they need neighbour comm + if (useGlobalMove) { + + opp_profiler->start("GblMv_finalize"); + const int finalized = globalMover->finalize(set); + opp_profiler->end("GblMv_finalize"); + + if (finalized > 0) { + opp_profiler->start("GblMv_AllMv"); + + // need to change arg data since particle resize in globalMover::finalize could change the pointer in dat->data + for (int i = 0; i < nargs; i++) + if (args[i].argtype == OPP_ARG_DAT && args[i].dat->set->is_particle) + args[i].data_d = args[i].dat->data_d; + + // check whether the new particle is within cell, and if not move between cells within the MPI rank, + // mark for neighbour comm. Do only for the globally moved particles + const int start2 = (set->size - set->diff); + const int end2 = set->size; + num_blocks = (end2 - start2 - 1) / block_size + 1; + + {% for dat in lh.dats|soa %} + opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_dat{{dat.id}}_stride_d, &opp_k{{kernel_idx}}_dat{{dat.id}}_stride, &(args[{{dat.arg_id}}].dat->set->set_capacity), 1); + {% endfor %} + {% for map in lh.maps %} + opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_map{{map.id}}_stride_d, &opp_k{{kernel_idx}}_map{{map.id}}_stride, &(args[{{map.arg_id}}].size), 1); + {% endfor %} + + opp_profiler->start("move_kernel_only"); {{kernel_call("(reduction_size * block_size)", - "OPP_iter_start", "OPP_iter_end")|indent(12)}} + "start2", "end2")|indent(12)}} + OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->end("move_kernel_only"); + + opp_profiler->end("GblMv_AllMv"); } - {% endif %} - {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics and config.seg_red %} - else // Do segmented reductions ---------- - {% endif %} - {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.seg_red %} - { - opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_sr_set_stride_d, &opp_k{{kernel_idx}}_sr_set_stride, &set->size, 1); + } +#endif +{%- endmacro %} - {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} - size_t operating_size_dat{{arg.dat_id}} = 0, resize_size_dat{{arg.dat_id}} = 0; - {% endfor %} - {% for arg in lh.args|dat|indirect|reduction %} - operating_size_dat{{arg.dat_id}} += (size_t)1; - resize_size_dat{{arg.dat_id}} += (size_t)1; - {% endfor %} +{%- macro generate_storage_for_global_reductions() -%} + {% if lh.args|gbl|reduction|length > 0 %} + {% if lh is direct %} + int max_blocks = num_blocks; + {% elif config.atomics %} + int max_blocks = (MAX(set->core_size, set->size + set->exec_size - set->core_size) - 1) / block_size + 1; + {% endif %} - {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} - operating_size_dat{{arg.dat_id}} *= (size_t)(set->size); - resize_size_dat{{arg.dat_id}} *= (size_t)(set->set_capacity); - {% endfor %} + int reduction_bytes = 0; + int reduction_size = 0; - // Resize the key/value device arrays only if current vector is small - opp_profiler->start("SRM_Resize"); - {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} - if (resize_size_dat{{arg.dat_id}} > sr_dat{{arg.dat_id}}_keys_dv.size()) { - sr_dat{{arg.dat_id}}_keys_dv.resize(resize_size_dat{{arg.dat_id}}, 0); - sr_dat{{arg.dat_id}}_keys_dv2.resize(resize_size_dat{{arg.dat_id}}, 0); - sr_dat{{arg.dat_id}}_values_dv.resize(resize_size_dat{{arg.dat_id}} * (args[{{arg.id}}].dat->dim), 0); - sr_dat{{arg.dat_id}}_values_dv2.resize(resize_size_dat{{arg.dat_id}} * (args[{{arg.id}}].dat->dim), 0); - } + {% for arg in lh.args|gbl|reduction %} + {% call opt_if(arg) %} + reduction_bytes += ROUND_UP(max_blocks * {{arg.dim}} * sizeof({{arg.typ}})); + reduction_size = MAX(reduction_size, sizeof({{arg.typ}})); + {% endcall %} {% endfor %} - opp_profiler->end("SRM_Resize"); - // Reset the key/value device arrays - opp_profiler->start("SRM_Init"); - {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} - opp_k{{kernel_idx}}::reset_OPP_INT_values<<>>( - opp_get_dev_raw_ptr(sr_dat{{arg.dat_id}}_keys_dv), 0, sr_dat{{arg.dat_id}}_keys_dv.size()); - opp_k{{kernel_idx}}::sequence_OPP_INT_values<<>>( - opp_get_dev_raw_ptr(sr_dat{{arg.dat_id}}_keys_dv2), 0, sr_dat{{arg.dat_id}}_keys_dv2.size()); - - const int num_blocks2 = (sr_dat{{arg.dat_id}}_values_dv.size() - 1) / block_size + 1; - opp_k{{kernel_idx}}::reset_{{lh.dat(arg).typ}}_values<<>>( - opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv), 0, sr_dat{{arg.dat_id}}_values_dv.size()); - // opp_k{{kernel_idx}}::reset_{{lh.dat(arg).typ}}_values<<>>( - // opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv2), 0, sr_dat{{arg.dat_id}}_values_dv2.size()); - OPP_DEVICE_SYNCHRONIZE(); - {% endfor %} - opp_profiler->end("SRM_Init"); + opp_reallocReductArrays(reduction_bytes); + reduction_bytes = 0; - // Create key/value pairs - opp_profiler->start("SRM_CrKeyVal"); - {{seg_red_kernel_call("(reduction_size * block_size)", - "OPP_iter_start", "OPP_iter_end")|indent(12)}} - OPP_DEVICE_SYNCHRONIZE(); - opp_profiler->end("SRM_CrKeyVal"); + {% for arg in lh.args|gbl|reduction %} + {% call opt_if(arg) %} + args[{{arg.id}}].data = OPP_reduct_h + reduction_bytes; + args[{{arg.id}}].data_d = OPP_reduct_d + reduction_bytes; - {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} - // Sort by keys to bring the identical keys together and store the order in sr_dat{{arg.dat_id}}_keys_dv2 - opp_profiler->start("SRM_SortByKey"); - thrust::sort_by_key(thrust::device, - sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}}, - sr_dat{{arg.dat_id}}_keys_dv2.begin()); - opp_profiler->end("SRM_SortByKey"); - - // Sort values according to sr_dat{{arg.dat_id}}_keys_dv2 - opp_profiler->start("SRM_AssignByKey"); - opp_k{{kernel_idx}}::assign_values_by_key<<>>( - opp_get_dev_raw_ptr(sr_dat{{arg.dat_id}}_keys_dv2), - opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv), - opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv2), - 0, operating_size_dat{{arg.dat_id}}, {{lh.dat(arg).dim}}); - OPP_DEVICE_SYNCHRONIZE(); - opp_profiler->end("SRM_AssignByKey"); - - // Compute the unique keys and their corresponding values - opp_profiler->start("SRM_RedByKey"); - auto new_end = thrust::reduce_by_key(thrust::device, - sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}}, - sr_dat{{arg.dat_id}}_values_dv2.begin(), - sr_dat{{arg.dat_id}}_keys_dv2.begin(), - sr_dat{{arg.dat_id}}_values_dv.begin()); - const size_t reduced_size = (new_end.first - sr_dat{{arg.dat_id}}_keys_dv2.begin()); - - {% if lh.dat(arg).dim > 1 %} - for (int d = 1; d < {{lh.dat(arg).dim}}; ++d) { - auto new_end = thrust::reduce_by_key(thrust::device, - sr_dat{{arg.dat_id}}_keys_dv.begin(), sr_dat{{arg.dat_id}}_keys_dv.begin() + operating_size_dat{{arg.dat_id}}, - sr_dat{{arg.dat_id}}_values_dv2.begin() + d * opp_k{{kernel_idx}}_sr_set_stride, - thrust::make_discard_iterator(), sr_dat{{arg.dat_id}}_values_dv.begin() + d * opp_k{{kernel_idx}}_sr_set_stride); - } - {% endif %} - opp_profiler->end("SRM_RedByKey"); - - // Assign reduced values to the nodes using keys/values - opp_profiler->start("SRM_Assign"); - num_blocks = reduced_size / block_size + 1; - for (int d = 0; d < {{lh.dat(arg).dim}}; ++d) { // Could invoke the kernel once and have all dims updated with that - opp_k{{kernel_idx}}::assign_values<<>> ( - opp_get_dev_raw_ptr(sr_dat{{arg.dat_id}}_keys_dv2), - (opp_get_dev_raw_ptr<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_values_dv) + d * opp_k{{kernel_idx}}_sr_set_stride), - (({{lh.dat(arg).typ}} *) args[{{arg.dat_id}}].data_d) + d * opp_k{{kernel_idx}}_dat{{arg.dat_id}}_stride, - 0, reduced_size); - } - OPP_DEVICE_SYNCHRONIZE(); - opp_profiler->end("SRM_Assign"); - - // Last: clear the thrust vectors if this is the last iteration (avoid crash) - opp_profiler->start("SRM_Clear"); - if (opp_params->get("num_steps") == (OPP_main_loop_iter + 1)) { - OPP_DEVICE_SYNCHRONIZE(); - sr_dat{{arg.dat_id}}_values_dv.clear(); sr_dat{{arg.dat_id}}_values_dv.shrink_to_fit(); - sr_dat{{arg.dat_id}}_keys_dv.clear(); sr_dat{{arg.dat_id}}_keys_dv.shrink_to_fit(); - } - opp_profiler->end("SRM_Clear"); - {% endfor %} - } - {% endif %} + for (int b = 0; b < max_blocks; ++b) { + for (int d = 0; d < {{arg.dim}}; ++d) + (({{arg.typ}} *)args[{{arg.id}}].data)[b * {{arg.dim}} + d] = {% if arg.access_type == OP.AccessType.INC -%} + {{arg.typ}}_ZERO + {%- else -%} + arg{{arg.id}}_host_data[d] + {%- endif %}; + } - } while (opp_finalize_particle_move(set)); + reduction_bytes += ROUND_UP(max_blocks * {{arg.dim}} * sizeof({{arg.typ}})); + {% endcall %} + {% endfor %} -{% endblock %} + opp_mvReductArraysToDevice(reduction_bytes); + {% endif %} +{%- endmacro %} -{% block host_epilogue %} +{%- macro reduce_global_reductions() -%} {% if lh.args|gbl|read_write|length > 0 or lh.args|gbl|write|length > 0 %} mvConstArraysToHost(const_bytes); @@ -804,6 +702,145 @@ opp_dev_sr_{{lh.kernel}}<< 0 and lh is particle_loop %} + const int array_count = opp_params->get("gpu_reduction_arrays"); + {% endif %} + {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics %} + if (!opp_use_segmented_reductions) { + {% for dat in lh.dats %} + {% if dat is indirect_reduction(lh) and lh is particle_loop %} + opp_create_thread_level_data<{{dat.typ}}>(args[{{dat.arg_id}}]); + {% endif %} + {% endfor %} + } + {% endif %} +{%- endmacro %} + +{%- macro reduce_particle_indirect_reduction_arrays() -%} + {% for dat in lh.dats %} + {% if dat is indirect_reduction(lh) and lh is particle_loop %} + opp_reduce_thread_level_data<{{dat.typ}}>(args[{{dat.id}}]); + {% endif %} + {% endfor %} +{%- endmacro %} + +{% macro multihop_move(pre) %} + opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); + num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; + + {% for dat in lh.dats|soa %} + opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_dat{{dat.id}}_stride_d, &opp_k{{kernel_idx}}_dat{{dat.id}}_stride, &(args[{{dat.arg_id}}].dat->set->set_capacity), 1); + {% endfor %} + {% for map in lh.maps %} + opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_map{{map.id}}_stride_d, &opp_k{{kernel_idx}}_map{{map.id}}_stride, &(args[{{map.arg_id}}].size), 1); + {% endfor %} + + {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics and config.seg_red %} + if (!opp_use_segmented_reductions) // Do atomics ---------- + {% endif %} + {% if lh is direct or config.atomics %} + { + opp_profiler->start("move_kernel_only"); + {{kernel_call("(reduction_size * block_size)", + "OPP_iter_start", "OPP_iter_end")|indent(8)}} + OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->end("move_kernel_only"); + } + {% endif %} + {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.atomics and config.seg_red %} + else // Do segmented reductions ---------- + {% endif %} + {% if lh is indirect and lh.args|dat|indirect|reduction|length > 0 and config.seg_red %} + { + {{pre}}opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_sr_set_stride_d, &opp_k{{kernel_idx}}_sr_set_stride, &set->size, 1); + + {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} + {{pre}}size_t operating_size_dat{{arg.dat_id}} = 0, resize_size_dat{{arg.dat_id}} = 0; + {% endfor %} + + {% for arg in lh.args|dat|indirect|reduction %} + {{pre}}operating_size_dat{{arg.dat_id}} += (size_t)1; + {{pre}}resize_size_dat{{arg.dat_id}} += (size_t)1; + {% endfor %} + + {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} + {{pre}}operating_size_dat{{arg.dat_id}} *= (size_t)(OPP_iter_end - OPP_iter_start); + {{pre}}resize_size_dat{{arg.dat_id}} *= (size_t)(set->set_capacity); + {% endfor %} + + {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} + {{pre}}opp_sr::init_arrays<{{lh.dat(arg).typ}}>(args[{{arg.id}}].dat->dim, operating_size_dat{{arg.dat_id}}, resize_size_dat{{arg.dat_id}}, + {{pre}} sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2); + {% endfor %} + + // Create key/value pairs + opp_profiler->start("SRM_CrKeyVal"); + {{seg_red_kernel_call("(reduction_size * block_size)", + "OPP_iter_start", "OPP_iter_end")|indent(8)}} + OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->end("SRM_CrKeyVal"); + + {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) %} + {{pre}}opp_sr::do_segmented_reductions<{{lh.dat(arg).typ}}>(args[{{arg.dat_id}}], (OPP_iter_end - OPP_iter_start), + {{pre}} sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2); + {% endfor %} + } + {%- endif %} +{% endmacro %} + +{% block host_loop %} + {% if lh.dh_loop_required %} +{{ dh_check_for_global_move() }} + + {% endif%} +{{ generate_storage_for_global_reductions() }} +{{ create_arrays_for_particle_indirect_reductions() }} + + opp_profiler->start("Mv_AllMv0"); + {% if lh.dh_loop_required %} + // ---------------------------------------------------------------------------- + // check whether all particles not marked for global comm is within cell, + // and if not mark to move between cells within the MPI rank, mark for neighbour comm + {% else %} + // ---------------------------------------------------------------------------- + // Multi-hop move particles within current MPI rank and if not mark for neighbour comm + {% endif %} + +{{ multihop_move("") }} + + opp_profiler->end("Mv_AllMv0"); + {% if lh.dh_loop_required %} +{{ dh_finalize() }} + {% endif%} + + // ---------------------------------------------------------------------------- + // Do neighbour communication and if atleast one particle is received by the currect rank, + // then iterate over the newly added particles + while (opp_finalize_particle_move(set)) { + + opp_init_particle_move(set, nargs, args); + + {{ multihop_move("// ")|indent(4) }} + } + +{{ reduce_global_reductions() }} + {%- if lh is indirect and lh.args|dat|indirect|reduction|length > 0 %} + if (!opp_use_segmented_reductions) { + {{ reduce_particle_indirect_reduction_arrays() }} + } + else { + {% for arg in lh.args|dat|indirect|reduction|not_already_mapped(lh) if config.seg_red %} + opp_sr::clear_arrays<{{lh.dat(arg).typ}}>(sr_dat{{arg.dat_id}}_keys_dv, sr_dat{{arg.dat_id}}_values_dv, sr_dat{{arg.dat_id}}_keys_dv2, sr_dat{{arg.dat_id}}_values_dv2); + {% endfor %} + } + + {% endif %} +{% endblock %} + +{% block host_epilogue %} opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); {% if lh is double_indirect_reduc %} @@ -818,6 +855,7 @@ opp_dev_sr_{{lh.kernel}}<<start("Setup_Mover"); - + + useGlobalMove = opp_params->get("opp_global_move"); + + if (OPP_DBG) opp_printf("opp_init_direct_hop_cg", "START useGlobalMove=%s", useGlobalMove ? "YES" : "NO"); + + if (useGlobalMove) { + + const int nargs = {{lh.args|length + 1}}; + opp_arg args[nargs]; + + {% for arg in lh.args %} + args[{{loop.index0}}] = {{arg_dat_redef(arg) if lh.args[arg.id] is vec else "arg%d" % arg.id}}; + {% endfor %} + args[{{lh.args|length}}] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty + +#ifdef USE_MPI + opp_mpi_halo_exchanges_grouped(c_gbl_id->set, nargs, args, Device_CPU); + + comm = std::make_shared(MPI_COMM_WORLD); + globalMover = std::make_unique(comm->comm_parent); + + opp_mpi_halo_wait_all(nargs, args); +#endif + + boundingBox = std::make_shared(b_box); + cellMapper = std::make_shared(boundingBox, grid_spacing, comm); + + const int c_set_size = c_gbl_id->set->size; + + // lambda function for dh mesh search loop + auto all_cell_checker = [&](const opp_point& point, int& cid) { + {% for arg in lh.args|dat %} + {% if arg.access_type.value in [1, 3, 4, 5, 2] %} + // we dont want to change the original arrays during dh mesh generation, hence duplicate except OPP_READ + {{lh.dat(arg).typ}} arg{{arg.id}}_temp[{{lh.dat(arg).dim}}]; + {%- endif %} + {%- endfor +%} + for (int ci = 0; ci < c_set_size; ++ci) { + opp_move_status_flag = OPP_NEED_MOVE; + opp_move_hop_iter_one_flag = true; + + int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy + + opp_p2c = &(temp_ci); + opp_c2c = &((c2c_map->map)[temp_ci * {{lh.c2c_map.dim}}]); + {% if lh is indirect %} + {% for map in lh.maps %} + const OPP_INT *map{{map.id}} = args[{{map.arg_id}}].map_data + (temp_ci * {{map.dim}}); + {%- endfor %} + {%- endif %} + {%- for arg in lh.args|dat %} + {% if arg.access_type.value == 2 %} + + {%- if arg is double_indirect -%} + {%- set offset = " + (map%s[%d] * %d)" % (arg.map_id, arg.map_idx, lh.dat(arg).dim) -%} + {%- elif arg is indirect -%} + {%- set offset = " + (temp_ci * %d)" % (lh.dat(arg).dim) -%} + {%- endif %} + // arg{{arg.id}} is OPP_RW, hence get a copy just incase + std::memcpy(&arg{{arg.id}}_temp, ({{lh.dat(arg).typ}} *)args[{{arg.id}}].data{{offset}}, (sizeof({{lh.dat(arg).typ}}) * {{lh.dat(arg).dim}})); + {% endif %} + {%- endfor %} + + opp_k{{kernel_idx}}::host::{{lh.kernel}}( + {% for arg in lh.args %} + {{arg_to_pointer_dh(arg,lh)}} + {% endfor %} + ); + if (opp_move_status_flag == OPP_MOVE_DONE) { + cid = temp_ci; + break; + } + } + }; + + if (opp_params->get("opp_dh_data_generate")) { + cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker); + } + else { + cellMapper->generateStructuredMeshFromFile(c_gbl_id->set, c_gbl_id); + } + } + opp_profiler->end("Setup_Mover"); } + {% endif %} {% endblock %} \ No newline at end of file diff --git a/scripts/batch/advection/archer2/config.param b/scripts/batch/advection/archer2/config.param index 75c74925..b5fd20cf 100644 --- a/scripts/batch/advection/archer2/config.param +++ b/scripts/batch/advection/archer2/config.param @@ -15,6 +15,7 @@ BOOL opp_profile_all = false INT opp_gpu_direct = 0 BOOL opp_global_move = true REAL grid_spacing = 0.5 +BOOL opp_segmented_red = false # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = Shuffle_Periodic diff --git a/scripts/batch/advection/avon/config.param b/scripts/batch/advection/avon/config.param index a7c1de43..b5fd20cf 100644 --- a/scripts/batch/advection/avon/config.param +++ b/scripts/batch/advection/avon/config.param @@ -1,15 +1,22 @@ # Simulation parameters INT max_iter = 250 -INT nx = 32 -INT ny = 32 -REAL dt = 0.1 +INT nx = 1024 +INT ny = 1024 +REAL dt = 0.5 REAL cell_width = 0.5 -INT n_particles = 6000000 - +INT npart_per_cell = 100 +BOOL verify_particles = false # System parameters BOOL opp_auto_sort = false -INT opp_allocation_multiple = 1 +REAL opp_allocation_multiple = 1.1 INT opp_threads_per_block = 512 BOOL opp_profile_all = false -INT \ No newline at end of file +INT opp_gpu_direct = 0 +BOOL opp_global_move = true +REAL grid_spacing = 0.5 +BOOL opp_segmented_red = false + +# HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic +STRING opp_fill = Shuffle_Periodic +INT opp_fill_period = 4 \ No newline at end of file diff --git a/scripts/batch/advection/pan/config.param b/scripts/batch/advection/pan/config.param index 75c74925..b5fd20cf 100644 --- a/scripts/batch/advection/pan/config.param +++ b/scripts/batch/advection/pan/config.param @@ -15,6 +15,7 @@ BOOL opp_profile_all = false INT opp_gpu_direct = 0 BOOL opp_global_move = true REAL grid_spacing = 0.5 +BOOL opp_segmented_red = false # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = Shuffle_Periodic diff --git a/scripts/batch/cabana/archer2/cabana.param b/scripts/batch/cabana/archer2/cabana.param index 183020ca..5dcf904d 100644 --- a/scripts/batch/cabana/archer2/cabana.param +++ b/scripts/batch/cabana/archer2/cabana.param @@ -28,7 +28,7 @@ STRING opp_fill = HoleFill_All INT opp_fill_period = 4 BOOL opp_global_move = false -BOOL use_reg_red = false +BOOL opp_segmented_red = false STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/cabana/mesh_files/cab_48000.hdf5 INT domain_expansion = 1 diff --git a/scripts/batch/cabana/bede/cabana.param b/scripts/batch/cabana/bede/cabana.param index 62770eb2..7c9b3a97 100644 --- a/scripts/batch/cabana/bede/cabana.param +++ b/scripts/batch/cabana/bede/cabana.param @@ -28,7 +28,7 @@ STRING opp_fill = HoleFill_All INT opp_fill_period = 4 BOOL opp_global_move = false -BOOL use_reg_red = false +BOOL opp_segmented_red = false STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/cabana/mesh_files/cab_48000.hdf5 INT domain_expansion = 1 diff --git a/scripts/batch/cabana/cosma/cabana.param b/scripts/batch/cabana/cosma/cabana.param index 0bfe6b27..61c43487 100644 --- a/scripts/batch/cabana/cosma/cabana.param +++ b/scripts/batch/cabana/cosma/cabana.param @@ -28,7 +28,7 @@ STRING opp_fill = HoleFill_All INT opp_fill_period = 4 BOOL opp_global_move = false -BOOL use_reg_red = false +BOOL opp_segmented_red = false INT gpu_reduction_arrays = 16 STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/cabana/mesh_files/cab_48000.hdf5 diff --git a/scripts/batch/cabana/lumi/cabana.param b/scripts/batch/cabana/lumi/cabana.param index 45463fb2..5f116dcb 100644 --- a/scripts/batch/cabana/lumi/cabana.param +++ b/scripts/batch/cabana/lumi/cabana.param @@ -28,7 +28,7 @@ STRING opp_fill = HoleFill_All INT opp_fill_period = 4 BOOL opp_global_move = false -BOOL use_reg_red = false +BOOL opp_segmented_red = false INT domain_expansion = -1 diff --git a/scripts/batch/fempic/archer2/box_archer.param b/scripts/batch/fempic/archer2/box_archer.param index 696fc390..68d71249 100644 --- a/scripts/batch/fempic/archer2/box_archer.param +++ b/scripts/batch/fempic/archer2/box_archer.param @@ -36,7 +36,7 @@ BOOL opp_global_move = false # block k-means mpi-block STRING cluster = block -BOOL use_reg_red = false +BOOL opp_segmented_red = false # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = Shuffle_Periodic diff --git a/scripts/batch/fempic/avon/box_avon.param b/scripts/batch/fempic/avon/box_avon.param index 495822b0..c55e9976 100644 --- a/scripts/batch/fempic/avon/box_avon.param +++ b/scripts/batch/fempic/avon/box_avon.param @@ -38,7 +38,7 @@ BOOL opp_global_move = false # block k-means mpi-block STRING cluster = block -BOOL use_reg_red = true +BOOL opp_segmented_red = true INT sed_red_block = 100000000 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic diff --git a/scripts/batch/fempic/bede/box_bede.param b/scripts/batch/fempic/bede/box_bede.param index bba91f81..226dd17c 100644 --- a/scripts/batch/fempic/bede/box_bede.param +++ b/scripts/batch/fempic/bede/box_bede.param @@ -32,7 +32,7 @@ INT opp_gpu_direct = 0 # block k-means STRING cluster = block -BOOL use_reg_red = false +BOOL opp_segmented_red = false # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = Shuffle_Periodic diff --git a/scripts/batch/fempic/cosma/coarse.param b/scripts/batch/fempic/cosma/coarse.param index f4335908..fa6940f3 100644 --- a/scripts/batch/fempic/cosma/coarse.param +++ b/scripts/batch/fempic/cosma/coarse.param @@ -38,7 +38,7 @@ BOOL opp_dh_data_generate = false # block k-means mpi-block STRING cluster = block -BOOL use_reg_red = false +BOOL opp_segmented_red = false INT gpu_reduction_arrays = 16 # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic diff --git a/scripts/batch/fempic/cosma/fempic_mpi_IntelMAX1100_hdf5.sh b/scripts/batch/fempic/cosma/fempic_mpi_IntelMAX1100_hdf5.sh index b5ba0a8b..130b67f1 100755 --- a/scripts/batch/fempic/cosma/fempic_mpi_IntelMAX1100_hdf5.sh +++ b/scripts/batch/fempic/cosma/fempic_mpi_IntelMAX1100_hdf5.sh @@ -49,7 +49,7 @@ for gpus in 2; do sed -i "s|STRING hdf_filename = /box_48000.hdf5|STRING hdf_filename = /cosma/home/do018/dc-lant1/phd/Artifacts/mesh_files/box_${actual_config}.hdf5|" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi if [ "$use_hole_fill" -eq 1 ]; then sed -i "s/STRING opp_fill = Shuffle_Periodic/STRING opp_fill = HoleFill_All/" ${currentfilename} diff --git a/scripts/batch/fempic/cosma/fempic_mpi_MI300X_dat.sh b/scripts/batch/fempic/cosma/fempic_mpi_MI300X_dat.sh index 9f0f13b3..d7b73a76 100755 --- a/scripts/batch/fempic/cosma/fempic_mpi_MI300X_dat.sh +++ b/scripts/batch/fempic/cosma/fempic_mpi_MI300X_dat.sh @@ -56,7 +56,7 @@ for config in 48000 96000 192000; do sed -i "s|STRING wall_mesh = /wall.dat|STRING wall_mesh = /cosma/home/do018/dc-lant1/phd/box_mesh_gen/${actual_config}/wall.dat|" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi if [ "$use_hole_fill" -eq 1 ]; then sed -i "s/STRING opp_fill = Shuffle_Periodic/STRING opp_fill = HoleFill_All/" ${currentfilename} diff --git a/scripts/batch/fempic/cosma/fempic_mpi_MI300X_hdf5.sh b/scripts/batch/fempic/cosma/fempic_mpi_MI300X_hdf5.sh index 49efa125..b5af8f6c 100755 --- a/scripts/batch/fempic/cosma/fempic_mpi_MI300X_hdf5.sh +++ b/scripts/batch/fempic/cosma/fempic_mpi_MI300X_hdf5.sh @@ -49,7 +49,7 @@ for gpus in 8 4 2 1; do sed -i "s|STRING hdf_filename = /box_48000.hdf5|STRING hdf_filename = /cosma/home/do018/dc-lant1/phd/Artifacts/mesh_files/box_${actual_config}.hdf5|" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi if [ "$use_hole_fill" -eq 1 ]; then sed -i "s/STRING opp_fill = Shuffle_Periodic/STRING opp_fill = HoleFill_All/" ${currentfilename} diff --git a/scripts/batch/fempic/lumi/box_fempic.param b/scripts/batch/fempic/lumi/box_fempic.param index 9aa203e6..4c0ad587 100644 --- a/scripts/batch/fempic/lumi/box_fempic.param +++ b/scripts/batch/fempic/lumi/box_fempic.param @@ -32,7 +32,7 @@ BOOL opp_global_move = false # block k-means mpi-block STRING cluster = block -BOOL use_reg_red = false +BOOL opp_segmented_red = false # HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic STRING opp_fill = Shuffle_Periodic diff --git a/scripts/batch/fempic/lumi/fempic_energy10.sh b/scripts/batch/fempic/lumi/fempic_energy10.sh index cd555b01..fdbec606 100755 --- a/scripts/batch/fempic/lumi/fempic_energy10.sh +++ b/scripts/batch/fempic/lumi/fempic_energy10.sh @@ -76,7 +76,7 @@ for run in 1 2 3; do echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with unsafe atomics" srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D10_UA_R${run}.log; - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with seg red" srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D10_SR_R${run}.log; # --------------------- @@ -116,7 +116,7 @@ for run in 1 2 3; do echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with unsafe atomics" srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D13_UA_R${run}.log; - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with seg red" srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D13_SR_R${run}.log; # --------------------- diff --git a/scripts/batch/fempic/lumi/fempic_energy5.sh b/scripts/batch/fempic/lumi/fempic_energy5.sh index 39eed55d..26f4c5a4 100755 --- a/scripts/batch/fempic/lumi/fempic_energy5.sh +++ b/scripts/batch/fempic/lumi/fempic_energy5.sh @@ -76,7 +76,7 @@ for run in 1 2 3; do echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with unsafe atomics" srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D10_UA_R${run}.log; - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with seg red" srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D10_SR_R${run}.log; # --------------------- @@ -116,7 +116,7 @@ for run in 1 2 3; do echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with unsafe atomics" srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D13_UA_R${run}.log; - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} echo "RUNNING -> 1e18 On "$totalGPUs" GPUs with seg red" srun --cpu-bind=${CPU_BIND} ${binary} ${currentfilename} | tee $folder/log_N${num_nodes}_G${totalGPUs}_C${config}_D13_SR_R${run}.log; # --------------------- diff --git a/scripts/batch/fempic/lumi/fempic_mpi_1.sh b/scripts/batch/fempic/lumi/fempic_mpi_1.sh index a9bd4f43..6e44151c 100755 --- a/scripts/batch/fempic/lumi/fempic_mpi_1.sh +++ b/scripts/batch/fempic/lumi/fempic_mpi_1.sh @@ -82,7 +82,7 @@ for run in 1 2 3; do sed -i "s/STRING hdf_filename = /STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename} sed -i "s/STRING rand_file = /STRING rand_file = ${escaped_folder}\/random_100k.dat/" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi # --------------------- @@ -126,7 +126,7 @@ for run in 1 2 3; do sed -i "s/STRING hdf_filename = /STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename} sed -i "s/STRING rand_file = /STRING rand_file = ${escaped_folder}\/random_100k.dat/" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi # --------------------- @@ -170,7 +170,7 @@ for run in 1 2 3; do sed -i "s/STRING hdf_filename = /STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename} sed -i "s/STRING rand_file = /STRING rand_file = ${escaped_folder}\/random_100k.dat/" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi # --------------------- @@ -214,7 +214,7 @@ for run in 1 2 3; do sed -i "s/STRING hdf_filename = /STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename} sed -i "s/STRING rand_file = /STRING rand_file = ${escaped_folder}\/random_100k.dat/" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi # --------------------- diff --git a/scripts/batch/fempic/lumi/fempic_mpi_16.sh b/scripts/batch/fempic/lumi/fempic_mpi_16.sh index a7cfba43..e06e51d1 100755 --- a/scripts/batch/fempic/lumi/fempic_mpi_16.sh +++ b/scripts/batch/fempic/lumi/fempic_mpi_16.sh @@ -90,7 +90,7 @@ for run in 1 2 3; do # 1 2 3 sed -i "s/STRING hdf_filename = /STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename} sed -i "s/STRING rand_file = /STRING rand_file = ${escaped_folder}\/random_100k.dat/" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi if [ "$config" -eq 12288000 ]; then sed -i "s/REAL plasma_den = 1e18/REAL plasma_den = 9.9e17/" ${currentfilename} diff --git a/scripts/batch/fempic/lumi/fempic_mpi_2.sh b/scripts/batch/fempic/lumi/fempic_mpi_2.sh index 4e372b60..0a91b86e 100755 --- a/scripts/batch/fempic/lumi/fempic_mpi_2.sh +++ b/scripts/batch/fempic/lumi/fempic_mpi_2.sh @@ -81,7 +81,7 @@ for run in 1 2 3; do sed -i "s/STRING hdf_filename = /STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename} sed -i "s/STRING rand_file = /STRING rand_file = ${escaped_folder}\/random_100k.dat/" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi # --------------------- diff --git a/scripts/batch/fempic/lumi/fempic_mpi_32.sh b/scripts/batch/fempic/lumi/fempic_mpi_32.sh index 28bf9c3f..5c6ede49 100755 --- a/scripts/batch/fempic/lumi/fempic_mpi_32.sh +++ b/scripts/batch/fempic/lumi/fempic_mpi_32.sh @@ -91,7 +91,7 @@ for run in 1 2 3; do sed -i "s/STRING hdf_filename = /STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename} sed -i "s/STRING rand_file = /STRING rand_file = ${escaped_folder}\/random_100k.dat/" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi if [ "$config" -eq 12288000 ]; then sed -i "s/REAL plasma_den = 1e18/REAL plasma_den = 9.9e17/" ${currentfilename} diff --git a/scripts/batch/fempic/lumi/fempic_mpi_4.sh b/scripts/batch/fempic/lumi/fempic_mpi_4.sh index e8c4cc7b..260e42f2 100755 --- a/scripts/batch/fempic/lumi/fempic_mpi_4.sh +++ b/scripts/batch/fempic/lumi/fempic_mpi_4.sh @@ -80,7 +80,7 @@ for run in 1 2 3; do sed -i "s/STRING hdf_filename = /STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename} sed -i "s/STRING rand_file = /STRING rand_file = ${escaped_folder}\/random_100k.dat/" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi # --------------------- diff --git a/scripts/batch/fempic/lumi/fempic_mpi_8.sh b/scripts/batch/fempic/lumi/fempic_mpi_8.sh index 89107ed1..fa0ba26f 100755 --- a/scripts/batch/fempic/lumi/fempic_mpi_8.sh +++ b/scripts/batch/fempic/lumi/fempic_mpi_8.sh @@ -80,7 +80,7 @@ for run in 1 2 3; do sed -i "s/STRING hdf_filename = /STRING hdf_filename = ${escaped_folder}\/box_${config}.hdf5/" ${currentfilename} sed -i "s/STRING rand_file = /STRING rand_file = ${escaped_folder}\/random_100k.dat/" ${currentfilename} if [ "$use_seg_red" -eq 1 ]; then - sed -i "s/BOOL use_reg_red = false/BOOL use_reg_red = true/" ${currentfilename} + sed -i "s/BOOL opp_segmented_red = false/BOOL opp_segmented_red = true/" ${currentfilename} fi # ---------------------