From 211981f3a05121087c2871208d1b2868fa787737 Mon Sep 17 00:00:00 2001 From: ZamanLantra Date: Fri, 25 Oct 2024 12:06:15 +0100 Subject: [PATCH] fempic with dh-codegen --- .../calculate_new_pos_vel_kernel_loop.hpp | 15 +- .../compute_electric_field_kernel_loop.hpp | 20 +- ...ompute_node_charge_density_kernel_loop.hpp | 10 +- .../deposit_charge_on_nodes_kernel_loop.hpp | 20 +- .../cuda/get_final_max_values_kernel_loop.hpp | 10 +- .../cuda/get_max_cef_kernel_loop.hpp | 5 +- .../cuda/init_boundary_pot_kernel_loop.hpp | 10 +- .../cuda/inject_ions_kernel_loop.hpp | 55 +-- app_fempic_cg/cuda/move_kernel_loop.hpp | 330 ++++++++++++++---- app_fempic_cg/cuda/opp_kernels.cu | 20 +- app_fempic_cg/mpi/move_kernel_loop.hpp | 205 ++--------- app_fempic_cg/omp/move_kernel_loop.hpp | 233 ++----------- app_fempic_cg/seq/move_kernel_loop.hpp | 205 ++--------- .../calculate_new_pos_vel_kernel_loop.hpp | 2 +- .../sycl/inject_ions_kernel_loop.hpp | 2 +- .../device_kernels/cuda_inline_kernels.h | 159 ++++++--- opp_lib/src/cuda/opp_direct_hop_cuda.cu | 66 +++- .../cpp/cuda/move_loop_host.hpp.jinja | 2 +- scripts/source/pan_oneapi2021.3 | 7 +- scripts/source/viking_gcc12.2 | 41 +++ 20 files changed, 623 insertions(+), 794 deletions(-) create mode 100644 scripts/source/viking_gcc12.2 diff --git a/app_fempic_cg/cuda/calculate_new_pos_vel_kernel_loop.hpp b/app_fempic_cg/cuda/calculate_new_pos_vel_kernel_loop.hpp index 1c177c7..1345d21 100644 --- a/app_fempic_cg/cuda/calculate_new_pos_vel_kernel_loop.hpp +++ b/app_fempic_cg/cuda/calculate_new_pos_vel_kernel_loop.hpp @@ -76,18 +76,9 @@ void opp_par_loop_all__calculate_new_pos_vel_kernel(opp_set set, opp_iterate_typ const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU); - if (opp_k3_dat0_stride != args[0].dat->set->set_capacity) { - opp_k3_dat0_stride = args[0].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k3_dat0_stride_d, &opp_k3_dat0_stride, sizeof(OPP_INT))); - } - if (opp_k3_dat1_stride != args[1].dat->set->set_capacity) { - opp_k3_dat1_stride = args[1].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k3_dat1_stride_d, &opp_k3_dat1_stride, sizeof(OPP_INT))); - } - if (opp_k3_dat2_stride != args[2].dat->set->set_capacity) { - opp_k3_dat2_stride = args[2].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k3_dat2_stride_d, &opp_k3_dat2_stride, sizeof(OPP_INT))); - } + opp_mem::dev_copy_to_symbol(opp_k3_dat0_stride_d, &opp_k3_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k3_dat1_stride_d, &opp_k3_dat1_stride, &(args[1].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k3_dat2_stride_d, &opp_k3_dat2_stride, &(args[2].dat->set->set_capacity), 1); #ifdef OPP_BLOCK_SIZE_3 const int block_size = OPP_BLOCK_SIZE_3; diff --git a/app_fempic_cg/cuda/compute_electric_field_kernel_loop.hpp b/app_fempic_cg/cuda/compute_electric_field_kernel_loop.hpp index cff6fc7..673e077 100644 --- a/app_fempic_cg/cuda/compute_electric_field_kernel_loop.hpp +++ b/app_fempic_cg/cuda/compute_electric_field_kernel_loop.hpp @@ -93,22 +93,10 @@ void opp_par_loop_all__compute_electric_field_kernel(opp_set set, opp_iterate_ty const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU); - if (opp_k7_dat0_stride != args[0].dat->set->set_capacity) { - opp_k7_dat0_stride = args[0].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k7_dat0_stride_d, &opp_k7_dat0_stride, sizeof(OPP_INT))); - } - if (opp_k7_dat1_stride != args[1].dat->set->set_capacity) { - opp_k7_dat1_stride = args[1].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k7_dat1_stride_d, &opp_k7_dat1_stride, sizeof(OPP_INT))); - } - if (opp_k7_dat2_stride != args[2].dat->set->set_capacity) { - opp_k7_dat2_stride = args[2].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k7_dat2_stride_d, &opp_k7_dat2_stride, sizeof(OPP_INT))); - } - if (opp_k7_map0_stride != args[2].size) { - opp_k7_map0_stride = args[2].size; - cutilSafeCall(cudaMemcpyToSymbol(opp_k7_map0_stride_d, &opp_k7_map0_stride, sizeof(OPP_INT))); - } + opp_mem::dev_copy_to_symbol(opp_k7_dat0_stride_d, &opp_k7_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k7_dat1_stride_d, &opp_k7_dat1_stride, &(args[1].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k7_dat2_stride_d, &opp_k7_dat2_stride, &(args[2].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k7_map0_stride_d, &opp_k7_map0_stride, &(args[2].size), 1); #ifdef OPP_BLOCK_SIZE_7 const int block_size = OPP_BLOCK_SIZE_7; diff --git a/app_fempic_cg/cuda/compute_node_charge_density_kernel_loop.hpp b/app_fempic_cg/cuda/compute_node_charge_density_kernel_loop.hpp index 37ef251..e7f6474 100644 --- a/app_fempic_cg/cuda/compute_node_charge_density_kernel_loop.hpp +++ b/app_fempic_cg/cuda/compute_node_charge_density_kernel_loop.hpp @@ -61,14 +61,8 @@ void opp_par_loop_all__compute_node_charge_density_kernel(opp_set set, opp_itera const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU); - if (opp_k6_dat0_stride != args[0].dat->set->set_capacity) { - opp_k6_dat0_stride = args[0].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k6_dat0_stride_d, &opp_k6_dat0_stride, sizeof(OPP_INT))); - } - if (opp_k6_dat1_stride != args[1].dat->set->set_capacity) { - opp_k6_dat1_stride = args[1].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k6_dat1_stride_d, &opp_k6_dat1_stride, sizeof(OPP_INT))); - } + opp_mem::dev_copy_to_symbol(opp_k6_dat0_stride_d, &opp_k6_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k6_dat1_stride_d, &opp_k6_dat1_stride, &(args[1].dat->set->set_capacity), 1); #ifdef OPP_BLOCK_SIZE_6 const int block_size = OPP_BLOCK_SIZE_6; diff --git a/app_fempic_cg/cuda/deposit_charge_on_nodes_kernel_loop.hpp b/app_fempic_cg/cuda/deposit_charge_on_nodes_kernel_loop.hpp index a1a87ac..9dc8510 100644 --- a/app_fempic_cg/cuda/deposit_charge_on_nodes_kernel_loop.hpp +++ b/app_fempic_cg/cuda/deposit_charge_on_nodes_kernel_loop.hpp @@ -199,18 +199,9 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set, opp_iterate_t #endif - if (opp_k5_dat0_stride != args[0].dat->set->set_capacity) { - opp_k5_dat0_stride = args[0].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k5_dat0_stride_d, &opp_k5_dat0_stride, sizeof(OPP_INT))); - } - if (opp_k5_dat1_stride != args[1].dat->set->set_capacity) { - opp_k5_dat1_stride = args[1].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k5_dat1_stride_d, &opp_k5_dat1_stride, sizeof(OPP_INT))); - } - if (opp_k5_map0_stride != args[1].size) { - opp_k5_map0_stride = args[1].size; - cutilSafeCall(cudaMemcpyToSymbol(opp_k5_map0_stride_d, &opp_k5_map0_stride, sizeof(OPP_INT))); - } + opp_mem::dev_copy_to_symbol(opp_k5_dat0_stride_d, &opp_k5_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k5_dat1_stride_d, &opp_k5_dat1_stride, &(args[1].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k5_map0_stride_d, &opp_k5_map0_stride, &(args[1].size), 1); #ifdef OPP_BLOCK_SIZE_5 const int block_size = OPP_BLOCK_SIZE_5; @@ -241,10 +232,7 @@ void opp_par_loop_all__deposit_charge_on_nodes_kernel(opp_set set, opp_iterate_t else // Do segmented reductions ---------- { - if (opp_k5_sr_set_stride != set->size) { - opp_k5_sr_set_stride = set->size; - cutilSafeCall(cudaMemcpyToSymbol(opp_k5_sr_set_stride_d, &opp_k5_sr_set_stride, sizeof(OPP_INT))); - } + opp_mem::dev_copy_to_symbol(opp_k5_sr_set_stride_d, &opp_k5_sr_set_stride, &(set->size), 1); size_t operating_size_dat1 = 0, resize_size_dat1 = 0; diff --git a/app_fempic_cg/cuda/get_final_max_values_kernel_loop.hpp b/app_fempic_cg/cuda/get_final_max_values_kernel_loop.hpp index 3d38ef4..bd13093 100644 --- a/app_fempic_cg/cuda/get_final_max_values_kernel_loop.hpp +++ b/app_fempic_cg/cuda/get_final_max_values_kernel_loop.hpp @@ -89,14 +89,8 @@ void opp_par_loop_all__get_final_max_values_kernel(opp_set set, opp_iterate_type OPP_REAL *arg1_host_data = (OPP_REAL *)args[1].data; OPP_REAL *arg3_host_data = (OPP_REAL *)args[3].data; - if (opp_k9_dat0_stride != args[0].dat->set->set_capacity) { - opp_k9_dat0_stride = args[0].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k9_dat0_stride_d, &opp_k9_dat0_stride, sizeof(OPP_INT))); - } - if (opp_k9_dat1_stride != args[2].dat->set->set_capacity) { - opp_k9_dat1_stride = args[2].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k9_dat1_stride_d, &opp_k9_dat1_stride, sizeof(OPP_INT))); - } + opp_mem::dev_copy_to_symbol(opp_k9_dat0_stride_d, &opp_k9_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k9_dat1_stride_d, &opp_k9_dat1_stride, &(args[2].dat->set->set_capacity), 1); #ifdef OPP_BLOCK_SIZE_9 const int block_size = OPP_BLOCK_SIZE_9; diff --git a/app_fempic_cg/cuda/get_max_cef_kernel_loop.hpp b/app_fempic_cg/cuda/get_max_cef_kernel_loop.hpp index 92bc763..fb507dd 100644 --- a/app_fempic_cg/cuda/get_max_cef_kernel_loop.hpp +++ b/app_fempic_cg/cuda/get_max_cef_kernel_loop.hpp @@ -70,10 +70,7 @@ void opp_par_loop_all__get_max_cef_kernel(opp_set set, opp_iterate_type, OPP_REAL *arg1_host_data = (OPP_REAL *)args[1].data; - if (opp_k8_dat0_stride != args[0].dat->set->set_capacity) { - opp_k8_dat0_stride = args[0].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k8_dat0_stride_d, &opp_k8_dat0_stride, sizeof(OPP_INT))); - } + opp_mem::dev_copy_to_symbol(opp_k8_dat0_stride_d, &opp_k8_dat0_stride, &(args[0].dat->set->set_capacity), 1); #ifdef OPP_BLOCK_SIZE_8 const int block_size = OPP_BLOCK_SIZE_8; diff --git a/app_fempic_cg/cuda/init_boundary_pot_kernel_loop.hpp b/app_fempic_cg/cuda/init_boundary_pot_kernel_loop.hpp index db68580..8814916 100644 --- a/app_fempic_cg/cuda/init_boundary_pot_kernel_loop.hpp +++ b/app_fempic_cg/cuda/init_boundary_pot_kernel_loop.hpp @@ -71,14 +71,8 @@ void opp_par_loop_all__init_boundary_pot_kernel(opp_set set, opp_iterate_type, const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU); - if (opp_k1_dat0_stride != args[0].dat->set->set_capacity) { - opp_k1_dat0_stride = args[0].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k1_dat0_stride_d, &opp_k1_dat0_stride, sizeof(OPP_INT))); - } - if (opp_k1_dat1_stride != args[1].dat->set->set_capacity) { - opp_k1_dat1_stride = args[1].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k1_dat1_stride_d, &opp_k1_dat1_stride, sizeof(OPP_INT))); - } + opp_mem::dev_copy_to_symbol(opp_k1_dat0_stride_d, &opp_k1_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k1_dat1_stride_d, &opp_k1_dat1_stride, &(args[1].dat->set->set_capacity), 1); #ifdef OPP_BLOCK_SIZE_1 const int block_size = OPP_BLOCK_SIZE_1; diff --git a/app_fempic_cg/cuda/inject_ions_kernel_loop.hpp b/app_fempic_cg/cuda/inject_ions_kernel_loop.hpp index 442ea01..653ab15 100644 --- a/app_fempic_cg/cuda/inject_ions_kernel_loop.hpp +++ b/app_fempic_cg/cuda/inject_ions_kernel_loop.hpp @@ -143,50 +143,17 @@ void opp_par_loop_injected__inject_ions_kernel(opp_set set, opp_iterate_type, const int iter_size = set->diff; const int inj_start = (set->size - set->diff); - if (opp_k2_dat0_stride != args[0].dat->set->set_capacity) { - opp_k2_dat0_stride = args[0].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, sizeof(OPP_INT))); - } - if (opp_k2_dat1_stride != args[1].dat->set->set_capacity) { - opp_k2_dat1_stride = args[1].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, sizeof(OPP_INT))); - } - if (opp_k2_dat2_stride != args[2].size) { - opp_k2_dat2_stride = args[2].size; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat2_stride_d, &opp_k2_dat2_stride, sizeof(OPP_INT))); - } - if (opp_k2_dat3_stride != args[3].size) { - opp_k2_dat3_stride = args[3].size; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat3_stride_d, &opp_k2_dat3_stride, sizeof(OPP_INT))); - } - if (opp_k2_dat4_stride != args[4].dat->set->set_capacity) { - opp_k2_dat4_stride = args[4].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat4_stride_d, &opp_k2_dat4_stride, sizeof(OPP_INT))); - } - if (opp_k2_dat5_stride != args[5].dat->set->set_capacity) { - opp_k2_dat5_stride = args[5].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat5_stride_d, &opp_k2_dat5_stride, sizeof(OPP_INT))); - } - if (opp_k2_dat6_stride != args[6].dat->set->set_capacity) { - opp_k2_dat6_stride = args[6].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat6_stride_d, &opp_k2_dat6_stride, sizeof(OPP_INT))); - } - if (opp_k2_dat7_stride != args[7].dat->set->set_capacity) { - opp_k2_dat7_stride = args[7].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat7_stride_d, &opp_k2_dat7_stride, sizeof(OPP_INT))); - } - if (opp_k2_dat8_stride != args[8].dat->set->set_capacity) { - opp_k2_dat8_stride = args[8].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat8_stride_d, &opp_k2_dat8_stride, sizeof(OPP_INT))); - } - if (opp_k2_dat9_stride != args[9].dat->set->set_capacity) { - opp_k2_dat9_stride = args[9].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_dat9_stride_d, &opp_k2_dat9_stride, sizeof(OPP_INT))); - } - if (opp_k2_map0_stride != args[4].size) { - opp_k2_map0_stride = args[4].size; - cutilSafeCall(cudaMemcpyToSymbol(opp_k2_map0_stride_d, &opp_k2_map0_stride, sizeof(OPP_INT))); - } + opp_mem::dev_copy_to_symbol(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat2_stride_d, &opp_k2_dat2_stride, &(args[2].size), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat3_stride_d, &opp_k2_dat3_stride, &(args[3].size), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat4_stride_d, &opp_k2_dat4_stride, &(args[4].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat5_stride_d, &opp_k2_dat5_stride, &(args[5].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat6_stride_d, &opp_k2_dat6_stride, &(args[6].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat7_stride_d, &opp_k2_dat7_stride, &(args[7].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat8_stride_d, &opp_k2_dat8_stride, &(args[8].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_dat9_stride_d, &opp_k2_dat9_stride, &(args[9].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k2_map0_stride_d, &opp_k2_map0_stride, &(args[4].size), 1); #ifdef OPP_BLOCK_SIZE_2 const int block_size = OPP_BLOCK_SIZE_2; diff --git a/app_fempic_cg/cuda/move_kernel_loop.hpp b/app_fempic_cg/cuda/move_kernel_loop.hpp index 2df3be7..2b388ce 100644 --- a/app_fempic_cg/cuda/move_kernel_loop.hpp +++ b/app_fempic_cg/cuda/move_kernel_loop.hpp @@ -16,6 +16,60 @@ __constant__ OPP_INT opp_k4_dat3_stride_d; __constant__ OPP_INT opp_k4_c2c_map_stride_d; namespace opp_k4 { + +namespace host { +inline void move_kernel( + const double *point_pos, double* point_lc, + const double *cell_volume, const double *cell_det +) { + + const double coefficient2 = (1.0 / 6.0) / (*cell_volume); + + for (int i=0; i<4; i++) { /*loop over vertices*/ + + point_lc[i] = coefficient2 * ( + cell_det[i * 4 + 0] - + cell_det[i * 4 + 1] * point_pos[0] + + cell_det[i * 4 + 2] * point_pos[1] - + cell_det[i * 4 + 3] * point_pos[2]); + } + + if (!(point_lc[0] < 0.0 || + point_lc[0] > 1.0 || + point_lc[1] < 0.0 || + point_lc[1] > 1.0 || + point_lc[2] < 0.0 || + point_lc[2] > 1.0 || + point_lc[3] < 0.0 || + point_lc[3] > 1.0)) { + + { opp_move_status_flag = OPP_MOVE_DONE; }; + return; + } + + // outside the last known cell, find most negative weight and + // use that cell_index to reduce computations + int min_i = 0; + double min_lc = point_lc[0]; + + for (int i=1; i<4; i++) { + if (point_lc[i] < min_lc) { + min_lc = point_lc[i]; + min_i = i; + } + } + + if (opp_c2c[min_i] >= 0) { // is there a neighbor in this direction? + (*opp_p2c) = opp_c2c[min_i]; + { opp_move_status_flag = OPP_NEED_MOVE; }; + } + else { + (*opp_p2c) = INT_MAX; + { opp_move_status_flag = OPP_NEED_REMOVE; }; + } +} +} + __device__ inline void move_kernel( char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen @@ -68,46 +122,6 @@ __device__ inline void move_kernel( { opp_move_status_flag = OPP_NEED_REMOVE; }; } } - -//******************************************************************************* -// Returns true only if another hop is required by the current rank -__device__ inline bool opp_part_check_status_cuda(char& move_flag, bool& iter_one_flag, - int* cell_id, int particle_index, int& remove_count, int *remove_particle_indices, - int *move_particle_indices, int *move_cell_indices, int *move_count) -{ - iter_one_flag = false; - - if (move_flag == OPP_MOVE_DONE) - { - return false; - } - else if (move_flag == OPP_NEED_REMOVE) - { - *cell_id = MAX_CELL_INDEX; - const int removeArrayIndex = atomicAdd(&remove_count, 1); - remove_particle_indices[removeArrayIndex] = particle_index; - - return false; - } - else if (*cell_id >= OPP_cells_set_size_d) - { - // cell_id cell is not owned by the current mpi rank, need to communicate - int moveArrayIndex = atomicAdd(move_count, 1); - move_particle_indices[moveArrayIndex] = particle_index; - move_cell_indices[moveArrayIndex] = *cell_id; - - // Needs to be removed from the current rank, - // particle packing will be done just prior exchange and removal - move_flag = OPP_NEED_REMOVE; - const int removeArrayIndex = atomicAdd(&remove_count, 1); - remove_particle_indices[removeArrayIndex] = particle_index; - - return false; - } - - // cell_id is an own cell and move_flag == OPP_NEED_MOVE - return true; -} } __global__ void opp_dev_move_kernel( @@ -133,6 +147,10 @@ __global__ void opp_dev_move_kernel( const int n = thread_id + start; OPP_INT *opp_p2c = (p2c_map + n); + if (opp_p2c[0] == MAX_CELL_INDEX) { + return; + } + char move_flag = OPP_NEED_MOVE; bool iter_one_flag = (OPP_comm_iteration_d > 0) ? false : true; @@ -150,7 +168,7 @@ __global__ void opp_dev_move_kernel( ); - } while (opp_k4::opp_part_check_status_cuda(move_flag, iter_one_flag, opp_p2c, n, + } while (opp_part_check_status_device(move_flag, iter_one_flag, opp_p2c, n, *particle_remove_count, particle_remove_indices, move_particle_indices, move_cell_indices, move_count)); } @@ -177,16 +195,11 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma args[4] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty const int iter_size = opp_mpi_halo_exchanges_grouped(set, nargs, args, Device_GPU); - - if (OPP_cells_set_size != set->cells_set->size) { - OPP_cells_set_size = set->cells_set->size; - cutilSafeCall(cudaMemcpyToSymbol(OPP_cells_set_size_d, &OPP_cells_set_size, sizeof(int))); - } + const OPP_INT c2c_stride = c2c_map->from->size + c2c_map->from->exec_size + c2c_map->from->nonexec_size; - if (opp_k4_c2c_map_stride != c2c_stride) { - opp_k4_c2c_map_stride = c2c_stride; - cutilSafeCall(cudaMemcpyToSymbol(opp_k4_c2c_map_stride_d, &opp_k4_c2c_map_stride, sizeof(OPP_INT))); - } + + opp_mem::dev_copy_to_symbol(OPP_cells_set_size_d, &OPP_cells_set_size, &(set->cells_set->size), 1); + opp_mem::dev_copy_to_symbol(opp_k4_c2c_map_stride_d, &opp_k4_c2c_map_stride, &c2c_stride, 1); opp_mpi_halo_wait_all(nargs, args); @@ -198,30 +211,139 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma int num_blocks = 200; - do - { - if (opp_k4_dat0_stride != args[0].dat->set->set_capacity) { - opp_k4_dat0_stride = args[0].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, sizeof(OPP_INT))); - } - if (opp_k4_dat1_stride != args[1].dat->set->set_capacity) { - opp_k4_dat1_stride = args[1].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, sizeof(OPP_INT))); - } - if (opp_k4_dat2_stride != args[2].dat->set->set_capacity) { - opp_k4_dat2_stride = args[2].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, sizeof(OPP_INT))); - } - if (opp_k4_dat3_stride != args[3].dat->set->set_capacity) { - opp_k4_dat3_stride = args[3].dat->set->set_capacity; - cutilSafeCall(cudaMemcpyToSymbol(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, sizeof(OPP_INT))); - } + opp_init_particle_move(set, nargs, args); + + opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); + + num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; + + if (useGlobalMove) { + +#ifdef USE_MPI + globalMover->initGlobalMove(); + opp_init_dh_device(set); +#endif + opp_profiler->start("GblMv_Move"); + + opp_mem::dev_copy_to_symbol(cellMapper_pos_stride_d, &cellMapper_pos_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(OPP_rank_d, &OPP_rank, 1); + + // check whether particles needs to be moved over global move routine + opp_dev_checkForGlobalMove3D_kernel<<>>( + (OPP_REAL*)args[0].data_d, // p_pos + (OPP_INT *)args[4].data_d, // p2c_map + cellMapper->structMeshToCellMapping_d, + cellMapper->structMeshToRankMapping_d, + cellMapper->oneOverGridSpacing_d, + cellMapper->minGlbCoordinate_d, + cellMapper->globalGridDims_d, + cellMapper->globalGridSize_d, + set->particle_remove_count_d, + OPP_remove_particle_indices_d, + dh_indices_d.part_indices, + dh_indices_d.cell_indices, + dh_indices_d.rank_indices, + dh_indices_d.move_count, + OPP_iter_start, OPP_iter_end + ); + OPP_DEVICE_SYNCHRONIZE(); + + opp_profiler->end("GblMv_Move"); + +#ifdef USE_MPI + opp_gather_dh_move_indices(set); + globalMover->communicate(set); +#endif + } + + opp_mem::dev_copy_to_symbol(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, &(args[3].dat->set->set_capacity), 1); + + + opp_profiler->start("Mv_AllMv0"); + // ---------------------------------------------------------------------------- + // check whether all particles not marked for global comm is within cell, + // and if not mark to move between cells within the MPI rank, mark for neighbour comm + opp_profiler->start("move_kernel_only"); + opp_dev_move_kernel<<>>( + (OPP_REAL *)args[0].data_d, // p_pos + (OPP_REAL *)args[1].data_d, // p_lc + (OPP_REAL *)args[2].data_d, // c_volume + (OPP_REAL *)args[3].data_d, // c_det + (OPP_INT *)args[4].data_d, // p2c_map + (OPP_INT *)c2c_map->map_d, // c2c_map + (OPP_INT *)set->particle_remove_count_d, + (OPP_INT *)OPP_remove_particle_indices_d, + (OPP_INT *)OPP_move_particle_indices_d, + (OPP_INT *)OPP_move_cell_indices_d, + (OPP_INT *)OPP_move_count_d, + OPP_iter_start, + OPP_iter_end + ); + OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->end("move_kernel_only"); + opp_profiler->end("Mv_AllMv0"); + + // ---------------------------------------------------------------------------- + // finalize the global move routine and iterate over newly added particles and check whether they need neighbour comm + if (useGlobalMove && globalMover->finalize(set) > 0) { + + opp_profiler->start("GblMv_AllMv"); + + // need to change arg data since particle resize in globalMover::finalize could change the pointer in dat->data + for (int i = 0; i < nargs; i++) + if (args[i].argtype == OPP_ARG_DAT && args[i].dat->set->is_particle) + args[i].data_d = args[i].dat->data_d; + + // check whether the new particle is within cell, and if not move between cells within the MPI rank, + // mark for neighbour comm. Do only for the globally moved particles + const int start2 = (set->size - set->diff); + const int end2 = set->size; + num_blocks = (end2 - start2 - 1) / block_size + 1; + opp_mem::dev_copy_to_symbol(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, &(args[3].dat->set->set_capacity), 1); + + opp_profiler->start("move_kernel_only"); + opp_dev_move_kernel<<>>( + (OPP_REAL *)args[0].data_d, // p_pos + (OPP_REAL *)args[1].data_d, // p_lc + (OPP_REAL *)args[2].data_d, // c_volume + (OPP_REAL *)args[3].data_d, // c_det + (OPP_INT *)args[4].data_d, // p2c_map + (OPP_INT *)c2c_map->map_d, // c2c_map + (OPP_INT *)set->particle_remove_count_d, + (OPP_INT *)OPP_remove_particle_indices_d, + (OPP_INT *)OPP_move_particle_indices_d, + (OPP_INT *)OPP_move_cell_indices_d, + (OPP_INT *)OPP_move_count_d, + start2, + end2 + ); + OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->end("move_kernel_only"); + + opp_profiler->end("GblMv_AllMv"); + } + + // ---------------------------------------------------------------------------- + // Do neighbour communication and if atleast one particle is received by the currect rank, + // then iterate over the newly added particles + while (opp_finalize_particle_move(set)) { + + opp_mem::dev_copy_to_symbol(opp_k4_dat0_stride_d, &opp_k4_dat0_stride, &(args[0].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat1_stride_d, &opp_k4_dat1_stride, &(args[1].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat2_stride_d, &opp_k4_dat2_stride, &(args[2].dat->set->set_capacity), 1); + opp_mem::dev_copy_to_symbol(opp_k4_dat3_stride_d, &opp_k4_dat3_stride, &(args[3].dat->set->set_capacity), 1); opp_init_particle_move(set, nargs, args); - cutilSafeCall(cudaMemcpyToSymbol(OPP_comm_iteration_d, &OPP_comm_iteration, sizeof(int))); + opp_mem::dev_copy_to_symbol(OPP_comm_iteration_d, &OPP_comm_iteration, 1); num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1; - + opp_profiler->start("move_kernel_only"); opp_dev_move_kernel<<>>( (OPP_REAL *)args[0].data_d, // p_pos (OPP_REAL *)args[1].data_d, // p_lc @@ -237,14 +359,16 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma OPP_iter_start, OPP_iter_end ); - - } while (opp_finalize_particle_move(set)); + OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->end("move_kernel_only"); + } opp_set_dirtybit_grouped(nargs, args, Device_GPU); OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("move_kernel"); } + void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id, const opp::BoundingBox& b_box, opp_map c2c_map, opp_map p2c_map, opp_arg arg0, // p_pos | OPP_READ @@ -253,6 +377,66 @@ void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id opp_arg arg3 // c_det | OPP_READ ) { opp_profiler->start("Setup_Mover"); - + + useGlobalMove = opp_params->get("opp_global_move"); + + if (OPP_DBG) opp_printf("opp_init_direct_hop_cg", "START useGlobalMove=%s", useGlobalMove ? "YES" : "NO"); + + if (useGlobalMove) { + + const int nargs = 5; + opp_arg args[nargs]; + + args[0] = arg0; + args[1] = arg1; + args[2] = arg2; + args[3] = arg3; + args[4] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty + +#ifdef USE_MPI + opp_mpi_halo_exchanges_grouped(c_gbl_id->set, nargs, args, Device_CPU); + + comm = std::make_shared(MPI_COMM_WORLD); + globalMover = std::make_unique(comm->comm_parent); + + opp_mpi_halo_wait_all(nargs, args); +#endif + + boundingBox = std::make_shared(b_box); + cellMapper = std::make_shared(boundingBox, grid_spacing, comm); + + const int c_set_size = c_gbl_id->set->size; + + // lambda function for dh mesh search loop + auto all_cell_checker = [&](const opp_point& point, int& cid) { + + // we dont want to change the original arrays during dh mesh generation, hence duplicate except OPP_READ + OPP_REAL arg1_temp[4]; + for (int ci = 0; ci < c_set_size; ++ci) { + opp_move_status_flag = OPP_NEED_MOVE; + opp_move_hop_iter_one_flag = true; + + int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy + + opp_p2c = &(temp_ci); + opp_c2c = &((c2c_map->map)[temp_ci * 4]); + + opp_k4::host::move_kernel( + (const OPP_REAL*)&point, + arg1_temp, // p_lc| OPP_WRITE + (const OPP_REAL *)args[2].data + (temp_ci * 1), // c_volume| OPP_READ + (const OPP_REAL *)args[3].data + (temp_ci * 16) // c_det| OPP_READ + ); + if (opp_move_status_flag == OPP_MOVE_DONE) { + cid = temp_ci; + break; + } + } + }; + + cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker); + } + opp_profiler->end("Setup_Mover"); } + diff --git a/app_fempic_cg/cuda/opp_kernels.cu b/app_fempic_cg/cuda/opp_kernels.cu index d91a2b0..195224f 100644 --- a/app_fempic_cg/cuda/opp_kernels.cu +++ b/app_fempic_cg/cuda/opp_kernels.cu @@ -35,7 +35,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //********************************************* #include "opp_cuda.h" +#include "device_kernels/cuda_inline_kernels.h" +OPP_REAL CONST_spwt[1]; +OPP_REAL CONST_ion_velocity[1]; +OPP_REAL CONST_dt[1]; +OPP_REAL CONST_plasma_den[1]; +OPP_REAL CONST_mass[1]; +OPP_REAL CONST_charge[1]; +OPP_REAL CONST_wall_potential[1]; __constant__ OPP_REAL CONST_spwt_d[1]; __constant__ OPP_REAL CONST_ion_velocity_d[1]; @@ -44,11 +52,6 @@ __constant__ OPP_REAL CONST_plasma_den_d[1]; __constant__ OPP_REAL CONST_mass_d[1]; __constant__ OPP_REAL CONST_charge_d[1]; __constant__ OPP_REAL CONST_wall_potential_d[1]; - -__constant__ int OPP_cells_set_size_d; -int OPP_cells_set_size; - -__constant__ int OPP_comm_iteration_d; void opp_decl_const_impl(int dim, int size, char* data, const char* name) { @@ -57,30 +60,37 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) { if (!strcmp(name, "CONST_spwt")) { cutilSafeCall(cudaMemcpyToSymbol(CONST_spwt_d, data, dim * size)); + std::memcpy(&CONST_spwt, data, (size*dim)); return; } if (!strcmp(name, "CONST_ion_velocity")) { cutilSafeCall(cudaMemcpyToSymbol(CONST_ion_velocity_d, data, dim * size)); + std::memcpy(&CONST_ion_velocity, data, (size*dim)); return; } if (!strcmp(name, "CONST_dt")) { cutilSafeCall(cudaMemcpyToSymbol(CONST_dt_d, data, dim * size)); + std::memcpy(&CONST_dt, data, (size*dim)); return; } if (!strcmp(name, "CONST_plasma_den")) { cutilSafeCall(cudaMemcpyToSymbol(CONST_plasma_den_d, data, dim * size)); + std::memcpy(&CONST_plasma_den, data, (size*dim)); return; } if (!strcmp(name, "CONST_mass")) { cutilSafeCall(cudaMemcpyToSymbol(CONST_mass_d, data, dim * size)); + std::memcpy(&CONST_mass, data, (size*dim)); return; } if (!strcmp(name, "CONST_charge")) { cutilSafeCall(cudaMemcpyToSymbol(CONST_charge_d, data, dim * size)); + std::memcpy(&CONST_charge, data, (size*dim)); return; } if (!strcmp(name, "CONST_wall_potential")) { cutilSafeCall(cudaMemcpyToSymbol(CONST_wall_potential_d, data, dim * size)); + std::memcpy(&CONST_wall_potential, data, (size*dim)); return; } diff --git a/app_fempic_cg/mpi/move_kernel_loop.hpp b/app_fempic_cg/mpi/move_kernel_loop.hpp index 8414c71..2746447 100644 --- a/app_fempic_cg/mpi/move_kernel_loop.hpp +++ b/app_fempic_cg/mpi/move_kernel_loop.hpp @@ -214,177 +214,6 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma opp_profiler->end("move_kernel"); } -// ---------------------------------------------------------------------------- -inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map c2c_map, - const int nargs, opp_arg* args) { - - if (OPP_rank == 0) - opp_printf("APP", "gen_dh_structured_mesh START cells [%s] global grid dims %zu %zu %zu", - set->name, cellMapper->globalGridDimsX, cellMapper->globalGridDimsY, cellMapper->globalGridDimsZ); - - const int set_size_inc_halo = set->size + set->exec_size + set->nonexec_size; - if (set_size_inc_halo <= 0) { - opp_printf("APP", "Error... set_size_inc_halo <= 0 for set %s", set->name); - opp_abort("Error... APP set_size_inc_halo <= 0"); - } - - - std::map removed_coords; - const opp_point& min_glb_coords = boundingBox->getGlobalMin(); - const opp_point& maxCoordinate = boundingBox->getLocalMax(); // required for GET_VERT define - opp_move_hop_iter_one_flag = true; - - // lambda function for dh mesh search loop - auto all_cell_checker = [&](const opp_point& point, int& cid) { - - // we dont want to change the original arrays during dh mesh generation, hence duplicate except OPP_READ - OPP_REAL arg1_temp[4]; - - for (int ci = 0; ci < set->size; ++ci) { - opp_move_status_flag = OPP_NEED_MOVE; - - int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy - - opp_p2c = &(temp_ci); - opp_c2c = &((c2c_map->map)[temp_ci * 4]); - - - opp_k4::move_kernel( - (const OPP_REAL*)&point, - arg1_temp, // p_lc| OPP_WRITE - (const OPP_REAL *)args[2].data + (temp_ci * 1), // c_volume| OPP_READ - (const OPP_REAL *)args[3].data + (temp_ci * 16) // c_det| OPP_READ - ); - if (opp_move_status_flag == OPP_MOVE_DONE) { - cid = temp_ci; - break; - } - } - }; - - cellMapper->createStructMeshMappingArrays(); - - // Step 1 : Get centroids of the structured mesh cells and try to relate them to unstructured mesh indices - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 1 Start"); - opp_profiler->start("Setup_Mover_s1"); - double x = 0.0, y = 0.0, z = 0.0; - - for (size_t dz = cellMapper->localGridStart.z; dz < cellMapper->localGridEnd.z; dz++) { - z = min_glb_coords.z + dz * cellMapper->gridSpacing; - for (size_t dy = cellMapper->localGridStart.y; dy < cellMapper->localGridEnd.y; dy++) { - y = min_glb_coords.y + dy * cellMapper->gridSpacing; - for (size_t dx = cellMapper->localGridStart.x; dx < cellMapper->localGridEnd.x; dx++) { - x = min_glb_coords.x + dx * cellMapper->gridSpacing; - - size_t index = (dx + dy * cellMapper->globalGridDimsX + dz * cellMapper->globalGridDimsXY); - - const opp_point centroid = cellMapper->getCentroidOfBox(opp_point(x, y ,z)); - int cid = MAX_CELL_INDEX; - - all_cell_checker(centroid, cid); // Find in which cell this centroid lies - - if (cid == MAX_CELL_INDEX) { - removed_coords.insert(std::make_pair(index, opp_point(x, y ,z))); - } - else if (cid < set->size) { // write only if the structured cell belong to the current MPI rank - cellMapper->enrichStructuredMesh(index, ((int*)c_gbl_id->data)[cid], OPP_rank); - } - } - } - } - opp_profiler->end("Setup_Mover_s1"); - - // Step 2 : For MPI, get the inter-node values reduced to the structured mesh - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 2 Start"); - opp_profiler->start("Setup_Mover_s2"); -#ifdef USE_MPI - cellMapper->reduceInterNodeMappings(1); - - // The marked structured cells from this rank might be filled by another rank, so if already filled, - // no need to recalculate from current rank - for (auto it = removed_coords.begin(); it != removed_coords.end(); ) { - size_t removed_idx = it->first; - if (cellMapper->structMeshToRankMapping[removed_idx] != MAX_CELL_INDEX) { - it = removed_coords.erase(it); // This structured index is already written by another rank - // opp_printf("APP", "index %zu already in %d", this->structMeshToRankMapping[removed_idx], removed_idx); - } - else { - ++it; - } - } - - cellMapper->waitBarrier(); -#endif - opp_profiler->end("Setup_Mover_s2"); - - // Step 3 : Iterate over NEED_REMOVE points, Check whether atleast one vertex of the structured mesh is within - // an unstructured mesh cell. If multiple are found, get the minimum cell index to match with MPI - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 3 Start"); - opp_profiler->start("Setup_Mover_s3"); - for (auto& p : removed_coords) { - - const size_t index = p.first; - double &x = p.second.x, &y = p.second.y, &z = p.second.z; - - const double gs = cellMapper->gridSpacing; - int most_suitable_cid = MAX_CELL_INDEX, most_suitable_gbl_cid = MAX_CELL_INDEX; - - std::array vertices = { - opp_point(GET_VERT(x,x), GET_VERT(y,y), GET_VERT(z,z)), - opp_point(GET_VERT(x,x), GET_VERT(y,y+gs), GET_VERT(z,z)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y), GET_VERT(z,z)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y+gs), GET_VERT(z,z)), - opp_point(GET_VERT(x,x), GET_VERT(y,y), GET_VERT(z,z+gs)), - opp_point(GET_VERT(x,x), GET_VERT(y,y+gs), GET_VERT(z,z+gs)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y), GET_VERT(z,z+gs)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y+gs), GET_VERT(z,z+gs)), - - }; - - for (const auto& point : vertices) { - int cid = MAX_CELL_INDEX; - - all_cell_checker(point, cid); - - if ((cid != MAX_CELL_INDEX) && (cid < set->size)) { - const int gbl_cid = ((OPP_INT*)c_gbl_id->data)[cid]; - if (most_suitable_gbl_cid > gbl_cid) { - most_suitable_gbl_cid = gbl_cid; - most_suitable_cid = cid; - } - } - } - - // Allow neighbours to write on-behalf of the current rank, to reduce issues - cellMapper->lockWindows(); - const int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; - if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && - (most_suitable_cid < set->size)) { - cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank); - } - cellMapper->unlockWindows(); - } - opp_profiler->end("Setup_Mover_s3"); - - // Step 4 : For MPI, get the inter-node values reduced to the structured mesh - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 4 Start"); - opp_profiler->start("Setup_Mover_s4"); -#ifdef USE_MPI - cellMapper->reduceInterNodeMappings(2); -#endif - opp_profiler->end("Setup_Mover_s4"); - - // Step 5 : For MPI, convert the global cell coordinates to rank local coordinates for increased performance - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 5 Start"); - opp_profiler->start("Setup_Mover_s5"); -#ifdef USE_MPI - cellMapper->convertToLocalMappings(c_gbl_id); -#endif - opp_profiler->end("Setup_Mover_s5"); - - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh DONE"); -} - void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id, const opp::BoundingBox& b_box, opp_map c2c_map, opp_map p2c_map, opp_arg arg0, // p_pos | OPP_READ @@ -398,13 +227,14 @@ void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id if (useGlobalMove) { - const int nargs = 4; + const int nargs = 5; opp_arg args[nargs]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; + args[4] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty #ifdef USE_MPI opp_mpi_halo_exchanges(c_gbl_id->set, nargs, args); @@ -418,7 +248,36 @@ void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id boundingBox = std::make_shared(b_box); cellMapper = std::make_shared(boundingBox, grid_spacing, comm); - gen_dh_structured_mesh(c_gbl_id->set, c_gbl_id, c2c_map, nargs, args); + const int c_set_size = c_gbl_id->set->size; + + // lambda function for dh mesh search loop + auto all_cell_checker = [&](const opp_point& point, int& cid) { + + // we dont want to change the original arrays during dh mesh generation, hence duplicate except OPP_READ + OPP_REAL arg1_temp[4]; + for (int ci = 0; ci < c_set_size; ++ci) { + opp_move_status_flag = OPP_NEED_MOVE; + opp_move_hop_iter_one_flag = true; + + int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy + + opp_p2c = &(temp_ci); + opp_c2c = &((c2c_map->map)[temp_ci * 4]); + + opp_k4::move_kernel( + (const OPP_REAL*)&point, + arg1_temp, // p_lc| OPP_WRITE + (const OPP_REAL *)args[2].data + (temp_ci * 1), // c_volume| OPP_READ + (const OPP_REAL *)args[3].data + (temp_ci * 16) // c_det| OPP_READ + ); + if (opp_move_status_flag == OPP_MOVE_DONE) { + cid = temp_ci; + break; + } + } + }; + + cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker); opp_profiler->reg("GlbToLocal"); opp_profiler->reg("GblMv_Move"); diff --git a/app_fempic_cg/omp/move_kernel_loop.hpp b/app_fempic_cg/omp/move_kernel_loop.hpp index da9fde6..79ea243 100644 --- a/app_fempic_cg/omp/move_kernel_loop.hpp +++ b/app_fempic_cg/omp/move_kernel_loop.hpp @@ -247,204 +247,6 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma opp_profiler->end("move_kernel"); } -// ---------------------------------------------------------------------------- -inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map c2c_map, - const int nargs, opp_arg* args) { - - if (OPP_rank == 0) - opp_printf("APP", "gen_dh_structured_mesh START cells [%s] global grid dims %zu %zu %zu", - set->name, cellMapper->globalGridDimsX, cellMapper->globalGridDimsY, cellMapper->globalGridDimsZ); - - const int set_size_inc_halo = set->size + set->exec_size + set->nonexec_size; - if (set_size_inc_halo <= 0) { - opp_printf("APP", "Error... set_size_inc_halo <= 0 for set %s", set->name); - opp_abort("Error... APP set_size_inc_halo <= 0"); - } - - - std::map removed_coords; - const opp_point& min_glb_coords = boundingBox->getGlobalMin(); - const opp_point& maxCoordinate = boundingBox->getLocalMax(); // required for GET_VERT define - bool opp_move_hop_iter_one_flag = true; - - // lambda function for dh mesh search loop - auto all_cell_checker = [&](const opp_point& point, int& cid) { - - // we dont want to change the original arrays during dh mesh generation, hence duplicate except OPP_READ - OPP_REAL arg1_temp[4]; - - for (int ci = 0; ci < set->size; ++ci) { - char opp_move_status_flag = OPP_NEED_MOVE; - - int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy - - OPP_INT* opp_p2c = &(temp_ci); - OPP_INT* opp_c2c = &((c2c_map->map)[temp_ci * 4]); - - - opp_k4::move_kernel( - opp_move_status_flag, opp_move_hop_iter_one_flag, opp_c2c, opp_p2c, - (const OPP_REAL*)&point, - arg1_temp, // p_lc| OPP_WRITE - (const OPP_REAL *)args[2].data + (temp_ci * 1), // c_volume| OPP_READ - (const OPP_REAL *)args[3].data + (temp_ci * 16) // c_det| OPP_READ - ); - if (opp_move_status_flag == OPP_MOVE_DONE) { - cid = temp_ci; - break; - } - } - }; - - cellMapper->createStructMeshMappingArrays(); - - // Step 1 : Get centroids of the structured mesh cells and try to relate them to unstructured mesh indices - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 1 Start"); - opp_profiler->start("Setup_Mover_s1"); - double x = 0.0, y = 0.0, z = 0.0; - - #pragma omp parallel for private(x, y, z) - for (size_t dz = cellMapper->localGridStart.z; dz < cellMapper->localGridEnd.z; dz++) { - z = min_glb_coords.z + dz * cellMapper->gridSpacing; - for (size_t dy = cellMapper->localGridStart.y; dy < cellMapper->localGridEnd.y; dy++) { - y = min_glb_coords.y + dy * cellMapper->gridSpacing; - for (size_t dx = cellMapper->localGridStart.x; dx < cellMapper->localGridEnd.x; dx++) { - x = min_glb_coords.x + dx * cellMapper->gridSpacing; - - size_t index = (dx + dy * cellMapper->globalGridDimsX + dz * cellMapper->globalGridDimsXY); - - const opp_point centroid = cellMapper->getCentroidOfBox(opp_point(x, y ,z)); - int cid = MAX_CELL_INDEX; - - all_cell_checker(centroid, cid); // Find in which cell this centroid lies - - if (cid == MAX_CELL_INDEX) { - #pragma omp critical - { - removed_coords.insert(std::make_pair(index, opp_point(x, y ,z))); - } - } - else if (cid < set->size) { // write only if the structured cell belong to the current MPI rank - cellMapper->enrichStructuredMesh(index, ((int*)c_gbl_id->data)[cid], OPP_rank); - } - } - } - } - opp_profiler->end("Setup_Mover_s1"); - - // Step 2 : For MPI, get the inter-node values reduced to the structured mesh - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 2 Start"); - opp_profiler->start("Setup_Mover_s2"); -#ifdef USE_MPI - cellMapper->reduceInterNodeMappings(1); - - // The marked structured cells from this rank might be filled by another rank, so if already filled, - // no need to recalculate from current rank - for (auto it = removed_coords.begin(); it != removed_coords.end(); ) { - size_t removed_idx = it->first; - if (cellMapper->structMeshToRankMapping[removed_idx] != MAX_CELL_INDEX) { - it = removed_coords.erase(it); // This structured index is already written by another rank - // opp_printf("APP", "index %zu already in %d", this->structMeshToRankMapping[removed_idx], removed_idx); - } - else { - ++it; - } - } - - cellMapper->waitBarrier(); -#endif - opp_profiler->end("Setup_Mover_s2"); - - // Step 3 : Iterate over NEED_REMOVE points, Check whether atleast one vertex of the structured mesh is within - // an unstructured mesh cell. If multiple are found, get the minimum cell index to match with MPI - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 3 Start"); - opp_profiler->start("Setup_Mover_s3"); - std::vector removed_coords_keys; - removed_coords_keys.reserve(removed_coords.size()); - for (const auto& pair : removed_coords) - removed_coords_keys.push_back(pair.first); - - std::vector>> tmp_add_per_thr; - tmp_add_per_thr.resize(opp_nthreads); - - #pragma omp parallel for - for (int thr = 0; thr < opp_nthreads; thr++) - { - const size_t start = (removed_coords_keys.size() * thr) / opp_nthreads; - const size_t finish = (removed_coords_keys.size() * (thr+1)) / opp_nthreads; - - for (size_t i = start; i < finish; i++) - { - - const size_t index = removed_coords_keys[i]; - opp_point& p = removed_coords[index]; - double &x = p.x, &y = p.y, &z = p.z; - - const double gs = cellMapper->gridSpacing; - int most_suitable_cid = MAX_CELL_INDEX, most_suitable_gbl_cid = MAX_CELL_INDEX; - - std::array vertices = { - opp_point(GET_VERT(x,x), GET_VERT(y,y), GET_VERT(z,z)), - opp_point(GET_VERT(x,x), GET_VERT(y,y+gs), GET_VERT(z,z)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y), GET_VERT(z,z)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y+gs), GET_VERT(z,z)), - opp_point(GET_VERT(x,x), GET_VERT(y,y), GET_VERT(z,z+gs)), - opp_point(GET_VERT(x,x), GET_VERT(y,y+gs), GET_VERT(z,z+gs)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y), GET_VERT(z,z+gs)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y+gs), GET_VERT(z,z+gs)), - - }; - - for (const auto& point : vertices) { - int cid = MAX_CELL_INDEX; - - all_cell_checker(point, cid); - - if ((cid != MAX_CELL_INDEX) && (cid < set->size)) { - const int gbl_cid = ((OPP_INT*)c_gbl_id->data)[cid]; - if (most_suitable_gbl_cid > gbl_cid) { - most_suitable_gbl_cid = gbl_cid; - most_suitable_cid = cid; - } - } - } - - // Allow neighbours to write on-behalf of the current rank, to reduce issues - const int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; - if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && - (most_suitable_cid < set->size)) { tmp_add_per_thr[thr].push_back(std::make_pair(index, most_suitable_gbl_cid)); - } - } - } - - cellMapper->lockWindows(); - for (auto& thread_vec : tmp_add_per_thr) { - for (auto& thread_data : thread_vec) { - cellMapper->enrichStructuredMesh(thread_data.first, thread_data.second, OPP_rank); - } - } - cellMapper->unlockWindows(); - opp_profiler->end("Setup_Mover_s3"); - - // Step 4 : For MPI, get the inter-node values reduced to the structured mesh - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 4 Start"); - opp_profiler->start("Setup_Mover_s4"); -#ifdef USE_MPI - cellMapper->reduceInterNodeMappings(2); -#endif - opp_profiler->end("Setup_Mover_s4"); - - // Step 5 : For MPI, convert the global cell coordinates to rank local coordinates for increased performance - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 5 Start"); - opp_profiler->start("Setup_Mover_s5"); -#ifdef USE_MPI - cellMapper->convertToLocalMappings(c_gbl_id); -#endif - opp_profiler->end("Setup_Mover_s5"); - - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh DONE"); -} - void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id, const opp::BoundingBox& b_box, opp_map c2c_map, opp_map p2c_map, opp_arg arg0, // p_pos | OPP_READ @@ -458,13 +260,14 @@ void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id if (useGlobalMove) { - const int nargs = 4; + const int nargs = 5; opp_arg args[nargs]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; + args[4] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty #ifdef USE_MPI opp_mpi_halo_exchanges(c_gbl_id->set, nargs, args); @@ -478,7 +281,37 @@ void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id boundingBox = std::make_shared(b_box); cellMapper = std::make_shared(boundingBox, grid_spacing, comm); - gen_dh_structured_mesh(c_gbl_id->set, c_gbl_id, c2c_map, nargs, args); + const int c_set_size = c_gbl_id->set->size; + + // lambda function for dh mesh search loop + auto all_cell_checker = [&](const opp_point& point, int& cid) { + + // we dont want to change the original arrays during dh mesh generation, hence duplicate except OPP_READ + OPP_REAL arg1_temp[4]; + for (int ci = 0; ci < c_set_size; ++ci) { + char opp_move_status_flag = OPP_NEED_MOVE; + bool opp_move_hop_iter_one_flag = true; + + int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy + + OPP_INT* opp_p2c = &(temp_ci); + OPP_INT* opp_c2c = &((c2c_map->map)[temp_ci * 4]); + + opp_k4::move_kernel( + opp_move_status_flag, opp_move_hop_iter_one_flag, opp_c2c, opp_p2c, + (const OPP_REAL*)&point, + arg1_temp, // p_lc| OPP_WRITE + (const OPP_REAL *)args[2].data + (temp_ci * 1), // c_volume| OPP_READ + (const OPP_REAL *)args[3].data + (temp_ci * 16) // c_det| OPP_READ + ); + if (opp_move_status_flag == OPP_MOVE_DONE) { + cid = temp_ci; + break; + } + } + }; + + cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker); opp_profiler->reg("GlbToLocal"); opp_profiler->reg("GblMv_Move"); diff --git a/app_fempic_cg/seq/move_kernel_loop.hpp b/app_fempic_cg/seq/move_kernel_loop.hpp index c24371a..b1917a1 100644 --- a/app_fempic_cg/seq/move_kernel_loop.hpp +++ b/app_fempic_cg/seq/move_kernel_loop.hpp @@ -147,177 +147,6 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma opp_profiler->end("move_kernel"); } -// ---------------------------------------------------------------------------- -inline void gen_dh_structured_mesh(opp_set set, const opp_dat c_gbl_id, opp_map c2c_map, - const int nargs, opp_arg* args) { - - if (OPP_rank == 0) - opp_printf("APP", "gen_dh_structured_mesh START cells [%s] global grid dims %zu %zu %zu", - set->name, cellMapper->globalGridDimsX, cellMapper->globalGridDimsY, cellMapper->globalGridDimsZ); - - const int set_size_inc_halo = set->size + set->exec_size + set->nonexec_size; - if (set_size_inc_halo <= 0) { - opp_printf("APP", "Error... set_size_inc_halo <= 0 for set %s", set->name); - opp_abort("Error... APP set_size_inc_halo <= 0"); - } - - - std::map removed_coords; - const opp_point& min_glb_coords = boundingBox->getGlobalMin(); - const opp_point& maxCoordinate = boundingBox->getLocalMax(); // required for GET_VERT define - opp_move_hop_iter_one_flag = true; - - // lambda function for dh mesh search loop - auto all_cell_checker = [&](const opp_point& point, int& cid) { - - // we dont want to change the original arrays during dh mesh generation, hence duplicate except OPP_READ - OPP_REAL arg1_temp[4]; - - for (int ci = 0; ci < set->size; ++ci) { - opp_move_status_flag = OPP_NEED_MOVE; - - int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy - - opp_p2c = &(temp_ci); - opp_c2c = &((c2c_map->map)[temp_ci * 4]); - - - opp_k4::move_kernel( - (const OPP_REAL*)&point, - arg1_temp, // p_lc| OPP_WRITE - (const OPP_REAL *)args[2].data + (temp_ci * 1), // c_volume| OPP_READ - (const OPP_REAL *)args[3].data + (temp_ci * 16) // c_det| OPP_READ - ); - if (opp_move_status_flag == OPP_MOVE_DONE) { - cid = temp_ci; - break; - } - } - }; - - cellMapper->createStructMeshMappingArrays(); - - // Step 1 : Get centroids of the structured mesh cells and try to relate them to unstructured mesh indices - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 1 Start"); - opp_profiler->start("Setup_Mover_s1"); - double x = 0.0, y = 0.0, z = 0.0; - - for (size_t dz = cellMapper->localGridStart.z; dz < cellMapper->localGridEnd.z; dz++) { - z = min_glb_coords.z + dz * cellMapper->gridSpacing; - for (size_t dy = cellMapper->localGridStart.y; dy < cellMapper->localGridEnd.y; dy++) { - y = min_glb_coords.y + dy * cellMapper->gridSpacing; - for (size_t dx = cellMapper->localGridStart.x; dx < cellMapper->localGridEnd.x; dx++) { - x = min_glb_coords.x + dx * cellMapper->gridSpacing; - - size_t index = (dx + dy * cellMapper->globalGridDimsX + dz * cellMapper->globalGridDimsXY); - - const opp_point centroid = cellMapper->getCentroidOfBox(opp_point(x, y ,z)); - int cid = MAX_CELL_INDEX; - - all_cell_checker(centroid, cid); // Find in which cell this centroid lies - - if (cid == MAX_CELL_INDEX) { - removed_coords.insert(std::make_pair(index, opp_point(x, y ,z))); - } - else if (cid < set->size) { // write only if the structured cell belong to the current MPI rank - cellMapper->enrichStructuredMesh(index, ((int*)c_gbl_id->data)[cid], OPP_rank); - } - } - } - } - opp_profiler->end("Setup_Mover_s1"); - - // Step 2 : For MPI, get the inter-node values reduced to the structured mesh - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 2 Start"); - opp_profiler->start("Setup_Mover_s2"); -#ifdef USE_MPI - cellMapper->reduceInterNodeMappings(1); - - // The marked structured cells from this rank might be filled by another rank, so if already filled, - // no need to recalculate from current rank - for (auto it = removed_coords.begin(); it != removed_coords.end(); ) { - size_t removed_idx = it->first; - if (cellMapper->structMeshToRankMapping[removed_idx] != MAX_CELL_INDEX) { - it = removed_coords.erase(it); // This structured index is already written by another rank - // opp_printf("APP", "index %zu already in %d", this->structMeshToRankMapping[removed_idx], removed_idx); - } - else { - ++it; - } - } - - cellMapper->waitBarrier(); -#endif - opp_profiler->end("Setup_Mover_s2"); - - // Step 3 : Iterate over NEED_REMOVE points, Check whether atleast one vertex of the structured mesh is within - // an unstructured mesh cell. If multiple are found, get the minimum cell index to match with MPI - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 3 Start"); - opp_profiler->start("Setup_Mover_s3"); - for (auto& p : removed_coords) { - - const size_t index = p.first; - double &x = p.second.x, &y = p.second.y, &z = p.second.z; - - const double gs = cellMapper->gridSpacing; - int most_suitable_cid = MAX_CELL_INDEX, most_suitable_gbl_cid = MAX_CELL_INDEX; - - std::array vertices = { - opp_point(GET_VERT(x,x), GET_VERT(y,y), GET_VERT(z,z)), - opp_point(GET_VERT(x,x), GET_VERT(y,y+gs), GET_VERT(z,z)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y), GET_VERT(z,z)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y+gs), GET_VERT(z,z)), - opp_point(GET_VERT(x,x), GET_VERT(y,y), GET_VERT(z,z+gs)), - opp_point(GET_VERT(x,x), GET_VERT(y,y+gs), GET_VERT(z,z+gs)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y), GET_VERT(z,z+gs)), - opp_point(GET_VERT(x,x+gs), GET_VERT(y,y+gs), GET_VERT(z,z+gs)), - - }; - - for (const auto& point : vertices) { - int cid = MAX_CELL_INDEX; - - all_cell_checker(point, cid); - - if ((cid != MAX_CELL_INDEX) && (cid < set->size)) { - const int gbl_cid = ((OPP_INT*)c_gbl_id->data)[cid]; - if (most_suitable_gbl_cid > gbl_cid) { - most_suitable_gbl_cid = gbl_cid; - most_suitable_cid = cid; - } - } - } - - // Allow neighbours to write on-behalf of the current rank, to reduce issues - cellMapper->lockWindows(); - const int avail_gbl_cid = cellMapper->structMeshToCellMapping[index]; - if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && - (most_suitable_cid < set->size)) { - cellMapper->enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank); - } - cellMapper->unlockWindows(); - } - opp_profiler->end("Setup_Mover_s3"); - - // Step 4 : For MPI, get the inter-node values reduced to the structured mesh - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 4 Start"); - opp_profiler->start("Setup_Mover_s4"); -#ifdef USE_MPI - cellMapper->reduceInterNodeMappings(2); -#endif - opp_profiler->end("Setup_Mover_s4"); - - // Step 5 : For MPI, convert the global cell coordinates to rank local coordinates for increased performance - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh Step 5 Start"); - opp_profiler->start("Setup_Mover_s5"); -#ifdef USE_MPI - cellMapper->convertToLocalMappings(c_gbl_id); -#endif - opp_profiler->end("Setup_Mover_s5"); - - if (OPP_rank == 0) opp_printf("APP", "gen_dh_structured_mesh DONE"); -} - void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id, const opp::BoundingBox& b_box, opp_map c2c_map, opp_map p2c_map, opp_arg arg0, // p_pos | OPP_READ @@ -331,13 +160,14 @@ void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id if (useGlobalMove) { - const int nargs = 4; + const int nargs = 5; opp_arg args[nargs]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; + args[4] = opp_arg_dat(p2c_map->p2c_dat, OPP_RW); // required to make dirty or should manually make it dirty #ifdef USE_MPI opp_mpi_halo_exchanges(c_gbl_id->set, nargs, args); @@ -351,7 +181,36 @@ void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id boundingBox = std::make_shared(b_box); cellMapper = std::make_shared(boundingBox, grid_spacing, comm); - gen_dh_structured_mesh(c_gbl_id->set, c_gbl_id, c2c_map, nargs, args); + const int c_set_size = c_gbl_id->set->size; + + // lambda function for dh mesh search loop + auto all_cell_checker = [&](const opp_point& point, int& cid) { + + // we dont want to change the original arrays during dh mesh generation, hence duplicate except OPP_READ + OPP_REAL arg1_temp[4]; + for (int ci = 0; ci < c_set_size; ++ci) { + opp_move_status_flag = OPP_NEED_MOVE; + opp_move_hop_iter_one_flag = true; + + int temp_ci = ci; // we dont want to get iterating ci changed within the kernel, hence get a copy + + opp_p2c = &(temp_ci); + opp_c2c = &((c2c_map->map)[temp_ci * 4]); + + opp_k4::move_kernel( + (const OPP_REAL*)&point, + arg1_temp, // p_lc| OPP_WRITE + (const OPP_REAL *)args[2].data + (temp_ci * 1), // c_volume| OPP_READ + (const OPP_REAL *)args[3].data + (temp_ci * 16) // c_det| OPP_READ + ); + if (opp_move_status_flag == OPP_MOVE_DONE) { + cid = temp_ci; + break; + } + } + }; + + cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker); opp_profiler->reg("GlbToLocal"); opp_profiler->reg("GblMv_Move"); diff --git a/app_fempic_cg/sycl/calculate_new_pos_vel_kernel_loop.hpp b/app_fempic_cg/sycl/calculate_new_pos_vel_kernel_loop.hpp index cb78d78..59d63d2 100644 --- a/app_fempic_cg/sycl/calculate_new_pos_vel_kernel_loop.hpp +++ b/app_fempic_cg/sycl/calculate_new_pos_vel_kernel_loop.hpp @@ -52,8 +52,8 @@ void opp_par_loop_all__calculate_new_pos_vel_kernel(opp_set set, opp_iterate_typ const OPP_INT* opp_k3_dat1_stride_sycl = opp_k3_dat1_stride_s; const OPP_INT* opp_k3_dat2_stride_sycl = opp_k3_dat2_stride_s; - const OPP_REAL* CONST_mass_sycl = CONST_mass_s; const OPP_REAL* CONST_dt_sycl = CONST_dt_s; + const OPP_REAL* CONST_mass_sycl = CONST_mass_s; const OPP_REAL* CONST_charge_sycl = CONST_charge_s; OPP_REAL* dat0_sycl = (OPP_REAL*)args[0].data_d; // c_ef diff --git a/app_fempic_cg/sycl/inject_ions_kernel_loop.hpp b/app_fempic_cg/sycl/inject_ions_kernel_loop.hpp index e7e96a6..d1fba3a 100644 --- a/app_fempic_cg/sycl/inject_ions_kernel_loop.hpp +++ b/app_fempic_cg/sycl/inject_ions_kernel_loop.hpp @@ -99,9 +99,9 @@ void opp_par_loop_injected__inject_ions_kernel(opp_set set, opp_iterate_type, const OPP_INT* opp_k2_dat9_stride_sycl = opp_k2_dat9_stride_s; const OPP_INT* opp_k2_map0_stride_sycl = opp_k2_map0_stride_s; + const OPP_REAL* CONST_dt_sycl = CONST_dt_s; const OPP_REAL* CONST_ion_velocity_sycl = CONST_ion_velocity_s; const OPP_REAL* CONST_mass_sycl = CONST_mass_s; - const OPP_REAL* CONST_dt_sycl = CONST_dt_s; const OPP_REAL* CONST_charge_sycl = CONST_charge_s; OPP_REAL* dat0_sycl = (OPP_REAL*)args[0].data_d; // p_pos diff --git a/opp_lib/include/device_kernels/cuda_inline_kernels.h b/opp_lib/include/device_kernels/cuda_inline_kernels.h index 30af35b..6c2338d 100644 --- a/opp_lib/include/device_kernels/cuda_inline_kernels.h +++ b/opp_lib/include/device_kernels/cuda_inline_kernels.h @@ -211,6 +211,25 @@ __inline__ __device__ size_t opp_dev_findStructuredCellIndex2D(const OPP_REAL* p return (index >= globalGridSize[0]) ? MAX_CELL_INDEX : index; } +//******************************************************************************* +__inline__ __device__ size_t opp_dev_findStructuredCellIndex3D(const OPP_REAL* pos, + const OPP_REAL* oneOverGridSpace, const OPP_REAL* minGlbCoordinate, + const size_t* globalGridDims, const size_t* globalGridSize) +{ + // Round to the nearest integer to minimize rounding errors + const size_t xIndex = (size_t)((pos[0 * cellMapper_pos_stride_d] - minGlbCoordinate[0]) * + oneOverGridSpace[0]); + const size_t yIndex = (size_t)((pos[1 * cellMapper_pos_stride_d] - minGlbCoordinate[1]) * + oneOverGridSpace[0]); + const size_t zIndex = (size_t)((pos[2 * cellMapper_pos_stride_d] - minGlbCoordinate[2]) * + oneOverGridSpace[0]); + + // Calculate the cell index mapping index + const size_t index = xIndex + (yIndex * globalGridDims[0]) + (zIndex * globalGridDims[3]); + + return (index >= globalGridSize[0]) ? MAX_CELL_INDEX : index; +} + //******************************************************************************* __inline__ __device__ void opp_dev_remove_dh_particle(OPP_INT& p2c, const OPP_INT part_idx, OPP_INT *__restrict__ remove_count, OPP_INT *__restrict__ remove_part_indices) @@ -231,6 +250,64 @@ __inline__ __device__ void opp_dev_move_dh_particle(const OPP_INT part_idx, cons arr_cid[moveArrayIndex] = cid; } +//******************************************************************************* +__inline__ __device__ void opp_dev_checkForGlobalMove( + const size_t struct_cell_id, + const int part_idx, + OPP_INT* __restrict__ p2c_map, + const OPP_INT* __restrict__ struct_mesh_to_cell_map, + const OPP_INT* __restrict__ struct_mesh_to_rank_map, + OPP_INT* __restrict__ remove_count, + OPP_INT* __restrict__ remove_part_indices, + OPP_INT* __restrict__ move_part_indices, + OPP_INT* __restrict__ move_cell_indices, + OPP_INT* __restrict__ move_rank_indices, + OPP_INT* __restrict__ move_count) +{ + if (struct_cell_id == MAX_CELL_INDEX) { // This happens when point is out of the unstructured mesh + opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); + return; + } + +#ifdef USE_MPI + const OPP_INT struct_cell_rank = struct_mesh_to_rank_map[struct_cell_id]; + + // Check whether the paticles need global moving, if yes start global moving process, + // if no, move to the closest local cell + if (struct_cell_rank != OPP_rank_d) { + + if (struct_cell_rank == MAX_CELL_INDEX) { + opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); + return; + } + + // Due to renumbering local cell indices will be different to global, hence do global comm with global indices + const OPP_INT unstruct_cell_id = struct_mesh_to_cell_map[struct_cell_id]; + + if (unstruct_cell_id == MAX_CELL_INDEX) { + opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); + return; + } + + // if the new rank is not the current rank, mark the particle to be sent via global comm + opp_dev_move_dh_particle(part_idx, struct_cell_rank, unstruct_cell_id, move_count, + move_part_indices, move_cell_indices, move_rank_indices); + + // remove the dh moved particle from the current rank + opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); + return; + } + else +#endif + { + p2c_map[part_idx] = struct_mesh_to_cell_map[struct_cell_id]; + if (p2c_map[part_idx] == MAX_CELL_INDEX) { // Particle is outside the mesh, need to remove + opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); + return; + } + } +} + //******************************************************************************* __global__ void opp_dev_checkForGlobalMove2D_kernel( const OPP_REAL* __restrict__ p_pos, @@ -254,53 +331,45 @@ __global__ void opp_dev_checkForGlobalMove2D_kernel( const int part_idx = thread_id + start; if (part_idx < end) { - const size_t struct_cell_id = opp_dev_findStructuredCellIndex2D(p_pos + part_idx, - oneOverGridSpacing, minGlbCoordinate, - globalGridDims, globalGridSize); + oneOverGridSpacing, minGlbCoordinate, + globalGridDims, globalGridSize); - if (struct_cell_id == MAX_CELL_INDEX) { // This happens when point is out of the unstructured mesh - - opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); - return; - } - -#ifdef USE_MPI - const OPP_INT struct_cell_rank = struct_mesh_to_rank_map[struct_cell_id]; - - // Check whether the paticles need global moving, if yes start global moving process, - // if no, move to the closest local cell - if (struct_cell_rank != OPP_rank_d) { - - if (struct_cell_rank == MAX_CELL_INDEX) { - opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); - return; - } - - // Due to renumbering local cell indices will be different to global, hence do global comm with global indices - const OPP_INT unstruct_cell_id = struct_mesh_to_cell_map[struct_cell_id]; + opp_dev_checkForGlobalMove(struct_cell_id, part_idx, p2c_map, struct_mesh_to_cell_map, + struct_mesh_to_rank_map, remove_count, remove_part_indices, move_part_indices, + move_cell_indices, move_rank_indices, move_count); + } +} - if (unstruct_cell_id == MAX_CELL_INDEX) { - opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); - return; - } +//******************************************************************************* +__global__ void opp_dev_checkForGlobalMove3D_kernel( + const OPP_REAL* __restrict__ p_pos, + OPP_INT* __restrict__ p2c_map, + const OPP_INT* __restrict__ struct_mesh_to_cell_map, + const OPP_INT* __restrict__ struct_mesh_to_rank_map, + const OPP_REAL* __restrict__ oneOverGridSpacing, + const OPP_REAL* __restrict__ minGlbCoordinate, + const size_t* __restrict__ globalGridDims, + const size_t* __restrict__ globalGridSize, + OPP_INT* __restrict__ remove_count, + OPP_INT* __restrict__ remove_part_indices, + OPP_INT* __restrict__ move_part_indices, + OPP_INT* __restrict__ move_cell_indices, + OPP_INT* __restrict__ move_rank_indices, + OPP_INT* __restrict__ move_count, + const OPP_INT start, + const OPP_INT end) +{ + const int thread_id = threadIdx.x + blockIdx.x * blockDim.x; + const int part_idx = thread_id + start; - // if the new rank is not the current rank, mark the particle to be sent via global comm - opp_dev_move_dh_particle(part_idx, struct_cell_rank, unstruct_cell_id, move_count, - move_part_indices, move_cell_indices, move_rank_indices); - - // remove the dh moved particle from the current rank - opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); - return; - } - else -#endif - { - p2c_map[part_idx] = struct_mesh_to_cell_map[struct_cell_id]; - if (p2c_map[part_idx] == MAX_CELL_INDEX) { // Particle is outside the mesh, need to remove - opp_dev_remove_dh_particle(p2c_map[part_idx], part_idx, remove_count, remove_part_indices); - return; - } - } + if (part_idx < end) { + const size_t struct_cell_id = opp_dev_findStructuredCellIndex3D(p_pos + part_idx, + oneOverGridSpacing, minGlbCoordinate, + globalGridDims, globalGridSize); + + opp_dev_checkForGlobalMove(struct_cell_id, part_idx, p2c_map, struct_mesh_to_cell_map, + struct_mesh_to_rank_map, remove_count, remove_part_indices, move_part_indices, + move_cell_indices, move_rank_indices, move_count); } -} +} \ No newline at end of file diff --git a/opp_lib/src/cuda/opp_direct_hop_cuda.cu b/opp_lib/src/cuda/opp_direct_hop_cuda.cu index 2f7a611..8bcac08 100644 --- a/opp_lib/src/cuda/opp_direct_hop_cuda.cu +++ b/opp_lib/src/cuda/opp_direct_hop_cuda.cu @@ -32,6 +32,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "opp_direct_hop_core.h" #include +#ifdef USE_OMP +#include +#endif + //******************************************************************************* using namespace opp; @@ -465,15 +469,23 @@ void CellMapper::generateStructuredMesh(opp_set set, const opp_dat c_gbl_id, createStructMeshMappingArrays(); +#ifdef USE_OMP + const int omp_nthreads = omp_get_max_threads(); + opp_printf("OPP", "generateStructuredMesh omp_nthreads %d", omp_nthreads); +#endif + // Step 1 : Get centroids of the structured mesh cells and try to relate them to unstructured mesh indices if (OPP_rank == 0) opp_printf("OPP", "generateStructuredMesh Step 1 Start"); opp_profiler->start("Setup_Mover_s1"); double x = 0.0, y = 0.0, z = 0.0; - + +#ifdef USE_OMP + #pragma omp parallel for private(x,y,z) num_threads(6) +#endif for (size_t dz = localGridStart.z; dz < localGridEnd.z; dz++) { z = min_glb_coords.z + dz * gridSpacing; for (size_t dy = localGridStart.y; dy < localGridEnd.y; dy++) { - y = min_glb_coords.y + dy * gridSpacing; + y = min_glb_coords.y + dy * gridSpacing; for (size_t dx = localGridStart.x; dx < localGridEnd.x; dx++) { x = min_glb_coords.x + dx * gridSpacing; @@ -485,7 +497,14 @@ void CellMapper::generateStructuredMesh(opp_set set, const opp_dat c_gbl_id, all_cell_checker(centroid, cid); // Find in which cell this centroid lies if (cid == MAX_CELL_INDEX) { +#ifdef USE_OMP + #pragma omp critical + { + removed_coords.insert(std::make_pair(index, opp_point(x, y ,z))); + } +#else removed_coords.insert(std::make_pair(index, opp_point(x, y ,z))); +#endif } else if (cid < set->size) { // write only if the structured cell belong to the current MPI rank enrichStructuredMesh(index, ((int*)c_gbl_id->data)[cid], OPP_rank); @@ -522,10 +541,33 @@ void CellMapper::generateStructuredMesh(opp_set set, const opp_dat c_gbl_id, // an unstructured mesh cell. If multiple are found, get the minimum cell index to match with MPI if (OPP_rank == 0) opp_printf("OPP", "generateStructuredMesh Step 3 Start"); opp_profiler->start("Setup_Mover_s3"); +#ifdef USE_OMP + std::vector removed_coords_keys; + removed_coords_keys.reserve(removed_coords.size()); + for (const auto& pair : removed_coords) + removed_coords_keys.push_back(pair.first); + + std::vector>> tmp_add_per_thr; + tmp_add_per_thr.resize(omp_nthreads); + + #pragma omp parallel for num_threads(6) + for (int thr = 0; thr < omp_nthreads; thr++) + { + const size_t start = (removed_coords_keys.size() * thr) / omp_nthreads; + const size_t finish = (removed_coords_keys.size() * (thr+1)) / omp_nthreads; + + for (size_t i = start; i < finish; i++) + { + + const size_t index = removed_coords_keys[i]; + opp_point& p = removed_coords[index]; + double &x = p.x, &y = p.y, &z = p.z; +#else for (auto& p : removed_coords) { const size_t index = p.first; double &x = p.second.x, &y = p.second.y, &z = p.second.z; +#endif const double gs = gridSpacing; int most_suitable_cid = MAX_CELL_INDEX, most_suitable_gbl_cid = MAX_CELL_INDEX; @@ -552,13 +594,23 @@ void CellMapper::generateStructuredMesh(opp_set set, const opp_dat c_gbl_id, } // Allow neighbours to write on-behalf of the current rank, to reduce issues +#ifndef USE_OMP lockWindows(); +#endif const int avail_gbl_cid = structMeshToCellMapping[index]; if ((most_suitable_gbl_cid != MAX_CELL_INDEX) && (most_suitable_gbl_cid < avail_gbl_cid) && - (most_suitable_cid < set->size)) { - enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank); + (most_suitable_cid < set->size)) { +#ifdef USE_OMP + tmp_add_per_thr[thr].push_back(std::make_pair(index, most_suitable_gbl_cid)); +#else + enrichStructuredMesh(index, most_suitable_gbl_cid, OPP_rank); +#endif } +#ifndef USE_OMP unlockWindows(); +#else + } +#endif } opp_profiler->end("Setup_Mover_s3"); @@ -622,7 +674,9 @@ void opp_init_dh_device(opp_set set) if (dh_indices_d.move_count == nullptr) dh_indices_d.move_count = opp_mem::dev_malloc(1); -if (OPP_DBG) opp_printf("OPP", "opp_init_dh_device JJJJJ %d %d", dh_indices_h.capacity, dh_indices_d.capacity); + + if (OPP_DBG) opp_printf("OPP", "opp_init_dh_device capacities %d %d", + dh_indices_h.capacity, dh_indices_d.capacity); } *(dh_indices_h.move_count) = 0; @@ -631,6 +685,7 @@ if (OPP_DBG) opp_printf("OPP", "opp_init_dh_device JJJJJ %d %d", dh_indices_h.ca if (OPP_DBG) opp_printf("OPP", "opp_init_dh_device END"); } +#ifdef USE_MPI //******************************************************************************* // gathers all global move information into the global mover for communication void opp_gather_dh_move_indices(opp_set set) @@ -840,3 +895,4 @@ void dh_particle_packer_gpu::unpack(opp_set set, const std::mapfrom->size + c2c_map->from->exec_size + c2c_map->from->nonexec_size; opp_mem::dev_copy_to_symbol(OPP_cells_set_size_d, &OPP_cells_set_size, &(set->cells_set->size), 1); - opp_mem::dev_copy_to_symbol(opp_k2_c2c_map_stride_d, &opp_k2_c2c_map_stride, &c2c_stride, 1); + opp_mem::dev_copy_to_symbol(opp_k{{kernel_idx}}_c2c_map_stride_d, &opp_k{{kernel_idx}}_c2c_map_stride, &c2c_stride, 1); opp_mpi_halo_wait_all(nargs, args); {% for arg in lh.args|gbl %} diff --git a/scripts/source/pan_oneapi2021.3 b/scripts/source/pan_oneapi2021.3 index 3dc36a7..f501373 100644 --- a/scripts/source/pan_oneapi2021.3 +++ b/scripts/source/pan_oneapi2021.3 @@ -1,5 +1,7 @@ #!/bin/bash +# salloc -p v100 --time 1:00:00 + module purge export OPP_COMPILER=intel @@ -56,7 +58,8 @@ elif [ "$(hostname)" == 'demos' ]; then export CUDA_VISIBLE_DEVICES=0 echo $NV_ARCH $(hostname) unset CUDA_HOME - + export CUDA_ARCH=90 + export PETSC_INSTALL_PATH=/home/zl/lib_install/petsc-3.20.5-nvhpc23.7-mpi_demos else echo The machine not configured, PETSC_INSTALL_PATH and some other variables may be missing! @@ -73,6 +76,8 @@ if [ "$(hostname)" != 'demos' ]; then echo "Source intel compilers" source /opt/intel/oneapi/2021.3/setvars.sh + export CPPFLAGS_ADD="-fp-model precise" + # source /opt/intel/oneapi/2024.0/oneapi-vars.sh export MPI_INSTALL_PATH=/opt/intel/oneapi/2021.3/mpi/latest diff --git a/scripts/source/viking_gcc12.2 b/scripts/source/viking_gcc12.2 new file mode 100644 index 0000000..ba609b0 --- /dev/null +++ b/scripts/source/viking_gcc12.2 @@ -0,0 +1,41 @@ +#!/bin/bash + +module purge + +module load GCC/12.2.0 +module load OpenMPI/4.1.4-GCC-12.2.0 +module load PETSc/3.19.2-foss-2022b +module load CUDA/12.2.2 +module load HDF5/1.14.0-gompi-2022b + +export NVCCFLAGS_ADD='-gencode arch=compute_90,code=sm_90' + +export OPP=$HOME/phd/OP-PIC +export OPP_INSTALL_PATH=$OPP/opp_install +export OPP_TRANSLATOR=$OPP/opp_translator/translator +export OPP_PATH=$OPP/opp_lib + +export CC_COMPILER=g++ +export MPI_COMPILER=mpicxx + +source $OPP/opp_translator/opp_venv/bin/activate + + +alias nv='nvidia-smi -l 1' +alias l='cd $HOME/phd/OP-PIC/opp_lib' +alias f='cd $HOME/phd/OP-PIC/app_fempic' +alias fc='cd $HOME/phd/OP-PIC/app_fempic_cg' +alias ac='cd $HOME/phd/OP-PIC/app_neso_advection_cg' +alias a='cd $HOME/phd/OP-PIC/app_neso_advection' +alias c='cd $HOME/phd/OP-PIC/app_cabanapic' +alias t='cd $HOME/phd/OP-PIC/opp_translator' + + +# export CUDA_INSTALL_PATH=/opt/apps/eb/software/NVHPC/23.7-CUDA-12.1.1/Linux_x86_64/23.7/cuda +# export CUDA_INSTALL_PATH=/opt/apps/eb/software/CUDA/12.2.2 +# export NVCCFLAGS_ADD='-gencode arch=compute_90,code=sm_90' +# export PETSC_INSTALL_PATH=/opt/apps/eb/software/PETSc/3.19.2-foss-2022b +# export LD_LIBRARY_PATH=$PETSC_INSTALL_PATH/lib:$LD_LIBRARY_PATH +# module load NVHPC/23.7-CUDA-12.1.1 +# module load OpenMPI/4.1.5-NVHPC-23.7-CUDA-12.1.1 +