Skip to content

Commit

Permalink
dh, scatter-atomics, seg-red -- hip codegen done
Browse files Browse the repository at this point in the history
  • Loading branch information
ZamanLantra committed Feb 3, 2025
1 parent 4ffeefb commit 3eb8519
Show file tree
Hide file tree
Showing 51 changed files with 957 additions and 567 deletions.
5 changes: 3 additions & 2 deletions app_cabanapic/configs/cabana.param
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ STRING cluster = block
# System parameters
BOOL opp_auto_sort = false
REAL opp_allocation_multiple = 1.1
INT opp_threads_per_block = 256
INT opp_threads_per_block = 512
BOOL opp_profile_all = false
BOOL print_final = true
INT opp_gpu_direct = 0
Expand All @@ -28,7 +28,8 @@ STRING opp_fill = HoleFill_All
INT opp_fill_period = 4

BOOL opp_global_move = false
BOOL use_reg_red = false
BOOL opp_segmented_red = false
INT gpu_reduction_arrays = 16

STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/cabana/mesh_files/cab_48000.hdf5
INT domain_expansion = 1
Expand Down
2 changes: 1 addition & 1 deletion app_cabanapic_cg/cabana_opp.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

// Auto-generated at 2025-01-21 14:53:57.354254 by opp-translator
// Auto-generated at 2025-02-03 16:39:08.296851 by opp-translator
/*
BSD 3-Clause License
Expand Down
12 changes: 6 additions & 6 deletions app_cabanapic_cg/cuda/interpolate_mesh_fields_kernel_loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ __constant__ OPP_INT opp_k1_map0_stride_d;


namespace opp_k1 {
enum Dim {
x = 0,
y = 1,
z = 2,
};

enum CellInterp {
ex = 0,
dexdy,
Expand All @@ -39,12 +45,6 @@ enum CellInterp {
dcbzdz,
};

enum Dim {
x = 0,
y = 1,
z = 2,
};

__device__ inline void interpolate_mesh_fields_kernel(
const double* cell0_e,
const double* cell0_b,
Expand Down
2 changes: 1 addition & 1 deletion app_cabanapic_cg/hip/compute_energy_kernel_loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,10 @@ void opp_par_loop_all__compute_energy_kernel(opp_set set,
for (int d = 0; d < 1; ++d)
arg2_host_data[d] += ((OPP_REAL *)args[2].data)[b * 1 + d];
}

args[2].data = (char *)arg2_host_data;
opp_mpi_reduce(&args[2], arg2_host_data);


opp_set_dirtybit_grouped(nargs, args, Device_GPU);
OPP_DEVICE_SYNCHRONIZE();

Expand Down
2 changes: 1 addition & 1 deletion app_cabanapic_cg/hip/get_max_x_values_kernel_loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,6 @@ void opp_par_loop_all__get_max_x_values_kernel(opp_set set,
for (int d = 0; d < 1; ++d)
arg5_host_data[d] = MAX(arg5_host_data[d], ((OPP_REAL *)args[5].data)[b * 1 + d]);
}

args[1].data = (char *)arg1_host_data;
opp_mpi_reduce(&args[1], arg1_host_data);

Expand All @@ -212,6 +211,7 @@ void opp_par_loop_all__get_max_x_values_kernel(opp_set set,
args[5].data = (char *)arg5_host_data;
opp_mpi_reduce(&args[5], arg5_host_data);


opp_set_dirtybit_grouped(nargs, args, Device_GPU);
OPP_DEVICE_SYNCHRONIZE();

Expand Down
76 changes: 39 additions & 37 deletions app_cabanapic_cg/hip/move_deposit_kernel_loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,33 +38,6 @@ enum CellAcc {
jfz = 2 * 4,
};

enum CellInterp {
ex = 0,
dexdy,
dexdz,
d2exdydz,
ey,
deydz,
deydx,
d2eydzdx,
ez,
dezdx,
dezdy,
d2ezdxdy,
cbx,
dcbxdx,
cby,
dcbydy,
cbz,
dcbzdz,
};

enum Dim {
x = 0,
y = 1,
z = 2,
};

__device__ inline void weight_current_to_accumulator_kernel(
double* cell_acc,
const double* q,
Expand Down Expand Up @@ -94,6 +67,33 @@ __device__ inline void weight_current_to_accumulator_kernel(
cell_acc[CellAcc::jfz + 3] += v3;
}

enum Dim {
x = 0,
y = 1,
z = 2,
};

enum CellInterp {
ex = 0,
dexdy,
dexdz,
d2exdydz,
ey,
deydz,
deydx,
d2eydzdx,
ez,
dezdx,
dezdy,
d2ezdxdy,
cbx,
dcbxdx,
cby,
dcbydy,
cbz,
dcbzdz,
};

__device__ inline void move_deposit_kernel(
char& opp_move_status_flag, const bool opp_move_hop_iter_one_flag, // Added by code-gen
const OPP_INT* opp_c2c, OPP_INT* opp_p2c, // Added by code-gen
Expand Down Expand Up @@ -481,12 +481,17 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma

int num_blocks = 200;

opp_init_particle_move(set, nargs, args);

const int array_count = opp_params->get<OPP_INT>("gpu_reduction_arrays");
if (!opp_use_segmented_reductions) {
opp_create_thread_level_data<OPP_REAL>(args[5]);
}

opp_init_particle_move(set, nargs, args);

opp_profiler->start("Mv_AllMv0");
// ----------------------------------------------------------------------------
// Multi-hop move particles within current MPI rank and if not mark for neighbour comm

opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
Expand All @@ -498,9 +503,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat4_stride_d, &opp_k2_dat4_stride, &(args[4].dat->set->set_capacity), 1);
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat5_stride_d, &opp_k2_dat5_stride, &(args[5].dat->set->set_capacity), 1);

opp_profiler->start("Mv_AllMv0");
// ----------------------------------------------------------------------------
// Multi-hop move particles within current MPI rank and if not mark for neighbour comm
if (!opp_use_segmented_reductions) // Do atomics ----------
{
opp_profiler->start("move_kernel_only");
Expand All @@ -524,7 +526,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
OPP_DEVICE_SYNCHRONIZE();
opp_profiler->end("move_kernel_only");
}

else // Do segmented reductions ----------
{
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_sr_set_stride_d, &opp_k2_sr_set_stride, &set->size, 1);
Expand Down Expand Up @@ -575,17 +576,18 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
// then iterate over the newly added particles
while (opp_finalize_particle_move(set)) {

opp_init_particle_move(set, nargs, args);

opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);
num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;

opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat0_stride_d, &opp_k2_dat0_stride, &(args[0].dat->set->set_capacity), 1);
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat1_stride_d, &opp_k2_dat1_stride, &(args[1].dat->set->set_capacity), 1);
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat2_stride_d, &opp_k2_dat2_stride, &(args[2].dat->set->set_capacity), 1);
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat3_stride_d, &opp_k2_dat3_stride, &(args[3].dat->set->set_capacity), 1);
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat4_stride_d, &opp_k2_dat4_stride, &(args[4].dat->set->set_capacity), 1);
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_dat5_stride_d, &opp_k2_dat5_stride, &(args[5].dat->set->set_capacity), 1);

opp_init_particle_move(set, nargs, args);
opp_mem::dev_copy_to_symbol<OPP_INT>(OPP_comm_iteration_d, &OPP_comm_iteration, 1);

num_blocks = (OPP_iter_end - OPP_iter_start - 1) / block_size + 1;
if (!opp_use_segmented_reductions) // Do atomics ----------
{
opp_profiler->start("move_kernel_only");
Expand All @@ -609,7 +611,6 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma
OPP_DEVICE_SYNCHRONIZE();
opp_profiler->end("move_kernel_only");
}

else // Do segmented reductions ----------
{
// opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k2_sr_set_stride_d, &opp_k2_sr_set_stride, &set->size, 1);
Expand Down Expand Up @@ -656,6 +657,7 @@ void opp_particle_move__move_deposit_kernel(opp_set set, opp_map c2c_map, opp_ma

if (!opp_use_segmented_reductions) {
opp_reduce_thread_level_data<OPP_REAL>(args[5]);

}
else {
opp_sr::clear_arrays<OPP_REAL>(sr_dat5_keys_dv, sr_dat5_values_dv, sr_dat5_keys_dv2, sr_dat5_values_dv2);
Expand Down
15 changes: 15 additions & 0 deletions app_cabanapic_cg/hip/opp_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "opp_hip.h"
#include "device_kernels/hip_inline_kernels.h"

OPP_REAL CONST_dt[1];
OPP_REAL CONST_qsp[1];
OPP_REAL CONST_cdt_d[3];
OPP_REAL CONST_p[3];
OPP_REAL CONST_qdt_2mc[1];
OPP_REAL CONST_dt_eps0[1];
OPP_REAL CONST_acc_coef[3];

__constant__ OPP_REAL CONST_dt_d[1];
__constant__ OPP_REAL CONST_qsp_d[1];
__constant__ OPP_REAL CONST_cdt_d_d[3];
Expand All @@ -52,30 +60,37 @@ void opp_decl_const_impl(int dim, int size, char* data, const char* name) {

if (!strcmp(name, "CONST_dt")) {
cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_dt_d), data, dim * size));
std::memcpy(&CONST_dt, data, (size*dim));
return;
}
if (!strcmp(name, "CONST_qsp")) {
cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_qsp_d), data, dim * size));
std::memcpy(&CONST_qsp, data, (size*dim));
return;
}
if (!strcmp(name, "CONST_cdt_d")) {
cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_cdt_d_d), data, dim * size));
std::memcpy(&CONST_cdt_d, data, (size*dim));
return;
}
if (!strcmp(name, "CONST_p")) {
cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_p_d), data, dim * size));
std::memcpy(&CONST_p, data, (size*dim));
return;
}
if (!strcmp(name, "CONST_qdt_2mc")) {
cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_qdt_2mc_d), data, dim * size));
std::memcpy(&CONST_qdt_2mc, data, (size*dim));
return;
}
if (!strcmp(name, "CONST_dt_eps0")) {
cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_dt_eps0_d), data, dim * size));
std::memcpy(&CONST_dt_eps0, data, (size*dim));
return;
}
if (!strcmp(name, "CONST_acc_coef")) {
cutilSafeCall(hipMemcpyToSymbol(HIP_SYMBOL(CONST_acc_coef_d), data, dim * size));
std::memcpy(&CONST_acc_coef, data, (size*dim));
return;
}

Expand Down
20 changes: 11 additions & 9 deletions app_cabanapic_cg/hip/update_ghosts_kernel_loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ __global__ void opp_dev_sr_update_ghosts_kernel( // Used for Segmented Reduction
);

for (int d = 0; d < 3; ++d) {
sr_dat1_values[n + opp_k6_sr_set_stride_d * d] = arg2_0_local[d];
sr_dat1_values[1 * n + 0 + opp_k6_sr_set_stride_d * d] = arg2_0_local[d];
}
sr_dat1_keys[n] = map0[opp_k6_map0_stride_d * 0 + n];
sr_dat1_keys[1 * n + 0] = map0[opp_k6_map0_stride_d * 0 + n];
}

}
Expand Down Expand Up @@ -212,19 +212,21 @@ void opp_par_loop_all__update_ghosts_kernel(opp_set set,

else // Do segmented reductions ----------
{
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k6_sr_set_stride_d, &opp_k6_sr_set_stride, &(iter_size), 1);

size_t operating_size_dat1 = 0, resize_size_dat1 = 0;

operating_size_dat1 += (size_t)(args[2].dat->dim);
resize_size_dat1 += (size_t)(args[2].dat->dim);

operating_size_dat1 *= (size_t)(iter_size); // this might need to be + set->exec_size ..., or simply iter_size | even for opp_k6_sr_set_stride
operating_size_dat1 *= (size_t)(iter_size);
resize_size_dat1 *= (size_t)(set->set_capacity);

opp_sr::init_arrays<OPP_REAL>(args[1].dat->dim, operating_size_dat1, resize_size_dat1,
sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);
int k6_stride = 0;
k6_stride += iter_size * 1;
opp_mem::dev_copy_to_symbol<OPP_INT>(opp_k6_sr_set_stride_d, &opp_k6_sr_set_stride, &(k6_stride), 1);

opp_sr::init_arrays<OPP_REAL>(args[2].dat->dim, operating_size_dat1, resize_size_dat1,
sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);

// Create key/value pairs
opp_profiler->start("SR_CrKeyVal");
opp_dev_sr_update_ghosts_kernel<<<num_blocks, block_size>>>(
Expand All @@ -241,8 +243,8 @@ void opp_par_loop_all__update_ghosts_kernel(opp_set set,
OPP_DEVICE_SYNCHRONIZE();
opp_profiler->end("SR_CrKeyVal");

opp_sr::do_segmented_reductions<OPP_REAL>(args[2], iter_size,
sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);
opp_sr::do_segmented_reductions<OPP_REAL>(args[2], k6_stride,
sr_dat1_keys_dv, sr_dat1_values_dv, sr_dat1_keys_dv2, sr_dat1_values_dv2);
}
}
args[3].data = (char *)arg3_host_data;
Expand Down
14 changes: 8 additions & 6 deletions app_fempic/configs/coarse.param
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,30 @@ STRING global_mesh = /home/zl/phd/box_mesh_gen/48000/mesh.dat
STRING inlet_mesh = /home/zl/phd/box_mesh_gen/48000/inlet.dat
STRING wall_mesh = /home/zl/phd/box_mesh_gen/48000/wall.dat
# OR
STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/fempic/mesh_files/box_48000.hdf5
STRING hdf_filename = /home/zl/phd/box_mesh_gen/hdf5/box_48000.hdf5

STRING rand_file = /home/zl/phd/box_mesh_gen/random_100k.dat

BOOL invert_normals = false

BOOL opp_auto_sort = false
REAL opp_allocation_multiple = 11
INT opp_threads_per_block = 256
INT opp_threads_per_block = 512

BOOL opp_profile_all = false
INT opp_gpu_direct = 0

REAL grid_spacing = 25e-6
BOOL print_final = true

BOOL opp_global_move = false
REAL grid_spacing = 25e-6
BOOL opp_dh_data_generate = false
# Use opp_dh_data_generate = true and OPP_DH_DATA_DUMP=1 in args to dump the structured mesh

# block k-means mpi-block
STRING cluster = block
BOOL use_reg_red = false
BOOL opp_segmented_red = false
INT gpu_reduction_arrays = 16

# HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
STRING opp_fill = Shuffle_Periodic
INT opp_fill_period = 20
INT opp_fill_period = 4
6 changes: 3 additions & 3 deletions app_fempic_cg/configs/coarse.param
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ STRING global_mesh = /home/zl/phd/box_mesh_gen/48000/mesh.dat
STRING inlet_mesh = /home/zl/phd/box_mesh_gen/48000/inlet.dat
STRING wall_mesh = /home/zl/phd/box_mesh_gen/48000/wall.dat
# OR
STRING hdf_filename = /home/zl/phd/neso_test/OP-PIC_Configs/fempic/mesh_files/box_48000.hdf5
STRING hdf_filename = /home/zl/phd/box_mesh_gen/hdf5/box_96000.hdf5

STRING rand_file = /home/zl/phd/box_mesh_gen/random_100k.dat

BOOL invert_normals = false

BOOL opp_auto_sort = false
REAL opp_allocation_multiple = 11
INT opp_threads_per_block = 256
INT opp_threads_per_block = 512

BOOL opp_profile_all = false
INT opp_gpu_direct = 0
Expand All @@ -38,7 +38,7 @@ BOOL opp_dh_data_generate = false

# block k-means mpi-block
STRING cluster = block
BOOL use_reg_red = false
BOOL opp_segmented_red = false
INT gpu_reduction_arrays = 16

# HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
Expand Down
2 changes: 1 addition & 1 deletion app_fempic_cg/fempic_opp.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

// Auto-generated at 2025-01-21 14:55:58.979812 by opp-translator
// Auto-generated at 2025-02-03 16:29:35.442468 by opp-translator
/*
BSD 3-Clause License
Expand Down
Loading

0 comments on commit 3eb8519

Please sign in to comment.