Skip to content

Commit

Permalink
bug_fix for dh mpi mesh creation
Browse files Browse the repository at this point in the history
  • Loading branch information
ZamanLantra committed Nov 13, 2024
1 parent 85f16c1 commit ea05811
Show file tree
Hide file tree
Showing 14 changed files with 147 additions and 55 deletions.
29 changes: 20 additions & 9 deletions app_fempic_cg/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,23 @@ ifeq ($(DEBUG_LOG), 1)
SYCLFLAGS += -DDEBUG_LOG
endif

HOPS ?= 1
ifeq ($(HOPS), 1)
CPPFLAGS += -DLOG_HOPS
NVCCFLAGS += -DLOG_HOPS
HIPCCFLAGS += -DLOG_HOPS
SYCLFLAGS += -DLOG_HOPS
endif

PETSC_INC = -I$(PETSC_INSTALL_PATH)/include
PETSC_LIB = -Wl,-rpath,$(PETSC_INSTALL_PATH)/lib -L$(PETSC_INSTALL_PATH)/lib -lpetsc

CUDA_INC = -I$(CUDA_INSTALL_PATH)/include
CUDA_LIB = -L$(CUDA_INSTALL_PATH)/lib64 -lcudart -lcuda

# CUDA_INC = -I$(CUDA_INSTALL_PATH)/include -I$(MPI_INSTALL_PATH)/include
# CUDA_LIB = -L$(CUDA_INSTALL_PATH)/lib64 -lcudart -lcuda -I$(MPI_INSTALL_PATH)/lib -lmpi

HIP_INC = -I$(ROCM_INSTALL_DIR)/include -I$(ROCM_THRUST_DIR)/include -I$(ROCM_PRIM_DIR)/include
HIP_LIB = -L$(ROCM_INSTALL_DIR)/lib -lamdhip64
# for archer2 gpus
Expand Down Expand Up @@ -94,7 +105,7 @@ seq: mklib fempic_opp+seq opp_kernels+seq field_solver+seq
$(OBJ)/fempic_opp_seq.o \
$(OBJ)/opp_kernels_seq.o \
$(OBJ)/field_solver_cpu_seq.o \
$(ALL_LIBS) -lopp_seq
$(ALL_LIBS) -lopp_seq -lz

# ----------------------------------------------------------------------------------------------------------------------
opp_kernels+omp:
Expand All @@ -109,7 +120,7 @@ omp: mklib opp_kernels+omp fempic_opp+omp field_solver+omp
$(OBJ)/fempic_opp_omp.o \
$(OBJ)/opp_kernels_omp.o \
$(OBJ)/field_solver_cpu_omp.o \
$(ALL_LIBS) -lopp_omp
$(ALL_LIBS) -lopp_omp -lz

# ----------------------------------------------------------------------------------------------------------------------
fempic_opp+omp_mpi:
Expand All @@ -126,15 +137,15 @@ omp_mpi: mklib opp_kernels+omp_mpi field_solver+omp_mpi fempic_opp+omp_mpi
$(OBJ)/fempic_opp_omp_mpi.o \
$(OBJ)/opp_kernels_omp_mpi.o \
$(OBJ)/field_solver_cpu_omp_mpi.o \
$(ALL_LIBS) -lopp_omp_mpi
$(ALL_LIBS) -lopp_omp_mpi -lz

omp_mpi_hdf5: mklib opp_kernels+omp_mpi field_solver+omp_mpi fempic_opp+omp_mpi opp_mpi_hdf5+omp_mpi
$(MPICPP) $(CPPFLAGS) -fopenmp -o $(BIN)/omp_mpi_hdf5 \
$(OBJ)/fempic_opp_omp_mpi.o \
$(OBJ)/opp_kernels_omp_mpi.o \
$(OBJ)/field_solver_cpu_omp_mpi.o \
$(OBJ)/opp_mpi_hdf5_omp_mpi.o \
$(ALL_LIBS) -lopp_omp_mpi -lhdf5
$(ALL_LIBS) -lopp_omp_mpi -lhdf5 -lz

# ----------------------------------------------------------------------------------------------------------------------
fempic_opp+mpi:
Expand All @@ -151,15 +162,15 @@ mpi: mklib fempic_opp+mpi opp_kernels+mpi field_solver+mpi
$(OBJ)/fempic_opp_mpi.o \
$(OBJ)/opp_kernels_mpi.o \
$(OBJ)/field_solver_cpu_mpi.o \
$(ALL_LIBS) -lopp_mpi
$(ALL_LIBS) -lopp_mpi -lz

mpi_hdf5: mklib fempic_opp+mpi opp_kernels+mpi field_solver+mpi opp_mpi_hdf5+mpi
$(MPICPP) $(CPPFLAGS) -DUSE_MPI -o $(BIN)/mpi_hdf5 \
$(OBJ)/fempic_opp_mpi.o \
$(OBJ)/opp_mpi_hdf5_mpi_hdf5.o \
$(OBJ)/opp_kernels_mpi.o \
$(OBJ)/field_solver_cpu_mpi.o \
$(ALL_LIBS) -lopp_mpi -lhdf5
$(ALL_LIBS) -lopp_mpi -lhdf5 -lz

# ----------------------------------------------------------------------------------------------------------------------
opp_kernels+cuda:
Expand All @@ -174,7 +185,7 @@ cuda: mklib opp_kernels+cuda fempic_opp+cuda field_solver+cuda
$(OBJ)/fempic_opp_cuda.o \
$(OBJ)/opp_kernels_cuda.o \
$(OBJ)/field_solver_cuda.o \
$(ALL_LIBS) $(CUDA_LIB) -lopp_cuda
$(ALL_LIBS) $(CUDA_LIB) -lopp_cuda -lz

# ----------------------------------------------------------------------------------------------------------------------
opp_kernels+cuda_mpi:
Expand All @@ -191,15 +202,15 @@ cuda_mpi: mklib opp_kernels+cuda_mpi fempic_opp+cuda_mpi field_solver+cuda_mpi
$(OBJ)/fempic_opp_cuda_mpi.o \
$(OBJ)/opp_kernels_cuda_mpi.o \
$(OBJ)/field_solver_cuda_mpi.o \
$(ALL_LIBS) $(CUDA_LIB) -lopp_cuda_mpi
$(ALL_LIBS) $(CUDA_LIB) -lopp_cuda_mpi -lz

cuda_mpi_hdf5: mklib opp_kernels+cuda_mpi fempic_opp+cuda_mpi field_solver+cuda_mpi opp_mpi_hdf5+cuda_mpi
$(MPICPP) $(CPPFLAGS) -DUSE_MPI -o $(BIN)/cuda_mpi_hdf5 \
$(OBJ)/field_solver_cuda_mpi_hdf5.o \
$(OBJ)/fempic_hdf5_opp_cuda_mpi_hdf5.o \
$(OBJ)/opp_mpi_hdf5_cuda_mpi_hdf5.o \
$(OBJ)/opp_kernels_cuda_mpi_hdf5.o \
$(ALL_LIBS) $(CUDA_LIB) -lopp_cuda_mpi -lhdf5
$(ALL_LIBS) $(CUDA_LIB) -lopp_cuda_mpi -lhdf5 -lz

# ----------------------------------------------------------------------------------------------------------------------
fempic_opp+hip:
Expand Down
9 changes: 5 additions & 4 deletions app_fempic_cg/configs/coarse.param
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,22 @@ STRING rand_file = /home/zl/phd/box_mesh_gen/random_100k.dat
BOOL invert_normals = false

BOOL opp_auto_sort = false
REAL opp_allocation_multiple = 11
REAL opp_allocation_multiple = 13
INT opp_threads_per_block = 256

BOOL opp_profile_all = false
INT opp_gpu_direct = 0

REAL grid_spacing = 25e-6
BOOL print_final = true

BOOL opp_global_move = false
REAL grid_spacing = 25e-6
BOOL opp_dh_data_generate = false
# Use opp_dh_data_generate = true and OPP_DH_DATA_DUMP=1 in args to dump the structured mesh

# block k-means mpi-block
STRING cluster = block
BOOL use_reg_red = false

# HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
STRING opp_fill = Shuffle_Periodic
INT opp_fill_period = 20
INT opp_fill_period = 4
8 changes: 4 additions & 4 deletions app_fempic_cg/fempic_misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ inline std::string get_global_level_log(double max_c_ef, double max_n_potential,
int64_t global_part_size = 0, global_inj_size = 0, global_removed = 0;
int64_t glb_parts, gbl_max_parts, gbl_min_parts;
int64_t glb_part_comms, gbl_max_part_comms, gbl_min_part_comms;
int64_t glb_max_hops, gbl_max_max_hops, gbl_min_max_hops;
int64_t glb_sum_max_hops, gbl_max_max_hops, gbl_min_max_hops;
int global_max_comm_iteration = 0, gbl_move_moreX_hops = 0;

#ifdef USE_MPI
Expand All @@ -252,7 +252,7 @@ inline std::string get_global_level_log(double max_c_ef, double max_n_potential,

get_global_values(local_part_count, glb_parts, gbl_max_parts, gbl_min_parts);
get_global_values(OPP_part_comm_count_per_iter, glb_part_comms, gbl_max_part_comms, gbl_min_part_comms);
get_global_values(OPP_move_max_hops, glb_max_hops, gbl_max_max_hops, gbl_min_max_hops);
get_global_values(OPP_move_max_hops, glb_sum_max_hops, gbl_max_max_hops, gbl_min_max_hops);

log += std::string("\t np: ") + str(global_part_size, "%" PRId64);
log += std::string(" (") + str(global_inj_size, "%" PRId64);
Expand All @@ -269,8 +269,8 @@ inline std::string get_global_level_log(double max_c_ef, double max_n_potential,
log += std::string(" Max: ") + str(gbl_max_part_comms, "%" PRId64);
#ifdef LOG_HOPS
log += std::string(" | Hops: Min: ") + str(gbl_min_max_hops, "%" PRId64);
log += std::string(" Max: ") + str(glb_max_hops, "%" PRId64);
log += std::string(" | moreX_hops: ") + str(gbl_move_moreX_hops, "%d");
log += std::string(" Max: ") + str(gbl_max_max_hops, "%" PRId64);
log += std::string(" | more") + std::to_string(X_HOPS) + "_hops: " + str(gbl_move_moreX_hops, "%d");
#endif

return log;
Expand Down
20 changes: 10 additions & 10 deletions app_neso_advection_cg/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ seq: mklib advec_opp+seq opp_kernels+seq
$(CPP) $(CPPFLAGS) -o $(BIN)/seq \
$(OBJ)/advec_opp_seq.o \
$(OBJ)/opp_kernels_seq.o \
$(ALL_INC) $(ALL_LIBS) -lopp_seq
$(ALL_INC) $(ALL_LIBS) -lopp_seq -lz

# ------------------------------------------------------------------------------------------
advec_opp+mpi:
Expand All @@ -79,7 +79,7 @@ mpi: mklib advec_opp+mpi opp_kernels+mpi
$(MPICPP) $(CPPFLAGS) -DUSE_MPI -o $(BIN)/mpi \
$(OBJ)/advec_opp_mpi.o \
$(OBJ)/opp_kernels_mpi.o \
$(ALL_INC) $(ALL_LIBS) -lopp_mpi
$(ALL_INC) $(ALL_LIBS) -lopp_mpi -lz

# ------------------------------------------------------------------------------------------
advec_opp+omp:
Expand All @@ -90,7 +90,7 @@ omp: mklib advec_opp+omp opp_kernels+omp
$(CPP) $(CPPFLAGS) -fopenmp -o $(BIN)/omp \
$(OBJ)/advec_opp_omp.o \
$(OBJ)/opp_kernels_omp.o \
$(ALL_INC) $(ALL_LIBS) -lopp_omp
$(ALL_INC) $(ALL_LIBS) -lopp_omp -lz

# ------------------------------------------------------------------------------------------
advec_opp+omp_mpi:
Expand All @@ -101,7 +101,7 @@ omp_mpi: mklib advec_opp+omp_mpi opp_kernels+omp_mpi
$(MPICPP) $(CPPFLAGS) -fopenmp -o $(BIN)/omp_mpi \
$(OBJ)/advec_opp_omp_mpi.o \
$(OBJ)/opp_kernels_omp_mpi.o \
$(ALL_INC) $(ALL_LIBS) -lopp_omp_mpi
$(ALL_INC) $(ALL_LIBS) -lopp_omp_mpi -lz

# ------------------------------------------------------------------------------------------
advec_opp+cuda:
Expand All @@ -112,7 +112,7 @@ cuda: mklib advec_opp+cuda opp_kernels+cuda
$(CPP) $(CPPFLAGS) -o $(BIN)/cuda \
$(OBJ)/advec_opp_cuda.o \
$(OBJ)/opp_kernels_cuda.o \
$(ALL_INC) $(CUDA_INC) $(ALL_LIBS) $(CUDA_LIB) -lopp_cuda
$(ALL_INC) $(CUDA_INC) $(ALL_LIBS) $(CUDA_LIB) -lopp_cuda -lz

# ------------------------------------------------------------------------------------------
advec_opp+cuda_mpi:
Expand All @@ -123,7 +123,7 @@ cuda_mpi: mklib advec_opp+cuda_mpi opp_kernels+cuda_mpi
$(MPICPP) $(CPPFLAGS) -DUSE_MPI -o $(BIN)/cuda_mpi \
$(OBJ)/advec_opp_cuda_mpi.o \
$(OBJ)/opp_kernels_cuda_mpi.o \
$(ALL_INC) $(CUDA_INC) $(ALL_LIBS) $(CUDA_LIB) -lopp_cuda_mpi
$(ALL_INC) $(CUDA_INC) $(ALL_LIBS) $(CUDA_LIB) -lopp_cuda_mpi -lz

# ------------------------------------------------------------------------------------------
advec_opp+sycl:
Expand All @@ -134,7 +134,7 @@ sycl: mklib advec_opp+sycl opp_kernels+sycl
$(SYCLCC) $(SYCLFLAGS) -o $(BIN)/sycl \
$(OBJ)/advec_opp_sycl.o \
$(OBJ)/opp_kernels_sycl.o \
$(ALL_INC) $(ALL_LIBS) -lopp_sycl
$(ALL_INC) $(ALL_LIBS) -lopp_sycl -lz

# ------------------------------------------------------------------------------------------
advec_opp+sycl_mpi:
Expand All @@ -145,7 +145,7 @@ sycl_mpi: mklib advec_opp+sycl_mpi opp_kernels+sycl_mpi
$(MPICPP) $(SYCLFLAGS) -o $(BIN)/sycl_mpi \
$(OBJ)/advec_opp_sycl_mpi.o \
$(OBJ)/opp_kernels_sycl_mpi.o \
$(ALL_INC) $(ALL_LIBS) -lopp_sycl_mpi
$(ALL_INC) $(ALL_LIBS) -lopp_sycl_mpi -lz

# ------------------------------------------------------------------------------------------
advec_opp+hip:
Expand All @@ -156,7 +156,7 @@ hip: mklib advec_opp+hip opp_kernels+hip
$(HIPCC) $(HIPCCFLAGS) -o $(BIN)/hip \
$(OBJ)/advec_opp_hip.o \
$(OBJ)/opp_kernels_hip.o \
$(ALL_INC) $(HIP_INC) $(ALL_LIBS) -lopp_hip
$(ALL_INC) $(HIP_INC) $(ALL_LIBS) -lopp_hip -lz

# ------------------------------------------------------------------------------------------
advec_opp+hip_mpi:
Expand All @@ -167,7 +167,7 @@ hip_mpi: mklib advec_opp+hip_mpi opp_kernels+hip_mpi
$(HIPCC) $(HIPCCFLAGS) -o $(BIN)/hip_mpi \
$(OBJ)/advec_opp_hip_mpi.o \
$(OBJ)/opp_kernels_hip_mpi.o \
$(ALL_INC) $(HIP_INC) $(ALL_LIBS) -lopp_hip_mpi -lmpi
$(ALL_INC) $(HIP_INC) $(ALL_LIBS) -lopp_hip_mpi -lmpi -lz

# ------------------------------------------------------------------------------------------
clean:
Expand Down
3 changes: 3 additions & 0 deletions app_neso_advection_cg/configs/advec.param
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@ REAL opp_allocation_multiple = 1.1
INT opp_threads_per_block = 256
BOOL opp_profile_all = false
INT opp_gpu_direct = 0

BOOL opp_global_move = false
REAL grid_spacing = 0.5
BOOL opp_dh_data_generate = false
# Use opp_dh_data_generate = true and OPP_DH_DATA_DUMP=1 in args to dump the structured mesh

# HoleFill_All Sort_All Shuffle_All Sort_Periodic Shuffle_Periodic
STRING opp_fill = HoleFill_All
Expand Down
2 changes: 2 additions & 0 deletions app_neso_advection_cg/cuda/move_kernel_loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
opp_profiler->end("move_kernel_only");
opp_profiler->end("Mv_AllMv0");

#ifdef USE_MPI
// ----------------------------------------------------------------------------
// finalize the global move routine and iterate over newly added particles and check whether they need neighbour comm
if (useGlobalMove && globalMover->finalize(set) > 0) {
Expand Down Expand Up @@ -284,6 +285,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma

opp_profiler->end("GblMv_AllMv");
}
#endif

// ----------------------------------------------------------------------------
// Do neighbour communication and if atleast one particle is received by the currect rank,
Expand Down
11 changes: 8 additions & 3 deletions app_neso_advection_cg/omp/move_kernel_loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ void opp_particle_move__move_kernel(opp_set set, opp_map c2c_map, opp_map p2c_ma
opp_profiler->end("move_kernel");
}

void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id, const opp::BoundingBox& b_box,
void opp_init_direct_hop_cg(double grid_spacing, const opp_dat c_gbl_id, const opp::BoundingBox& b_box,
opp_map c2c_map, opp_map p2c_map,
opp_arg arg0, // p_pos | OPP_READ
opp_arg arg1 // c_pos_ll | OPP_READ
Expand Down Expand Up @@ -285,8 +285,13 @@ void opp_init_direct_hop_cg(double grid_spacing, int dim, const opp_dat c_gbl_id
}
};

cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker);

if (opp_params->get<OPP_BOOL>("opp_dh_data_generate")) {
cellMapper->generateStructuredMesh(c_gbl_id->set, c_gbl_id, all_cell_checker);
}
else {
cellMapper->generateStructuredMeshFromFile(c_gbl_id->set, c_gbl_id);
}

opp_profiler->reg("GlbToLocal");
opp_profiler->reg("GblMv_Move");
opp_profiler->reg("GblMv_AllMv");
Expand Down
18 changes: 14 additions & 4 deletions opp_lib/include/device_kernels/cuda_inline_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ OPP_INT cellMapper_pos_stride = -1;
__constant__ OPP_INT OPP_rank_d;
__constant__ OPP_INT OPP_comm_size_d;

__constant__ size_t opp_maxSavedDHGrid_d[3];
__constant__ size_t opp_minSavedDHGrid_d[3];

//*******************************************************************************
// Returns true only if another hop is required by the current rank
__inline__ __device__ bool opp_part_check_status_device(char& move_flag, bool& iter_one_flag,
Expand Down Expand Up @@ -205,10 +208,13 @@ __inline__ __device__ size_t opp_dev_findStructuredCellIndex2D(const OPP_REAL* p
const size_t yIndex = (size_t)((pos[1 * cellMapper_pos_stride_d] - minGlbCoordinate[1]) *
oneOverGridSpace[0]);

const bool isOutOfCuboid = ((xIndex >= opp_maxSavedDHGrid_d[0] || xIndex < opp_minSavedDHGrid_d[0]) ||
(yIndex >= opp_maxSavedDHGrid_d[1] || yIndex < opp_minSavedDHGrid_d[1]));

// Calculate the cell index mapping index
const size_t index = xIndex + (yIndex * globalGridDims[0]);

return (index >= globalGridSize[0]) ? MAX_CELL_INDEX : index;
return (isOutOfCuboid) ? MAX_CELL_INDEX : index;
}

//*******************************************************************************
Expand All @@ -224,10 +230,14 @@ __inline__ __device__ size_t opp_dev_findStructuredCellIndex3D(const OPP_REAL* p
const size_t zIndex = (size_t)((pos[2 * cellMapper_pos_stride_d] - minGlbCoordinate[2]) *
oneOverGridSpace[0]);

const bool isOutOfCuboid = ((xIndex >= opp_maxSavedDHGrid_d[0] || xIndex < opp_minSavedDHGrid_d[0]) ||
(yIndex >= opp_maxSavedDHGrid_d[1] || yIndex < opp_minSavedDHGrid_d[1]) ||
(zIndex >= opp_maxSavedDHGrid_d[2] || zIndex < opp_minSavedDHGrid_d[2]));

// Calculate the cell index mapping index
const size_t index = xIndex + (yIndex * globalGridDims[0]) + (zIndex * globalGridDims[3]);

return (index >= globalGridSize[0]) ? MAX_CELL_INDEX : index;
return (isOutOfCuboid) ? MAX_CELL_INDEX : index;
}

//*******************************************************************************
Expand Down Expand Up @@ -358,8 +368,8 @@ __global__ void opp_dev_checkForGlobalMove3D_kernel(
OPP_INT* __restrict__ move_rank_indices,
OPP_INT* __restrict__ move_count,
const OPP_INT start,
const OPP_INT end)
{
const OPP_INT end
) {
const int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
const int part_idx = thread_id + start;

Expand Down
2 changes: 2 additions & 0 deletions opp_lib/include/opp_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ constexpr int MIN_INT = std::numeric_limits<int>::min();
#define MAX_CELL_INDEX INT_MAX
#define OPP_OUT_OF_SAVED_DOMAIN ULONG_MAX

#define X_HOPS 5

#define UNUSED(expr) do { (void)(expr); } while (0)

#define OPP_READ 0
Expand Down
Loading

0 comments on commit ea05811

Please sign in to comment.