diff --git a/app_fempic/Makefile b/app_fempic/Makefile index 1281ccd..598d25c 100644 --- a/app_fempic/Makefile +++ b/app_fempic/Makefile @@ -94,7 +94,7 @@ fempic_opp+seq: opp_kernels+seq: $(CPP) $(CPPFLAGS) -c seq/opp_kernels.cpp -o $(OBJ)/opp_kernels_seq.o $(ALL_INC) field_solver+seq: - $(CPP) $(CPPFLAGS) -c field_solver/cpu.cpp -o $(OBJ)/field_solver_cpu_seq.o $(ALL_INC) + $(CPP) $(CPPFLAGS) -c field_solver/field_solver.cpp -o $(OBJ)/field_solver_cpu_seq.o $(ALL_INC) seq: mklib fempic_opp+seq opp_kernels+seq field_solver+seq $(CPP) $(CPPFLAGS) -o $(BIN)/seq \ @@ -109,7 +109,7 @@ opp_kernels+omp: fempic_opp+omp: $(CPP) $(CPPFLAGS) -fopenmp -DUSE_OMP -c fempic_opp.cpp -o $(OBJ)/fempic_opp_omp.o $(ALL_INC) field_solver+omp: - $(CPP) $(CPPFLAGS) -fopenmp -DUSE_OMP -c field_solver/cpu.cpp -o $(OBJ)/field_solver_cpu_omp.o $(ALL_INC) + $(CPP) $(CPPFLAGS) -fopenmp -DUSE_OMP -c field_solver/field_solver.cpp -o $(OBJ)/field_solver_cpu_omp.o $(ALL_INC) omp: mklib opp_kernels+omp fempic_opp+omp field_solver+omp $(CPP) $(CPPFLAGS) -fopenmp -o $(BIN)/omp \ @@ -250,11 +250,11 @@ hip_mpi_hdf5: mklib fempic_opp+hip_mpi opp_kernels+hip_mpi field_solver+hip_mpi # ---------------------------------------------------------------------------------------------------------------------- fempic_opp+sycl: - $(SYCLCC) $(SYCLFLAGS) -DUSE_SYCL -c fempic_opp.cpp -o $(OBJ)/fempic_opp_sycl.o $(ALL_INC) + $(SYCLCC) $(SYCLFLAGS) -c fempic_opp.cpp -o $(OBJ)/fempic_opp_sycl.o $(ALL_INC) opp_kernels+sycl: - $(SYCLCC) $(SYCLFLAGS) -DUSE_SYCL -c sycl/opp_kernels.cpp -o $(OBJ)/opp_kernels_sycl.o $(ALL_INC) + $(SYCLCC) $(SYCLFLAGS) -c sycl/opp_kernels.cpp -o $(OBJ)/opp_kernels_sycl.o $(ALL_INC) field_solver+sycl: - $(SYCLCC) $(SYCLFLAGS) -DUSE_SYCL -c field_solver/field_solver.cpp -o $(OBJ)/field_solver_sycl.o $(ALL_INC) + $(SYCLCC) $(SYCLFLAGS) -c field_solver/field_solver.cpp -o $(OBJ)/field_solver_sycl.o $(ALL_INC) sycl: mklib fempic_opp+sycl opp_kernels+sycl field_solver+sycl $(SYCLCC) $(SYCLFLAGS) -o $(BIN)/sycl \ @@ -265,23 +265,23 @@ sycl: mklib fempic_opp+sycl opp_kernels+sycl field_solver+sycl # ---------------------------------------------------------------------------------------------------------------------- fempic_opp+sycl_mpi: - $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -DUSE_SYCL -c fempic_opp.cpp -o $(OBJ)/fempic_opp_sycl_mpi.o $(ALL_INC) + $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -c fempic_opp.cpp -o $(OBJ)/fempic_opp_sycl_mpi.o $(ALL_INC) opp_kernels+sycl_mpi: - $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -DUSE_SYCL -c sycl/opp_kernels.cpp -o $(OBJ)/opp_kernels_sycl_mpi.o $(ALL_INC) + $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -c sycl/opp_kernels.cpp -o $(OBJ)/opp_kernels_sycl_mpi.o $(ALL_INC) field_solver+sycl_mpi: - $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -DUSE_SYCL -c field_solver/field_solver.cpp -o $(OBJ)/field_solver_sycl_mpi.o $(ALL_INC) + $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -c field_solver/field_solver.cpp -o $(OBJ)/field_solver_sycl_mpi.o $(ALL_INC) opp_mpi_hdf5+sycl_mpi: - $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -DUSE_SYCL -c $(OPP_PATH)/src/hdf5/opp_mpi_hdf5.cpp -o $(OBJ)/opp_mpi_hdf5_sycl_mpi_hdf5.o $(ALL_INC) + $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -c $(OPP_PATH)/src/hdf5/opp_mpi_hdf5.cpp -o $(OBJ)/opp_mpi_hdf5_sycl_mpi_hdf5.o $(ALL_INC) sycl_mpi: mklib fempic_opp+sycl_mpi opp_kernels+sycl_mpi field_solver+sycl_mpi - $(MPICPP) $(CPPFLAGS) -DUSE_MPI -o $(BIN)/sycl_mpi \ + $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -o $(BIN)/sycl_mpi \ $(OBJ)/fempic_opp_sycl_mpi.o \ $(OBJ)/opp_kernels_sycl_mpi.o \ $(OBJ)/field_solver_sycl_mpi.o \ $(ALL_LIBS) $(SYCL_LIB) -lopp_sycl_mpi sycl_mpi_hdf5: mklib fempic_opp+sycl_mpi opp_kernels+sycl_mpi field_solver+sycl_mpi opp_mpi_hdf5+sycl_mpi - $(MPICPP) $(CPPFLAGS) -DUSE_MPI -o $(BIN)/sycl_mpi_hdf5 \ + $(MPICPP) $(SYCLFLAGS) -DUSE_MPI -o $(BIN)/sycl_mpi_hdf5 \ $(OBJ)/fempic_hdf5_opp_sycl_mpi_hdf5.o \ $(OBJ)/opp_kernels_sycl_mpi_hdf5.o \ $(OBJ)/opp_mpi_hdf5_sycl_mpi_hdf5.o \ diff --git a/opp_lib/include/opp_cuda.h b/opp_lib/include/opp_cuda.h index 04f04b4..6952cd5 100644 --- a/opp_lib/include/opp_cuda.h +++ b/opp_lib/include/opp_cuda.h @@ -490,7 +490,7 @@ class opp_mem { // Copy data from host to device, create new device arrays if requested template inline static void copy_host_to_dev(T*& data_d, const T *data_h, size_t copy_count, - bool create_new = false, size_t alloc_count = 0) { + bool no_wait = false, bool create_new = false, size_t alloc_count = 0) { if (create_new) { if (data_d != nullptr) opp_mem::dev_free(data_d); diff --git a/opp_lib/include/opp_sycl.h b/opp_lib/include/opp_sycl.h index 4c7a8c4..24feff7 100644 --- a/opp_lib/include/opp_sycl.h +++ b/opp_lib/include/opp_sycl.h @@ -47,9 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. constexpr bool debug_mem = false; -#define cutilSafeCall(err) // these need to be remmoved -#define cutilCheckMsg(msg) // these need to be remmoved - #define OPP_GPU_THREADS_PER_BLOCK 32 #define OPP_PARTICLE_MOVE_DONE { m.move_status = OPP_MOVE_DONE; } @@ -58,31 +55,25 @@ constexpr bool debug_mem = false; #define OPP_DO_ONCE (m.iteration_one) #define OPP_MOVE_RESET_FLAGS { m.move_status = OPP_MOVE_DONE; m.iteration_one = true; } +#define OPP_DEVICE_SYNCHRONIZE() opp_queue->wait() +#define OPP_DEVICE_GLOBAL_LINEAR_ID (item.get_global_linear_id()) +#define OPP_GLOBAL_FUNCTION inline +#define OPP_DEVICE_FUNCTION +#define ADDITIONAL_PARAMETERS , sycl::nd_item<1> item +#define OPP_ATOMIC_FETCH_ADD(address, value) opp_atomic_fetch_add(address, value) + extern int* opp_saved_mesh_relation_d; -extern size_t opp_saved_mesh_relation_size; -extern dpct::device_vector ps_cell_index_dv; -extern dpct::device_vector ps_swap_indices_dv; -// extern char *OPP_need_remove_flags_d; +extern OPP_INT* hf_from_indices_dp; +extern OPP_INT* ps_swap_indices_dp; extern int *OPP_move_particle_indices_d; extern int *OPP_move_cell_indices_d; extern int *OPP_move_count_d; -// extern dpct::device_vector OPP_thrust_move_particle_indices_d; -// extern dpct::device_vector OPP_thrust_move_cell_indices_d; - extern int *OPP_remove_particle_indices_d; -// extern int *OPP_remove_count_d; -// extern dpct::device_vector OPP_thrust_remove_particle_indices_d; -// extern dpct::device_vector ps_to_indices_dv; -extern dpct::device_vector hf_from_indices_dv; -extern dpct::device_vector hf_sequence_dv; - -extern std::map> - cell_indices_hv; // cellid in the foreign rank, arrange according to rank -extern std::map> - particle_indices_hv; // particle ids to send, arrange according to rank +extern std::map> cell_indices_hv; // cellid in the foreign rank, arrange according to rank +extern std::map> particle_indices_hv; // particle ids to send, arrange according to rank extern std::map> particle_indices_dv; extern std::map> send_data; extern std::map> recv_data; @@ -96,59 +87,187 @@ extern char *OPP_consts_h, *OPP_consts_d; extern std::vector opp_consts; //************************************************************************************************* - -void __cudaSafeCall(dpct::err0 err, const char *file, const int line); - -void __cutilCheckMsg(const char *errorMessage, const char *file, const int line); - void opp_sycl_init(int argc, char **argv); void opp_sycl_exit(); -// Copy a map from host to device void opp_upload_map(opp_map map, bool create_new = false); +void opp_create_dat_device_arrays(opp_dat dat, bool create_new = false); /*******************************************************************************/ - void opp_halo_create(); void opp_halo_destroy(); /*******************************************************************************/ - void opp_init_double_indirect_reductions_cuda(int nargs, opp_arg *args); void opp_exchange_double_indirect_reductions_cuda(int nargs, opp_arg *args) ; void opp_complete_double_indirect_reductions_cuda(int nargs, opp_arg *args); /*******************************************************************************/ - - void print_dat_to_txtfile_mpi(opp_dat dat, const char *file_name); void opp_mpi_print_dat_to_txtfile(opp_dat dat, const char *file_name); -void opp_copy_host_to_device(void **data_d, void **data_h, size_t copy_size, - size_t alloc_size = 0, bool create_new = false); - -void opp_create_dat_device_arrays(opp_dat dat, bool create_new = false); - void opp_finalize_particle_move_cuda(opp_set set); - void particle_sort_device(opp_set set, bool hole_filling); /*******************************************************************************/ +class opp_mem { -inline void opp_mpi_reduce(opp_arg *args, double *data) -{ -#ifdef USE_MPI - opp_mpi_reduce_double(args, data); -#else - (void)args; - (void)data; -#endif -} +public: + + // Allocate host memory + template + inline static T* host_malloc(size_t count) { + return (T*)malloc(count * sizeof(T)); + } + + // Free host memory + template + inline static void host_free(T* ptr) { + if (ptr) + free(ptr); + ptr = nullptr; + } + + // Reallocate host memory + template + inline static void host_realloc(T*& ptr, size_t new_size) { + T* tmp_ptr = (T*)realloc(ptr, new_size); + ptr = tmp_ptr; + } + + // Allocate device memory + template + inline static T* dev_malloc(size_t count) { + if (count <= 0) return nullptr; + try { + T* ptr = sycl::malloc_device(count * sizeof(T), *opp_queue); + if (debug_mem) opp_printf("dev_malloc", "[%p][%zu]", ptr, count); + return ptr; + } + catch (const sycl::exception &e) { + throw std::runtime_error(std::string("dev_malloc: ") + e.what()); + } + } + + // Free device memory + template + inline static void dev_free(T*& ptr) { + if (ptr) { + if (debug_mem) opp_printf("dev_free", "[%p]", ptr); + sycl::free(ptr, *opp_queue); + ptr = nullptr; + } + } + + // Copy memory from one device pointer to another + template + inline static void dev_memcpy(T*& dst, const T* src, size_t cpy_count) { + if (debug_mem) opp_printf("dev_memcpy", "[%p]->[%p] cpy_count[%zu]", src, dst, cpy_count); + opp_queue->memcpy(dst, src, cpy_count * sizeof(T)).wait(); + } -inline void opp_mpi_reduce(opp_arg *args, int *data) + // Resize device memory + template + inline static void dev_realloc(T*& ptr, size_t& current_size, const size_t& new_size) { + if (new_size <= 0) + throw std::runtime_error("dev_realloc: New Realloc size invalid - " + std::to_string(new_size)); + T* new_ptr = opp_mem::dev_malloc(new_size); + if (debug_mem) opp_printf("dev_realloc", "created [%p] old [%p]", new_ptr, ptr); + if (ptr) { + const size_t copy_size = std::min(current_size, new_size); + opp_mem::dev_memcpy(new_ptr, ptr, copy_size); + opp_mem::dev_free(ptr); + current_size = new_size; + } + if (debug_mem) opp_printf("dev_realloc", "[%p]->[%p] cpy_count[%zu]", ptr, new_ptr, current_size); + ptr = new_ptr; + } + + // Resize device memory (only increasing size) + template + inline static void dev_resize(T*& ptr, size_t& current_size, const size_t& new_size) { + if (debug_mem) opp_printf("dev_resize", "[%p] %zu -> %zu", ptr, current_size, new_size); + if (new_size > current_size) { + opp_mem::dev_realloc(ptr, current_size, new_size); + } + } + + // initialize device memory with a specific value + template + inline static void dev_memset(T* ptr, size_t count, T value) { + opp_queue->fill(ptr, value, count).wait(); + } + + // Allocate and initialize device memory with a specific value + template + inline static T* dev_malloc_set(size_t count, T value) { + T* ptr = opp_mem::dev_malloc(count); + opp_mem::dev_memset(ptr, count, value); + return ptr; + } + + // Copy data from host to device, create new device arrays if requested + template + inline static void copy_host_to_dev(T*& data_d, const T *data_h, size_t copy_count, + bool no_wait = false, bool create_new = false, size_t alloc_count = 0) { + try { + if (create_new) { + if (data_d != nullptr) + opp_mem::dev_free(data_d); + data_d = opp_mem::dev_malloc(alloc_count); + } + opp_queue->memcpy(data_d, data_h, copy_count * sizeof(T)); + if (!no_wait) + opp_queue->wait(); + if (debug_mem) opp_printf("copy_host_to_dev", "[%p]->[%p] copy_count[%zu]", data_h, data_d, copy_count); + } + catch (const sycl::exception &e) { + throw std::runtime_error(std::string("opp_mem::copy_host_to_dev: ") + e.what()); + } + } + + // Copy data from device to host, no dot create new host arrays since it can be allocated differently, + // like malloc, new, stack, std::vector<>, hence free mechanism is unknown + template + inline static void copy_dev_to_host(T* data_h, const T *data_d, size_t copy_count, + bool no_wait = false) { + try { + opp_queue->memcpy(data_h, data_d, copy_count * sizeof(T)); + if (!no_wait) + opp_queue->wait(); + if (debug_mem) opp_printf("copy_dev_to_host", "[%p]->[%p] copy_count[%zu]", data_d, data_h, copy_count); + } + catch (const sycl::exception &e) { + throw std::runtime_error(std::string("opp_mem::copy_dev_to_host: ") + e.what()); + } + } + + // Copy data from device to device, create new device arrays if requested + template + inline static void copy_dev_to_dev(T*& data_d, const T *data_h, size_t copy_count, + bool no_wait = false, bool create_new = false, size_t alloc_count = 0) { + try { + return opp_mem::copy_host_to_dev(data_d, data_h, copy_count, no_wait, create_new, alloc_count); + } + catch (const sycl::exception &e) { + throw std::runtime_error(std::string("opp_mem::copy_dev_to_dev: ") + e.what()); + } + } +}; + +/*******************************************************************************/ +template +inline void opp_mpi_reduce(opp_arg *args, T *data) { #ifdef USE_MPI - opp_mpi_reduce_int(args, data); + if constexpr (std::is_same::value) { + opp_mpi_reduce_double(args, data); + } else if constexpr (std::is_same::value) { + opp_mpi_reduce_int(args, data); + } else { + static_assert(std::is_same::value || std::is_same::value, + "Unsupported data type for opp_mpi_reduce."); + } #else (void)args; (void)data; @@ -159,15 +278,11 @@ inline void opp_mpi_reduce(opp_arg *args, int *data) // routines to resize constant/reduct arrays, if necessary void opp_reallocReductArrays(int reduct_bytes); - void opp_mvReductArraysToDevice(int reduct_bytes); - void opp_mvReductArraysToHost(int reduct_bytes); void opp_reallocConstArrays(int consts_bytes); - void opp_mvConstArraysToDevice(int consts_bytes); - void opp_mvConstArraysToHost(int consts_bytes); template @@ -325,159 +440,7 @@ void copy_according_to_index(dpct::device_vector *in_dat_dv, 0, 0, size, dim); } -// template -// void opp_device_memset(T* data, T value, size_t size) { -// opp_queue->submit([&](sycl::handler& cgh) { -// cgh.parallel_for(sycl::range<1>(size), [=](sycl::id<1> idx) { -// data[idx] = value; -// }); -// }).wait(); // Wait for the kernel to finish -// } - -/*******************************************************************************/ -class opp_mem { - -public: - - // Allocate host memory - template - inline static T* host_malloc(size_t count) { - return (T*)malloc(count * sizeof(T)); - } - - // Free host memory - template - inline static void host_free(T* ptr) { - if (ptr) - free(ptr); - ptr = nullptr; - } - - // Reallocate host memory - template - inline static void host_realloc(T*& ptr, size_t new_size) { - T* tmp_ptr = (T*)realloc(ptr, new_size); - ptr = tmp_ptr; - } - - // Allocate device memory - template - inline static T* dev_malloc(size_t count) { - if (count <= 0) return nullptr; - try { - T* ptr = sycl::malloc_device(count * sizeof(T), *opp_queue); - if (debug_mem) opp_printf("dev_malloc", "[%p][%zu]", ptr, count); - return ptr; - } - catch (const sycl::exception &e) { - throw std::runtime_error(std::string("dev_malloc: ") + e.what()); - } - } - - // Free device memory - template - inline static void dev_free(T*& ptr) { - if (ptr) { - if (debug_mem) opp_printf("dev_free", "[%p]", ptr); - sycl::free(ptr, *opp_queue); - } - } - - // Copy memory from one device pointer to another - template - inline static void dev_memcpy(T*& dst, const T* src, size_t cpy_count) { - if (debug_mem) opp_printf("dev_memcpy", "[%p]->[%p] cpy_count[%zu]", src, dst, cpy_count); - opp_queue->memcpy(dst, src, cpy_count * sizeof(T)).wait(); - } - - // Resize device memory - template - inline static void dev_realloc(T*& ptr, size_t& current_size, const size_t& new_size) { - if (new_size <= 0) - throw std::runtime_error("dev_realloc: New Realloc size invalid - " + std::to_string(new_size)); - T* new_ptr = opp_mem::dev_malloc(new_size); - if (debug_mem) opp_printf("dev_realloc", "created [%p] old [%p]", new_ptr, ptr); - if (ptr) { - const size_t copy_size = std::min(current_size, new_size); - opp_mem::dev_memcpy(new_ptr, ptr, copy_size); - opp_mem::dev_free(ptr); - current_size = new_size; - if (debug_mem) opp_printf("dev_realloc", "[%p]->[%p] cpy_count[%zu]", ptr, new_ptr, copy_size); - } - ptr = new_ptr; - } - - // Resize device memory (only increasing size) - template - inline static void dev_resize(T*& ptr, size_t& current_size, const size_t& new_size) { - if (debug_mem) opp_printf("dev_resize", "[%p] %zu -> %zu", ptr, current_size, new_size); - if (new_size > current_size) { - opp_mem::dev_realloc(ptr, current_size, new_size); - } - } - - // initialize device memory with a specific value - template - inline static void dev_memset(T* ptr, size_t count, T value) { - opp_queue->fill(ptr, value, count).wait(); - } - - // Allocate and initialize device memory with a specific value - template - inline static T* dev_malloc_set(size_t count, T value) { - T* ptr = opp_mem::dev_malloc(count); - opp_mem::dev_memset(ptr, count, value); - return ptr; - } - - // Copy data from host to device, create new device arrays if requested - template - inline static void copy_host_to_dev(T*& data_d, const T *data_h, size_t copy_count, - bool no_wait = false, bool create_new = false, size_t alloc_count = 0) { - try { - if (create_new) { - if (data_d != nullptr) - opp_mem::dev_free(data_d); - data_d = opp_mem::dev_malloc(alloc_count); - } - opp_queue->memcpy(data_d, data_h, copy_count * sizeof(T)); - if (!no_wait) - opp_queue->wait(); - if (debug_mem) opp_printf("copy_host_to_dev", "[%p]->[%p] copy_count[%zu]", data_h, data_d, copy_count); - } - catch (const sycl::exception &e) { - throw std::runtime_error(std::string("opp_mem::copy_host_to_dev: ") + e.what()); - } - } - - // Copy data from device to host, no dot create new host arrays since it can be allocated differently, - // like malloc, new, stack, std::vector<>, hence free mechanism is unknown - template - inline static void copy_dev_to_host(T* data_h, const T *data_d, size_t copy_count, - bool no_wait = false) { - try { - opp_queue->memcpy(data_h, data_d, copy_count * sizeof(T)); - if (!no_wait) - opp_queue->wait(); - if (debug_mem) opp_printf("copy_dev_to_host", "[%p]->[%p] copy_count[%zu]", data_d, data_h, copy_count); - } - catch (const sycl::exception &e) { - throw std::runtime_error(std::string("opp_mem::copy_dev_to_host: ") + e.what()); - } - } -}; - /*******************************************************************************/ -#define OPP_DEVICE_SYNCHRONIZE() opp_queue->wait() -#define OPP_DEVICE_GLOBAL_LINEAR_ID (item.get_global_linear_id()) -#define OPP_GLOBAL_FUNCTION inline -#define OPP_DEVICE_FUNCTION -#define ADDITIONAL_PARAMETERS , sycl::nd_item<1> item -#define OPP_ATOMIC_FETCH_ADD(address, value) opp_atomic_fetch_add(address, value) - -/*******************************************************************************/ - -//**************************************** template void copy_from( const T* in_dat_d, T* out_dat_d, @@ -497,7 +460,7 @@ void copy_from( } } -//**************************************** +/*******************************************************************************/ template void copy_from_to( const T* in_dat_d, T* out_dat_d, @@ -518,6 +481,7 @@ void copy_from_to( } } +/*******************************************************************************/ template void opp_register_const(T*& ptr, const size_t count) { if (ptr == nullptr) { @@ -526,15 +490,32 @@ void opp_register_const(T*& ptr, const size_t count) { } } +/*******************************************************************************/ template T opp_atomic_fetch_add(T* address, T value) { return dpct::atomic_fetch_add(address, value); } +/*******************************************************************************/ inline void opp_set_stride(OPP_INT*& data_d, OPP_INT& data_h, OPP_INT new_data) { opp_register_const(data_d, 1); if (data_h != new_data) { data_h = new_data; opp_mem::copy_host_to_dev(data_d, &data_h, 1); } -} \ No newline at end of file +} + +/*******************************************************************************/ +template +inline void write_T_array_to_file(const T* array, size_t size, const std::string& filename) { + std::ofstream outFile(filename); + if (!outFile) { + std::cerr << "Error opening file: " << filename << std::endl; + return; + } + outFile << size << " 1 -- 0 0\n"; + for (int i = 0; i < size; ++i) { + outFile << " " << array[i] << "\n"; + } + outFile.close(); +} diff --git a/opp_lib/src/sycl/opp_increase_part_count.cpp b/opp_lib/src/sycl/opp_increase_part_count.cpp index d0ded83..7878655 100644 --- a/opp_lib/src/sycl/opp_increase_part_count.cpp +++ b/opp_lib/src/sycl/opp_increase_part_count.cpp @@ -141,13 +141,12 @@ void opp_inc_part_count_with_distribution(opp_set set, int num_particles_to_inse opp_printf("opp_inc_part_count_with_distribution", "Calculating all from new"); - opp_queue->submit([&](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>(nthread*nblocks,nthread), kernel); }).wait(); } else { - const size_t copy_size = (end - start) * sizeof(int); + const size_t copy_size = (end - start); int* inj_mesh_relations_d = (int *)mesh_rel_dat->data_d + inj_start; if (opp_saved_mesh_relation_d == nullptr) { @@ -156,16 +155,15 @@ void opp_inc_part_count_with_distribution(opp_set set, int num_particles_to_inse "Allocating saved_mesh_relation_d with size [%zu]", copy_size); opp_saved_mesh_relation_size = copy_size; - opp_saved_mesh_relation_d = (int *)sycl::malloc_device( - copy_size, *opp_queue); + opp_saved_mesh_relation_d = opp_mem::dev_malloc(copy_size); opp_queue->submit([&](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>(nthread*nblocks,nthread), kernel); }).wait(); // save the mesh relation data for next iteration - opp_queue->memcpy(opp_saved_mesh_relation_d, inj_mesh_relations_d, - copy_size).wait(); + opp_mem::copy_dev_to_dev(opp_saved_mesh_relation_d, + inj_mesh_relations_d, copy_size); } else { if (OPP_DBG) @@ -179,8 +177,8 @@ void opp_inc_part_count_with_distribution(opp_set set, int num_particles_to_inse } // Copy from the saved mesh relation data - opp_queue->memcpy(inj_mesh_relations_d, opp_saved_mesh_relation_d, - copy_size).wait(); + opp_mem::copy_dev_to_dev(inj_mesh_relations_d, + opp_saved_mesh_relation_d, copy_size); } } } diff --git a/opp_lib/src/sycl/opp_particle_mover.cpp b/opp_lib/src/sycl/opp_particle_mover.cpp index 5f06702..cd856db 100644 --- a/opp_lib/src/sycl/opp_particle_mover.cpp +++ b/opp_lib/src/sycl/opp_particle_mover.cpp @@ -47,14 +47,16 @@ void opp_part_unpack_device_direct(opp_set set); //******************************************************************************* bool opp_finalize_particle_move(opp_set set) { - opp_profiler->start("Mv_Finalize"); - OPP_DEVICE_SYNCHRONIZE(); + opp_profiler->start("Mv_Finalize"); + // this is the exchange particle count + OPP_move_count_h = -1; opp_mem::copy_dev_to_host(&OPP_move_count_h, OPP_move_count_d, 1, true); // remove count is the addition of removed particles and the exchange count + set->particle_remove_count = -1; opp_mem::copy_dev_to_host(&(set->particle_remove_count), set->particle_remove_count_d, 1); if (OPP_DBG) @@ -86,22 +88,27 @@ bool opp_finalize_particle_move(opp_set set) if (OPP_DBG) opp_printf("opp_finalize_particle_move", "hole fill set [%s]", set->name); - + opp_profiler->start("Mv_holefill"); particle_hole_fill_device(set); + opp_profiler->end("Mv_holefill"); } else if (OPP_fill_type == OPP_Sort_All || OPP_fill_type == OPP_Sort_Periodic) { if (OPP_DBG) opp_printf("opp_finalize_particle_move", "sort set [%s]", set->name); + opp_profiler->start("Mv_sort"); opp_particle_sort(set); + opp_profiler->end("Mv_sort"); } else if (OPP_fill_type == OPP_Shuffle_All || OPP_fill_type == OPP_Shuffle_Periodic) { if (OPP_DBG) opp_printf("opp_finalize_particle_move", "shuffle set [%s]", set->name); + opp_profiler->start("Mv_shuffle"); particle_sort_device(set, true); // true will shuffle the particles + opp_profiler->end("Mv_shuffle"); } else { opp_abort("OPP_fill_type is undefined"); @@ -146,9 +153,9 @@ bool opp_finalize_particle_move(opp_set set) OPP_comm_iteration++; opp_profiler->end("Mv_Finalize"); - return true; // need to run another communication iteration (particle move loop) #else + opp_profiler->end("Mv_Finalize"); return false; #endif } @@ -171,7 +178,7 @@ void opp_init_particle_move(opp_set set, int nargs, opp_arg *args) opp_mem::dev_memcpy(set->particle_remove_count_d, &(set->particle_remove_count), 1); - const size_t buffer_alloc_size = (size_t)(set->size * OPP_part_alloc_mult / 2); + const size_t buffer_alloc_size = (size_t)(set->set_capacity / 4); if (buffer_alloc_size > opp_move_particle_indices_h) { opp_mem::dev_resize(OPP_move_particle_indices_d, @@ -868,9 +875,7 @@ void opp_part_pack_and_exchange_device_direct(opp_set set) mpi_buffers->buffers[it->first].buf_export_index = 0; // make export indices to zero for next iteration } - // for (const auto& x : streams) cudaStreamDestroy(x.second); - cutilSafeCall( - DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())); + OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("Mv_PackExDir"); #endif @@ -995,8 +1000,7 @@ void opp_part_unpack_device_direct(opp_set set) } } - cutilSafeCall(DPCT_CHECK_ERROR( - dpct::get_current_device().queues_wait_and_throw())); + OPP_DEVICE_SYNCHRONIZE(); } opp_profiler->end("Mv_UnpackDir"); diff --git a/opp_lib/src/sycl/opp_particle_organize.cpp b/opp_lib/src/sycl/opp_particle_organize.cpp index 4f736f9..e07fdb3 100644 --- a/opp_lib/src/sycl/opp_particle_organize.cpp +++ b/opp_lib/src/sycl/opp_particle_organize.cpp @@ -33,11 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include constexpr int opp_const_threads_per_block = 192; +constexpr int const_blocks = 200; -dpct::device_vector hf_from_indices_dv; // holefill from indices - starting in reverse order) -dpct::device_vector hf_sequence_dv; // sequence of numbers required for hole filling -dpct::device_vector ps_cell_index_dv; // cell indices required for sorting/swapping -dpct::device_vector ps_swap_indices_dv; // swap (from mapping) indices required for sorting/swapping +size_t hf_from_indices_size = 0; +OPP_INT* hf_from_indices_dp = nullptr; // holefill from indices - starting in reverse order +size_t ps_swap_indices_size = 0; +OPP_INT* ps_swap_indices_dp = nullptr; // swap (from mapping) indices required for sorting/swapping //**************************************** void opp_particle_sort(opp_set set) @@ -45,30 +46,9 @@ void opp_particle_sort(opp_set set) particle_sort_device(set, false); } -//**************************************** -unsigned int seed = 123; // Seed for random number generator -struct RandomFunctor -{ - mutable dpl::default_engine rng; - - RandomFunctor(unsigned int seed) : rng(seed) {} - - int operator()(int value) const { - if (value == MAX_CELL_INDEX) { - return value; // Keep the value of MAX_CELL_INDEX unchanged - } - else { - dpl::uniform_int_distribution dist(0, 1000000000); - return dist(rng); // Assign random number < 100 - } - } -}; - - - //**************************************** template -void sort_dat_according_to_index(opp_dat dat, const dpct::device_vector &swap_indices, +void sort_dat_according_to_index(opp_dat dat, const OPP_INT* swap_indices, const int set_capacity, const int size, bool shuffle, const int out_start_idx) { // if hole filling and second communicator iteration: arrange only the newly added particles @@ -82,14 +62,13 @@ void sort_dat_according_to_index(opp_dat dat, const dpct::device_vector &sw T *dat_data = (T *)(dat->data_d); T *dat_swap_data = (T *)(dat->data_swap_d); - const int *swap_indices_ptr = (int *)dpct::get_raw_pointer(swap_indices.data()); + const int *swap_indices_ptr = swap_indices; const int dat_dim = dat->dim; const int nblocks = (size - 1) / opp_const_threads_per_block + 1; // data will be swapped from dat->data_d to dat->data_swap_d according to swap_indices opp_queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for( sycl::nd_range<1>(opp_const_threads_per_block*nblocks,opp_const_threads_per_block), [=](sycl::nd_item<1> item) { @@ -104,13 +83,11 @@ void sort_dat_according_to_index(opp_dat dat, const dpct::device_vector &sw }); if (shuffle && OPP_comm_iteration != 0) { - OPP_DEVICE_SYNCHRONIZE(); // Now the valid data is in dat->data_swap_d, but since it is not the 1st communicator iteration // we only swap the newly added indices (since newly added particle count is small compared to // existing particles), simply copy the required indices from dat->data_swap_d to dat->data_d - for (int d = 0; d < dat->dim; d++) std::copy(dpl::execution::make_device_policy(*opp_queue), (dat_swap_data + d * set_capacity + out_start_idx), @@ -147,52 +124,73 @@ void particle_sort_device(opp_set set, bool shuffle) set->size, set->diff, set_capacity, set_size_plus_removed, (shuffle ? "TRUE" : "FALSE"), OPP_comm_iteration, sort_start_index, sort_size); - opp_profiler->start("PS_Resize"); - ps_cell_index_dv.reserve(set->set_capacity); - ps_cell_index_dv.resize(sort_size); - opp_profiler->end("PS_Resize"); - OPP_INT* cellIdx_dp = (OPP_INT*)set->mesh_relation_dat->data_d; // this is cell id data of particle set + OPP_INT* ps_cell_index_dp = (OPP_INT*)set->mesh_relation_dat->data_swap_d; // temp using swap array if (shuffle) { - opp_profiler->start("PS_Shuffle"); // randomize the cell indices to minimize shared memory issues in later PIC routines // The below will create random numbers for each index, and MAX_CELL_INDEX for removed, - std::transform(dpl::execution::make_device_policy(*opp_queue), - cellIdx_dp + sort_start_index, cellIdx_dp + set_size_plus_removed, - ps_cell_index_dv.begin(), - RandomFunctor(seed)); + opp_profiler->start("PS_Shuffle"); + const uint32_t shuffle_seed = 123; + const int size = (set_size_plus_removed - sort_start_index); + opp_queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::nd_range<1>(OPP_gpu_threads_per_block * const_blocks, OPP_gpu_threads_per_block), + [=](sycl::nd_item<1> item) { + dpl::uniform_int_distribution dist(0, 1000); + for (size_t i = item.get_global_linear_id(); i < size; i += item.get_global_range()[0] ){ + dpl::minstd_rand engine(shuffle_seed, i); + ps_cell_index_dp[i] = (cellIdx_dp[i + sort_start_index] == MAX_CELL_INDEX) ? + MAX_CELL_INDEX : dist(engine); + } + } + ); + }); + OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("PS_Shuffle"); } else { // copy the cell index to the thrust vector for sorting opp_profiler->start("PS_CopyCID"); - std::copy(dpl::execution::make_device_policy(*opp_queue), - cellIdx_dp + sort_start_index, cellIdx_dp + set_size_plus_removed, - ps_cell_index_dv.begin()); + const int size = (set_size_plus_removed - sort_start_index); + opp_queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::nd_range<1>(OPP_gpu_threads_per_block * const_blocks, OPP_gpu_threads_per_block), + [=](sycl::nd_item<1> item) { + for (int i = item.get_global_linear_id(); i < size; i += item.get_global_range()[0] ){ + ps_cell_index_dp[i] = cellIdx_dp[i + sort_start_index]; + } + } + ); + }); + OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("PS_CopyCID"); } // Create a sequence of numbers starting from sort_start_index to be used as swap indices opp_profiler->start("PS_Sequence"); - ps_swap_indices_dv.reserve(set->set_capacity); - ps_swap_indices_dv.resize(sort_size); - dpct::iota(dpl::execution::make_device_policy(*opp_queue), - ps_swap_indices_dv.begin(), ps_swap_indices_dv.end(), - sort_start_index); + opp_mem::dev_resize(ps_swap_indices_dp, ps_swap_indices_size, (size_t)set->set_capacity); + opp_queue->submit([&](sycl::handler& cgh) { + OPP_INT* ps_swap_indices = ps_swap_indices_dp; + cgh.parallel_for(sycl::nd_range<1>(OPP_gpu_threads_per_block * const_blocks, OPP_gpu_threads_per_block), + [=](sycl::nd_item<1> item) { + for (int i = item.get_global_linear_id(); i < sort_size; i += item.get_global_range()[0] ){ + ps_swap_indices[i] = i; + } + } + ); + }); + OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("PS_Sequence"); - // sort ps_swap_indices_dv, with the key ps_cell_index_dv. Both keys and values will be sorted + // sort ps_swap_indices_dp, with the key ps_cell_index_dp. Both keys and values will be sorted opp_profiler->start("PS_SortKey"); - dpct::sort(dpl::execution::make_device_policy(*opp_queue), - ps_cell_index_dv.begin(), ps_cell_index_dv.end(), - ps_swap_indices_dv.begin()); + dpl::sort_by_key(dpl::execution::make_device_policy(*opp_queue), + ps_cell_index_dp, ps_cell_index_dp + sort_size, + ps_swap_indices_dp); opp_profiler->end("PS_SortKey"); + // Reorder the dats according to ps_swap_indices_dp (from mapping) opp_profiler->start("PS_Dats"); - for (int i = 0; i < (int)set->particle_dats->size(); i++) { - - opp_dat& dat = set->particle_dats->at(i); + for (opp_dat& dat : *(set->particle_dats)) { if (!(strstr(dat->type, ":soa") != NULL || OPP_auto_soa || (dat->dim > 1))) { std::cerr << "particle_sort_device not implemented for non SOA data structures [dat " @@ -202,13 +200,12 @@ void particle_sort_device(opp_set set, bool shuffle) if (strcmp(dat->type, "int") == 0) { - sort_dat_according_to_index(dat, ps_swap_indices_dv, + sort_dat_according_to_index(dat, ps_swap_indices_dp, set_capacity, sort_size, shuffle, sort_start_index); } else if (strcmp(dat->type, "double") == 0) { - dpct::has_capability_or_fail(opp_queue->get_device(), {sycl::aspect::fp64}); - sort_dat_according_to_index(dat, ps_swap_indices_dv, + sort_dat_according_to_index(dat, ps_swap_indices_dp, set_capacity, sort_size, shuffle, sort_start_index); } else { @@ -224,47 +221,59 @@ void particle_sort_device(opp_set set, bool shuffle) // This assumes all the device data to be valid void particle_hole_fill_device(opp_set set) { - const int set_capacity = set->set_capacity; - const int part_remove_count = set->particle_remove_count; - const int set_size_plus_removed = set->size + part_remove_count; - const int nblocks = (part_remove_count - 1) / opp_const_threads_per_block + 1; + const OPP_INT set_capacity = set->set_capacity; + const OPP_INT part_remove_count = set->particle_remove_count; + const OPP_INT set_size_plus_removed = set->size + part_remove_count; + const OPP_INT nblocks = (part_remove_count - 1) / opp_const_threads_per_block + 1; if (OPP_DBG) opp_printf("particle_hole_fill_device", "remove=%d set_size+removed=%d capacity=%d", part_remove_count, set_size_plus_removed, set_capacity); - int sort_start_index = 0; - int sort_size = set_size_plus_removed; - // sort OPP_remove_particle_indices_d since it can be in shuffled state opp_profiler->start("HF_SORT"); dpl::sort(dpl::execution::make_device_policy(*opp_queue), - OPP_remove_particle_indices_d, - OPP_remove_particle_indices_d + part_remove_count); + OPP_remove_particle_indices_d, OPP_remove_particle_indices_d + part_remove_count); opp_profiler->end("HF_SORT"); - // resize hf_sequence_dv and hf_from_indices_dv if required - if (hf_sequence_dv.capacity() < sort_size) { - - hf_sequence_dv.resize(set_capacity); - dpct::iota(dpl::execution::make_device_policy(*opp_queue), - hf_sequence_dv.begin(), hf_sequence_dv.end(), 0); + // resize only if hf_from_indices_size < set_capacity + opp_mem::dev_resize(hf_from_indices_dp, hf_from_indices_size, (size_t)set_capacity); - hf_from_indices_dv.reserve(set_size_plus_removed); - } - hf_from_indices_dv.resize(set_size_plus_removed); + OPP_INT* swap_indices_dp = (OPP_INT *)set->particle_remove_count_d; // using dev ptr temporarily + OPP_INT tmp = set_size_plus_removed - 1; + opp_mem::copy_host_to_dev(swap_indices_dp, &tmp, 1); - // get the particle indices in reverse order whose cell index is not MAX_CELL_INDEX + // Find the hole fill from indices (order may not be preserved) opp_profiler->start("HF_COPY_IF"); - auto end_iter1 = dpct::copy_if(dpl::execution::make_device_policy(*opp_queue), - dpl::make_reverse_iterator(hf_sequence_dv.begin() + set_size_plus_removed), - dpl::make_reverse_iterator(hf_sequence_dv.begin() + sort_start_index), - dpl::make_reverse_iterator(((OPP_INT*)set->mesh_relation_dat->data_d) + set_size_plus_removed), - hf_from_indices_dv.begin(), - [](int i) { return i != MAX_CELL_INDEX; }); - hf_from_indices_dv.resize(part_remove_count); + opp_queue->submit([&](sycl::handler& cgh) { + const OPP_INT* cid_dp = (OPP_INT*)set->mesh_relation_dat->data_d; + OPP_INT* hf_from_indices = hf_from_indices_dp; + cgh.parallel_for( + sycl::nd_range<1>(opp_const_threads_per_block*nblocks,opp_const_threads_per_block), + [=](sycl::nd_item<1> item) { + const OPP_INT idx = item.get_global_linear_id(); + if (idx < part_remove_count) { + while (true) { + const OPP_INT pos = opp_atomic_fetch_add(swap_indices_dp, -1); + if (cid_dp[pos] != MAX_CELL_INDEX) { + hf_from_indices[idx] = pos; + break; + } + } + } + } + ); + }); + OPP_DEVICE_SYNCHRONIZE(); opp_profiler->end("HF_COPY_IF"); + // Sort the hole fill from indices to avoid hole-filling issues + opp_profiler->start("HF_COPY_IF_SORT"); + dpl::sort(dpl::execution::make_device_policy(*opp_queue), + hf_from_indices_dp, hf_from_indices_dp + part_remove_count, + std::greater()); + opp_profiler->end("HF_COPY_IF_SORT"); + // For all the dats, fill the holes using the swap_indices opp_profiler->start("HF_Dats"); for (opp_dat& dat : *(set->particle_dats)) { @@ -278,12 +287,10 @@ void particle_hole_fill_device(opp_set set) if (strcmp(dat->type, "int") == 0) { opp_queue->submit([&](sycl::handler &cgh) { - OPP_INT *dat_data = (OPP_INT *)(dat->data_d); - const int *from_indices = (int *)dpct::get_raw_pointer(hf_from_indices_dv.data()); + const int *from_indices = hf_from_indices_dp; const int *remove_indices = OPP_remove_particle_indices_d; const int dat_dim = dat->dim; - cgh.parallel_for( sycl::nd_range<1>(opp_const_threads_per_block*nblocks,opp_const_threads_per_block), [=](sycl::nd_item<1> item) { @@ -291,7 +298,7 @@ void particle_hole_fill_device(opp_set set) dat_data, dat_data, from_indices, remove_indices, set_capacity, set_capacity, - 0, sort_start_index, + 0, 0, dat_dim, part_remove_count, item); }); @@ -299,15 +306,11 @@ void particle_hole_fill_device(opp_set set) } else if (strcmp(dat->type, "double") == 0) { - dpct::has_capability_or_fail(opp_queue->get_device(), {sycl::aspect::fp64}); - opp_queue->submit([&](sycl::handler &cgh) { - OPP_REAL *dat_data = (OPP_REAL *)(dat->data_d); - const int *from_indices = (int *)dpct::get_raw_pointer(hf_from_indices_dv.data()); + const int *from_indices = hf_from_indices_dp; const int *remove_indices = OPP_remove_particle_indices_d; const int dat_dim = dat->dim; - cgh.parallel_for( sycl::nd_range<1>(opp_const_threads_per_block*nblocks,opp_const_threads_per_block), [=](sycl::nd_item<1> item) { @@ -315,7 +318,7 @@ void particle_hole_fill_device(opp_set set) dat_data, dat_data, from_indices, remove_indices, set_capacity, set_capacity, - 0, sort_start_index, + 0, 0, dat_dim, part_remove_count, item); }); diff --git a/opp_lib/src/sycl/opp_sycl.cpp b/opp_lib/src/sycl/opp_sycl.cpp index 1e7817a..7afb791 100644 --- a/opp_lib/src/sycl/opp_sycl.cpp +++ b/opp_lib/src/sycl/opp_sycl.cpp @@ -119,8 +119,8 @@ void opp_exit() boundingBox.reset(); comm.reset(); - opp_host_free(OPP_reduct_h); - opp_host_free(OPP_consts_h); + opp_mem::host_free(OPP_reduct_h); + opp_mem::host_free(OPP_consts_h); #ifdef USE_MPI opp_halo_destroy(); // free memory allocated to halos and mpi_buffers @@ -205,8 +205,8 @@ void opp_sycl_init(int argc, char **argv) { sycl::device selected_device = selected_devices[int_rank % selected_devices.size()]; opp_queue = new sycl::queue(selected_device); - float *test = sycl::malloc_device(1, *opp_queue); - sycl::free(test, *opp_queue); + float *test = opp_mem::dev_malloc(1); + opp_mem::dev_free(test); OPP_hybrid_gpu = 1; } catch (sycl::exception const &exc) { @@ -241,10 +241,8 @@ void opp_sycl_exit() opp_mem::dev_free(OPP_remove_particle_indices_d); opp_mem::dev_free(OPP_move_count_d); - ps_cell_index_dv.clear(); ps_cell_index_dv.shrink_to_fit(); - ps_swap_indices_dv.clear(); ps_swap_indices_dv.shrink_to_fit(); - hf_from_indices_dv.clear(); hf_from_indices_dv.shrink_to_fit(); - hf_sequence_dv.clear(); hf_sequence_dv.shrink_to_fit(); + opp_mem::dev_free(hf_from_indices_dp); + opp_mem::dev_free(ps_swap_indices_dp); opp_mem::dev_free(OPP_reduct_d); opp_mem::dev_free(OPP_consts_d); @@ -252,26 +250,6 @@ void opp_sycl_exit() for (auto& a : opp_consts) opp_mem::dev_free(a); - // send_part_cell_idx_dv.clear(); - // send_part_cell_idx_dv.shrink_to_fit(); - - // temp_int_dv.clear(); - // temp_int_dv.shrink_to_fit(); - - // temp_real_dv.clear(); - // temp_real_dv.shrink_to_fit(); - - // OPP_thrust_move_particle_indices_d.clear(); - // OPP_thrust_move_particle_indices_d.shrink_to_fit(); - - // OPP_thrust_move_cell_indices_d.clear(); - // OPP_thrust_move_cell_indices_d.shrink_to_fit(); - - // OPP_thrust_remove_particle_indices_d.clear(); - // OPP_thrust_remove_particle_indices_d.shrink_to_fit(); - - // ps_to_indices_dv.clear(); ps_to_indices_dv.shrink_to_fit(); - // below are for GPU direct particle communication for (auto it = particle_indices_hv.begin(); it != particle_indices_hv.end(); it++) it->second.clear(); for (auto it = cell_indices_hv.begin(); it != cell_indices_hv.end(); it++) it->second.clear(); @@ -352,7 +330,7 @@ opp_map opp_decl_map_txt(opp_set from, opp_set to, int dim, const char* file_nam { int* map_data = (int*)opp_load_from_file_core(file_name, from->size, dim, "int", sizeof(int)); opp_map map = opp_decl_map(from, to, dim, map_data, name); - opp_host_free(map_data); + opp_mem::host_free(map_data); return map; } @@ -373,7 +351,7 @@ opp_dat opp_decl_dat_txt(opp_set set, int dim, opp_data_type dtype, const char* else dat = opp_decl_dat_core(set, dim, type.c_str(), size, (char*)dat_data, name); - opp_host_free(dat_data); + opp_mem::host_free(dat_data); return dat; } @@ -576,9 +554,9 @@ void opp_mpi_print_dat_to_txtfile(opp_dat dat, const char *file_name) print_dat_to_txtfile_mpi(temp, prefixed_file_name.c_str()); - opp_host_free(temp->data); - opp_host_free(temp->set); - opp_host_free(temp); + opp_mem::host_free(temp->data); + opp_mem::host_free(temp->set); + opp_mem::host_free(temp); #endif } @@ -672,26 +650,26 @@ void opp_reallocReductArrays(int reduct_bytes) { if (reduct_bytes > OPP_reduct_bytes) { if (OPP_reduct_bytes > 0) { - opp_host_free(OPP_reduct_h); - sycl::free(OPP_reduct_d, *opp_queue); + opp_mem::host_free(OPP_reduct_h); + opp_mem::dev_free(OPP_reduct_d); } OPP_reduct_bytes = 4 * reduct_bytes; // 4 is arbitrary, more than needed - OPP_reduct_h = (char *)opp_host_malloc(OPP_reduct_bytes); - OPP_reduct_d = (char *)sycl::malloc_device(OPP_reduct_bytes, *opp_queue); + OPP_reduct_h = opp_mem::host_malloc(OPP_reduct_bytes); + OPP_reduct_d = opp_mem::dev_malloc(OPP_reduct_bytes); // use opp_mem::dev_realloc instead } } //**************************************** void opp_mvReductArraysToDevice(int reduct_bytes) { - opp_queue->memcpy(OPP_reduct_d, OPP_reduct_h, reduct_bytes).wait(); + opp_mem::copy_host_to_dev(OPP_reduct_d, OPP_reduct_h, reduct_bytes); } //**************************************** void opp_mvReductArraysToHost(int reduct_bytes) { OPP_DEVICE_SYNCHRONIZE(); - opp_queue->memcpy(OPP_reduct_h, OPP_reduct_d, reduct_bytes).wait(); + opp_mem::copy_dev_to_host(OPP_reduct_h, OPP_reduct_d, reduct_bytes); } //******************************************************************************* @@ -701,26 +679,26 @@ void opp_reallocConstArrays(int consts_bytes) { if (OPP_consts_bytes > 0) { - opp_host_free(OPP_consts_h); - sycl::free(OPP_consts_d, *opp_queue); + opp_mem::host_free(OPP_consts_h); + opp_mem::dev_free(OPP_consts_d); } OPP_consts_bytes = 4 * consts_bytes; // 4 is arbitrary, more than needed - OPP_consts_h = (char *)opp_host_malloc(OPP_consts_bytes); - OPP_consts_d = (char *)sycl::malloc_device( OPP_consts_bytes, *opp_queue); + OPP_consts_h = opp_mem::host_malloc(OPP_consts_bytes); + OPP_consts_d = opp_mem::dev_malloc(OPP_consts_bytes); // use opp_mem::dev_realloc instead } } //**************************************** void opp_mvConstArraysToDevice(int consts_bytes) { - opp_queue->memcpy(OPP_consts_d, OPP_consts_h, consts_bytes).wait(); + opp_mem::copy_host_to_dev(OPP_consts_d, OPP_consts_h, consts_bytes); } //**************************************** void opp_mvConstArraysToHost(int consts_bytes) { OPP_DEVICE_SYNCHRONIZE(); - opp_queue->memcpy(OPP_consts_h, OPP_consts_d, consts_bytes).wait(); + opp_mem::copy_dev_to_host(OPP_consts_h, OPP_consts_d, consts_bytes); } //******************************************************************************* diff --git a/opp_lib/src/sycl/opp_sycl_halo.cpp b/opp_lib/src/sycl/opp_sycl_halo.cpp index b306751..0e07c9e 100644 --- a/opp_lib/src/sycl/opp_sycl_halo.cpp +++ b/opp_lib/src/sycl/opp_sycl_halo.cpp @@ -74,8 +74,7 @@ void opp_halo_create() OPP_import_nonexec_list[set->index]->size); dat->buffer_d_r = opp_mem::dev_malloc(size); if (OPP_DBG) - opp_printf("opp_halo_create", "buffer_d_r Alloc %zu bytes for %s dat", - size, dat->name); + opp_printf("opp_halo_create", "buffer_d_r Alloc %zu bytes for %s dat", size, dat->name); } const size_t size = (size_t)dat->size * (size_t)(OPP_export_exec_list[set->index]->size + @@ -83,8 +82,7 @@ void opp_halo_create() dat->buffer_d = opp_mem::dev_malloc(size); if (OPP_DBG) - opp_printf("opp_halo_create", "buffer_d Alloc %zu bytes for %s dat", - size, dat->name); + opp_printf("opp_halo_create", "buffer_d Alloc %zu bytes for %s dat", size, dat->name); } } } @@ -101,20 +99,21 @@ void opp_halo_destroy() #ifdef USE_MPI __opp_halo_destroy(); - if (OPP_hybrid_gpu) - { + if (OPP_hybrid_gpu) { if (OPP_DBG) opp_printf("opp_halo_destroy", "Destroying sycl halo buffers START"); - for (auto& dat : opp_dats) - { + for (auto& dat : opp_dats) { opp_mem::dev_free(dat->buffer_d_r); opp_mem::dev_free(dat->buffer_d); } - for (int i = 0; i < (int)opp_sets.size(); i++) - { + for (size_t i = 0; i < opp_sets.size(); i++) { opp_mem::dev_free(export_exec_list_d[i]); opp_mem::dev_free(export_nonexec_list_d[i]); + opp_mem::dev_free(export_exec_list_disps_d[i]); + opp_mem::dev_free(export_nonexec_list_disps_d[i]); + opp_mem::dev_free(import_exec_list_disps_d[i]); + opp_mem::dev_free(import_nonexec_list_disps_d[i]); } if (OPP_DBG) opp_printf("opp_halo_destroy", "Destroying sycl halo buffers END"); @@ -233,7 +232,7 @@ HOST DIRECT LOOP 0 Not Dirty 0 0 0 | 0 Not Dirty 1 Host Dirty 1 1 0 | 0 Not Dirty */ -void markExInfo(opp_arg& arg, DeviceType device, bool direct_loop, opp_HaloExInfo& exInfo) { +void mark_ex_info(opp_arg& arg, DeviceType device, bool direct_loop, opp_HaloExInfo& exInfo) { // return if the arg is not for a dat or it is not read somehow if (!arg.opt || arg.argtype != OPP_ARG_DAT || !(arg.acc == OPP_READ || arg.acc == OPP_RW)) @@ -264,7 +263,7 @@ void markExInfo(opp_arg& arg, DeviceType device, bool direct_loop, opp_HaloExInf } -void changeDatFlags(opp_arg& arg, DeviceType device, bool direct_loop) { +void change_dat_flags(opp_arg& arg, DeviceType device, bool direct_loop) { if (!arg.opt || arg.argtype != OPP_ARG_DAT || !(arg.acc == OPP_READ || arg.acc == OPP_RW)) return; @@ -281,7 +280,7 @@ void changeDatFlags(opp_arg& arg, DeviceType device, bool direct_loop) { arg.dat->dirtybit = 0; } -void generateHaloExchangeInfo(opp_set iter_set, int nargs, opp_arg *args, DeviceType device) { +void generate_halo_ex_info(opp_set iter_set, int nargs, opp_arg *args, DeviceType device) { haloExInfo.clear(); bool direct_loop = true; @@ -304,7 +303,7 @@ void generateHaloExchangeInfo(opp_set iter_set, int nargs, opp_arg *args, Device } if (!already_done) { - markExInfo(args[n], device, direct_loop, exInfo); + mark_ex_info(args[n], device, direct_loop, exInfo); } else { exInfo.skip = true; @@ -316,7 +315,7 @@ void generateHaloExchangeInfo(opp_set iter_set, int nargs, opp_arg *args, Device for (int n = 0; n < nargs; n++) { if (!haloExInfo[n].skip) { - changeDatFlags(args[n], device, direct_loop); + change_dat_flags(args[n], device, direct_loop); } } @@ -334,7 +333,7 @@ int opp_mpi_halo_exchanges_grouped(opp_set set, int nargs, opp_arg *args, Device current_device = device; int size = set->size; - generateHaloExchangeInfo(set, nargs, args, device); + generate_halo_ex_info(set, nargs, args, device); for (int n = 0; n < nargs; n++) { @@ -437,8 +436,7 @@ void __opp_mpi_device_halo_exchange(opp_arg *arg, int exec_flag) char *outptr_nonexec = NULL; // opp_printf("opp_mpi_halo_exchange_dev", "2"); if (OPP_gpu_direct) { outptr_exec = arg->dat->buffer_d; - outptr_nonexec = - arg->dat->buffer_d + exp_exec_list->size * arg->dat->size; + outptr_nonexec = arg->dat->buffer_d + exp_exec_list->size * arg->dat->size; OPP_DEVICE_SYNCHRONIZE(); } @@ -564,24 +562,14 @@ void __opp_mpi_device_halo_wait_all(opp_arg *arg) { int init = dat->set->size * dat->size; int size = (dat->set->exec_size + dat->set->nonexec_size) * dat->size; - /* - DPCT1124:26: cudaMemcpyAsync is migrated to asynchronous memcpy API. - While the origin API might be synchronous, it depends on the type of - operand memory, so you may need to call wait() on event return by - memcpy API to ensure synchronization behavior. - */ + opp_queue->memcpy(dat->buffer_d_r, dat->data + init, size).wait(); scatter_data_from_buffer(*arg); } else { int init = dat->set->size * dat->size; - /* - DPCT1124:27: cudaMemcpyAsync is migrated to asynchronous memcpy API. - While the origin API might be synchronous, it depends on the type of - operand memory, so you may need to call wait() on event return by - memcpy API to ensure synchronization behavior. - */ + opp_queue->memcpy( dat->data_d + init, dat->data + init, (OPP_import_exec_list[dat->set->index]->size + @@ -621,9 +609,13 @@ void __opp_mpi_device_halo_wait_all(int nargs, opp_arg *args) if (OPP_DBG) opp_printf("__opp_mpi_device_halo_wait_all", "END"); } - - - +inline void clean_int_array_hd(int **list_d) { + if (list_d != NULL) { + for (size_t s = 0; s < opp_sets.size(); s++) + opp_mem::dev_free(list_d[opp_sets[s]->index]); + opp_mem::host_free(list_d); + } +} void opp_mv_halo_list_device() { @@ -631,191 +623,116 @@ void opp_mv_halo_list_device() #ifdef USE_MPI - if (export_exec_list_d != NULL) - { - for (int s = 0; s < (int)opp_sets.size(); s++) - if (export_exec_list_d[opp_sets[s]->index] != NULL) - opp_mem::dev_free(export_exec_list_d[opp_sets[s]->index]); - free(export_exec_list_d); - } - export_exec_list_d = (int **)malloc(sizeof(int *) * (int)opp_sets.size()); - - for (int s = 0; s < (int)opp_sets.size(); s++) // for each set - { - opp_set set = opp_sets[s]; - export_exec_list_d[set->index] = NULL; + clean_int_array_hd(export_exec_list_d); + export_exec_list_d = opp_mem::host_malloc(opp_sets.size()); + for (opp_set& set : opp_sets) { + export_exec_list_d[set->index] = nullptr; if (set->is_particle) continue; - opp_copy_host_to_device((void **)&(export_exec_list_d[set->index]), - (void **)&(OPP_export_exec_list[set->index]->list), - OPP_export_exec_list[set->index]->size * sizeof(int), - OPP_export_exec_list[set->index]->size * sizeof(int), true); + const size_t count = (size_t)OPP_export_exec_list[set->index]->size; + opp_mem::copy_host_to_dev(export_exec_list_d[set->index], + OPP_export_exec_list[set->index]->list, count, false, true, count); if (OPP_DBG) opp_printf("opp_mv_halo_list_device", "export_exec_list_d Alloc %zu bytes for set %s", OPP_export_exec_list[set->index]->size * sizeof(int), set->name); } - if (export_nonexec_list_d != NULL) - { - for (int s = 0; s < (int)opp_sets.size(); s++) - if (export_nonexec_list_d[opp_sets[s]->index] != NULL) - opp_mem::dev_free(export_nonexec_list_d[opp_sets[s]->index]); - free(export_nonexec_list_d); - } - export_nonexec_list_d = (int **)malloc(sizeof(int *) * (int)opp_sets.size()); + clean_int_array_hd(export_nonexec_list_d); + export_nonexec_list_d = opp_mem::host_malloc(opp_sets.size()); + for (opp_set& set : opp_sets) { - for (int s = 0; s < (int)opp_sets.size(); s++) // for each set - { - opp_set set = opp_sets[s]; - export_nonexec_list_d[set->index] = NULL; - + export_nonexec_list_d[set->index] = nullptr; if (set->is_particle) continue; - opp_copy_host_to_device((void **)&(export_nonexec_list_d[set->index]), - (void **)&(OPP_export_nonexec_list[set->index]->list), - OPP_export_nonexec_list[set->index]->size * sizeof(int), - OPP_export_nonexec_list[set->index]->size * sizeof(int), true); - + const size_t count = (size_t)OPP_export_nonexec_list[set->index]->size; + opp_mem::copy_host_to_dev(export_nonexec_list_d[set->index], + OPP_export_nonexec_list[set->index]->list, count, false, true, count); + if (OPP_DBG) opp_printf("opp_mv_halo_list_device", "export_nonexec_list_d Alloc %zu bytes for set %s", OPP_export_nonexec_list[set->index]->size * sizeof(int), set->name); } - //for grouped, we need the disps array on device too - if (export_exec_list_disps_d != NULL) - { - for (int s = 0; s < (int)opp_sets.size(); s++) - if (export_exec_list_disps_d[opp_sets[s]->index] != NULL) - opp_mem::dev_free(export_exec_list_disps_d[opp_sets[s]->index]); - free(export_exec_list_disps_d); - } - export_exec_list_disps_d = (int **)malloc(sizeof(int *) * (int)opp_sets.size()); - - for (int s = 0; s < (int)opp_sets.size(); s++) // for each set - { - opp_set set = opp_sets[s]; - export_exec_list_disps_d[set->index] = NULL; + clean_int_array_hd(export_exec_list_disps_d); + export_exec_list_disps_d = opp_mem::host_malloc(opp_sets.size()); + for (opp_set& set : opp_sets) { + export_exec_list_disps_d[set->index] = nullptr; if (set->is_particle) continue; //make sure end size is there too - OPP_export_exec_list[set->index] - ->disps[OPP_export_exec_list[set->index]->ranks_size] = - OPP_export_exec_list[set->index]->ranks_size == 0 - ? 0 - : OPP_export_exec_list[set->index] - ->disps[OPP_export_exec_list[set->index]->ranks_size - 1] + - OPP_export_exec_list[set->index] - ->sizes[OPP_export_exec_list[set->index]->ranks_size - 1]; - opp_copy_host_to_device((void **)&(export_exec_list_disps_d[set->index]), - (void **)&(OPP_export_exec_list[set->index]->disps), - (OPP_export_exec_list[set->index]->ranks_size+1) * sizeof(int), - (OPP_export_exec_list[set->index]->ranks_size+1) * sizeof(int), true); + OPP_export_exec_list[set->index]->disps[OPP_export_exec_list[set->index]->ranks_size] = + OPP_export_exec_list[set->index]->ranks_size == 0 ? 0 : + OPP_export_exec_list[set->index]->disps[OPP_export_exec_list[set->index]->ranks_size - 1] + + OPP_export_exec_list[set->index]->sizes[OPP_export_exec_list[set->index]->ranks_size - 1]; - if (OPP_DBG) opp_printf("opp_mv_halo_list_device", "export_exec_list_disps_d Alloc %zu bytes for set %s", - (OPP_export_exec_list[set->index]->ranks_size+1) * sizeof(int), set->name); - } + const size_t count = (size_t)(OPP_export_exec_list[set->index]->ranks_size + 1); + opp_mem::copy_host_to_dev(export_exec_list_disps_d[set->index], + OPP_export_exec_list[set->index]->disps, count, false, true, count); - if (export_nonexec_list_disps_d != NULL) - { - for (int s = 0; s < (int)opp_sets.size(); s++) - if (export_nonexec_list_disps_d[opp_sets[s]->index] != NULL) - opp_mem::dev_free(export_nonexec_list_disps_d[opp_sets[s]->index]); - free(export_nonexec_list_disps_d); + if (OPP_DBG) opp_printf("opp_mv_halo_list_device", "export_exec_list_disps_d Alloc %zu bytes for set %s", + (OPP_export_exec_list[set->index]->ranks_size + 1) * sizeof(int), set->name); } - export_nonexec_list_disps_d = (int **)malloc(sizeof(int *) * (int)opp_sets.size()); - for (int s = 0; s < (int)opp_sets.size(); s++) { // for each set - opp_set set = opp_sets[s]; - export_nonexec_list_disps_d[set->index] = NULL; + clean_int_array_hd(export_nonexec_list_disps_d); + export_nonexec_list_disps_d = opp_mem::host_malloc(opp_sets.size()); + for (opp_set& set : opp_sets) { + export_nonexec_list_disps_d[set->index] = nullptr; if (set->is_particle) continue; //make sure end size is there too - OPP_export_nonexec_list[set->index] - ->disps[OPP_export_nonexec_list[set->index]->ranks_size] = - OPP_export_nonexec_list[set->index]->ranks_size == 0 - ? 0 - : OPP_export_nonexec_list[set->index] - ->disps[OPP_export_nonexec_list[set->index]->ranks_size - - 1] + - OPP_export_nonexec_list[set->index] - ->sizes[OPP_export_nonexec_list[set->index]->ranks_size - - 1]; - opp_copy_host_to_device((void **)&(export_nonexec_list_disps_d[set->index]), - (void **)&(OPP_export_nonexec_list[set->index]->disps), - (OPP_export_nonexec_list[set->index]->ranks_size+1) * sizeof(int), - (OPP_export_nonexec_list[set->index]->ranks_size+1) * sizeof(int), true); + OPP_export_nonexec_list[set->index]->disps[OPP_export_nonexec_list[set->index]->ranks_size] = + OPP_export_nonexec_list[set->index]->ranks_size == 0 ? 0 : + (OPP_export_nonexec_list[set->index] ->disps[OPP_export_nonexec_list[set->index]->ranks_size - 1] + + OPP_export_nonexec_list[set->index] ->sizes[OPP_export_nonexec_list[set->index]->ranks_size - 1]); + const size_t count = (size_t)(OPP_export_nonexec_list[set->index]->ranks_size + 1); + opp_mem::copy_host_to_dev(export_nonexec_list_disps_d[set->index], + OPP_export_nonexec_list[set->index]->disps, count, false, true, count); + if (OPP_DBG) opp_printf("opp_mv_halo_list_device", "export_nonexec_list_disps_d Alloc %zu bytes for set %s", (OPP_export_nonexec_list[set->index]->ranks_size+1) * sizeof(int), set->name); } - if (import_exec_list_disps_d != NULL) { - for (int s = 0; s < (int)opp_sets.size(); s++) - if (import_exec_list_disps_d[opp_sets[s]->index] != NULL) - opp_mem::dev_free(import_exec_list_disps_d[opp_sets[s]->index]); - free(import_exec_list_disps_d); - } - import_exec_list_disps_d = (int **)malloc(sizeof(int *) * (int)opp_sets.size()); - - for (int s = 0; s < (int)opp_sets.size(); s++) // for each set - { - opp_set set = opp_sets[s]; - import_exec_list_disps_d[set->index] = NULL; + clean_int_array_hd(import_exec_list_disps_d); + import_exec_list_disps_d = opp_mem::host_malloc(opp_sets.size()); + for (opp_set& set : opp_sets) { + import_exec_list_disps_d[set->index] = nullptr; if (set->is_particle) continue; //make sure end size is there too - OPP_import_exec_list[set->index] - ->disps[OPP_import_exec_list[set->index]->ranks_size] = - OPP_import_exec_list[set->index]->ranks_size == 0 - ? 0 - : OPP_import_exec_list[set->index] - ->disps[OPP_import_exec_list[set->index]->ranks_size - 1] + - OPP_import_exec_list[set->index] - ->sizes[OPP_import_exec_list[set->index]->ranks_size - 1]; - opp_copy_host_to_device((void **)&(import_exec_list_disps_d[set->index]), - (void **)&(OPP_import_exec_list[set->index]->disps), - (OPP_import_exec_list[set->index]->ranks_size+1) * sizeof(int), - (OPP_import_exec_list[set->index]->ranks_size+1) * sizeof(int), true); + OPP_import_exec_list[set->index]->disps[OPP_import_exec_list[set->index]->ranks_size] = + OPP_import_exec_list[set->index]->ranks_size == 0 ? 0 : + (OPP_import_exec_list[set->index]->disps[OPP_import_exec_list[set->index]->ranks_size - 1] + + OPP_import_exec_list[set->index]->sizes[OPP_import_exec_list[set->index]->ranks_size - 1]); + + const size_t count = (size_t)(OPP_import_exec_list[set->index]->ranks_size + 1); + opp_mem::copy_host_to_dev(import_exec_list_disps_d[set->index], + OPP_import_exec_list[set->index]->disps, count, false, true, count); if (OPP_DBG) opp_printf("opp_mv_halo_list_device", "import_exec_list_disps_d Alloc %zu bytes for set %s", (OPP_import_exec_list[set->index]->ranks_size+1) * sizeof(int), set->name); } - if (import_nonexec_list_disps_d != NULL) - { - for (int s = 0; s < (int)opp_sets.size(); s++) - if (import_nonexec_list_disps_d[opp_sets[s]->index] != NULL) - opp_mem::dev_free(import_nonexec_list_disps_d[opp_sets[s]->index]); - free(import_nonexec_list_disps_d); - } - import_nonexec_list_disps_d = (int **)malloc(sizeof(int *) * (int)opp_sets.size()); - - for (int s = 0; s < (int)opp_sets.size(); s++) // for each set - { - opp_set set = opp_sets[s]; - import_nonexec_list_disps_d[set->index] = NULL; + clean_int_array_hd(import_nonexec_list_disps_d); + import_nonexec_list_disps_d = opp_mem::host_malloc(opp_sets.size()); + for (opp_set& set : opp_sets) { + import_nonexec_list_disps_d[set->index] = nullptr; if (set->is_particle) continue; - - //make sure end size is there too - OPP_import_nonexec_list[set->index] - ->disps[OPP_import_nonexec_list[set->index]->ranks_size] = - OPP_import_nonexec_list[set->index]->ranks_size == 0 - ? 0 - : OPP_import_nonexec_list[set->index] - ->disps[OPP_import_nonexec_list[set->index]->ranks_size - - 1] + - OPP_import_nonexec_list[set->index] - ->sizes[OPP_import_nonexec_list[set->index]->ranks_size - - 1]; - opp_copy_host_to_device((void **)&(import_nonexec_list_disps_d[set->index]), - (void **)&(OPP_import_nonexec_list[set->index]->disps), - (OPP_import_nonexec_list[set->index]->ranks_size+1) * sizeof(int), - (OPP_import_nonexec_list[set->index]->ranks_size+1) * sizeof(int), true); + //make sure end size is there too + OPP_import_nonexec_list[set->index]->disps[OPP_import_nonexec_list[set->index]->ranks_size] = + OPP_import_nonexec_list[set->index]->ranks_size == 0 ? 0 : + OPP_import_nonexec_list[set->index]->disps[OPP_import_nonexec_list[set->index]->ranks_size - 1] + + OPP_import_nonexec_list[set->index]->sizes[OPP_import_nonexec_list[set->index]->ranks_size - 1]; + + const size_t count = (size_t)(OPP_import_nonexec_list[set->index]->ranks_size + 1); + opp_mem::copy_host_to_dev(import_nonexec_list_disps_d[set->index], + OPP_import_nonexec_list[set->index]->disps, count, false, true, count); + if (OPP_DBG) opp_printf("opp_mv_halo_list_device", "import_nonexec_list_disps_d Alloc %zu bytes for set %s", (OPP_import_nonexec_list[set->index]->ranks_size+1) * sizeof(int), set->name); } diff --git a/opp_lib/src/sycl/opp_sycl_utils.cpp b/opp_lib/src/sycl/opp_sycl_utils.cpp index 73a61e3..fa39b94 100644 --- a/opp_lib/src/sycl/opp_sycl_utils.cpp +++ b/opp_lib/src/sycl/opp_sycl_utils.cpp @@ -44,7 +44,8 @@ void opp_create_dat_device_arrays(opp_dat dat, bool create_new) const size_t alloc_count = (size_t)(dat->set->set_capacity * dat->size); dat->data_d = opp_mem::dev_malloc(alloc_count); - dat->data_swap_d = opp_mem::dev_malloc(alloc_count); + if (OPP_fill_type != OPP_HoleFill_All) + dat->data_swap_d = opp_mem::dev_malloc(alloc_count); if (OPP_DBG) opp_printf("opp_create_dat_device_arrays", "Device array of dat [%s][%p][%p] [%zubytes]", @@ -60,10 +61,12 @@ void opp_download_dat(opp_dat dat) const size_t set_size = dat->set->set_capacity; if (strstr(dat->type, ":soa") != NULL || (OPP_auto_soa && dat->dim > 1)) { - if (OPP_DBG) opp_printf("opp_download_dat", "GPU->CPU SOA | %s", dat->name); + if (OPP_DBG) + opp_printf("opp_download_dat", "GPU->CPU SOA | %s to %p from %p", + dat->name, dat->data, dat->data_d); std::vector tmp_data(dat->size * set_size); - opp_mem::copy_dev_to_host(tmp_data.data(), dat->data_d, set_size * dat->size); + opp_mem::copy_dev_to_host(tmp_data.data(), dat->data_d, set_size * dat->size); int element_size = dat->size / dat->dim; for (int i = 0; i < dat->dim; i++) { @@ -76,7 +79,9 @@ void opp_download_dat(opp_dat dat) } } else { - if (OPP_DBG) opp_printf("opp_download_dat", "GPU->CPU NON-SOA| %s", dat->name); + if (OPP_DBG) + opp_printf("opp_download_dat", "GPU->CPU NON-SOA| %s to %p from %p", + dat->name, dat->data, dat->data_d); opp_mem::copy_dev_to_host(dat->data, dat->data_d, set_size * dat->size); } @@ -195,16 +200,6 @@ void opp_upload_particle_set(opp_set particles_set, bool realloc) // TO BE DISCARDED --- Use opp_mem:: routines //**************************************** -void opp_copy_host_to_device(void **data_d, void **data_h, size_t copy_size, - size_t alloc_size, bool create_new) -{ - if (create_new) { - if (*data_d != NULL) sycl::free(*data_d, *opp_queue); - *data_d = (void*)sycl::malloc_device(alloc_size, *opp_queue); - } - - opp_queue->memcpy(*data_d, *data_h, copy_size).wait(); -} //******************************************************************************* void* opp_host_malloc(size_t size) @@ -227,65 +222,3 @@ void opp_host_free(void* ptr) } //******************************************************************************* - -// //**************************************** -// void opp_create_dat_device_arrays(opp_dat dat, bool create_new) -// { -// if (OPP_DBG) -// opp_printf("opp_create_dat_device_arrays", "%s %s", dat->name, dat->type); - -// char* temp_char_d = nullptr; - -// if (strcmp(dat->type, "double") == 0) -// { -// if (dat->set->size > 0) -// { -// if (create_new && dat->thrust_real) -// { -// delete dat->thrust_real; -// delete dat->thrust_real_sort; -// } - -// dat->thrust_real = new dpct::device_vector( -// dat->set->set_capacity * dat->dim); -// dat->data_d = -// (char *)dpct::get_raw_pointer(dat->thrust_real->data()); - -// dat->thrust_real_sort = new dpct::device_vector( -// dat->set->set_capacity * dat->dim); -// temp_char_d = -// (char *)dpct::get_raw_pointer(dat->thrust_real_sort->data()); -// } -// } -// else if (strcmp(dat->type, "int") == 0 ) -// { -// if (dat->set->size > 0) -// { -// if (create_new && dat->thrust_int) -// { -// delete dat->thrust_int; -// delete dat->thrust_int_sort; -// } - -// dat->thrust_int = -// new dpct::device_vector(dat->set->set_capacity * dat->dim); -// dat->data_d = -// (char *)dpct::get_raw_pointer(dat->thrust_int->data()); - -// dat->thrust_int_sort = -// new dpct::device_vector(dat->set->set_capacity * dat->dim); -// temp_char_d = -// (char *)dpct::get_raw_pointer(dat->thrust_int_sort->data()); -// } -// } -// else -// { -// std::cerr << "opp_create_dat_device_arrays DEVICE not implemented for type: " -// << dat->type << " dat name: " << dat->name << std::endl; -// opp_abort(); -// } - -// if (OPP_DBG) -// opp_printf("opp_create_dat_device_arrays", "Device array of dat [%s][%p][%p] Capacity [%d]", -// dat->name, dat->data_d, temp_char_d, dat->set->set_capacity * dat->dim); -// } diff --git a/opp_translator/resources/templates/cpp/sycl/move_loop_host.hpp.jinja b/opp_translator/resources/templates/cpp/sycl/move_loop_host.hpp.jinja index 4b9b4f7..ce74df4 100644 --- a/opp_translator/resources/templates/cpp/sycl/move_loop_host.hpp.jinja +++ b/opp_translator/resources/templates/cpp/sycl/move_loop_host.hpp.jinja @@ -354,7 +354,6 @@ auto opp_part_check_status = move_cell_indices[moveIdx] = c_idx[0]; // To be removed from the current rank, packing will be done prior exchange & removal - move_flag = OPP_NEED_REMOVE; const int removeIdx = opp_atomic_fetch_add(remove_count, 1); remove_part_indices[removeIdx] = p_idx;