diff --git a/.jenkins/continuous.groovy b/.jenkins/continuous.groovy index fde45b3f..1e2283c7 100644 --- a/.jenkins/continuous.groovy +++ b/.jenkins/continuous.groovy @@ -39,7 +39,7 @@ pipeline { -DKRS_ENABLE_TESTS=ON \ -DCMAKE_CXX_FLAGS=-Werror \ .. && \ - make -j8 && cd unit_tests && mpirun -np 2 ./KokkosRemote_TestAll''' + make -j8 && cd unit_tests && mpirun -np 2 ./KokkosRemoteSpaces_TestAll''' } } } diff --git a/CMakeLists.txt b/CMakeLists.txt index 988bb660..95aea5ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required (VERSION 3.12) -project(KokkosRemote +project(KokkosRemoteSpaces LANGUAGES CXX VERSION 0.1.0) @@ -108,58 +108,58 @@ if (KRS_ENABLE_RACERLIB) list(APPEND HEADERS ${DIR_HDRS}) endif() -add_library(kokkosremote ${SOURCES} ${HEADERS}) -add_library(Kokkos::kokkosremote ALIAS kokkosremote) -target_link_libraries(kokkosremote PUBLIC Kokkos::kokkos) +add_library(kokkosremotespaces ${SOURCES} ${HEADERS}) +add_library(Kokkos::kokkosremotespaces ALIAS kokkosremotespaces) +target_link_libraries(kokkosremotespaces PUBLIC Kokkos::kokkos) foreach(DIR ${SOURCE_DIRS}) - target_include_directories(kokkosremote PUBLIC $) + target_include_directories(kokkosremotespaces PUBLIC $) foreach(BACKEND_NAME ${PUBLIC_DEPS}) - target_compile_definitions(kokkosremote PUBLIC KRS_ENABLE_${BACKEND_NAME}) + target_compile_definitions(kokkosremotespaces PUBLIC KRS_ENABLE_${BACKEND_NAME}) endforeach() endforeach() if (KRS_ENABLE_RACERLIB) - target_include_directories(kokkosremote PUBLIC $) - target_compile_definitions(kokkosremote PUBLIC KRS_ENABLE_RACERLIB) + target_include_directories(kokkosremotespaces PUBLIC $) + target_compile_definitions(kokkosremotespaces PUBLIC KRS_ENABLE_RACERLIB) endif() -target_include_directories(kokkosremote PUBLIC $) -target_include_directories(kokkosremote PUBLIC $) +target_include_directories(kokkosremotespaces PUBLIC $) +target_include_directories(kokkosremotespaces PUBLIC $) if(KRS_ENABLE_DEBUG OR CMAKE_BUILD_TYPE STREQUAL "Debug") - target_compile_definitions(kokkosremote PUBLIC KOKKOS_REMOTE_SPACES_ENABLE_DEBUG) + target_compile_definitions(kokkosremotespaces PUBLIC KOKKOS_REMOTE_SPACES_ENABLE_DEBUG) message(STATUS "Enabled build mode: debug") endif() if (PRIVATE_DEPS) - target_link_libraries(kokkosremote PRIVATE ${PRIVATE_DEPS}) + target_link_libraries(kokkosremotespaces PRIVATE ${PRIVATE_DEPS}) endif() if (PUBLIC_DEPS) - target_link_libraries(kokkosremote PUBLIC ${PUBLIC_DEPS}) + target_link_libraries(kokkosremotespaces PUBLIC ${PUBLIC_DEPS}) endif() if (KRS_ENABLE_NVSHMEMSPACE) - target_link_libraries(kokkosremote PRIVATE "-lnvidia-ml") + target_link_libraries(kokkosremotespaces PRIVATE "-lnvidia-ml") endif() include(GNUInstallDirs) include(CMakePackageConfigHelpers) configure_package_config_file( - KokkosRemoteConfig.cmake.in - "${CMAKE_CURRENT_BINARY_DIR}/KokkosRemoteConfig.cmake" + KokkosRemoteSpacesConfig.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/KokkosRemoteSpacesConfig.cmake" INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake ) write_basic_package_version_file( - "${CMAKE_CURRENT_BINARY_DIR}/KokkosRemoteConfigVersion.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosRemoteSpacesConfigVersion.cmake" VERSION "${KokkosRemote_VERSION}" COMPATIBILITY AnyNewerVersion ) install(FILES - "${CMAKE_CURRENT_BINARY_DIR}/KokkosRemoteConfig.cmake" - "${CMAKE_CURRENT_BINARY_DIR}/KokkosRemoteConfigVersion.cmake" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/KokkosRemote) + "${CMAKE_CURRENT_BINARY_DIR}/KokkosRemoteSpacesConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosRemoteSpacesConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/KokkosRemoteSpaces) install(FILES ${HEADERS} @@ -167,17 +167,17 @@ install(FILES ) install( - TARGETS kokkosremote ${PUBLIC_DEPS} - EXPORT KokkosRemoteTargets + TARGETS kokkosremotespaces ${PUBLIC_DEPS} + EXPORT KokkosRemoteSpacesTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) install(EXPORT - KokkosRemoteTargets + KokkosRemoteSpacesTargets NAMESPACE Kokkos:: - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/KokkosRemote + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/KokkosRemoteSpaces ) IF (KRS_ENABLE_EXAMPLES) diff --git a/KokkosRemoteConfig.cmake.in b/KokkosRemoteSpacesConfig.cmake.in similarity index 79% rename from KokkosRemoteConfig.cmake.in rename to KokkosRemoteSpacesConfig.cmake.in index 2efef89e..fa3e219d 100644 --- a/KokkosRemoteConfig.cmake.in +++ b/KokkosRemoteSpacesConfig.cmake.in @@ -7,5 +7,5 @@ set(Kokkos_DIR "@Kokkos_DIR@") find_dependency(Kokkos REQUIRED) GET_FILENAME_COMPONENT(Kokkos_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) -INCLUDE("${Kokkos_CMAKE_DIR}/KokkosRemoteTargets.cmake") +INCLUDE("${Kokkos_CMAKE_DIR}/KokkosRemoteSpacesTargets.cmake") UNSET(Kokkos_CMAKE_DIR) diff --git a/README.md b/README.md index a97d11ae..f17c8718 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ find_package(KokkosRemote REQUIRED) add_executable(MatVec matvec.cpp) target_link_libraries(MatVec PRIVATE \ - Kokkos::kokkoskernels Kokkos::kokkosremote) + Kokkos::kokkoskernels Kokkos::kokkosremotespaces) ``` This cmake build fike can be used as @@ -119,5 +119,4 @@ cmake .. -DKokkosKernels_ROOT=$KokkosKernels_INSTALL_PATH -DKokkosRemote_ROOT=$K *Note: Kokkos Remote Spaces is in an experimental development stage.* ### Contact -Jan Ciesko, Engineering Lead, -jciesko@sandia.gov +Jan Ciesko, jciesko@sandia.gov diff --git a/examples/benchmarks/CMakeLists.txt b/examples/benchmarks/CMakeLists.txt index 5bec9b93..93034738 100644 --- a/examples/benchmarks/CMakeLists.txt +++ b/examples/benchmarks/CMakeLists.txt @@ -1,4 +1,4 @@ add_subdirectory(poissonaccess) add_subdirectory(misslatency) add_subdirectory(randomaccess) -add_subdirectory(stream) +add_subdirectory(access_overhead) diff --git a/examples/benchmarks/access_overhead/CMakeLists.txt b/examples/benchmarks/access_overhead/CMakeLists.txt new file mode 100755 index 00000000..7d06712b --- /dev/null +++ b/examples/benchmarks/access_overhead/CMakeLists.txt @@ -0,0 +1,7 @@ +FILE(GLOB SRCS *.cpp) + +foreach(file ${SRCS}) + get_filename_component(test_name ${file} NAME_WE) + add_executable(${test_name} ${file}) + target_link_libraries(${test_name} PRIVATE Kokkos::kokkosremotespaces) +endforeach() diff --git a/examples/benchmarks/access_overhead/access_overhead.cpp b/examples/benchmarks/access_overhead/access_overhead.cpp new file mode 100644 index 00000000..d75248e9 --- /dev/null +++ b/examples/benchmarks/access_overhead/access_overhead.cpp @@ -0,0 +1,268 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_FOR_CORRECTNESS + +using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; +using RemoteView_t = Kokkos::View; +using PlainView_t = Kokkos::View; +using UnmanagedView_t = + Kokkos::View>; +using HostView_t = typename RemoteView_t::HostMirror; +struct InitTag {}; +struct UpdateTag {}; +struct CheckTag {}; +using policy_init_t = Kokkos::RangePolicy; +using policy_update_t = Kokkos::RangePolicy; +using policy_check_t = Kokkos::RangePolicy; +#define default_N 134217728 +#define default_iters 3 + +std::string modes[3] = {"Kokkos::View", "Kokkos::RemoteView", + "Kokkos::LocalProxyView"}; + +struct Args_t { + int mode = 0; + int N = default_N; + int iters = default_iters; +}; + +void print_help() { + printf("Options (default):\n"); + printf(" -N IARG: (%i) num elements in the vector\n", default_N); + printf(" -I IARG: (%i) num repititions\n", default_iters); + printf(" -M IARG: (%i) mode (view type)\n", 0); + printf(" modes:\n"); + printf(" 0: Kokkos (Normal) View\n"); + printf(" 1: Kokkos Remote View\n"); + printf(" 2: Kokkos Unmanaged View\n"); +} + +// read command line args +bool read_args(int argc, char *argv[], Args_t &args) { + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-h") == 0) { + print_help(); + return false; + } + } + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-N") == 0) args.N = atoi(argv[i + 1]); + if (strcmp(argv[i], "-I") == 0) args.iters = atoi(argv[i + 1]); + if (strcmp(argv[i], "-M") == 0) args.mode = atoi(argv[i + 1]); + } + return true; +} + +template +struct Access; + +template +struct Access::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + + ViewType_t v; + + Access(Args_t args) + : N(args.N), + iters(args.iters), + v(std::string(typeid(v).name()), args.N), + mode(args.mode){}; + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = 0; } + + KOKKOS_FUNCTION + void operator()(const UpdateTag &, const size_t i) const { v(i) += 1; } + + KOKKOS_FUNCTION + void operator()(const CheckTag &, const size_t i) const { + assert(v(i) == iters * 1.0); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + Kokkos::parallel_for("access_overhead-init", policy_init_t({0}, {N}), + *this); + Kokkos::fence(); +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_barrier_all(); // Not sure why this impacts perf +#endif + + time_a = timer.seconds(); + for (int i = 0; i < iters; i++) { + Kokkos::parallel_for("access_overhead", policy_update_t({0}, {N}), *this); + RemoteSpace_t().fence(); + } + time_b = timer.seconds(); + time += time_b - time_a; + +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for("access_overhead-check", policy_check_t({0}, {N}), + *this); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + printf("access_overhead,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", modes[mode].c_str(), + N, size, iters, time, gups, bw); + } +}; + +template +struct Access::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + + UnmanagedView_t v; + RemoteView_t rv; + + Access(Args_t args) + : N(args.N), + iters(args.iters), + rv(std::string(typeid(v).name()), args.N), + mode(args.mode) { + v = ViewType_t(rv.data(), N); + }; + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = 0; } + + KOKKOS_FUNCTION + void operator()(const UpdateTag &, const size_t i) const { v(i) += 1; } + + KOKKOS_FUNCTION + void operator()(const CheckTag &, const size_t i) const { + assert(v(i) == iters * 1.0); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + Kokkos::parallel_for("access_overhead-init", policy_init_t({0}, {N}), + *this); + Kokkos::fence(); + + time_a = timer.seconds(); + for (int i = 0; i < iters; i++) { + Kokkos::parallel_for("access_overhead", policy_update_t({0}, {N}), *this); + RemoteSpace_t().fence(); + } + time_b = timer.seconds(); + time += time_b - time_a; + +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for("access_overhead-check", policy_check_t({0}, {N}), + *this); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + printf("access_overhead,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", modes[mode].c_str(), + N, size, iters, time, gups, bw); + } +}; + +int main(int argc, char *argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif + + Kokkos::initialize(argc, argv); + + do { + Args_t args; + if (!read_args(argc, argv, args)) { + break; + }; + + if (args.mode == 0) { + Access s(args); + s.run(); + } else if (args.mode == 1) { + Access s(args); + s.run(); + } else if (args.mode == 2) { + Access s(args); + s.run(); + } else { + printf("invalid mode selected (%d)\n", args.mode); + } + } while (false); + + Kokkos::fence(); + + Kokkos::finalize(); +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); + return 0; +} + +#undef CHECK_FOR_CORRECTNESS diff --git a/examples/benchmarks/access_overhead/access_overhead_noThis.cpp b/examples/benchmarks/access_overhead/access_overhead_noThis.cpp new file mode 100644 index 00000000..65c47164 --- /dev/null +++ b/examples/benchmarks/access_overhead/access_overhead_noThis.cpp @@ -0,0 +1,271 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_FOR_CORRECTNESS + +using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; +using RemoteView_t = Kokkos::View; +using PlainView_t = Kokkos::View; +using UnmanagedView_t = + Kokkos::View>; +using HostView_t = typename RemoteView_t::HostMirror; +using StreamIndex = size_t; +using policy_t = Kokkos::RangePolicy>; + +#define default_N 134217728 +#define default_iters 3 + +std::string modes[3] = {"Kokkos::View", "Kokkos::RemoteView", + "Kokkos::LocalProxyView"}; + +struct Args_t { + int mode = 0; + size_t N = default_N; + int iters = default_iters; +}; + +void print_help() { + printf("Options (default):\n"); + printf(" -N IARG: (%i) num elements in the vector\n", default_N); + printf(" -I IARG: (%i) num repititions\n", default_iters); + printf(" -M IARG: (%i) mode (view type)\n", 0); + printf(" modes:\n"); + printf(" 0: Kokkos (Normal) View\n"); + printf(" 1: Kokkos Remote View\n"); + printf(" 2: Kokkos Unmanaged View\n"); +} + +// read command line args +bool read_args(int argc, char* argv[], Args_t& args) { + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-h") == 0) { + print_help(); + return false; + } + } + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-N") == 0) args.N = atoi(argv[i + 1]); + if (strcmp(argv[i], "-I") == 0) args.iters = atoi(argv[i + 1]); + if (strcmp(argv[i], "-M") == 0) args.mode = atol(argv[i + 1]); + } + return true; +} + +// run copy benchmark +void run_1(Args_t& args) { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + using ViewType_t = PlainView_t; + ViewType_t v("PlainView_t", args.N); + + size_t N = args.N; /* size of vector */ + int iters = args.iters; /* number of iterations */ + int mode = args.mode; /* View type */ + + Kokkos::parallel_for( + "access_overhead-init", policy_t({0}, {N}), + KOKKOS_LAMBDA(const size_t i) { v(i) = 0.0; }); + + Kokkos::fence(); +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_barrier_all(); // Not sure why this impacts perf +#endif + + time_a = timer.seconds(); + for (int i = 0; i < iters; i++) { + Kokkos::parallel_for( + "access_overhead", policy_t({0}, {N}), + KOKKOS_LAMBDA(const size_t i) { v(i) += 1; }); + RemoteSpace_t().fence(); + } + time_b = timer.seconds(); + time += time_b - time_a; + +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for( + "access_overhead-check", policy_t({0}, {N}), + KOKKOS_LAMBDA(const size_t i) { assert(v(i) == iters * 1.0); }); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + printf("access_overhead-noThis,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); +} + +// run copy benchmark +void run_2(Args_t& args) { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + using ViewType_t = RemoteView_t; + ViewType_t v("RemoteView_t", args.N); + + size_t N = args.N; /* size of vector */ + int iters = args.iters; /* number of iterations */ + int mode = args.mode; /* View type */ + + Kokkos::parallel_for( + "access_overhead-init", policy_t({0}, {N}), + KOKKOS_LAMBDA(const size_t i) { v(i) = 0.0; }); + + Kokkos::fence(); +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_barrier_all(); // Not sure why this impacts perf +#endif + + time_a = timer.seconds(); + for (int i = 0; i < iters; i++) { + Kokkos::parallel_for( + "access_overhead", policy_t({0}, {N}), + KOKKOS_LAMBDA(const size_t i) { v(i) += 1; }); + RemoteSpace_t().fence(); + } + time_b = timer.seconds(); + time += time_b - time_a; + +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for( + "access_overhead-check", policy_t({0}, {N}), + KOKKOS_LAMBDA(const size_t i) { assert(v(i) == iters * 1.0); }); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + printf("access_overhead-noThis,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); +} + +// run copy benchmark +void run_3(Args_t& args) { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + size_t N = args.N; /* size of vector */ + int iters = args.iters; /* number of iterations */ + int mode = args.mode; /* View type */ + + RemoteView_t rv("RemoteView_t", args.N); + UnmanagedView_t v(rv.data(), N); + + Kokkos::parallel_for( + "access_overhead-init", policy_t({0}, {N}), + KOKKOS_LAMBDA(const size_t i) { v(i) = 0.0; }); + + Kokkos::fence(); +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_barrier_all(); // Not sure why this impacts perf +#endif + + time_a = timer.seconds(); + for (int i = 0; i < iters; i++) { + Kokkos::parallel_for( + "access_overhead", policy_t({0}, {N}), + KOKKOS_LAMBDA(const size_t i) { v(i) += 1; }); + RemoteSpace_t().fence(); + } + time_b = timer.seconds(); + time += time_b - time_a; + +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for( + "access_overhead-check", policy_t({0}, {N}), + KOKKOS_LAMBDA(const size_t i) { assert(v(i) == iters * 1.0); }); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + printf("access_overhead-noThis,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); +} + +int main(int argc, char* argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif + + Kokkos::initialize(argc, argv); + + do { + Args_t args; + if (!read_args(argc, argv, args)) { + break; + }; + if (args.mode == 0) { + run_1(args); + } else if (args.mode == 1) { + run_2(args); + } else if (args.mode == 2) { + run_3(args); + } else { + printf("invalid mode selected (%d)\n", args.mode); + } + } while (false); + + Kokkos::finalize(); +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); + return 0; +} + +#undef CHECK_FOR_CORRECTNESS diff --git a/examples/benchmarks/access_overhead/access_overhead_p2p.cpp b/examples/benchmarks/access_overhead/access_overhead_p2p.cpp new file mode 100644 index 00000000..37adb3cc --- /dev/null +++ b/examples/benchmarks/access_overhead/access_overhead_p2p.cpp @@ -0,0 +1,574 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#include +#include +#include +#include +#include +#include +#include + +// Uncomment to enable device-aware MPI +//#define CUDA_AWARE_MPI + +#define CHECK_FOR_CORRECTNESS + +using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; +using RemoteView_t = Kokkos::View; +using PlainView_t = Kokkos::View; +using UnmanagedView_t = + Kokkos::View>; +using HostView_t = typename RemoteView_t::HostMirror; + +// Tags +struct InitTag {}; +struct UpdateTag {}; +struct UpdateTag_put {}; +struct UpdateTag_get {}; +struct CheckTag {}; +struct CheckTag_put {}; +struct CheckTag_get {}; + +// Exec policies +using policy_init_t = Kokkos::RangePolicy; +using policy_update_t = Kokkos::RangePolicy; +using policy_update_put_t = Kokkos::RangePolicy; +using policy_update_get_t = Kokkos::RangePolicy; +using team_policy_get_update_t = Kokkos::TeamPolicy; +using team_policy_put_update_t = Kokkos::TeamPolicy; +using policy_check_t = Kokkos::RangePolicy; +using policy_check_put_t = Kokkos::RangePolicy; +using policy_check_get_t = Kokkos::RangePolicy; + +// Default values +#define default_Mode 0 +#define default_N 134217728 +#define default_Iters 3 +#define default_RmaOp 0 // get +#define TAG 0 + +std::string modes[3] = {"Kokkos::View", "Kokkos::RemoteView", + "Kokkos::RemoteViewBlockTransfer"}; + +enum { RMA_GET, RMA_PUT }; + +struct Args_t { + int mode = default_Mode; + int N = default_N; + int iters = default_Iters; + int rma_op = default_RmaOp; +}; + +void print_help() { + printf("Options (default):\n"); + printf(" -N IARG: (%i) num elements in the vector\n", default_N); + printf(" -I IARG: (%i) num repititions\n", default_Iters); + printf(" -M IARG: (%i) mode (view type)\n", default_Mode); + printf(" -O IARG: (%i) rma operation (0...get, 1...put)\n", default_RmaOp); + printf(" modes:\n"); + printf(" 0: Kokkos (Normal) View\n"); + printf(" 1: Kokkos Remote View\n"); +} + +// read command line args +bool read_args(int argc, char *argv[], Args_t &args) { + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-h") == 0) { + print_help(); + return false; + } + } + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-N") == 0) args.N = atoi(argv[i + 1]); + if (strcmp(argv[i], "-I") == 0) args.iters = atoi(argv[i + 1]); + if (strcmp(argv[i], "-M") == 0) args.mode = atoi(argv[i + 1]); + if (strcmp(argv[i], "-O") == 0) args.rma_op = atoi(argv[i + 1]); + } + return true; +} + +template +struct Access; + +template +struct Access_LDC; + +template +struct Access::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + int rma_op; + + int my_rank, other_rank, num_ranks; + + ViewType_t v; + + Access(Args_t args) + : N(args.N), iters(args.iters), mode(args.mode), rma_op(args.rma_op) { + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + assert(num_ranks == 2); + other_rank = my_rank ^ 1; + v = ViewType_t(std::string(typeid(v).name()), num_ranks * args.N); + }; + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = 1.0; } + + KOKKOS_FUNCTION + void operator()(const UpdateTag_get &, const size_t i) const { + v(i) += v(other_rank * N + i); + } + + KOKKOS_FUNCTION + void operator()(const UpdateTag_put &, const size_t i) const { + v(other_rank * N + i) = v(i); + } + + KOKKOS_FUNCTION + void operator()(const CheckTag_get &, const size_t i) const { + assert(v(i) == iters * 1.0 + 1.0); + } + + KOKKOS_FUNCTION + void operator()(const CheckTag_put &, const size_t i) const { + assert(v(i) == 1.0); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + auto local_range = Kokkos::Experimental::get_local_range(2 * v.size()); + Kokkos::parallel_for("access_overhead-init", + policy_init_t(local_range.first, local_range.second), + *this); + RemoteSpace_t().fence(); + + if (rma_op == RMA_GET) { + for (int i = 0; i < iters; i++) { + if (my_rank == 0) { + time_a = timer.seconds(); + Kokkos::parallel_for( + "access_overhead", + policy_update_get_t(local_range.first, local_range.second), + *this); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } else { + RemoteSpace_t().fence(); + } + } + } else if (rma_op == RMA_PUT) { + for (int i = 0; i < iters; i++) { + if (my_rank == 0) { + time_a = timer.seconds(); + Kokkos::parallel_for( + "access_overhead", + policy_update_put_t(local_range.first, local_range.second), + *this); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } else { + RemoteSpace_t().fence(); + } + } + } else { + printf("What rma_op is this? Exiting.\n"); + exit(1); + } + + if (rma_op == RMA_GET) { + // check on rank 0 + if (my_rank == 0) { +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for( + "access_overhead-check", + policy_check_get_t(local_range.first, local_range.second), *this); + Kokkos::fence(); +#endif + } + } else { + // check on rank 1 + if (my_rank == 1) { +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for( + "access_overhead-check", + policy_check_put_t(local_range.first, local_range.second), *this); + Kokkos::fence(); +#endif + } + } + + if (my_rank == 0) { + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + if (rma_op == RMA_GET) { + printf("access_overhead_p2p_get,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); + } else { + printf("access_overhead_p2p_put,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); + } + } + } +}; + +template +struct Access_LDC< + ViewType_t, + typename std::enable_if_t::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + int rma_op; + + int my_rank, other_rank, num_ranks; + + ViewType_t v, v_tmp; + ViewType_t v_subview_remote; + + Access_LDC(Args_t args) + : N(args.N), iters(args.iters), mode(args.mode), rma_op(args.rma_op) { + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + assert(num_ranks == 2); + other_rank = my_rank ^ 1; + v = ViewType_t(std::string(typeid(v).name()), num_ranks * args.N); + v_tmp = ViewType_t(std::string(typeid(v).name()), num_ranks * args.N); + }; + + KOKKOS_FUNCTION + void operator()(const size_t i) const { + double val1 = v_tmp(i); + double val2 = v(i); + printf("debug: %li, %f, %f\n", i, val1, val2); + } + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = my_rank; } + + KOKKOS_FUNCTION + void operator()(const CheckTag_get &, const size_t i) const { + assert(v(i) == iters * 1.0 + 1.0); + } + + KOKKOS_FUNCTION + void operator()(const CheckTag_put &, const size_t i) const { + assert(v(i) == iters * 1.0); + } + + KOKKOS_FUNCTION + void operator()(const UpdateTag &, const size_t i) const { v(i) += v_tmp(i); } + + KOKKOS_FUNCTION + void operator()(const UpdateTag_get &, + typename team_policy_get_update_t::member_type team) const { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + auto local_range = Kokkos::Experimental::get_local_range(num_ranks * N); + auto remote_range = + Kokkos::Experimental::get_range(num_ranks * N, other_rank); + auto v_subview_remote = Kokkos::subview(v, remote_range); + auto v_tmp_subview_local = Kokkos::subview(v_tmp, local_range); + Kokkos::Experimental::RemoteSpaces::local_deep_copy(v_tmp_subview_local, + v_subview_remote); + }); + } + + KOKKOS_FUNCTION + void operator()(const UpdateTag_put &, + typename team_policy_put_update_t::member_type team) const { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + auto local_range = Kokkos::Experimental::get_local_range(num_ranks * N); + auto remote_range = + Kokkos::Experimental::get_range(num_ranks * N, other_rank); + auto v_subview_remote = Kokkos::subview(v, remote_range); + auto v_tmp_subview_local = Kokkos::subview(v_tmp, local_range); + Kokkos::Experimental::RemoteSpaces::local_deep_copy(v_subview_remote, + v_tmp_subview_local); + }); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + auto local_range = Kokkos::Experimental::get_local_range(num_ranks * N); + + Kokkos::parallel_for("access_overhead-init", + policy_init_t(local_range.first, local_range.second), + *this); + Kokkos::fence(); + MPI_Barrier(MPI_COMM_WORLD); + + if (rma_op == RMA_GET) { + for (int i = 0; i < iters; i++) { + if (my_rank == 0) { + time_a = timer.seconds(); + Kokkos::parallel_for("block_transfer", team_policy_get_update_t(1, 1), + *this); + Kokkos::fence(); +#ifdef KRS_ENABLE_DEBUG + Kokkos::parallel_for( + "printf values for debugging", + Kokkos::RangePolicy(local_range.first, local_range.second), + *this); +#endif + Kokkos::parallel_for( + "update", policy_update_t(local_range.first, local_range.second), + *this); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } else { + RemoteSpace_t().fence(); + } + } + } else if (rma_op == RMA_PUT) { + for (int i = 0; i < iters; i++) { + if (my_rank == 0) { + time_a = timer.seconds(); + Kokkos::parallel_for("block_transfer", team_policy_put_update_t(1, 1), + *this); + Kokkos::fence(); + Kokkos::parallel_for( + "access_overhead", + policy_update_t(local_range.first, local_range.second), *this); + + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } else { + RemoteSpace_t().fence(); + } + } + } else { + printf("What rma_op is this? Exiting.\n"); + exit(1); + } + +#ifdef CHECK_FOR_CORRECTNESS + if (rma_op == RMA_GET) { + // check on rank 0 + if (my_rank == 0) { + Kokkos::parallel_for( + "access_overhead-check", + policy_check_get_t(local_range.first, local_range.second), *this); + Kokkos::fence(); + } + } else { + // check on rank 1 + if (my_rank == 1) { + Kokkos::parallel_for( + "access_overhead-check", + policy_check_put_t(local_range.first, local_range.second), *this); + Kokkos::fence(); + } + } +#endif + + if (my_rank == 0) { + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + if (rma_op == RMA_GET) { + printf("access_overhead_p2p_get,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); + } else { + printf("access_overhead_p2p_put,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); + } + } + } +}; + +template +struct Access::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + + int my_rank, other_rank, num_ranks = 0; + + ViewType_t v; + ViewType_t v_tmp; + + Access(Args_t args) + : N(args.N), + iters(args.iters), + v(std::string(typeid(v).name()), args.N), + v_tmp(std::string(typeid(v).name()) + "_tmp", args.N), + mode(args.mode) { + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + other_rank = my_rank ^ 1; + assert(num_ranks == 2); + }; + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = 1.0; } + + KOKKOS_FUNCTION + void operator()(const UpdateTag &, const size_t i) const { v(i) += v_tmp(i); } + + KOKKOS_FUNCTION + void operator()(const CheckTag &, const size_t i) const { + assert(v(i) == iters * 1.0 + 1.0); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + Kokkos::parallel_for("access_overhead-init", policy_init_t({0}, {N}), + *this); + Kokkos::fence(); + MPI_Barrier(MPI_COMM_WORLD); + + for (int i = 0; i < iters; i++) { + time_a = timer.seconds(); + + if (my_rank == 1) { +#ifndef CUDA_AWARE_MPI + auto v_tmp_host = Kokkos::create_mirror_view(v_tmp); + Kokkos::deep_copy(v_tmp_host, v); + MPI_Send(v_tmp_host.data(), N, MPI_DOUBLE, other_rank, TAG, + MPI_COMM_WORLD); +#else + MPI_Send(v.data(), N, MPI_DOUBLE, other_rank, TAG, MPI_COMM_WORLD); +#endif + } else { +#ifndef CUDA_AWARE_MPI + auto v_tmp_host = Kokkos::create_mirror_view(v_tmp); + MPI_Recv(v_tmp_host.data(), N, MPI_DOUBLE, other_rank, TAG, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(v_tmp, v_tmp_host); +#else + MPI_Recv(v_tmp.data(), N, MPI_DOUBLE, other_rank, TAG, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif + Kokkos::parallel_for("access_overhead", policy_update_t({0}, {N}), + *this); + + Kokkos::fence(); + + time_b = timer.seconds(); + time += time_b - time_a; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + if (my_rank == 0) { +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for("access_overhead-check", policy_check_t({0}, {N}), + *this); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); +#ifdef CUDA_AWARE_MPI + printf("access_overhead,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + (modes[mode] + "-MPIIsCudaAware").c_str(), N, size, iters, time, + gups, bw); +#else + printf("access_overhead,%s,%lu,%lf,%lu,%lf,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, gups, bw); +#endif + } + MPI_Barrier(MPI_COMM_WORLD); + } +}; + +int main(int argc, char *argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif + + Kokkos::initialize(argc, argv); + + do { + Args_t args; + if (!read_args(argc, argv, args)) { + break; + }; + if (args.mode == 0) { + Access s(args); + s.run(); + } else if (args.mode == 1) { + Access s(args); + s.run(); + } else if (args.mode == 2) { + Access_LDC s(args); + s.run(); + } else { + printf("invalid mode selected (%d)\n", args.mode); + } + } while (false); + + Kokkos::fence(); + + Kokkos::finalize(); +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); + return 0; +} + +#undef CHECK_FOR_CORRECTNESS \ No newline at end of file diff --git a/examples/benchmarks/access_overhead/access_overhead_stream.cpp b/examples/benchmarks/access_overhead/access_overhead_stream.cpp new file mode 100644 index 00000000..085a3c69 --- /dev/null +++ b/examples/benchmarks/access_overhead/access_overhead_stream.cpp @@ -0,0 +1,250 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include +#include + +#include + +#define STREAM_ARRAY_SIZE 134217728 +#define STREAM_NTIMES 20 + +using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; +using StreamDeviceArray = Kokkos::View; +using StreamHostArray = Kokkos::View; +using StreamIndex = size_t; +using Policy = Kokkos::RangePolicy>; + +void perform_set(StreamDeviceArray& a, const double scalar) { + Kokkos::parallel_for( + "set", Policy(0, a.extent(0)), + KOKKOS_LAMBDA(const StreamIndex i) { a(i) = scalar; }); + RemoteSpace_t().fence(); +} + +void perform_incr(StreamDeviceArray& a, const double scalar) { + Kokkos::parallel_for( + "set", Policy(0, a.extent(0)), + KOKKOS_LAMBDA(const StreamIndex i) { a(i) += scalar; }); + RemoteSpace_t().fence(); +} + +void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b) { + Kokkos::parallel_for( + "copy", Policy(0, a.extent(0)), KOKKOS_LAMBDA(const StreamIndex i) { + double tmp = a(i); + b(i) = tmp; + }); + RemoteSpace_t().fence(); +} + +void perform_scale(StreamDeviceArray& b, StreamDeviceArray& c, + const double scalar) { + Kokkos::parallel_for( + "scale", Policy(0, b.extent(0)), + KOKKOS_LAMBDA(const StreamIndex i) { b(i) = scalar * c(i); }); + RemoteSpace_t().fence(); +} + +void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, + StreamDeviceArray& c) { + Kokkos::parallel_for( + "add", Policy(0, a.extent(0)), + KOKKOS_LAMBDA(const StreamIndex i) { c(i) = a(i) + b(i); }); + RemoteSpace_t().fence(); +} + +void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b, + StreamDeviceArray& c, const double scalar) { + Kokkos::parallel_for( + "triad", Policy(0, a.extent(0)), + KOKKOS_LAMBDA(const StreamIndex i) { a(i) = b(i) + scalar * c(i); }); + RemoteSpace_t().fence(); +} + +int perform_validation(StreamHostArray& a, StreamHostArray& b, + StreamHostArray& c, const StreamIndex arraySize, + const double scalar) { + double ai = 1.0; + double bi = 2.0; + double ci = 0.0; + + for (StreamIndex i = 0; i < arraySize; ++i) { + ci = ai; + bi = scalar * ci; + ci = ai + bi; + ai = bi + scalar * ci; + }; + + double aError = 0.0; + double bError = 0.0; + double cError = 0.0; + + for (StreamIndex i = 0; i < arraySize; ++i) { + aError = std::abs(a[i] - ai); + bError = std::abs(b[i] - bi); + cError = std::abs(c[i] - ci); + } + + double aAvgError = aError / (double)arraySize; + double bAvgError = bError / (double)arraySize; + double cAvgError = cError / (double)arraySize; + + const double epsilon = 1.0e-13; + int errorCount = 0; + + if (std::abs(aAvgError / ai) > epsilon) { + fprintf(stderr, "Error: validation check on View a failed.\n"); + errorCount++; + } + + if (std::abs(bAvgError / bi) > epsilon) { + fprintf(stderr, "Error: validation check on View b failed.\n"); + errorCount++; + } + + if (std::abs(cAvgError / ci) > epsilon) { + fprintf(stderr, "Error: validation check on View c failed.\n"); + errorCount++; + } + + if (errorCount == 0) { + printf("All solutions checked and verified.\n"); + } + + return errorCount; +} + +int run_benchmark(uint64_t size, uint64_t reps, uint64_t ls, uint64_t ts, + uint64_t vs) { + StreamDeviceArray dev_a("a", size); + StreamDeviceArray dev_b("b", size); + StreamDeviceArray dev_c("c", size); + + StreamHostArray host_a("a", size); + StreamHostArray host_b("b", size); + StreamHostArray host_c("c", size); + + const double scalar = 3.0; + + double setTime = std::numeric_limits::max(); + double incrTime = std::numeric_limits::max(); + double copyTime = std::numeric_limits::max(); + double scaleTime = std::numeric_limits::max(); + double addTime = std::numeric_limits::max(); + double triadTime = std::numeric_limits::max(); + + Kokkos::parallel_for( + "init", Kokkos::RangePolicy<>(0, size), KOKKOS_LAMBDA(const int i) { + dev_a(i) = 1.0; + dev_b(i) = 2.0; + dev_c(i) = 0.0; + }); + + Kokkos::Timer timer; + + for (StreamIndex k = 0; k < reps; ++k) { + timer.reset(); + perform_set(dev_c, 1.5); + setTime = std::min(setTime, timer.seconds()); + + timer.reset(); + perform_incr(dev_c, 1); + incrTime = std::min(incrTime, timer.seconds()); + + timer.reset(); + perform_copy(dev_a, dev_c); + copyTime = std::min(copyTime, timer.seconds()); + + timer.reset(); + perform_scale(dev_b, dev_c, scalar); + scaleTime = std::min(scaleTime, timer.seconds()); + + timer.reset(); + perform_add(dev_a, dev_b, dev_c); + addTime = std::min(addTime, timer.seconds()); + + timer.reset(); + perform_triad(dev_a, dev_b, dev_c, scalar); + triadTime = std::min(triadTime, timer.seconds()); + } + + printf("%lu,%lu,%lu,%lu,%li,%li,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f\n", ls, ts, vs, + static_cast(size), (size * sizeof(double)) >> 20, + ((size * sizeof(double)) >> 20) * 3, + (1.0e-06 * 1.0 * (double)sizeof(double) * (double)size) / setTime, + (1.0e-06 * 1.0 * (double)sizeof(double) * (double)size) / incrTime, + (1.0e-06 * 2.0 * (double)sizeof(double) * (double)size) / copyTime, + (1.0e-06 * 2.0 * (double)sizeof(double) * (double)size) / scaleTime, + (1.0e-06 * 3.0 * (double)sizeof(double) * (double)size) / addTime, + (1.0e-06 * 3.0 * (double)sizeof(double) * (double)size) / triadTime); + + Kokkos::deep_copy(host_a, dev_a); + Kokkos::deep_copy(host_b, dev_b); + Kokkos::deep_copy(host_c, dev_c); + + return perform_validation(host_a, host_b, host_c, size, scalar); +} + +int main(int argc, char* argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif + + uint64_t array_size = STREAM_ARRAY_SIZE; + uint64_t repetitions = STREAM_NTIMES; + uint64_t ls = 32, vs = 32, ts = 32; + + array_size = argc > 1 ? atoi(argv[1]) : array_size; + repetitions = argc > 2 ? atoi(argv[2]) : repetitions; + ls = argc > 3 ? atoi(argv[3]) : ls; + ts = argc > 4 ? atoi(argv[4]) : ts; + vs = argc > 5 ? atoi(argv[5]) : vs; + + Kokkos::initialize(argc, argv); + const int rc = run_benchmark(array_size, repetitions, ls, ts, vs); + Kokkos::finalize(); +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); + return rc; +} \ No newline at end of file diff --git a/examples/benchmarks/access_overhead/access_overhead_teams.cpp b/examples/benchmarks/access_overhead/access_overhead_teams.cpp new file mode 100644 index 00000000..cb2724c5 --- /dev/null +++ b/examples/benchmarks/access_overhead/access_overhead_teams.cpp @@ -0,0 +1,319 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_FOR_CORRECTNESS + +#define LEAGE_SIZE 128 +#define TEAM_SIZE 1024 +#define VECTOR_LEN 1 + +using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; +using RemoteView_t = Kokkos::View; +using PlainView_t = Kokkos::View; +using UnmanagedView_t = + Kokkos::View>; +using HostView_t = typename RemoteView_t::HostMirror; +struct InitTag {}; +struct UpdateTag {}; +struct CheckTag {}; +using policy_init_t = Kokkos::RangePolicy; +using policy_update_t = Kokkos::TeamPolicy; +using policy_check_t = Kokkos::RangePolicy; + +using StreamIndex = size_t; + +using team_t = const policy_update_t::member_type; + +#define default_N 134217728 +#define default_iters 3 + +#define default_LS 64 +#define default_TS 64 + +std::string modes[3] = {"Kokkos::View", "Kokkos::RemoteView", + "Kokkos::LocalProxyView"}; + +struct Args_t { + int mode = 0; + int N = default_N; + int iters = default_iters; + int LS = default_LS; + int TS = default_TS; +}; + +void print_help() { + printf("Options (default):\n"); + printf(" -N IARG: (%i) num elements in the vector\n", default_N); + printf(" -I IARG: (%i) num repititions\n", default_iters); + printf(" -M IARG: (%i) mode (view type)\n", 0); + printf(" -LS IARG: (%i) num leagues\n", default_LS); + printf(" -TS IARG: (%i) num theads\n", default_TS); + printf(" modes:\n"); + printf(" 0: Kokkos (Normal) View\n"); + printf(" 1: Kokkos Remote View\n"); + printf(" 2: Kokkos Unmanaged View\n"); +} + +// read command line args +bool read_args(int argc, char *argv[], Args_t &args) { + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-h") == 0) { + print_help(); + return false; + } + } + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-N") == 0) args.N = atoi(argv[i + 1]); + if (strcmp(argv[i], "-I") == 0) args.iters = atoi(argv[i + 1]); + if (strcmp(argv[i], "-M") == 0) args.mode = atoi(argv[i + 1]); + if (strcmp(argv[i], "-LS") == 0) args.LS = atoi(argv[i + 1]); + if (strcmp(argv[i], "-TS") == 0) args.TS = atoi(argv[i + 1]); + } + return true; +} + +template +struct Access; + +template +struct Access::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + int ls; + int ts; + + ViewType_t v; + + Access(Args_t args) + : N(args.N), + iters(args.iters), + v(std::string(typeid(v).name()), args.N), + mode(args.mode), + ls(args.LS), + ts(args.TS){}; + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = 0; } + + KOKKOS_FUNCTION + void operator()(const UpdateTag &, team_t &thread) const { + const int64_t iters_per_team = N / ls; + const int64_t iters_per_team_mod = N % ls; + const int64_t first_i = thread.league_rank() * iters_per_team; + const int64_t last_i = thread.league_rank() == thread.league_size() - 1 + ? first_i + iters_per_team + iters_per_team_mod + : first_i + iters_per_team; + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, first_i, last_i), + [=](const StreamIndex i) { v(i) += 1; }); + } + + KOKKOS_FUNCTION + void operator()(const CheckTag &, const size_t i) const { + assert(v(i) == iters * 1.0); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + Kokkos::parallel_for("access_overhead-init", policy_init_t({0}, {N}), + *this); + Kokkos::fence(); +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_barrier_all(); // Not sure why this impacts perf +#endif + + auto policy = policy_update_t(ls, ts, 1); + + for (int i = 0; i < iters; i++) { + time_a = timer.seconds(); + Kokkos::parallel_for("access_overhead", policy, *this); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } + +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for("access_overhead-check", policy_check_t({0}, {N}), + *this); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + printf("access_overhead_teams,%s,%lu,%lf,%lu,%lf,%i,%i,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, ls, ts, gups, bw); + } +}; + +template +struct Access::value>> { + size_t N; /* size of vector */ + int iters; /* number of iterations */ + int mode; /* View type */ + int ls; + int ts; + + UnmanagedView_t v; + RemoteView_t rv; + + Access(Args_t args) + : N(args.N), + iters(args.iters), + rv(std::string(typeid(v).name()), args.N), + mode(args.mode), + ls(args.LS), + ts(args.TS) { + v = ViewType_t(rv.data(), args.N); + }; + + KOKKOS_FUNCTION + void operator()(const InitTag &, const size_t i) const { v(i) = 0; } + + KOKKOS_FUNCTION + void operator()(const UpdateTag &, team_t &thread) const { + const int64_t iters_per_team = N / ls; + const int64_t iters_per_team_mod = N % ls; + const int64_t first_i = thread.league_rank() * iters_per_team; + const int64_t last_i = thread.league_rank() == thread.league_size() - 1 + ? first_i + iters_per_team + iters_per_team_mod + : first_i + iters_per_team; + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, first_i, last_i), + [=](const StreamIndex i) { v(i) += 1; }); + } + + KOKKOS_FUNCTION + void operator()(const CheckTag &, const size_t i) const { + assert(v(i) == iters * 1.0); + } + + // run copy benchmark + void run() { + Kokkos::Timer timer; + double time_a, time_b; + time_a = time_b = 0; + double time = 0; + + Kokkos::parallel_for("access_overhead-init", policy_init_t({0}, {N}), + *this); + + Kokkos::fence(); +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_barrier_all(); // Not sure why this impacts perf +#endif + + auto policy = policy_update_t(ls, ts, 1); + for (int i = 0; i < iters; i++) { + time_a = timer.seconds(); + Kokkos::parallel_for("access_overhead", policy, *this); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + time += time_b - time_a; + } + +#ifdef CHECK_FOR_CORRECTNESS + Kokkos::parallel_for("access_overhead-check", policy_check_t({0}, {N}), + *this); + Kokkos::fence(); +#endif + + double gups = 1e-9 * ((N * iters) / time); + double size = N * sizeof(double) / 1024.0 / 1024.0; + double bw = gups * sizeof(double); + printf("access_overhead_teams,%s,%lu,%lf,%lu,%lf,%i,%i,%lf,%lf\n", + modes[mode].c_str(), N, size, iters, time, ls, ts, gups, bw); + } +}; + +int main(int argc, char *argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif + + Kokkos::initialize(argc, argv); + + do { + Args_t args; + if (!read_args(argc, argv, args)) { + break; + }; + + if (args.mode == 0) { + Access s(args); + s.run(); + } else if (args.mode == 1) { + Access s(args); + s.run(); + } else if (args.mode == 2) { + Access s(args); + s.run(); + } else { + printf("invalid mode selected (%d)\n", args.mode); + } + } while (false); + + Kokkos::fence(); + + Kokkos::finalize(); +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); + return 0; +} + +#undef CHECK_FOR_CORRECTNESS diff --git a/examples/benchmarks/access_overhead/scripts/run_over_kernel_conf.sh b/examples/benchmarks/access_overhead/scripts/run_over_kernel_conf.sh new file mode 100644 index 00000000..ff845511 --- /dev/null +++ b/examples/benchmarks/access_overhead/scripts/run_over_kernel_conf.sh @@ -0,0 +1,40 @@ +#/bin/bash +BENCHMARK=$1 +DEFAULT_SIZE=134217728 + +#exports +export OMP_PROC_BIND=spread +export OMP_PLACES=threads +export OMP_NUM_THREADS=32 + +ITERS=10 +DS=$DEFAULT_SIZE +#print header +HASH=`date | md5sum | head -c 5` +FILENAME=$BENCHMARK_$HASH_kernel_conf.res +echo "name,type,N,size,iters,time,ls,ts,gups,bw" | tee $FILENAME + +#run test over kernel params +for LS in 4 8 16 32 64 128 256 512 1024; do + for TS in 32 64 128 256 512 1024; do + for reps in $(seq 1 3); do + ./$BENCHMARK -N $DS -I $ITERS -M 0 -LS $LS -TS $TS | tee -a $FILENAME + done + done +done + +for LS in 4 8 16 32 64 128 256 512 1024; do + for TS in 32 64 128 256 512 1024; do + for reps in $(seq 1 3); do + ./$BENCHMARK -N $DS -I $ITERS -M 1 -LS $LS -TS $TS | tee -a $FILENAME + done + done +done + +for LS in 4 8 16 32 64 128 256 512 1024; do + for TS in 32 64 128 256 512 1024; do + for reps in $(seq 1 3); do + ./$BENCHMARK -N $DS -I $ITERS -M 2 -LS $LS -TS $TS | tee -a $FILENAME + done + done +done \ No newline at end of file diff --git a/examples/benchmarks/access_overhead/scripts/run_over_size.sh b/examples/benchmarks/access_overhead/scripts/run_over_size.sh new file mode 100644 index 00000000..9569dd00 --- /dev/null +++ b/examples/benchmarks/access_overhead/scripts/run_over_size.sh @@ -0,0 +1,43 @@ +#/bin/bash +BENCHMARK=$1 +DEFAULT_SIZE=1000 + +#exports +export OMP_PROC_BIND=spread +export OMP_PLACES=threads +export OMP_NUM_THREADS=32 + +ITERS=200 + +#print header +HASH=`date|md5sum|head -c 5` +FILENAME="${BENCHMARK}_${HASH}.res" +echo $FILENAME +echo "name,type,N,size,iters,time,gups,bw" | tee $FILENAME + +#run test over size +SIZE=$DEFAULT_SIZE +for S in $(seq 1 21); do + for reps in $(seq 1 3); do + ./$BENCHMARK -N $SIZE -I $ITERS -M 0 | tee -a $FILENAME + done + let SIZE=$SIZE*2 +done + +#run test over size +let SIZE=$DEFAULT_SIZE +for S in $(seq 1 21); do + for reps in $(seq 1 3); do + ./$BENCHMARK -N $SIZE -I $ITERS -M 1 | tee -a $FILENAME + done + let SIZE=$SIZE*2 +done + +#run test over size +let SIZE=$DEFAULT_SIZE +for S in $(seq 1 21); do + for reps in $(seq 1 3); do + ./$BENCHMARK -N $SIZE -I $ITERS -M 2 | tee -a $FILENAME + done + let SIZE=$SIZE*2 +done diff --git a/examples/benchmarks/access_overhead/scripts/run_over_size_p2p.sh b/examples/benchmarks/access_overhead/scripts/run_over_size_p2p.sh new file mode 100644 index 00000000..4f5cb28f --- /dev/null +++ b/examples/benchmarks/access_overhead/scripts/run_over_size_p2p.sh @@ -0,0 +1,59 @@ +#/bin/bash +BENCHMARK=$1 +HOST=$2 +DEFAULT_SIZE=1000 + +#exports +export OMP_PROC_BIND=spread +export OMP_PLACES=threads +export OMP_NUM_THREADS=32 + +ITERS=30 + +DS=$DATA_SIZE +#print header +HASH=`date|md5sum|head -c 5` +FILENAME="${BENCHMARK}_${HASH}_p2p.res" +echo $FILENAME +echo "name,type,N,size,iters,time,gups,bw" | tee $FILENAME +VARS0="--bind-to core --map-by socket -x CUDA_VISIBLE_DEVICES=0,1 -x NVSHMEM_SYMMETRIC_SIZE=10737418240" +VARS1="-x UCX_WARN_UNUSED_ENV_VARS=n -x HCOLL_RCACHE=^ucs -x LD_LIBRARY_PATH=/g/g92/ciesko1/software/nvshmem_src_2.9.0-2/install/lib:$LD_LIBRARY_PATH" +#VARS2="-x :$LD_LIBRARY_PATH" + +# #run test over size +# let SIZE=$DEFAULT_SIZE +# for S in $(seq 1 21); do +# for reps in $(seq 1 3); do +# mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK"_cudaawarempi" -N $SIZE -I $ITERS -M 0 | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +# #run test over size +# let SIZE=$DEFAULT_SIZE +# for S in $(seq 1 21); do +# for reps in $(seq 1 3); do +# mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 0 | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +# #run test over size +# let SIZE=$DEFAULT_SIZE +# for S in $(seq 1 21); do +# for reps in $(seq 1 3); do +# mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 1 | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +#run test over size +let SIZE=$DEFAULT_SIZE +for S in $(seq 1 21); do + for reps in $(seq 1 3); do + mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 2 | tee -a $FILENAME + done + let SIZE=$SIZE*2 +done + + diff --git a/examples/benchmarks/misslatency/CMakeLists.txt b/examples/benchmarks/misslatency/CMakeLists.txt index dff1613c..f790285f 100644 --- a/examples/benchmarks/misslatency/CMakeLists.txt +++ b/examples/benchmarks/misslatency/CMakeLists.txt @@ -1,4 +1,4 @@ add_executable(misslatency misslatency.cpp) add_executable(misslatency_partitioned misslatency_partitioned.cpp) -target_link_libraries(misslatency PRIVATE Kokkos::kokkosremote) -target_link_libraries(misslatency_partitioned PRIVATE Kokkos::kokkosremote) +target_link_libraries(misslatency PRIVATE Kokkos::kokkosremotespaces) +target_link_libraries(misslatency_partitioned PRIVATE Kokkos::kokkosremotespaces) diff --git a/examples/benchmarks/poissonaccess/CMakeLists.txt b/examples/benchmarks/poissonaccess/CMakeLists.txt index d8c90ece..9cd5b231 100644 --- a/examples/benchmarks/poissonaccess/CMakeLists.txt +++ b/examples/benchmarks/poissonaccess/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(poissonaccess poissonaccess.cpp) -target_link_libraries(poissonaccess PRIVATE Kokkos::kokkosremote) +target_link_libraries(poissonaccess PRIVATE Kokkos::kokkosremotespaces) diff --git a/examples/benchmarks/randomaccess/CMakeLists.txt b/examples/benchmarks/randomaccess/CMakeLists.txt index bdcb14ab..cd86bb2b 100644 --- a/examples/benchmarks/randomaccess/CMakeLists.txt +++ b/examples/benchmarks/randomaccess/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(randomaccess randomaccess.cpp) -target_link_libraries(randomaccess PRIVATE Kokkos::kokkosremote) +target_link_libraries(randomaccess PRIVATE Kokkos::kokkosremotespaces) diff --git a/examples/benchmarks/stream/CMakeLists.txt b/examples/benchmarks/stream/CMakeLists.txt deleted file mode 100755 index d2ca4141..00000000 --- a/examples/benchmarks/stream/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_executable(stream stream_benchmark.cpp) -target_link_libraries(stream PRIVATE Kokkos::kokkosremote) -target_include_directories(stream PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -target_compile_definitions(stream PRIVATE KRS_ENABLE_NVSHMEM_PTR) diff --git a/examples/benchmarks/stream/stream_benchmark.cpp b/examples/benchmarks/stream/stream_benchmark.cpp deleted file mode 100644 index 183b0947..00000000 --- a/examples/benchmarks/stream/stream_benchmark.cpp +++ /dev/null @@ -1,204 +0,0 @@ -/* A micro benchmark ported mainly from Heat3D to test overhead of RMA */ - -#include -#include -#include -#include - -template -struct SpaceInstance { - static ExecSpace create() { return ExecSpace(); } - static void destroy(ExecSpace&) {} - static bool overlap() { return false; } -}; - -#ifndef KOKKOS_ENABLE_DEBUG -#ifdef KOKKOS_ENABLE_CUDA -template <> -struct SpaceInstance { - static Kokkos::Cuda create() { - cudaStream_t stream; - cudaStreamCreate(&stream); - return Kokkos::Cuda(stream); - } - static void destroy(Kokkos::Cuda& space) { - cudaStream_t stream = space.cuda_stream(); - cudaStreamDestroy(stream); - } - static bool overlap() { /* returns true if you can overlap */ - bool value = true; - auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING"); - if (local_rank_str) { - value = (std::stoi(local_rank_str) == 0); - } - return value; - } -}; -#endif /* KOKKOS_ENABLE_CUDA */ -#endif /* KOKKOS_ENABLE_DEBUG */ - -using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; -using RemoteView_t = Kokkos::View; -using PlainView_t = Kokkos::View; -using UnmanagedView_t = - Kokkos::View>; -using HostView_t = typename RemoteView_t::HostMirror; - -template -struct Stream_Manager { - int len; /* size of vector */ - int N; /* number of iterations */ - int indication_of_lack_of_cpp_knowledge; - - // Temperature and delta Temperature - HostView_t V_h; - ViewType_t V; - - Stream_Manager(int a) : indication_of_lack_of_cpp_knowledge(a) { - // populate with defaults, set the rest in initialize_views. - len = 8000000; - V_h = HostView_t(); - V = ViewType_t(); - N = 10000; - } - - void initialize_views() { - /* how to handle unmanaged? */ - // if (std::is_same::value) { - // R = RemoteView_t("System::Vector", len); - // V = ViewType_t(R.data(), len); - // } - // else { - V = ViewType_t("System::Vector", len); - // } - V_h = HostView_t("Host::Vector", V.extent(0)); - - Kokkos::deep_copy(V_h, 0); - Kokkos::deep_copy(V, V_h); - - printf("My Vector: [%i, %i]\n", 0, len - 1); - } - - void print_help() { - printf("Options (default):\n"); - printf(" -l IARG: (%i) num elements in the V vector\n", len); - printf(" -N IARG: (%i) num repititions\n", N); - } - - // check command line args - bool check_args(int argc, char* argv[]) { - for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "-h") == 0) { - print_help(); - return false; - } - } - for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "-l") == 0) len = atoi(argv[i + 1]); - if (strcmp(argv[i], "-len") == 0) len = atoi(argv[i + 1]); - if (strcmp(argv[i], "-N") == 0) N = atoi(argv[i + 1]); - } - initialize_views(); - return true; - } - struct stream_benchmark {}; - KOKKOS_FUNCTION - void operator()(stream_benchmark, int i) const { V(i) += 1; } - void Stream_Benchmark() { - using policy_t = Kokkos::RangePolicy; - Kokkos::parallel_for("stream", policy_t({0}, {len}), *this); - } - - // run copy benchmark - void timestep() { - Kokkos::Timer timer; - double time_a, time_b; - time_a = time_b = 0; - double time_stream = 0; - double old_time = 0.0; - for (int t = 0; t <= N; t++) { - time_a = timer.seconds(); - Stream_Benchmark(); - RemoteSpace_t().fence(); - time_b = timer.seconds(); - time_stream += time_b - time_a; - if ((t % 400 == 0 || t == N)) { - double time = timer.seconds(); - Kokkos::deep_copy(V_h, V); - printf("%d V_h(0)=%lf Time (%lf %lf)\n", t, V_h(0), time, - time - old_time); - printf(" stream: %lf\n", time_stream); - old_time = time; - } - } - } -}; - -int main(int argc, char* argv[]) { - int mpi_thread_level_available; - int mpi_thread_level_required = MPI_THREAD_MULTIPLE; - -#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL - mpi_thread_level_required = MPI_THREAD_SINGLE; -#endif - - MPI_Init_thread(&argc, &argv, mpi_thread_level_required, - &mpi_thread_level_available); - assert(mpi_thread_level_available >= mpi_thread_level_required); - -#ifdef KRS_ENABLE_SHMEMSPACE - shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); - assert(mpi_thread_level_available >= mpi_thread_level_required); -#endif - -#ifdef KRS_ENABLE_NVSHMEMSPACE - MPI_Comm mpi_comm; - nvshmemx_init_attr_t attr; - mpi_comm = MPI_COMM_WORLD; - attr.mpi_comm = &mpi_comm; - nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); -#endif - - Kokkos::initialize(argc, argv); - { - /* use 'mode' variable to pack any of three benchmarks into one here */ - int mode = 0; - for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "-h") == 0) { - printf("modes:\n"); - printf(" 0: Kokkos (Normal) View\n"); - printf(" 1: Kokkos Remote View\n"); - printf(" 2: Kokkos Unmanaged View\n"); - printf(" -m IARG: (%d) which mode to choose\n", mode); - break; - } - if (strcmp(argv[i], "-m") == 0 || strcmp(argv[i], "-type") == 0) { - mode = atoi(argv[i + 1]); - } - } - - if (mode == 0) { - Stream_Manager sys(0); - if (sys.check_args(argc, argv)) sys.timestep(); - } else if (mode == 1) { - Stream_Manager sys(0); - if (sys.check_args(argc, argv)) sys.timestep(); - } else if (mode == 2) { - printf("unmanaged views not handled yet."); - // Stream_Manager sys(0); - // if (sys.check_args(argc, argv)) - // sys.timestep(); - } else { - printf("invalid mode selected (%d)\n", mode); - } - } - Kokkos::finalize(); -#ifdef KRS_ENABLE_SHMEMSPACE - shmem_finalize(); -#endif -#ifdef KRS_ENABLE_NVSHMEMSPACE - nvshmem_finalize(); -#endif - MPI_Finalize(); - return 0; -} diff --git a/examples/cgsolve/rma/CMakeLists.txt b/examples/cgsolve/rma/CMakeLists.txt index ecba0829..37e7ef47 100644 --- a/examples/cgsolve/rma/CMakeLists.txt +++ b/examples/cgsolve/rma/CMakeLists.txt @@ -1,4 +1,4 @@ add_executable(cgsolve cgsolve.cpp) -target_link_libraries(cgsolve PRIVATE Kokkos::kokkosremote) +target_link_libraries(cgsolve PRIVATE Kokkos::kokkosremotespaces) target_include_directories(cgsolve PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_compile_definitions(cgsolve PRIVATE KRS_ENABLE_NVSHMEM_PTR) diff --git a/examples/empty_project/CMakeLists.txt b/examples/empty_project/CMakeLists.txt new file mode 100644 index 00000000..020a71ca --- /dev/null +++ b/examples/empty_project/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required(VERSION 3.25) +project(MyProject) + +find_package(KokkosRemoteSpaces REQUIRED) + +add_executable(project project.cpp) +target_link_libraries(project PRIVATE Kokkos::kokkosremotespaces) +target_include_directories(project PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/examples/empty_project/project.cpp b/examples/empty_project/project.cpp new file mode 100644 index 00000000..19410d52 --- /dev/null +++ b/examples/empty_project/project.cpp @@ -0,0 +1,30 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#include +#include + +int main(int argc, char *argv[]) { + comm_init(argc, argv); + Kokkos::initialize(argc, argv); + { + // Your application code here + } + Kokkos::finalize(); + comm_fini(); +} diff --git a/examples/empty_project/project.hpp b/examples/empty_project/project.hpp new file mode 100644 index 00000000..c7f1719e --- /dev/null +++ b/examples/empty_project/project.hpp @@ -0,0 +1,53 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +void comm_init(int argc, char* argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif +} + +void comm_fini() { +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); +} diff --git a/examples/heat3d/mpi/CMakeLists.txt b/examples/heat3d/mpi/CMakeLists.txt index af4ab548..f0e95815 100755 --- a/examples/heat3d/mpi/CMakeLists.txt +++ b/examples/heat3d/mpi/CMakeLists.txt @@ -1,3 +1,3 @@ add_executable(mpi_heat3d heat3d.cpp) -target_link_libraries(mpi_heat3d PRIVATE Kokkos::kokkosremote) +target_link_libraries(mpi_heat3d PRIVATE Kokkos::kokkosremotespaces) target_include_directories(mpi_heat3d PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/examples/heat3d/mpi/heat3d.cpp b/examples/heat3d/mpi/heat3d.cpp index 4c809d3e..d6415471 100644 --- a/examples/heat3d/mpi/heat3d.cpp +++ b/examples/heat3d/mpi/heat3d.cpp @@ -8,7 +8,7 @@ struct SpaceInstance { static bool overlap() { return false; } }; -#ifndef KOKKOS_ENABLE_DEBUG +#ifndef KOKKOS_REMOTE_SPACES_ENABLE_DEBUG #ifdef KOKKOS_ENABLE_CUDA template <> struct SpaceInstance { @@ -42,6 +42,9 @@ struct CommHelper { // My rank int me; + // N ranks + int nranks; + // My pos in proc grid int x, y, z; @@ -50,7 +53,7 @@ struct CommHelper { CommHelper(MPI_Comm comm_) { comm = comm_; - int nranks; + MPI_Comm_size(comm, &nranks); MPI_Comm_rank(comm, &me); @@ -71,10 +74,12 @@ struct CommHelper { front = z == 0 ? -1 : me - nx * ny; back = z == nz - 1 ? -1 : me + nx * ny; +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG printf("NumRanks: %i Me: %i Grid: %i %i %i MyPos: %i %i %i\n", nranks, me, nx, ny, nz, x, y, z); printf("Me: %i MyNeighs: %i %i %i %i %i %i\n", me, left, right, down, up, front, back); +#endif } template @@ -121,7 +126,7 @@ struct System { Kokkos::View T, dT; // Halo data using buffer_t = - Kokkos::View; + Kokkos::View; buffer_t T_left, T_right, T_up, T_down, T_front, T_back; buffer_t T_left_out, T_right_out, T_up_out, T_down_out, T_front_out, T_back_out; @@ -152,21 +157,25 @@ struct System { X_lo = Y_lo = Z_lo = 0; X_hi = Y_hi = Z_hi = X; N = 10000; - I = 100; - T = Kokkos::View(); - dT = Kokkos::View(); - T0 = 0.0; - dt = 0.1; - q = 1.0; - sigma = 1.0; - P = 1.0; - E_left = SpaceInstance::create(); - E_right = SpaceInstance::create(); - E_up = SpaceInstance::create(); - E_down = SpaceInstance::create(); - E_front = SpaceInstance::create(); - E_back = SpaceInstance::create(); - E_bulk = SpaceInstance::create(); +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG + I = 10; +#else + I = N - 1; +#endif + T = Kokkos::View(); + dT = Kokkos::View(); + T0 = 0.0; + dt = 0.1; + q = 1.0; + sigma = 1.0; + P = 1.0; + E_left = SpaceInstance::create(); + E_right = SpaceInstance::create(); + E_up = SpaceInstance::create(); + E_down = SpaceInstance::create(); + E_front = SpaceInstance::create(); + E_back = SpaceInstance::create(); + E_bulk = SpaceInstance::create(); } void destroy_exec_spaces() { @@ -193,8 +202,10 @@ struct System { Z_hi = Z_lo + dZ; if (Z_hi > Z) Z_hi = Z; +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG printf("My Domain: %i (%i %i %i) (%i %i %i)\n", comm.me, X_lo, Y_lo, Z_lo, X_hi, Y_hi, Z_hi); +#endif T = Kokkos::View("System::T", X_hi - X_lo, Y_hi - Y_lo, Z_hi - Z_lo); dT = Kokkos::View("System::dT", T.extent(0), T.extent(1), @@ -271,32 +282,44 @@ struct System { void timestep() { Kokkos::Timer timer; double old_time = 0.0; + double time_all = 0.0; + double GUPs = 0.0; double time_a, time_b, time_c, time_d; - double time_inner, time_surface, time_compute; - time_inner = time_surface = time_compute = 0.0; + double time_inner, time_surface, time_update; + time_inner = time_surface = time_update = 0.0; for (int t = 0; t <= N; t++) { if (t > N / 2) P = 0.0; - pack_T_halo(); time_a = timer.seconds(); - compute_inner_dT(); + pack_T_halo(); // Overlap O1 + compute_inner_dT(); // Overlap O1 Kokkos::fence(); time_b = timer.seconds(); exchange_T_halo(); compute_surface_dT(); Kokkos::fence(); time_c = timer.seconds(); - double T_ave = compute_T(); + double T_ave = update_T(); time_d = timer.seconds(); time_inner += time_b - time_a; time_surface += time_c - time_b; - time_compute += time_d - time_c; + time_update += time_d - time_c; T_ave /= 1e-9 * (X * Y * Z); if ((t % I == 0 || t == N) && (comm.me == 0)) { double time = timer.seconds(); - printf("%i T=%lf Time (%lf %lf)\n", t, T_ave, time, time - old_time); - printf(" inner + surface: %lf compute: %lf\n", - time_inner + time_surface, time_compute); - old_time = time; + time_all += time - old_time; + GUPs += 1e-9 * (dT.size() / time_inner); +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG + if ((t % I == 0 || t == N) && (comm.me == 0)) { +#else + if ((t == N) && (comm.me == 0)) { +#endif + printf("heat3D,Kokkos+MPI,%i,%i,%lf,%lf,%lf,%lf,%lf,%lf,%lf,%i,%f\n", + comm.nranks, t, T_ave, time_inner, time_surface, time_update, + time - old_time, /* time last iter */ + time_all, /* current runtime */ + GUPs / t, X, 1e-6 * (X * sizeof(double))); + old_time = time; + } } } } @@ -549,11 +572,10 @@ struct System { // Some compilers have deduction issues if this were just a tagged operator // So did a full Functor here instead - struct ComputeT { + struct UpdateT { Kokkos::View T, dT; double dt; - ComputeT(Kokkos::View T_, Kokkos::View dT_, - double dt_) + UpdateT(Kokkos::View T_, Kokkos::View dT_, double dt_) : T(T_), dT(dT_), dt(dt_) {} KOKKOS_FUNCTION void operator()(int x, int y, int z, double& sum_T) const { @@ -562,19 +584,19 @@ struct System { } }; - double compute_T() { + double update_T() { using policy_t = Kokkos::MDRangePolicy, Kokkos::IndexType>; - int X = T.extent(0); - int Y = T.extent(1); - int Z = T.extent(2); - double my_T; + int X = T.extent(0); + int Y = T.extent(1); + int Z = T.extent(2); + double my_T = 0.0; Kokkos::parallel_reduce( - "ComputeT", + "UpdateT", Kokkos::Experimental::require( policy_t(E_bulk, {0, 0, 0}, {X, Y, Z}, {10, 10, 10}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), - ComputeT(T, dT, dt), my_T); + UpdateT(T, dT, dt), my_T); double sum_T; MPI_Allreduce(&my_T, &sum_T, 1, MPI_DOUBLE, MPI_SUM, comm.comm); return sum_T; diff --git a/examples/heat3d/opt_rma/CMakeLists.txt b/examples/heat3d/opt_rma/CMakeLists.txt index 9af638e6..d360bc86 100755 --- a/examples/heat3d/opt_rma/CMakeLists.txt +++ b/examples/heat3d/opt_rma/CMakeLists.txt @@ -1,4 +1,4 @@ add_executable(opt_rma_heat3d heat3d.cpp) -target_link_libraries(opt_rma_heat3d PRIVATE Kokkos::kokkosremote) +target_link_libraries(opt_rma_heat3d PRIVATE Kokkos::kokkosremotespaces) target_include_directories(opt_rma_heat3d PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_compile_definitions(opt_rma_heat3d PRIVATE KRS_ENABLE_NVSHMEM_PTR) diff --git a/examples/heat3d/opt_rma/comm.hpp b/examples/heat3d/opt_rma/comm.hpp new file mode 100644 index 00000000..14abe07b --- /dev/null +++ b/examples/heat3d/opt_rma/comm.hpp @@ -0,0 +1,35 @@ +void comm_init(int argc, char* argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif +} + +void comm_fini() { +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); +} \ No newline at end of file diff --git a/examples/heat3d/opt_rma/heat3d.cpp b/examples/heat3d/opt_rma/heat3d.cpp index ea9aa724..17541283 100644 --- a/examples/heat3d/opt_rma/heat3d.cpp +++ b/examples/heat3d/opt_rma/heat3d.cpp @@ -21,39 +21,10 @@ #include #include -template -struct SpaceInstance { - static ExecSpace create() { return ExecSpace(); } - static void destroy(ExecSpace&) {} - static bool overlap() { return false; } -}; - -#ifndef KOKKOS_ENABLE_DEBUG -#ifdef KOKKOS_ENABLE_CUDA -template <> -struct SpaceInstance { - static Kokkos::Cuda create() { - cudaStream_t stream; - cudaStreamCreate(&stream); - return Kokkos::Cuda(stream); - } - static void destroy(Kokkos::Cuda& space) { - cudaStream_t stream = space.cuda_stream(); - cudaStreamDestroy(stream); - } - static bool overlap() { /* returns true if you can overlap */ - bool value = true; - auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING"); - if (local_rank_str) { - value = (std::stoi(local_rank_str) == 0); - } - return value; - } -}; -#endif /* KOKKOS_ENABLE_CUDA */ -#endif /* KOKKOS_ENABLE_DEBUG */ +#include using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; +using LocalView_t = Kokkos::View; using RemoteView_t = Kokkos::View; using UnmanagedView_t = Kokkos::View>; @@ -98,10 +69,12 @@ struct CommHelper { left = right = down = up = front = back = -1; x = y = z = 0; +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG printf("NumRanks: %i Me: %i (old Grid): %i %i %i MyPos: %i %i %i\n", nranks, me, nx, ny, nz, x, y, z); printf("Me: %d MyNeighbors: %i %i %i %i %i %i\n", me, left, right, down, up, front, back); +#endif } }; @@ -134,12 +107,11 @@ struct System { int I; // Temperature and delta Temperature - RemoteView_t T, dT; + LocalView_t dT; + RemoteView_t T; UnmanagedView_t dT_u; HostView_t T_h; - Kokkos::DefaultExecutionSpace E_bulk; - // Initial Temmperature double T0; @@ -166,20 +138,16 @@ struct System { my_lo_x = 0; my_hi_x = 0; N = 10000; - I = 100; - T_h = HostView_t(); - T = RemoteView_t(); - dT = RemoteView_t(); - dT_u = UnmanagedView_t(); - T0 = 0.0; - dt = 0.1; - q = 1.0; - sigma = 1.0; - P = 1.0; - E_bulk = SpaceInstance::create(); - } - void destroy_exec_spaces() { - SpaceInstance::destroy(E_bulk); +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG + I = 10; +#else + I = N - 1; +#endif + T0 = 0.0; + dt = 0.1; + q = 1.0; + sigma = 1.0; + P = 1.0; } void setup_subdomain() { @@ -211,11 +179,13 @@ struct System { my_lo_x = local_range.first; my_hi_x = local_range.second + 1; +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG printf("My Domain: %i (%i %i %i) (%i %i %i)\n", comm.me, my_lo_x, Y_lo, Z_lo, my_hi_x, Y_hi, Z_hi); +#endif T = RemoteView_t("System::T", dX, dY, dZ); T_h = HostView_t("Host::T", T.extent(0), dY, dZ); - dT = RemoteView_t("System::dT", dX, dY, dZ); + dT = LocalView_t("System::dT", T.extent(0), dY, dZ); dT_u = UnmanagedView_t(dT.data(), dX, dY, dZ); Kokkos::deep_copy(T_h, T0); @@ -260,91 +230,17 @@ struct System { return true; } - // only computethe inner updates - struct ComputeInnerDT {}; - KOKKOS_FUNCTION - void operator()(ComputeInnerDT, int x, int y, int z) const { - double dT_xyz = 0.0; - double T_xyz = T(x, y, z); - dT_xyz += q * (T(x - 1, y, z) - T_xyz); - dT_xyz += q * (T(x + 1, y, z) - T_xyz); - dT_xyz += q * (T(x, y - 1, z) - T_xyz); - dT_xyz += q * (T(x, y + 1, z) - T_xyz); - dT_xyz += q * (T(x, y, z - 1) - T_xyz); - dT_xyz += q * (T(x, y, z + 1) - T_xyz); - dT(x, y, z) = dT_xyz; - } - - void compute_inner_dT() { - using policy_t = - Kokkos::MDRangePolicy, ComputeInnerDT, int>; - Kokkos::parallel_for( - "ComputeInnerDT", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, 1, 1}, {my_hi_x - 1, Y - 1, Z - 1}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - } - // compute both inner and outer updates. This function is suitable for both. - struct ComputeAllDT {}; + struct ComputeDT {}; KOKKOS_FUNCTION - void operator()(ComputeAllDT, int x, int y, int z) const { - double dT_xyz = 0.0; - double T_xyz = T(x, y, z); - // printf("begin computeAllDT with x,y,z=(%i,%i,%i)\n", x, y, z); + void operator()(ComputeDT, int x, int y, int z) const { + double dT_xyz = 0.0; + double T_xyz = T(x, y, z); int num_surfaces = 0; -// # if LIKELY_IN_IF -#if 0 - if (x == 0) { - num_surfaces += 1; - // Incoming Power - if(X_lo == 0) dT_xyz += P; - } - else { - dT_xyz += q * (T(x-1,y ,z ) - T_xyz); - // printf("x access computeAllDT with x,y,z=(%i,%i,%i)\n", x, y, z); - } - - if (x == X-1) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x+1,y ,z ) - T_xyz); - } - - if (y == 0) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x ,y-1,z ) - T_xyz); - } - - if (y == Y-1) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x ,y+1,z ) - T_xyz); - } - - if (z == 0) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x ,y ,z-1) - T_xyz); - } - if (z == Z-1) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x ,y ,z+1) - T_xyz); - } -#else if (x > 0) { dT_xyz += q * (T(x - 1, y, z) - T_xyz); - // printf("x access computeAllDT with x,y,z=(%i,%i,%i)\n", x, y, z); } else { num_surfaces += 1; // Incoming Power @@ -380,70 +276,18 @@ struct System { } else { num_surfaces += 1; } -#endif /* LIKELY_IN_IF */ // radiation dT_xyz -= sigma * T_xyz * T_xyz * T_xyz * T_xyz * num_surfaces; - dT(x, y, z) = dT_xyz; - // double saved = dT(x,y,z); - // printf("conclude computeAllDT with x,y,z=(%i,%i,%i) (%lf) (%lf)\n", x, y, - // z, dT_xyz, saved); - } - - void compute_outer_dT() { - using policy_t = Kokkos::MDRangePolicy, ComputeAllDT, int>; - // left - Kokkos::parallel_for( - "ComputeAllDTLeft", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x, 0, 0}, {my_lo_x + 1, Y, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // right - Kokkos::parallel_for( - "ComputeAllDTRight", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_hi_x - 1, 0, 0}, {my_hi_x, Y, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // bottom - Kokkos::parallel_for( - "ComputeAllDTBottom", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, 0, 0}, {my_hi_x - 1, 1, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // top - Kokkos::parallel_for( - "ComputeAllDTTop", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, Y - 1, 0}, {my_hi_x - 1, Y, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // front - Kokkos::parallel_for( - "ComputeAllDTFront", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, 0 + 1, 0}, {my_hi_x - 1, Y - 1, 1}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // back - Kokkos::parallel_for( - "ComputeAllDTBack", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, 0 + 1, Z - 1}, - {my_hi_x - 1, Y - 1, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); } - void compute_all_dT() { - using policy_t = Kokkos::MDRangePolicy, ComputeAllDT, int>; + void compute_dT() { + using policy_t = Kokkos::MDRangePolicy, ComputeDT, int>; Kokkos::parallel_for( - "ComputeAllDT", + "ComputeDT", Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x, 0, 0}, {my_hi_x, Y, Z}, {16, 8, 8}), + policy_t({my_lo_x, 0, 0}, {my_hi_x, Y, Z}, {16, 8, 8}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), *this); } @@ -463,14 +307,14 @@ struct System { } }; - double compute_T() { + double update_T() { using policy_t = Kokkos::MDRangePolicy, Kokkos::IndexType>; double my_T; Kokkos::parallel_reduce( "ComputeT", Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x, 0, 0}, {my_hi_x, Y, Z}, {10, 10, 10}), + policy_t({my_lo_x, 0, 0}, {my_hi_x, Y, Z}, {10, 10, 10}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), computeT(T, dT_u, dt), my_T); double sum_T; @@ -485,71 +329,51 @@ struct System { void timestep() { Kokkos::Timer timer; double old_time = 0.0; - double time_a, time_c, time_d; - double time_compute, time_all; - time_all = time_compute = 0.0; + double GUPs = 0.0; + double time_a, time_b, time_c, time_update, time_compute, time_all; + time_all = time_update = time_compute = 0.0; for (int t = 0; t <= N; t++) { if (t > N / 2) P = 0.0; /* stop heat in halfway through */ time_a = timer.seconds(); - compute_all_dT(); + compute_dT(); RemoteSpace_t().fence(); - Kokkos::DefaultExecutionSpace().fence(); + time_b = timer.seconds(); + double T_ave = update_T(); time_c = timer.seconds(); - double T_ave = compute_T(); - time_d = timer.seconds(); - time_all += time_c - time_a; - time_compute += time_d - time_c; + time_compute += time_b - time_a; + time_update += time_c - time_b; T_ave /= 1e-9 * (X * Y * Z); if ((t % I == 0 || t == N) && (comm.me == 0)) { double time = timer.seconds(); - printf("%d T=%lf Time (%lf %lf)\n", t, T_ave, time, time - old_time); - printf(" inner + surface: %lf compute: %lf\n", time_all, - time_compute); - old_time = time; + time_all += time - old_time; + GUPs += 1e-9 * (dT.size() / time_compute); +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG + if ((t % I == 0 || t == N) && (comm.me == 0)) { +#else + if ((t == N) && (comm.me == 0)) { +#endif + printf( + "heat3D,KokkosRemoteSpaces_localproxy,%i,%i,%lf,%lf,%lf,%lf,%lf,%" + "lf,%lf,%i,%f\n", + comm.nranks, t, T_ave, 0.0, time_compute, time_update, + time - old_time, /* time last iter */ + time_all, /* current runtime */ + GUPs / t, X, 1e-6 * (dT.size() * sizeof(double))); + old_time = time; + } } } } }; int main(int argc, char* argv[]) { - int mpi_thread_level_available; - int mpi_thread_level_required = MPI_THREAD_MULTIPLE; - -#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL - mpi_thread_level_required = MPI_THREAD_SINGLE; -#endif - - MPI_Init_thread(&argc, &argv, mpi_thread_level_required, - &mpi_thread_level_available); - assert(mpi_thread_level_available >= mpi_thread_level_required); - -#ifdef KRS_ENABLE_SHMEMSPACE - shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); - assert(mpi_thread_level_available >= mpi_thread_level_required); -#endif - -#ifdef KRS_ENABLE_NVSHMEMSPACE - MPI_Comm mpi_comm; - nvshmemx_init_attr_t attr; - mpi_comm = MPI_COMM_WORLD; - attr.mpi_comm = &mpi_comm; - nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); -#endif - + comm_init(argc, argv); Kokkos::initialize(argc, argv); { System sys(MPI_COMM_WORLD); - if (sys.check_args(argc, argv)) sys.timestep(); - sys.destroy_exec_spaces(); } Kokkos::finalize(); -#ifdef KRS_ENABLE_SHMEMSPACE - shmem_finalize(); -#endif -#ifdef KRS_ENABLE_NVSHMEMSPACE - nvshmem_finalize(); -#endif - MPI_Finalize(); + comm_fini(); return 0; } diff --git a/examples/heat3d/partitioned_rma/CMakeLists.txt b/examples/heat3d/partitioned_rma/CMakeLists.txt index 6637aff2..7645ef42 100755 --- a/examples/heat3d/partitioned_rma/CMakeLists.txt +++ b/examples/heat3d/partitioned_rma/CMakeLists.txt @@ -1,4 +1,4 @@ add_executable(partitioned_heat3d heat3d.cpp) -target_link_libraries(partitioned_heat3d PRIVATE Kokkos::kokkosremote) +target_link_libraries(partitioned_heat3d PRIVATE Kokkos::kokkosremotespaces) target_include_directories(partitioned_heat3d PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_compile_definitions(partitioned_heat3d PRIVATE KRS_ENABLE_NVSHMEM_PTR) diff --git a/examples/heat3d/partitioned_rma/comm.hpp b/examples/heat3d/partitioned_rma/comm.hpp new file mode 100644 index 00000000..14abe07b --- /dev/null +++ b/examples/heat3d/partitioned_rma/comm.hpp @@ -0,0 +1,35 @@ +void comm_init(int argc, char* argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif +} + +void comm_fini() { +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); +} \ No newline at end of file diff --git a/examples/heat3d/partitioned_rma/heat3d.cpp b/examples/heat3d/partitioned_rma/heat3d.cpp index 17b319e1..9119ca02 100644 --- a/examples/heat3d/partitioned_rma/heat3d.cpp +++ b/examples/heat3d/partitioned_rma/heat3d.cpp @@ -21,39 +21,10 @@ #include #include -template -struct SpaceInstance { - static ExecSpace create() { return ExecSpace(); } - static void destroy(ExecSpace&) {} - static bool overlap() { return false; } -}; - -#ifndef KOKKOS_ENABLE_DEBUG -#ifdef KOKKOS_ENABLE_CUDA -template <> -struct SpaceInstance { - static Kokkos::Cuda create() { - cudaStream_t stream; - cudaStreamCreate(&stream); - return Kokkos::Cuda(stream); - } - static void destroy(Kokkos::Cuda& space) { - cudaStream_t stream = space.cuda_stream(); - cudaStreamDestroy(stream); - } - static bool overlap() { /* returns true if you can overlap */ - bool value = true; - auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING"); - if (local_rank_str) { - value = (std::stoi(local_rank_str) == 0); - } - return value; - } -}; -#endif /* KOKKOS_ENABLE_CUDA */ -#endif /* KOKKOS_ENABLE_DEBUG */ +#include using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; +using LocalView_t = Kokkos::View; using RemoteView_t = Kokkos::View; using HostView_t = @@ -95,10 +66,12 @@ struct CommHelper { front = (z == 0) ? -1 : me - nx * ny; back = (z == nz - 1) ? -1 : me + nx * ny; +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG printf("NumRanks: %i Me: %i Grid: %i %i %i MyPos: %i %i %i\n", nranks, me, nx, ny, nz, x, y, z); printf("Me: %d MyNeighbors: %i %i %i %i %i %i\n", me, left, right, down, up, front, back); +#endif } }; @@ -131,13 +104,10 @@ struct System { int I; // Temperature and delta Temperature - RemoteView_t T, dT; + RemoteView_t T; + LocalView_t dT; HostView_t T_h; - // TODO: what with this? - Kokkos::DefaultExecutionSpace E_bulk; - // Kokkos::DefaultExecutionSpace::memory_space E_bulk; - // Initial Temmperature double T0; @@ -165,22 +135,18 @@ struct System { Y_ra = Y; Z_ra = Z; N = 10000; - I = 100; - T_h = HostView_t(); - T = RemoteView_t(); - dT = RemoteView_t(); - T0 = 0.0; - dt = 0.1; - q = 1.0; - sigma = 1.0; - P = 1.0; - E_bulk = SpaceInstance::create(); - // E_bulk = - // SpaceInstance::create(); - } - void destroy_exec_spaces() { - SpaceInstance::destroy(E_bulk); +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG + I = 10; +#else + I = N - 1; +#endif + T0 = 0.0; + dt = 0.1; + q = 1.0; + sigma = 1.0; + P = 1.0; } + void setup_subdomain() { int dX = (X + comm.nx - 1) / comm.nx; /* Divide the space up to each MPI rank */ @@ -201,12 +167,13 @@ struct System { if (Z_hi > Z) Z_hi = Z; Z_ra = Z_hi - Z_lo; +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG printf("My Domain: %i (%i %i %i) (%i %i %i)\n", comm.me, X_lo, Y_lo, Z_lo, X_hi, Y_hi, Z_hi); +#endif T_h = HostView_t("Host::T", 1, dX, dY, dZ); T = RemoteView_t("System::T", comm.nranks, dX, dY, dZ); - dT = RemoteView_t("System::dT", comm.nranks, T.extent(1), T.extent(2), - T.extent(3)); + dT = LocalView_t("System::dT", 1, T.extent(1), T.extent(2), T.extent(3)); Kokkos::deep_copy(T_h, T0); Kokkos::deep_copy(T, T_h); } @@ -274,7 +241,7 @@ struct System { Kokkos::parallel_for( "ComputerInnerDT", Kokkos::Experimental::require( - policy_t(E_bulk, {1, 1, 1}, {myX - 1, myY - 1, myZ - 1}), + policy_t({1, 1, 1}, {myX - 1, myY - 1, myZ - 1}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), *this); } @@ -362,12 +329,6 @@ struct System { ((y == (myY - 1) && Y_hi == Y) ? 1 : 0) + ((z == 0 && Z_lo == 0) ? 1 : 0) + ((z == (myZ - 1) && Z_hi == Z) ? 1 : 0); - // printf("point xyz=(%i %i %i) on process %d with %d surfaces\n" - // " (xyzlo=(%i %i %i)\n" - // " (xyzhi=(%i %i %i)\n\n", - // x,y,z,comm.me,num_surfaces, - // X_lo, Y_lo, Z_lo, - // X_hi, Y_hi, Z_hi); dT_xyz -= sigma * T_xyz * T_xyz * T_xyz * T_xyz * num_surfaces; dT(comm.me, x, y, z) = dT_xyz; } @@ -392,37 +353,37 @@ struct System { Kokkos::parallel_for( "ComputeSurfaceDT_Left", Kokkos::Experimental::require( - policy_left_t(E_bulk, {0, 0}, {Y, Z}), + policy_left_t({0, 0}, {Y, Z}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), *this); Kokkos::parallel_for( "ComputeSurfaceDT_Right", Kokkos::Experimental::require( - policy_right_t(E_bulk, {0, 0}, {Y, Z}), + policy_right_t({0, 0}, {Y, Z}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), *this); Kokkos::parallel_for( "ComputeSurfaceDT_Down", Kokkos::Experimental::require( - policy_down_t(E_bulk, {1, 0}, {X - 1, Z}), + policy_down_t({1, 0}, {X - 1, Z}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), *this); Kokkos::parallel_for( "ComputeSurfaceDT_Up", Kokkos::Experimental::require( - policy_up_t(E_bulk, {1, 0}, {X - 1, Z}), + policy_up_t({1, 0}, {X - 1, Z}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), *this); Kokkos::parallel_for( "ComputeSurfaceDT_Front", Kokkos::Experimental::require( - policy_front_t(E_bulk, {1, 1}, {X - 1, Y - 1}), + policy_front_t({1, 1}, {X - 1, Y - 1}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), *this); Kokkos::parallel_for( "ComputeSurfaceDT_Back", Kokkos::Experimental::require( - policy_back_t(E_bulk, {1, 1}, {X - 1, Y - 1}), + policy_back_t({1, 1}, {X - 1, Y - 1}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), *this); } @@ -430,19 +391,19 @@ struct System { // Some compilers have deduction issues if this were just a tagget operator // So it is instead a full Functor struct computeT { - RemoteView_t T, dT; + RemoteView_t T; + LocalView_t dT; double dt; - computeT(RemoteView_t T_, RemoteView_t dT_, double dt_) + computeT(RemoteView_t T_, LocalView_t dT_, double dt_) : T(T_), dT(dT_), dt(dt_) {} KOKKOS_FUNCTION void operator()(int me, int x, int y, int z, double& sum_T) const { sum_T += T(me, x, y, z); - // printf("just added %lf\n", T(me,x,y,z)); T(me, x, y, z) += dt * dT(me, x, y, z); } }; - double compute_T() { + double update_T() { using policy_t = Kokkos::MDRangePolicy, Kokkos::IndexType>; int X = X_ra; @@ -452,7 +413,7 @@ struct System { Kokkos::parallel_reduce( "ComputeT", Kokkos::Experimental::require( - policy_t(E_bulk, {comm.me, 0, 0, 0}, {comm.me + 1, X, Y, Z}), + policy_t({comm.me, 0, 0, 0}, {comm.me + 1, X, Y, Z}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), computeT(T, dT, dt), my_T); double sum_T; @@ -465,89 +426,53 @@ struct System { void timestep() { Kokkos::Timer timer; double old_time = 0.0; - double time_a, time_b, time_c, time_d; - double time_inner, time_surface, time_compute; - time_inner = time_surface = time_compute = 0.0; + double GUPs = 0.0; + double time_a, time_b, time_c, time_update, time_compute, time_all; + time_all = time_update = time_compute = 0.0; for (int t = 0; t <= N; t++) { if (t > N / 2) P = 0.0; /* stop heat in halfway through */ time_a = timer.seconds(); computer_inner_dT(); - Kokkos::fence(); - time_b = timer.seconds(); + RemoteSpace_t().fence(); compute_surface_dT(); - Kokkos::fence(); + RemoteSpace_t().fence(); + time_b = timer.seconds(); + double T_ave = update_T(); time_c = timer.seconds(); - double T_ave = compute_T(); - time_d = timer.seconds(); - time_inner += time_b - time_a; - time_surface += time_c - time_b; - time_compute += time_d - time_c; + time_compute += time_b - time_a; + time_update += time_c - time_b; T_ave /= 1e-9 * (X * Y * Z); - // if((t%I == 0 || t == N)) - // { - // Kokkos::deep_copy(T_h, T); - // printf("process (%d) my T(0,0,0,0): %lf\n", comm.me,T_h(0, 0, 0, - // 0)); printf("process (%d) my T(0,0,1,1): %lf\n", comm.me,T_h(0, 0, - // 1, 1)); printf("process (%d) my T(1,0,0,0): %lf\n", comm.me,T_h(0, - // 0, 0, 0)); printf("process (%d) my T(0,1,0,0): %lf\n", - // comm.me,T_h(0, 1, 0, 0)); printf("process (%d) my T(0,1,1,1): %lf\n", - // comm.me,T_h(0, 1, 1, 1)); printf("process (%d) my T(0,50,50,50): - // %lf\n", comm.me,T_h(0, 50, 50, 50)); - // } if ((t % I == 0 || t == N) && (comm.me == 0)) { double time = timer.seconds(); - printf("%d T=%lf Time (%lf %lf)\n", t, T_ave, time, time - old_time); - printf(" inner: %lf surface: %lf compute: %lf\n", time_inner, - time_surface, time_compute); - old_time = time; + time_all += time - old_time; + GUPs += 1e-9 * (dT.size() / time_compute); +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG + if ((t % I == 0 || t == N) && (comm.me == 0)) { +#else + if ((t == N) && (comm.me == 0)) { +#endif + printf( + "heat3D,KokkosRemoteSpaces_partitioned,%i,%i,%lf,%lf,%lf,%lf,%lf," + "%lf,%lf,%i,%f\n", + comm.nranks, t, T_ave, 0.0, time_compute, time_update, + time - old_time, /* time last iter */ + time_all, /* current runtime */ + GUPs / t, X, 1e-6 * (dT.size() * sizeof(double))); + old_time = time; + } } } } }; int main(int argc, char* argv[]) { - int mpi_thread_level_available; - int mpi_thread_level_required = MPI_THREAD_MULTIPLE; - -#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL - mpi_thread_level_required = MPI_THREAD_SINGLE; -#endif - - MPI_Init_thread(&argc, &argv, mpi_thread_level_required, - &mpi_thread_level_available); - assert(mpi_thread_level_available >= mpi_thread_level_required); - -#ifdef KRS_ENABLE_SHMEMSPACE - shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); - assert(mpi_thread_level_available >= mpi_thread_level_required); -#endif - -#ifdef KRS_ENABLE_NVSHMEMSPACE - MPI_Comm mpi_comm; - nvshmemx_init_attr_t attr; - mpi_comm = MPI_COMM_WORLD; - attr.mpi_comm = &mpi_comm; - nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); -#endif - - int myPE, numPEs; - MPI_Comm_rank(MPI_COMM_WORLD, &myPE); - MPI_Comm_size(MPI_COMM_WORLD, &numPEs); - + comm_init(argc, argv); Kokkos::initialize(argc, argv); { System sys(MPI_COMM_WORLD); - if (sys.check_args(argc, argv)) sys.timestep(); - sys.destroy_exec_spaces(); } Kokkos::finalize(); -#ifdef KRS_ENABLE_SHMEMSPACE - shmem_finalize(); -#endif -#ifdef KRS_ENABLE_NVSHMEMSPACE - nvshmem_finalize(); -#endif - MPI_Finalize(); + comm_fini(); return 0; } diff --git a/examples/heat3d/rma/CMakeLists.txt b/examples/heat3d/rma/CMakeLists.txt index ff514542..5f077946 100755 --- a/examples/heat3d/rma/CMakeLists.txt +++ b/examples/heat3d/rma/CMakeLists.txt @@ -1,4 +1,4 @@ add_executable(rma_heat3d heat3d.cpp) -target_link_libraries(rma_heat3d PRIVATE Kokkos::kokkosremote) +target_link_libraries(rma_heat3d PRIVATE Kokkos::kokkosremotespaces) target_include_directories(rma_heat3d PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_compile_definitions(rma_heat3d PRIVATE KRS_ENABLE_NVSHMEM_PTR) diff --git a/examples/heat3d/rma/comm.hpp b/examples/heat3d/rma/comm.hpp new file mode 100644 index 00000000..14abe07b --- /dev/null +++ b/examples/heat3d/rma/comm.hpp @@ -0,0 +1,35 @@ +void comm_init(int argc, char* argv[]) { + int mpi_thread_level_available; + int mpi_thread_level_required = MPI_THREAD_MULTIPLE; + +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL + mpi_thread_level_required = MPI_THREAD_SINGLE; +#endif + + MPI_Init_thread(&argc, &argv, mpi_thread_level_required, + &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); + +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); + assert(mpi_thread_level_available >= mpi_thread_level_required); +#endif + +#ifdef KRS_ENABLE_NVSHMEMSPACE + MPI_Comm mpi_comm; + nvshmemx_init_attr_t attr; + mpi_comm = MPI_COMM_WORLD; + attr.mpi_comm = &mpi_comm; + nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); +#endif +} + +void comm_fini() { +#ifdef KRS_ENABLE_SHMEMSPACE + shmem_finalize(); +#endif +#ifdef KRS_ENABLE_NVSHMEMSPACE + nvshmem_finalize(); +#endif + MPI_Finalize(); +} \ No newline at end of file diff --git a/examples/heat3d/rma/heat3d.cpp b/examples/heat3d/rma/heat3d.cpp index 2659622a..306ede17 100644 --- a/examples/heat3d/rma/heat3d.cpp +++ b/examples/heat3d/rma/heat3d.cpp @@ -21,42 +21,12 @@ #include #include -template -struct SpaceInstance { - static ExecSpace create() { return ExecSpace(); } - static void destroy(ExecSpace&) {} - static bool overlap() { return false; } -}; - -#ifndef KOKKOS_ENABLE_DEBUG -#ifdef KOKKOS_ENABLE_CUDA -template <> -struct SpaceInstance { - static Kokkos::Cuda create() { - cudaStream_t stream; - cudaStreamCreate(&stream); - return Kokkos::Cuda(stream); - } - static void destroy(Kokkos::Cuda& space) { - cudaStream_t stream = space.cuda_stream(); - cudaStreamDestroy(stream); - } - static bool overlap() { /* returns true if you can overlap */ - bool value = true; - auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING"); - if (local_rank_str) { - value = (std::stoi(local_rank_str) == 0); - } - return value; - } -}; -#endif /* KOKKOS_ENABLE_CUDA */ -#endif /* KOKKOS_ENABLE_DEBUG */ +#include using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; +using LocalView_t = Kokkos::View; using RemoteView_t = Kokkos::View; using HostView_t = typename RemoteView_t::HostMirror; -// Kokkos::View; struct CommHelper { MPI_Comm comm; @@ -96,11 +66,12 @@ struct CommHelper { left = right = down = up = front = back = -1; x = y = z = 0; - +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG printf("NumRanks: %i Me: %i (old Grid): %i %i %i MyPos: %i %i %i\n", nranks, me, nx, ny, nz, x, y, z); printf("Me: %d MyNeighbors: %i %i %i %i %i %i\n", me, left, right, down, up, front, back); +#endif } }; @@ -134,11 +105,10 @@ struct System { int I; // Temperature and delta Temperature - RemoteView_t T, dT; + RemoteView_t T; + LocalView_t dT; HostView_t T_h; - Kokkos::DefaultExecutionSpace E_bulk; - // Initial Temmperature double T0; @@ -168,19 +138,16 @@ struct System { my_lo_x = 0; my_hi_x = 0; N = 10000; - I = 100; - T_h = HostView_t(); - T = RemoteView_t(); - dT = RemoteView_t(); - T0 = 0.0; - dt = 0.1; - q = 1.0; - sigma = 1.0; - P = 1.0; - E_bulk = SpaceInstance::create(); - } - void destroy_exec_spaces() { - SpaceInstance::destroy(E_bulk); +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG + I = 10; +#else + I = N - 1; +#endif + T0 = 0.0; + dt = 0.1; + q = 1.0; + sigma = 1.0; + P = 1.0; } void setup_subdomain() { @@ -218,11 +185,13 @@ struct System { my_lo_x = local_range.first; my_hi_x = local_range.second + 1; +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG printf("My Domain: %i (%i %i %i) (%i %i %i)\n", comm.me, my_lo_x, Y_lo, Z_lo, my_hi_x, Y_hi, Z_hi); +#endif T = RemoteView_t("System::T", dX, dY, dZ); T_h = HostView_t("Host::T", T.extent(0), dY, dZ); - dT = RemoteView_t("System::dT", dX, dY, dZ); + dT = LocalView_t("System::dT", dX, dY, dZ); Kokkos::deep_copy(T_h, T0); Kokkos::deep_copy(T, T_h); @@ -266,91 +235,17 @@ struct System { return true; } - // only computethe inner updates - struct ComputeInnerDT {}; - KOKKOS_FUNCTION - void operator()(ComputeInnerDT, int x, int y, int z) const { - double dT_xyz = 0.0; - double T_xyz = T(x, y, z); - dT_xyz += q * (T(x - 1, y, z) - T_xyz); - dT_xyz += q * (T(x + 1, y, z) - T_xyz); - dT_xyz += q * (T(x, y - 1, z) - T_xyz); - dT_xyz += q * (T(x, y + 1, z) - T_xyz); - dT_xyz += q * (T(x, y, z - 1) - T_xyz); - dT_xyz += q * (T(x, y, z + 1) - T_xyz); - dT(x, y, z) = dT_xyz; - } - - void compute_inner_dT() { - using policy_t = - Kokkos::MDRangePolicy, ComputeInnerDT, int>; - Kokkos::parallel_for( - "ComputeInnerDT", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, 1, 1}, {my_hi_x - 1, Y - 1, Z - 1}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - } - // compute both inner and outer updates. This function is suitable for both. - struct ComputeAllDT {}; + struct ComputeDT {}; KOKKOS_FUNCTION - void operator()(ComputeAllDT, int x, int y, int z) const { - double dT_xyz = 0.0; - double T_xyz = T(x, y, z); - // printf("begin computeAllDT with x,y,z=(%i,%i,%i)\n", x, y, z); + void operator()(ComputeDT, int x, int y, int z) const { + double dT_xyz = 0.0; + double T_xyz = T(x, y, z); int num_surfaces = 0; -// # if LIKELY_IN_IF -#if 0 - if (x == 0) { - num_surfaces += 1; - // Incoming Power - if(X_lo == 0) dT_xyz += P; - } - else { - dT_xyz += q * (T(x-1,y ,z ) - T_xyz); - // printf("x access computeAllDT with x,y,z=(%i,%i,%i)\n", x, y, z); - } - - if (x == X-1) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x+1,y ,z ) - T_xyz); - } - - if (y == 0) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x ,y-1,z ) - T_xyz); - } - - if (y == Y-1) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x ,y+1,z ) - T_xyz); - } - - if (z == 0) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x ,y ,z-1) - T_xyz); - } - if (z == Z-1) { - num_surfaces += 1; - } - else { - dT_xyz += q * (T(x ,y ,z+1) - T_xyz); - } -#else if (x > 0) { dT_xyz += q * (T(x - 1, y, z) - T_xyz); - // printf("x access computeAllDT with x,y,z=(%i,%i,%i)\n", x, y, z); } else { num_surfaces += 1; // Incoming Power @@ -386,80 +281,30 @@ struct System { } else { num_surfaces += 1; } -#endif /* LIKELY_IN_IF */ // radiation dT_xyz -= sigma * T_xyz * T_xyz * T_xyz * T_xyz * num_surfaces; dT(x, y, z) = dT_xyz; - // double saved = dT(x,y,z); - // printf("conclude computeAllDT with x,y,z=(%i,%i,%i) (%lf) (%lf)\n", x, y, - // z, dT_xyz, saved); - } - - void compute_outer_dT() { - using policy_t = Kokkos::MDRangePolicy, ComputeAllDT, int>; - // left - Kokkos::parallel_for( - "ComputeAllDTLeft", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x, 0, 0}, {my_lo_x + 1, Y, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // right - Kokkos::parallel_for( - "ComputeAllDTRight", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_hi_x - 1, 0, 0}, {my_hi_x, Y, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // bottom - Kokkos::parallel_for( - "ComputeAllDTBottom", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, 0, 0}, {my_hi_x - 1, 1, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // top - Kokkos::parallel_for( - "ComputeAllDTTop", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, Y - 1, 0}, {my_hi_x - 1, Y, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // front - Kokkos::parallel_for( - "ComputeAllDTFront", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, 0 + 1, 0}, {my_hi_x - 1, Y - 1, 1}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); - // back - Kokkos::parallel_for( - "ComputeAllDTBack", - Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x + 1, 0 + 1, Z - 1}, - {my_hi_x - 1, Y - 1, Z}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - *this); } - void compute_all_dT() { - using policy_t = Kokkos::MDRangePolicy, ComputeAllDT, int>; + void compute_dT() { + using policy_t = Kokkos::MDRangePolicy, ComputeDT, int>; Kokkos::parallel_for( - "ComputeAllDT", + "ComputeDT", Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x, 0, 0}, {my_hi_x, Y, Z}, {16, 8, 8}), + policy_t({my_lo_x, 0, 0}, {my_hi_x, Y, Z}, {16, 8, 8}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), *this); } // Some compilers have deduction issues if this were just a tagget operator // So it is instead a full Functor - struct computeT { - RemoteView_t T, dT; + struct updateT { + RemoteView_t T; + LocalView_t dT; double dt; - computeT(RemoteView_t T_, RemoteView_t dT_, double dt_) + updateT(RemoteView_t T_, LocalView_t dT_, double dt_) : T(T_), dT(dT_), dt(dt_) {} KOKKOS_FUNCTION void operator()(int x, int y, int z, double& sum_T) const { @@ -468,19 +313,19 @@ struct System { } }; - double compute_T() { + double update_T() { using policy_t = Kokkos::MDRangePolicy, Kokkos::IndexType>; double my_T; Kokkos::parallel_reduce( "ComputeT", Kokkos::Experimental::require( - policy_t(E_bulk, {my_lo_x, 0, 0}, {my_hi_x, Y, Z}, {10, 10, 10}), + policy_t({my_lo_x, 0, 0}, {my_hi_x, Y, Z}, {10, 10, 10}), Kokkos::Experimental::WorkItemProperty::HintLightWeight), - computeT(T, dT, dt), my_T); + updateT(T, dT, dt), my_T); double sum_T; RemoteSpace_t().fence(); - Kokkos::DefaultExecutionSpace().fence(); + Kokkos::fence(); MPI_Allreduce(&my_T, &sum_T, 1, MPI_DOUBLE, MPI_SUM, comm.comm); /* also a barrier */ return sum_T; @@ -490,71 +335,51 @@ struct System { void timestep() { Kokkos::Timer timer; double old_time = 0.0; - double time_a, time_c, time_d; - double time_compute, time_all; - time_all = time_compute = 0.0; + double GUPs = 0.0; + double time_a, time_b, time_c, time_update, time_compute, time_all; + time_all = time_update = time_compute = 0.0; for (int t = 0; t <= N; t++) { if (t > N / 2) P = 0.0; /* stop heat in halfway through */ time_a = timer.seconds(); - compute_all_dT(); + compute_dT(); RemoteSpace_t().fence(); - Kokkos::DefaultExecutionSpace().fence(); + time_b = timer.seconds(); + double T_ave = update_T(); time_c = timer.seconds(); - double T_ave = compute_T(); - time_d = timer.seconds(); - time_all += time_c - time_a; - time_compute += time_d - time_c; + time_compute += time_b - time_a; + time_update += time_c - time_b; T_ave /= 1e-9 * (X * Y * Z); if ((t % I == 0 || t == N) && (comm.me == 0)) { double time = timer.seconds(); - printf("%d T=%lf Time (%lf %lf)\n", t, T_ave, time, time - old_time); - printf(" inner + surface: %lf compute: %lf\n", time_all, - time_compute); - old_time = time; + time_all += time - old_time; + GUPs += 1e-9 * (dT.size() / time_compute); +#if KOKKOS_REMOTE_SPACES_ENABLE_DEBUG + if ((t % I == 0 || t == N) && (comm.me == 0)) { +#else + if ((t == N) && (comm.me == 0)) { +#endif + printf( + "heat3D,KokkosRemoteSpaces,%i,%i,%lf,%lf,%lf,%lf,%lf,%lf,%lf,%i,%" + "f\n", + comm.nranks, t, T_ave, 0.0, time_compute, time_update, + time - old_time, /* time last iter */ + time_all, /* current runtime */ + GUPs / t, X, 1e-6 * (dT.size() * sizeof(double))); + old_time = time; + } } } } }; int main(int argc, char* argv[]) { - int mpi_thread_level_available; - int mpi_thread_level_required = MPI_THREAD_MULTIPLE; - -#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL - mpi_thread_level_required = MPI_THREAD_SINGLE; -#endif - - MPI_Init_thread(&argc, &argv, mpi_thread_level_required, - &mpi_thread_level_available); - assert(mpi_thread_level_available >= mpi_thread_level_required); - -#ifdef KRS_ENABLE_SHMEMSPACE - shmem_init_thread(mpi_thread_level_required, &mpi_thread_level_available); - assert(mpi_thread_level_available >= mpi_thread_level_required); -#endif - -#ifdef KRS_ENABLE_NVSHMEMSPACE - MPI_Comm mpi_comm; - nvshmemx_init_attr_t attr; - mpi_comm = MPI_COMM_WORLD; - attr.mpi_comm = &mpi_comm; - nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); -#endif - + comm_init(argc, argv); Kokkos::initialize(argc, argv); { System sys(MPI_COMM_WORLD); - if (sys.check_args(argc, argv)) sys.timestep(); - sys.destroy_exec_spaces(); } Kokkos::finalize(); -#ifdef KRS_ENABLE_SHMEMSPACE - shmem_finalize(); -#endif -#ifdef KRS_ENABLE_NVSHMEMSPACE - nvshmem_finalize(); -#endif - MPI_Finalize(); + comm_fini(); return 0; } diff --git a/examples/heat3d/scripts/run_over_size.sh b/examples/heat3d/scripts/run_over_size.sh new file mode 100644 index 00000000..f8fee846 --- /dev/null +++ b/examples/heat3d/scripts/run_over_size.sh @@ -0,0 +1,154 @@ +#/bin/bash +BENCHMARK=$1 +HOST1=$2 +HOST2=$3 +HOST3=$4 +HOST4=$5 + +DEFAULT_SIZE=10 + +#exports +export OMP_PROC_BIND=spread +export OMP_PLACES=threads +export OMP_NUM_THREADS=32 + +ITERS=2500 + +DS=$DATA_SIZE + +VARS0="--bind-to core -x NVSHMEM_SYMMETRIC_SIZE=10737418240" +VARS1="-x UCX_WARN_UNUSED_ENV_VARS=n -x HCOLL_RCACHE=^ucs -x LD_LIBRARY_PATH=/g/g92/ciesko1/software/nvshmem_src_2.9.0-2/install/lib:$LD_LIBRARY_PATH" +HASH=`date|md5sum|head -c 5` + +#===================================== +#===================================== +#===================================== +#===================================== + +# TYPE="1x1" +# FILENAME="${BENCHMARK}_${HASH}_${TYPE}_p2p.res" +# echo $FILENAME +# echo "name,type,ranks,step,t_avg,time_inner,time_surface,time_update,time_last_iter,time_all,GUPs,view_size_elems,view_size(MB)" | tee $FILENAME +# SIZE=$DEFAULT_SIZE +# for S in $(seq 1 7); do +# for reps in $(seq 1 3); do +# mpirun -x CUDA_VISIBLE_DEVICES=0 -np 1 -npernode 1 $VARS0 $VARS1 $VARS2 -host "$HOST1:1" ./$BENCHMARK -X $SIZE -Y $SIZE -Z $SIZE -N $ITERS | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +# TYPE="1x2" +# FILENAME="${BENCHMARK}_${HASH}_${TYPE}_p2p.res" +# echo $FILENAME +# echo "name,type,ranks,step,t_avg,time_inner,time_surface,time_update,time_last_iter,time_all,GUPs,view_size_elems,view_size(MB)" | tee $FILENAME + +# # #run test over size +# SIZE=$DEFAULT_SIZE +# for S in $(seq 1 7); do +# for reps in $(seq 1 3); do +# mpirun -x CUDA_VISIBLE_DEVICES=0,1 -np 2 -npernode 2 $VARS0 $VARS1 $VARS2 -host "$HOST1:2" ./$BENCHMARK -X $SIZE -Y $SIZE -Z $SIZE -N $ITERS | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +# TYPE="1x4" +# FILENAME="${BENCHMARK}_${HASH}_${TYPE}_p2p.res" +# echo $FILENAME +# echo "name,type,ranks,step,t_avg,time_inner,time_surface,time_update,time_last_iter,time_all,GUPs,view_size_elems,view_size(MB)" | tee $FILENAME +# # #run test over size +# SIZE=$DEFAULT_SIZE +# for S in $(seq 1 7); do +# for reps in $(seq 1 3); do +# mpirun -x CUDA_VISIBLE_DEVICES=0,1,2,3 -np 4 -npernode 4 $VARS0 $VARS1 $VARS2 -host "$HOST1:4" ./$BENCHMARK -X $SIZE -Y $SIZE -Z $SIZE -N $ITERS | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +#===================================== +#===================================== +#===================================== +#===================================== + +# TYPE="2x1" +# FILENAME="${BENCHMARK}_${HASH}_${TYPE}_p2p.res" +# echo $FILENAME +# echo "name,type,ranks,step,t_avg,time_inner,time_surface,time_update,time_last_iter,time_all,GUPs,view_size_elems,view_size(MB)" | tee $FILENAME +# SIZE=$DEFAULT_SIZE +# for S in $(seq 1 7); do +# for reps in $(seq 1 3); do +# mpirun -x CUDA_VISIBLE_DEVICES=0 -np 2 -npernode 1 $VARS0 $VARS1 $VARS2 -host "$HOST1:1,$HOST2:1" ./$BENCHMARK -X $SIZE -Y $SIZE -Z $SIZE -N $ITERS | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +# TYPE="2x2" +# FILENAME="${BENCHMARK}_${HASH}_${TYPE}_p2p.res" +# echo $FILENAME +# echo "name,type,ranks,step,t_avg,time_inner,time_surface,time_update,time_last_iter,time_all,GUPs,view_size_elems,view_size(MB)" | tee $FILENAME + +# # #run test over size +# SIZE=$DEFAULT_SIZE +# for S in $(seq 1 7); do +# for reps in $(seq 1 3); do +# mpirun -x CUDA_VISIBLE_DEVICES=0,1 -np 4 -npernode 2 $VARS0 $VARS1 $VARS2 -host "$HOST1:2,$HOST2:2" ./$BENCHMARK -X $SIZE -Y $SIZE -Z $SIZE -N $ITERS | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +TYPE="2x4" +FILENAME="${BENCHMARK}_${HASH}_${TYPE}_p2p.res" +echo $FILENAME +echo "name,type,ranks,step,t_avg,time_inner,time_surface,time_update,time_last_iter,time_all,GUPs,view_size_elems,view_size(MB)" | tee $FILENAME +# #run test over size +SIZE=$DEFAULT_SIZE +for S in $(seq 1 7); do + for reps in $(seq 1 3); do + mpirun -x CUDA_VISIBLE_DEVICES=0,1,2,3 -np 8 -npernode 4 $VARS0 $VARS1 $VARS2 -host "$HOST1:4,$HOST2:4" ./$BENCHMARK -X $SIZE -Y $SIZE -Z $SIZE -N $ITERS | tee -a $FILENAME + done + let SIZE=$SIZE*2 +done + + +#===================================== +#===================================== +#===================================== +#===================================== + +# TYPE="4x1" +# FILENAME="${BENCHMARK}_${HASH}_${TYPE}_p2p.res" +# echo $FILENAME +# echo "name,type,ranks,step,t_avg,time_inner,time_surface,time_update,time_last_iter,time_all,GUPs,view_size_elems,view_size(MB)" | tee $FILENAME +# SIZE=$DEFAULT_SIZE +# for S in $(seq 1 7); do +# for reps in $(seq 1 3); do +# mpirun -x CUDA_VISIBLE_DEVICES=0 -np 4 -npernode 1 $VARS0 $VARS1 $VARS2 -host $HOST1:1,$HOST2:1,$HOST3:1,$HOST4:1 ./$BENCHMARK -X $SIZE -Y $SIZE -Z $SIZE -N $ITERS | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +# TYPE="4x2" +# FILENAME="${BENCHMARK}_${HASH}_${TYPE}_p2p.res" +# echo $FILENAME +# echo "name,type,ranks,step,t_avg,time_inner,time_surface,time_update,time_last_iter,time_all,GUPs,view_size_elems,view_size(MB)" | tee $FILENAME + +# # #run test over size +# SIZE=$DEFAULT_SIZE +# for S in $(seq 1 7); do +# for reps in $(seq 1 3); do +# mpirun -x CUDA_VISIBLE_DEVICES=0,1 -np 8 --map-by ppr:2:node $VARS0 $VARS1 $VARS2 -host $HOST1,$HOST2,$HOST3,$HOST4 ./$BENCHMARK -X $SIZE -Y $SIZE -Z $SIZE -N $ITERS | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done + +# TYPE="4x4" +# FILENAME="${BENCHMARK}_${HASH}_${TYPE}_p2p.res" +# echo $FILENAME +# echo "name,type,ranks,step,t_avg,time_inner,time_surface,time_update,time_last_iter,time_all,GUPs,view_size_elems,view_size(MB)" | tee $FILENAME +# # #run test over size +# SIZE=$DEFAULT_SIZE +# for S in $(seq 1 7); do +# for reps in $(seq 1 3); do +# mpirun -x CUDA_VISIBLE_DEVICES=0,1,2,3 -np 16 --map-by ppr:4:node $VARS0 $VARS1 $VARS2 -host $HOST1,$HOST2,$HOST3,$HOST4 ./$BENCHMARK -X $SIZE -Y $SIZE -Z $SIZE -N $ITERS | tee -a $FILENAME +# done +# let SIZE=$SIZE*2 +# done diff --git a/examples/matvec/multi-node/CMakeLists.txt b/examples/matvec/multi-node/CMakeLists.txt index fb5c8d20..a10c1cf9 100644 --- a/examples/matvec/multi-node/CMakeLists.txt +++ b/examples/matvec/multi-node/CMakeLists.txt @@ -1,5 +1,5 @@ add_executable(matvec_multi matvec.cpp) -target_link_libraries(matvec_multi PRIVATE Kokkos::kokkosremote) +target_link_libraries(matvec_multi PRIVATE Kokkos::kokkosremotespaces) target_include_directories(matvec_multi PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/examples/matvec/multi-node/matvec.cpp b/examples/matvec/multi-node/matvec.cpp index 8ac3f29b..93301d35 100644 --- a/examples/matvec/multi-node/matvec.cpp +++ b/examples/matvec/multi-node/matvec.cpp @@ -38,8 +38,8 @@ using VectorHost_r_t = using VectorHost_t = Kokkos::View; using MatrixHost_t = Kokkos::View; -using Vector_t = Kokkos::View; -using Matrix_t = Kokkos::View; +using Vector_t = Kokkos::View; +using Matrix_t = Kokkos::View; int main(int argc, char *argv[]) { int mpi_thread_level_available; @@ -95,8 +95,11 @@ int main(int argc, char *argv[]) { Kokkos::deep_copy(b_h, 0.0); Kokkos::deep_copy(x_h, 1.0); - auto A = Kokkos::create_mirror_view_and_copy(Kokkos::CudaSpace(), A_h); - auto b = Kokkos::create_mirror_view_and_copy(Kokkos::CudaSpace(), b_h); + using DeviceMemorySpace = + typename Kokkos::DefaultExecutionSpace::memory_space; + + auto A = Kokkos::create_mirror_view_and_copy(DeviceMemorySpace{}, A_h); + auto b = Kokkos::create_mirror_view_and_copy(DeviceMemorySpace{}, b_h); // Copy host device data into global vector Kokkos::deep_copy(x, x_h); diff --git a/examples/matvec/single-node/CMakeLists.txt b/examples/matvec/single-node/CMakeLists.txt index 88367a5b..d0be2206 100644 --- a/examples/matvec/single-node/CMakeLists.txt +++ b/examples/matvec/single-node/CMakeLists.txt @@ -1,5 +1,5 @@ add_executable(matvec_single matvec.cpp) -target_link_libraries(matvec_single PRIVATE Kokkos::kokkosremote) +target_link_libraries(matvec_single PRIVATE Kokkos::kokkosremotespaces) target_include_directories(matvec_single PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/examples/matvec/single-node/matvec.cpp b/examples/matvec/single-node/matvec.cpp index 1547697d..5742ce95 100644 --- a/examples/matvec/single-node/matvec.cpp +++ b/examples/matvec/single-node/matvec.cpp @@ -30,8 +30,8 @@ using VALUE_T = double; using VectorHost_t = Kokkos::View; using MatrixHost_t = Kokkos::View; -using Vector_t = Kokkos::View; -using Matrix_t = Kokkos::View; +using Vector_t = Kokkos::View; +using Matrix_t = Kokkos::View; int main(int argc, char *argv[]) { // Vars @@ -56,9 +56,12 @@ int main(int argc, char *argv[]) { Kokkos::deep_copy(b_h, 0.0); Kokkos::deep_copy(x_h, 1.0); - auto A = Kokkos::create_mirror_view_and_copy(Kokkos::CudaSpace(), A_h); - auto b = Kokkos::create_mirror_view_and_copy(Kokkos::CudaSpace(), b_h); - auto x = Kokkos::create_mirror_view_and_copy(Kokkos::CudaSpace(), x_h); + using DeviceMemorySpace = + typename Kokkos::DefaultExecutionSpace::memory_space; + + auto A = Kokkos::create_mirror_view_and_copy(DeviceMemorySpace{}, A_h); + auto b = Kokkos::create_mirror_view_and_copy(DeviceMemorySpace{}, b_h); + auto x = Kokkos::create_mirror_view_and_copy(DeviceMemorySpace{}, x_h); Kokkos::Timer timer; Kokkos::parallel_for( diff --git a/examples/vectorshift/multi-node/CMakeLists.txt b/examples/vectorshift/multi-node/CMakeLists.txt index f7af3110..4e1e7cae 100644 --- a/examples/vectorshift/multi-node/CMakeLists.txt +++ b/examples/vectorshift/multi-node/CMakeLists.txt @@ -1,3 +1,3 @@ add_executable(shift_multi vectorshift.cpp) -target_link_libraries(shift_multi PRIVATE Kokkos::kokkosremote) +target_link_libraries(shift_multi PRIVATE Kokkos::kokkosremotespaces) target_include_directories(shift_multi PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/examples/vectorshift/single-node/CMakeLists.txt b/examples/vectorshift/single-node/CMakeLists.txt index af78f442..eb5a4d9b 100644 --- a/examples/vectorshift/single-node/CMakeLists.txt +++ b/examples/vectorshift/single-node/CMakeLists.txt @@ -1,5 +1,5 @@ add_executable(shift_single vectorshift.cpp) -target_link_libraries(shift_single PRIVATE Kokkos::kokkosremote) +target_link_libraries(shift_single PRIVATE Kokkos::kokkosremotespaces) target_include_directories(shift_single PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/scripts/docker/Dockerfile.openmpi b/scripts/docker/Dockerfile.openmpi index 784986de..53a777a6 100644 --- a/scripts/docker/Dockerfile.openmpi +++ b/scripts/docker/Dockerfile.openmpi @@ -94,7 +94,7 @@ RUN export KOKKOS_SOURCE_DIR=${SOURCE_DIR}/kokkos && \ export KOKKOS_INSTALL_DIR=${INSTALL_DIR}/kokkos && \ cd ${SOURCE_DIR} && git clone https://github.com/kokkos/kokkos && \ cd kokkos && \ - git checkout 4.0.00 && \ + git checkout 4.1.00 && \ mkdir -p ${KOKKOS_BUILD_DIR} && \ cd ${KOKKOS_BUILD_DIR} && \ cmake -DCMAKE_BUILD_TYPE=Release \ diff --git a/src/core/Kokkos_RemoteSpaces_DeepCopy.hpp b/src/core/Kokkos_RemoteSpaces_DeepCopy.hpp index b1e36169..ce91c37e 100644 --- a/src/core/Kokkos_RemoteSpaces_DeepCopy.hpp +++ b/src/core/Kokkos_RemoteSpaces_DeepCopy.hpp @@ -309,7 +309,7 @@ void view_copy_(const DstType& dst, const SrcType& src) { } // Figure out iteration order in case we need it - int64_t strides[DstType::Rank + 1]; + int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; if (Kokkos::is_layouttiled::value) { @@ -329,7 +329,7 @@ void view_copy_(const DstType& dst, const SrcType& src) { Kokkos::PartitionedLayoutStride>::value || std::is_same::value) { - if (strides[0] > strides[DstType::Rank - 1]) + if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -348,40 +348,40 @@ void view_copy_(const DstType& dst, const SrcType& src) { if (DstExecCanAccessSrc) { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewCopy_( + dst_execution_space, DstType::rank, int64_t>( dst, src); else Kokkos::Impl::ViewCopy_( + dst_execution_space, DstType::rank, int64_t>( dst, src); } else { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewCopy_( + src_execution_space, DstType::rank, int64_t>( dst, src); else Kokkos::Impl::ViewCopy_( + src_execution_space, DstType::rank, int64_t>( dst, src); } } else { if (DstExecCanAccessSrc) { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewCopy_(dst, + dst_execution_space, DstType::rank, int>(dst, src); else Kokkos::Impl::ViewCopy_(dst, + dst_execution_space, DstType::rank, int>(dst, src); } else { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewCopy_(dst, + src_execution_space, DstType::rank, int>(dst, src); else Kokkos::Impl::ViewCopy_(dst, + src_execution_space, DstType::rank, int>(dst, src); } } @@ -517,19 +517,19 @@ inline void deep_copy( "match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::Rank - 1; r++) { + for (int r = 0; r < dst_type::rank - 1; r++) { message += std::to_string(dst.extent(r)); message += ","; } - message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::Rank - 1; r++) { + for (int r = 0; r < src_type::rank - 1; r++) { message += std::to_string(src.extent(r)); message += ","; } - message += std::to_string(src.extent(src_type::Rank - 1)); + message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); @@ -604,19 +604,19 @@ inline void deep_copy( "Deprecation Error: Kokkos::deep_copy extents of views don't match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::Rank - 1; r++) { + for (int r = 0; r < dst_type::rank - 1; r++) { message += std::to_string(dst.extent(r)); message += ","; } - message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::Rank - 1; r++) { + for (int r = 0; r < src_type::rank - 1; r++) { message += std::to_string(src.extent(r)); message += ","; } - message += std::to_string(src.extent(src_type::Rank - 1)); + message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); @@ -746,19 +746,19 @@ inline void deep_copy( "match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::Rank - 1; r++) { + for (int r = 0; r < dst_type::rank - 1; r++) { message += std::to_string(dst.extent(r)); message += ","; } - message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::Rank - 1; r++) { + for (int r = 0; r < src_type::rank - 1; r++) { message += std::to_string(src.extent(r)); message += ","; } - message += std::to_string(src.extent(src_type::Rank - 1)); + message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); @@ -816,19 +816,19 @@ inline void deep_copy( "Deprecation Error: Kokkos::deep_copy extents of views don't match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::Rank - 1; r++) { + for (int r = 0; r < dst_type::rank - 1; r++) { message += std::to_string(dst.extent(r)); message += ","; } - message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::Rank - 1; r++) { + for (int r = 0; r < src_type::rank - 1; r++) { message += std::to_string(src.extent(r)); message += ","; } - message += std::to_string(src.extent(src_type::Rank - 1)); + message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); diff --git a/src/core/Kokkos_RemoteSpaces_Error.hpp b/src/core/Kokkos_RemoteSpaces_Error.hpp new file mode 100644 index 00000000..ffcb843d --- /dev/null +++ b/src/core/Kokkos_RemoteSpaces_Error.hpp @@ -0,0 +1,142 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#ifndef KOKKOS_REMOTESPACES_ERROR_HPP +#define KOKKOS_REMOTESPACES_ERROR_HPP + +namespace Kokkos { +namespace Impl { +namespace Experimental { + +class RemoteSpacesMemoryAllocationFailure : public std::bad_alloc { + public: + enum class FailureMode { + OutOfMemoryError, + AllocationNotAligned, + InvalidAllocationSize, + Unknown + }; + enum class AllocationMechanism { + SHMEMMALLOCDEFAULT, + SHMEMMALLOC, + NVSHMEMMALLOC, + ROCSHMEMMALLOC, + MPIWINALLOC + }; + + private: + size_t m_attempted_size; + size_t m_attempted_alignment; + FailureMode m_failure_mode; + AllocationMechanism m_mechanism; + + public: + RemoteSpacesMemoryAllocationFailure( + size_t arg_attempted_size, size_t arg_attempted_alignment, + FailureMode arg_failure_mode = FailureMode::OutOfMemoryError, + AllocationMechanism arg_mechanism = + AllocationMechanism::SHMEMMALLOCDEFAULT) noexcept + : m_attempted_size(arg_attempted_size), + m_attempted_alignment(arg_attempted_alignment), + m_failure_mode(arg_failure_mode), + m_mechanism(arg_mechanism) {} + + RemoteSpacesMemoryAllocationFailure() noexcept = delete; + + RemoteSpacesMemoryAllocationFailure( + RemoteSpacesMemoryAllocationFailure const &) noexcept = default; + RemoteSpacesMemoryAllocationFailure( + RemoteSpacesMemoryAllocationFailure &&) noexcept = default; + + RemoteSpacesMemoryAllocationFailure &operator =( + RemoteSpacesMemoryAllocationFailure const &) noexcept = default; + RemoteSpacesMemoryAllocationFailure &operator =( + RemoteSpacesMemoryAllocationFailure &&) noexcept = default; + + ~RemoteSpacesMemoryAllocationFailure() noexcept override = default; + + [[nodiscard]] const char *what() const noexcept override { + if (m_failure_mode == FailureMode::OutOfMemoryError) { + return "Memory allocation error: out of memory"; + } else if (m_failure_mode == FailureMode::AllocationNotAligned) { + return "Memory allocation error: allocation result was under-aligned"; + } + + return nullptr; // unreachable + } + + [[nodiscard]] size_t attempted_size() const noexcept { + return m_attempted_size; + } + + [[nodiscard]] size_t attempted_alignment() const noexcept { + return m_attempted_alignment; + } + + [[nodiscard]] AllocationMechanism allocation_mechanism() const noexcept { + return m_mechanism; + } + + [[nodiscard]] FailureMode failure_mode() const noexcept { + return m_failure_mode; + } + + void print_error_message(std::ostream &o) const { + o << "Allocation of size " << Impl::human_memory_size(m_attempted_size); + o << " failed"; + switch (m_failure_mode) { + case FailureMode::OutOfMemoryError: + o << ", likely due to insufficient memory."; + break; + case FailureMode::AllocationNotAligned: + o << " because the allocation was improperly aligned."; + break; + case FailureMode::InvalidAllocationSize: + o << " because the requested allocation size is not a valid size for " + "the" + " requested allocation mechanism (it's probably too large)."; + break; + case FailureMode::Unknown: o << " because of an unknown error."; + } + o << " (The allocation mechanism was "; + switch (m_mechanism) { + case AllocationMechanism::SHMEMMALLOC: o << "shmem_malloc()."; break; + case AllocationMechanism::NVSHMEMMALLOC: o << "nvshmem_malloc()."; break; + case AllocationMechanism::ROCSHMEMMALLOC: + o << "rocshmem_malloc()."; + break; + case AllocationMechanism::MPIWINALLOC: o << "MPI_Win_allocate()."; + } + append_additional_error_information(o); + o << ")" << std::endl; + } + + [[nodiscard]] std::string get_error_message() const { + std::ostringstream out; + print_error_message(out); + return out.str(); + } + + virtual void append_additional_error_information(std::ostream &) const {} +}; + +} // namespace Experimental +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_REMOTESPACES_ERROR_HPP \ No newline at end of file diff --git a/src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp b/src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp index f67998aa..5b6e7477 100644 --- a/src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp +++ b/src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp @@ -25,18 +25,6 @@ namespace Kokkos { namespace Experimental { namespace RemoteSpaces { -#ifdef KRS_ENABLE_NVSHMEMSPACE -typedef NVSHMEMSpace DefaultRemoteMemorySpace; -#else -#ifdef KRS_ENABLE_SHMEMSPACE -typedef SHMEMSpace DefaultRemoteMemorySpace; -#else -#ifdef KRS_ENABLE_MPISPACE -typedef MPISpace DefaultRemoteMemorySpace; -#endif -#endif -#endif - /** \brief A local deep copy between views of the default specialization, * compatible type, same non-zero rank. */ @@ -50,8 +38,55 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( std::is_same::specialize, Kokkos::Experimental::RemoteSpaceSpecializeTag>::value)>:: type * = nullptr) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, src.span()), - [&](const int &i) { dst.data()[i] = src.data()[i]; }); + int src_rank = src.impl_map().get_owning_pe(); + int dst_rank = dst.impl_map().get_owning_pe(); + int my_rank = get_my_pe(); + + if (src_rank != my_rank && dst_rank != my_rank) + static_assert( + "local_deep_copy allows only one view with remote data access"); + + // We use the data ptr explicitly thus expecting that a subview starts at the + // beginning of the local allocaton. We need to add the offset = sum of + // offsets in all non-leading dimenions to the ptr to support the generic + // case. + using src_data_block_t = + Kokkos::Impl::BlockDataHandle::value_type, + ViewTraits>; + using dst_data_block_t = + Kokkos::Impl::BlockDataHandle::value_type, + ViewTraits>; + if (src_rank != my_rank) { + team.team_barrier(); + Kokkos::single(Kokkos::PerTeam(team), [&]() { +#ifdef KRS_ENABLE_MPISPACE + src_data_block_t data_block = src_data_block_t( + dst.data(), src.impl_map().handle().loc.win, + src.impl_map().handle().loc.offset, src.span(), src_rank); +#else + src_data_block_t data_block = + src_data_block_t(dst.data(), src.data(), src.span(), src_rank); +#endif + data_block.get(); + }); + } else if (dst_rank != my_rank) { + team.team_barrier(); + Kokkos::single(Kokkos::PerTeam(team), [&]() { +#ifdef KRS_ENABLE_MPISPACE + dst_data_block_t data_block = dst_data_block_t( + src.data(), dst.impl_map().handle().loc.win, + dst.impl_map().handle().loc.offset, dst.span(), src_rank); +#else + dst_data_block_t data_block = + dst_data_block_t(dst.data(), src.data(), dst.span(), dst_rank); +#endif + data_block.put(); + }); + } else { + // Data resides within the node, copy as usual + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, src.span()), + [&](const int &i) { dst.data()[i] = src.data()[i]; }); + } } template @@ -63,8 +98,48 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( std::is_same::specialize, Kokkos::Experimental::RemoteSpaceSpecializeTag>::value)>:: type * = nullptr) { - for (size_t i = 0; i < src.span(); ++i) { - dst.data()[i] = src.data()[i]; + int src_rank = src.impl_map().get_owning_pe(); + int dst_rank = dst.impl_map().get_owning_pe(); + int my_rank = get_my_pe(); + + if (src_rank != my_rank && dst_rank != my_rank) + static_assert( + "local_deep_copy allows only one view with remote data access"); + + using src_data_block_t = + Kokkos::Impl::BlockDataHandle::value_type, + ViewTraits>; + using dst_data_block_t = + Kokkos::Impl::BlockDataHandle::value_type, + ViewTraits>; + + // We use the data ptr explicitly thus expecting that a subview starts at the + // beginning of the local allocaton. We need to add the offset = sum of + // offsets in all non-leading dimenions to the ptr to support the generic + // case. + if (src_rank != my_rank) { +#ifdef KRS_ENABLE_MPISPACE + src_data_block_t data_block = src_data_block_t( + dst.data(), src.impl_map().handle().loc.win, + src.impl_map().handle().loc.offset, src.span(), src_rank); +#else + src_data_block_t data_block = + src_data_block_t(dst.data(), src.data(), src.span(), src_rank); +#endif + data_block.get(); + } else if (dst_rank != my_rank) { +#ifdef KRS_ENABLE_MPISPACE + dst_data_block_t data_block = dst_data_block_t( + src.data(), dst.impl_map().handle().loc.win, + dst.impl_map().handle().loc.offset, dst.span(), src_rank); +#else + dst_data_block_t data_block = + dst_data_block_t(dst.data(), src.data(), dst.span(), dst_rank); +#endif + data_block.put(); + } else { + // Data resides within the node, copy as usual + for (size_t i = 0; i < src.span(); ++i) dst.data()[i] = src.data()[i]; } } @@ -96,7 +171,6 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( } // Accepts (team, src_view, dst_view) - template void KOKKOS_INLINE_FUNCTION local_deep_copy( const TeamType &team, const View &dst, @@ -360,8 +434,6 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy( } } -// Accepts (src_view, dst_view) - template void KOKKOS_INLINE_FUNCTION local_deep_copy( const View &dst, const View &src, @@ -376,12 +448,7 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy( if (dst.data() == nullptr) { return; } - - const size_t N = dst.extent(0); - - for (size_t i = 0; i < N; ++i) { - dst(i) = src(i); - } + Kokkos::Experimental::RemoteSpaces::local_deep_copy_contiguous(dst, src); } template diff --git a/src/core/Kokkos_RemoteSpaces_ViewLayout.hpp b/src/core/Kokkos_RemoteSpaces_ViewLayout.hpp index 83300e2f..e1f397d2 100644 --- a/src/core/Kokkos_RemoteSpaces_ViewLayout.hpp +++ b/src/core/Kokkos_RemoteSpaces_ViewLayout.hpp @@ -142,7 +142,7 @@ struct SubviewLegalArgsCompileTime::value)) || ((CurrentArg >= RankDest) && (std::is_integral::value)) || ((CurrentArg < RankDest) && - (std::is_same::value)) || + (std::is_same::value)) || ((CurrentArg == 0) && (Kokkos::Impl::is_integral_extent_type::value))) && (SubviewLegalArgsCompileTime< @@ -174,7 +174,7 @@ struct SubviewLegalArgsCompileTime::value)) || ((CurrentArg >= RankSrc - RankDest) && - (std::is_same::value))) && + (std::is_same::value))) && (SubviewLegalArgsCompileTime< Kokkos::PartitionedLayoutRight, Kokkos::PartitionedLayoutRight, RankDest, RankSrc, CurrentArg + 1, SubViewArgs...>::value) @@ -187,7 +187,7 @@ struct SubviewLegalArgsCompileTime { enum { value = ((CurrentArg == RankSrc - 1) && - (std::is_same::value)) + (std::is_same::value)) }; }; diff --git a/src/core/Kokkos_RemoteSpaces_ViewMapping.hpp b/src/core/Kokkos_RemoteSpaces_ViewMapping.hpp index 06c722d2..910892f7 100644 --- a/src/core/Kokkos_RemoteSpaces_ViewMapping.hpp +++ b/src/core/Kokkos_RemoteSpaces_ViewMapping.hpp @@ -25,88 +25,57 @@ /** \brief View mapping for non-specialized data type and standard layout */ namespace Kokkos { - namespace Experimental { -template -std::pair get_range( - T &v, size_t pe, - typename std::enable_if::value>::type * = nullptr) { - static_assert(!(std::is_same::value || - std::is_same::value || - std::is_same::value), - "get_local_range over partitioned layouts are not allowed"); - - // JC: Error out also in this case as we need to access the original dim0 of - // the View and not the rounded dim0 of the View. Fix would need to add - // get_mapping to View - static_assert((std::is_same::value || - std::is_same::value || - std::is_same::value), - "get_local_range overload currently unsupported"); - - size_t extent_dim0 = v.extent(0); - return getRange(extent_dim0, pe); +KOKKOS_INLINE_FUNCTION +size_t get_indexing_block_size(size_t size) { + int num_pes; + size_t block; + num_pes = Kokkos::Experimental::get_num_pes(); + block = (size + static_cast(num_pes) - 1) / num_pes; + return block; } template -std::pair get_local_range( - T &v, - typename std::enable_if::value>::type * = nullptr) { - static_assert(!(std::is_same::value || - std::is_same::value || - std::is_same::value), - "get_local_range over partitioned layouts are not allowed"); - - // JC: Error out also in this case as we need to access the original dim0 of - // the View and not the rounded dim0 of the View. Fix would need to add - // get_mapping to View - static_assert((std::is_same::value || - std::is_same::value || - std::is_same::value), - "get_local_range overload currently unsupported"); - - size_t pe = get_my_pe(); - size_t extent_dim0 = v.extent(0); - return getRange(extent_dim0, pe); +KOKKOS_INLINE_FUNCTION Kokkos::pair getRange(T size, int pe) { + T start, end; + T block = get_indexing_block_size(size); + start = static_cast(pe) * block; + end = (static_cast(pe) + 1) * block; + + T num_pes = Kokkos::Experimental::get_num_pes(); + if (size < num_pes) { + T diff = (num_pes * block) - size; + if (pe > num_pes - 1 - diff) end--; + } else { + if (pe == num_pes - 1) { + size_t diff = size - (num_pes - 1) * block; + end = start + diff; + } + } + return Kokkos::pair(start, end); } template -std::pair get_range( - T size, size_t pe, - typename std::enable_if::value>::type * = nullptr) { +KOKKOS_INLINE_FUNCTION Kokkos::pair get_range(T size, int pe) { return getRange(size, pe); } template -std::pair get_local_range( - T size, - typename std::enable_if::value>::type * = nullptr) { - size_t pe = get_my_pe(); +KOKKOS_INLINE_FUNCTION Kokkos::pair get_local_range(T size) { + auto pe = Kokkos::Experimental::get_my_pe(); return getRange(size, pe); } } // namespace Experimental -namespace Impl { - /* * ViewMapping class used by View copy-ctr and subview() to specialize new * (sub-) view type */ +namespace Impl { + template class ViewMapping< typename std::enable_if<( @@ -274,15 +243,16 @@ class ViewMapping< dst.m_offset_remote_dim = extents.domain_offset(0); dst.dim0_is_pe = R0; + dst.isSubView = true; + #ifdef KRS_ENABLE_MPISPACE // Subviews propagate MPI_Window of the original view dst.m_handle = ViewDataHandle::assign( - src.m_handle, + src.m_handle, src.m_handle.loc.win, src.m_offset(0, extents.domain_offset(1), extents.domain_offset(2), extents.domain_offset(3), extents.domain_offset(4), extents.domain_offset(5), extents.domain_offset(6), - extents.domain_offset(7)), - src.m_handle.win); + extents.domain_offset(7))); #else dst.m_handle = ViewDataHandle::assign( src.m_handle, @@ -330,6 +300,8 @@ class ViewMapping { // with a partitioned layout always expects dim0 to be rank id size_t dim0_is_pe; + bool isSubView = false; + int m_num_pes; int pe; @@ -465,6 +437,9 @@ class ViewMapping { return m_handle.ptr; } + /** \brief Query raw pointer to memory */ + KOKKOS_INLINE_FUNCTION handle_type handle() const { return m_handle; } + //---------------------------------------- // The View class performs all rank and bounds checking before // calling these element reference methods. @@ -814,25 +789,35 @@ class ViewMapping { // Implements global views struct dim0_offsets { - size_t pe, offset; + int pe; + size_t offset; }; // TODO: move this to kokkos::view_offset (new template specialization // on RemoteSpace space type for all default layouts and also one for // all partitioned laytouts. Wait for mdspan.) - template + template KOKKOS_INLINE_FUNCTION dim0_offsets compute_dim0_offsets(const I0 &_i0) const { - size_t target_pe, dim0_mod, i0; + int target_pe; + size_t dim0_mod, i0; i0 = static_cast(_i0); assert(m_local_dim0); - target_pe = i0 / m_local_dim0; + target_pe = static_cast(i0 / m_local_dim0); dim0_mod = i0 % m_local_dim0; return {target_pe, dim0_mod}; } - template + KOKKOS_INLINE_FUNCTION int get_owning_pe() const { + // If subview with a valid m_offset_remote_dim, + // compute the corresponding PE + if (isSubView) return compute_dim0_offsets(m_offset_remote_dim).pe; + // Else, return the current PE as the current PE is the owner + // of the local allocation + return pe; + } + template KOKKOS_INLINE_FUNCTION const reference_type reference( const I0 &i0, typename std::enable_if< @@ -1014,7 +999,8 @@ class ViewMapping { m_offset(), m_offset_remote_dim(0), m_local_dim0(0), - dim0_is_pe(1) { + dim0_is_pe(1), + isSubView(false) { m_num_pes = Kokkos::Experimental::get_num_pes(); pe = Kokkos::Experimental::get_my_pe(); } @@ -1026,7 +1012,8 @@ class ViewMapping { pe(rhs.pe), m_offset_remote_dim(rhs.m_offset_remote_dim), m_local_dim0(rhs.m_local_dim0), - dim0_is_pe(rhs.dim0_is_pe) {} + dim0_is_pe(rhs.dim0_is_pe), + isSubView(rhs.isSubView) {} KOKKOS_INLINE_FUNCTION ViewMapping &operator=(const ViewMapping &rhs) { m_handle = rhs.m_handle; @@ -1036,6 +1023,7 @@ class ViewMapping { m_local_dim0 = rhs.m_local_dim0; dim0_is_pe = rhs.dim0_is_pe; pe = rhs.pe; + isSubView = rhs.isSubView; return *this; } @@ -1046,7 +1034,8 @@ class ViewMapping { pe(rhs.pe), m_offset_remote_dim(rhs.m_offset_remote_dim), m_local_dim0(rhs.m_local_dim0), - dim0_is_pe(0) {} + dim0_is_pe(rhs.dim0_is_pe), + isSubView(rhs.isSubView) {} KOKKOS_INLINE_FUNCTION ViewMapping &operator=(ViewMapping &&rhs) { m_handle = rhs.m_handle; @@ -1056,6 +1045,7 @@ class ViewMapping { m_offset_remote_dim = rhs.m_offset_remote_dim; m_local_dim0 = rhs.m_local_dim0; dim0_is_pe = rhs.dim0_is_pe; + isSubView = rhs.isSubView; return *this; } @@ -1069,7 +1059,9 @@ class ViewMapping { : m_offset_remote_dim(0), m_handle( ((Kokkos::Impl::ViewCtorProp const &)arg_prop) - .value) { + .value) + + { typedef typename Traits::value_type value_type; typedef std::integral_constant< unsigned, Kokkos::Impl::ViewCtorProp::allow_padding diff --git a/src/impl/mpispace/Kokkos_MPISpace.cpp b/src/impl/mpispace/Kokkos_MPISpace.cpp index 5df80fcd..13ebc5b5 100644 --- a/src/impl/mpispace/Kokkos_MPISpace.cpp +++ b/src/impl/mpispace/Kokkos_MPISpace.cpp @@ -16,7 +16,6 @@ // //@HEADER -#include #include #include #include @@ -37,6 +36,22 @@ void MPISpace::impl_set_allocation_mode(const int allocation_mode_) { void MPISpace::impl_set_extent(const int64_t extent_) { extent = extent_; } void *MPISpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} + +void *MPISpace::allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t + + arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} + +void *MPISpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; static_assert(sizeof(void *) == sizeof(uintptr_t), "Error sizeof(void*) != sizeof(uintptr_t)"); @@ -44,11 +59,18 @@ void *MPISpace::allocate(const size_t arg_alloc_size) const { Kokkos::Impl::is_integral_power_of_two(Kokkos::Impl::MEMORY_ALIGNMENT), "Memory alignment must be power of two"); - void *ptr = 0; + constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; + constexpr uintptr_t alignment_mask = alignment - 1; + + void *ptr = nullptr; + if (arg_alloc_size) { + // Over-allocate to and round up to guarantee proper alignment. + size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; + if (allocation_mode == Kokkos::Experimental::Symmetric) { current_win = MPI_WIN_NULL; - MPI_Win_allocate(arg_alloc_size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &ptr, + MPI_Win_allocate(size_padded, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &ptr, ¤t_win); assert(current_win != MPI_WIN_NULL); @@ -70,41 +92,92 @@ void *MPISpace::allocate(const size_t arg_alloc_size) const { Kokkos::abort("MPISpace only supports symmetric allocation policy."); } } + using MemAllocFailure = + Kokkos::Impl::Experimental::RemoteSpacesMemoryAllocationFailure; + using MemAllocFailureMode = Kokkos::Impl::Experimental:: + RemoteSpacesMemoryAllocationFailure::FailureMode; + + if ((ptr == nullptr) || (reinterpret_cast(ptr) == ~uintptr_t(0)) + // MPI_Win_allocate may allocate non-alligned to + // Kokkos::Impl::MEMORY_ALIGNMENT + // || + // (reinterpret_cast(ptr) & alignment_mask)*/ + ) { + MemAllocFailureMode failure_mode = + MemAllocFailureMode::AllocationNotAligned; + if (ptr == nullptr) { + failure_mode = MemAllocFailureMode::OutOfMemoryError; + } + MemAllocFailure::AllocationMechanism alloc_mec = + MemAllocFailure::AllocationMechanism::MPIWINALLOC; + throw MemAllocFailure(arg_alloc_size, alignment, failure_mode, alloc_mec); + } + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); + } return ptr; } -void MPISpace::deallocate(void *const, const size_t) const { - int last_valid; - for (last_valid = 0; last_valid < mpi_windows.size(); ++last_valid) { - if (mpi_windows[last_valid] == MPI_WIN_NULL) break; - } +void MPISpace::deallocate(void *const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} - last_valid--; - for (int i = 0; i < mpi_windows.size(); ++i) { - if (mpi_windows[i] == current_win) { - mpi_windows[i] = mpi_windows[last_valid]; - mpi_windows[last_valid] = MPI_WIN_NULL; - break; +void MPISpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t + + arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} + +void MPISpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (arg_alloc_ptr) { + Kokkos::fence("HostSpace::impl_deallocate before free"); + fence(); + size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + + int last_valid; + for (last_valid = 0; last_valid < mpi_windows.size(); ++last_valid) { + if (mpi_windows[last_valid] == MPI_WIN_NULL) break; } - } - assert(current_win != MPI_WIN_NULL); - MPI_Win_unlock_all(current_win); - MPI_Win_free(¤t_win); + last_valid--; + for (int i = 0; i < mpi_windows.size(); ++i) { + if (mpi_windows[i] == current_win) { + mpi_windows[i] = mpi_windows[last_valid]; + mpi_windows[last_valid] = MPI_WIN_NULL; + break; + } + } - // We pass a mempory space instance do multiple Views thus - // setting "current_win = MPI_WIN_NULL;" will result in a wrong handle if - // subsequent view runs out of scope - // Fixme: The following only works when views are allocated sequentially - // We need a thread-safe map to associate views and windows + assert(current_win != MPI_WIN_NULL); + MPI_Win_unlock_all(current_win); + MPI_Win_free(¤t_win); - if (last_valid != 0) - current_win = mpi_windows[last_valid - 1]; - else - current_win = MPI_WIN_NULL; + // We pass a mempory space instance do multiple Views thus + // setting "current_win = MPI_WIN_NULL;" will result in a wrong handle if + // subsequent view runs out of scope + // Fixme: The following only works when views are allocated sequentially + // We need a thread-safe map to associate views and windows + + if (last_valid != 0) + current_win = mpi_windows[last_valid - 1]; + else + current_win = MPI_WIN_NULL; + } } -void MPISpace::fence() { +void MPISpace::fence() const { for (int i = 0; i < mpi_windows.size(); i++) { if (mpi_windows[i] != MPI_WIN_NULL) { MPI_Win_flush_all(mpi_windows[i]); @@ -127,34 +200,6 @@ size_t get_my_pe() { return rank; } -KOKKOS_FUNCTION -size_t get_indexing_block_size(size_t size) { - size_t num_pes, block; - num_pes = get_num_pes(); - block = (size + num_pes - 1) / num_pes; - return block; -} - -std::pair getRange(size_t size, size_t pe) { - size_t start, end; - size_t block = get_indexing_block_size(size); - start = pe * block; - end = (pe + 1) * block; - - size_t num_pes = get_num_pes(); - - if (size < num_pes) { - size_t diff = (num_pes * block) - size; - if (pe > num_pes - 1 - diff) end--; - } else { - if (pe == num_pes - 1) { - size_t diff = size - (num_pes - 1) * block; - end = start + diff; - } - end--; - } - return std::make_pair(start, end); -} } // namespace Experimental namespace Impl { diff --git a/src/impl/mpispace/Kokkos_MPISpace.hpp b/src/impl/mpispace/Kokkos_MPISpace.hpp index 7f90dcc2..358830dc 100644 --- a/src/impl/mpispace/Kokkos_MPISpace.hpp +++ b/src/impl/mpispace/Kokkos_MPISpace.hpp @@ -29,7 +29,6 @@ #include #include #include -/*--------------------------------------------------------------------------*/ namespace Kokkos { namespace Experimental { @@ -66,21 +65,37 @@ class MPISpace { explicit MPISpace(const MPI_Comm &); + /**\brief Allocate untracked memory in the space */ void *allocate(const size_t arg_alloc_size) const; + void *allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + /**\brief Deallocate untracked memory in the space */ void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; - void *allocate(const int *gids, const int &arg_local_alloc_size) const; - - void deallocate(const int *gids, void *const arg_alloc_ptr, - const size_t arg_alloc_size) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + + void *impl_allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + public: /**\brief Return Name of the MemorySpace */ static constexpr const char *name() { return m_name; } - void fence(); + void fence() const; - int *rank_list; int allocation_mode; int64_t extent; @@ -98,8 +113,6 @@ class MPISpace { size_t get_num_pes(); size_t get_my_pe(); -size_t get_indexing_block_size(size_t size); -std::pair getRange(size_t size, size_t pe); } // namespace Experimental } // namespace Kokkos @@ -145,18 +158,43 @@ struct MemorySpaceAccess { enum { deepcopy = true }; }; +// MPI locality based on an MPI window and offset +typedef struct MPIAccessLocation { + mutable MPI_Win win; + size_t offset; + KOKKOS_INLINE_FUNCTION + MPIAccessLocation() { + win = MPI_WIN_NULL; + offset = 0; + } + + KOKKOS_INLINE_FUNCTION + MPIAccessLocation(MPI_Win win_, size_t offset_) { + win = win_; + offset = offset_; + } + + KOKKOS_INLINE_FUNCTION + void operator=(const MPIAccessLocation &val) { + win = val.win; + offset = val.offset; + } +} MPIAccessLocation; + } // namespace Impl } // namespace Kokkos +#include #include #include -#include #include #include #include #include +#include #include #include +#include #include #endif // #define KOKKOS_MPISPACE_HPP diff --git a/src/impl/mpispace/Kokkos_MPISpace_AllocationRecord.cpp b/src/impl/mpispace/Kokkos_MPISpace_AllocationRecord.cpp index 2220df65..55562167 100644 --- a/src/impl/mpispace/Kokkos_MPISpace_AllocationRecord.cpp +++ b/src/impl/mpispace/Kokkos_MPISpace_AllocationRecord.cpp @@ -22,39 +22,42 @@ namespace Kokkos { namespace Impl { -template -SharedAllocationRecord:: - SharedAllocationRecord( - const ExecutionSpace &execution_space, - const Kokkos::Experimental::MPISpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : SharedAllocationRecord( - execution_space, #ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, +SharedAllocationRecord + SharedAllocationRecord::s_root_record; #endif - reinterpret_cast(arg_space.allocate( - sizeof(SharedAllocationHeader) + arg_alloc_size)), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { -#if defined(KOKKOS_ENABLE_PROFILING) - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::allocateData( - Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(), - arg_alloc_size); + +SharedAllocationRecord::~SharedAllocationRecord() { + m_space.deallocate(m_label.c_str(), + SharedAllocationRecord::m_alloc_ptr, + SharedAllocationRecord::m_alloc_size, + (SharedAllocationRecord::m_alloc_size - + sizeof(SharedAllocationHeader))); +} + +SharedAllocationHeader *_do_allocation( + Kokkos::Experimental::MPISpace const &space, std::string const &label, + size_t alloc_size) { + using MemAllocFailure = + Kokkos::Impl::Experimental::RemoteSpacesMemoryAllocationFailure; + try { + return reinterpret_cast( + space.allocate(alloc_size)); + } catch (MemAllocFailure const &failure) { + if (failure.failure_mode() == + MemAllocFailure::FailureMode::AllocationNotAligned) { + // TODO: delete the misaligned memory + } + + std::cerr << "Kokkos failed to allocate memory for label \"" << label + << "\". Allocation using MemorySpace named \"" << space.name() + << " failed with the following error: "; + failure.print_error_message(std::cerr); + std::cerr.flush(); + Kokkos::Impl::throw_runtime_exception("Memory allocation failure"); } -#endif - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast *>(this); - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - win = m_space.current_win; + return nullptr; // unreachable } SharedAllocationRecord:: @@ -64,120 +67,34 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif - reinterpret_cast(arg_space.allocate( - sizeof(SharedAllocationHeader) + arg_alloc_size)), + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, arg_label), m_space(arg_space) { -#if defined(KOKKOS_ENABLE_PROFILING) - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::allocateData( - Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(), - arg_alloc_size); - } -#endif - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast *>(this); - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); + this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, + arg_label); win = m_space.current_win; } -SharedAllocationRecord::~SharedAllocationRecord() { -#if defined(KOKKOS_ENABLE_PROFILING) - if (Kokkos::Profiling::profileLibraryLoaded()) { - SharedAllocationHeader header; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::SpaceHandle( - Kokkos::Experimental::SHMEMSpace::name()), - header.m_label, data(), size()); - } -#endif - - m_space.deallocate(SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -void * -SharedAllocationRecord::allocate_tracked( - const Kokkos::Experimental::MPISpace &arg_space, - const std::string &arg_alloc_label, const size_t arg_alloc_size) { - if (!arg_alloc_size) return (void *)0; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - RecordBase::increment(r); - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != 0) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord:: - reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), r_new->size()); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} +} // namespace Impl +} // namespace Kokkos -SharedAllocationRecord - *SharedAllocationRecord::get_record( - void *alloc_ptr) { - typedef SharedAllocationHeader Header; - typedef SharedAllocationRecord - RecordHost; +#define KOKKOS_IMPL_PUBLIC_INCLUDE - SharedAllocationHeader const *const head = - alloc_ptr ? Header::get_header(alloc_ptr) : (SharedAllocationHeader *)0; - RecordHost *const record = - head ? static_cast(head->m_record) : (RecordHost *)0; +#include - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::MPISpace " - ", void >::get_record ERROR")); - } +namespace Kokkos { +namespace Impl { - return record; -} +template class SharedAllocationRecordCommon; -// Iterate records to print orphaned memory ... -void SharedAllocationRecord:: - print_records(std::ostream &s, const Kokkos::Experimental::MPISpace &, - bool detail) { - SharedAllocationRecord::print_host_accessible_records( - s, "MPISpace", &s_root_record, detail); -} +#undef KOKKOS_IMPL_PUBLIC_INCLUDE } // namespace Impl } // namespace Kokkos diff --git a/src/impl/mpispace/Kokkos_MPISpace_AllocationRecord.hpp b/src/impl/mpispace/Kokkos_MPISpace_AllocationRecord.hpp index 70b5970b..5c1c9058 100644 --- a/src/impl/mpispace/Kokkos_MPISpace_AllocationRecord.hpp +++ b/src/impl/mpispace/Kokkos_MPISpace_AllocationRecord.hpp @@ -15,87 +15,82 @@ // Contact: Jan Ciesko (jciesko@sandia.gov) // //@HEADER -#ifndef KOKKOS_MPI_ALLOCREC_HPP -#define KOKKOS_MPI_ALLOCREC_HPP -#include +#ifndef KOKKOS_REMOTESPACES_MPI_ALLOCREC_HPP +#define KOKKOS_REMOTESPACES_MPI_ALLOCREC_HPP -/*--------------------------------------------------------------------------*/ +#include namespace Kokkos { namespace Impl { template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public SharedAllocationRecordCommon { private: friend Kokkos::Experimental::MPISpace; + friend class SharedAllocationRecordCommon; - typedef SharedAllocationRecord RecordBase; - - SharedAllocationRecord(const SharedAllocationRecord &) = delete; - SharedAllocationRecord &operator=(const SharedAllocationRecord &) = delete; + using base_t = SharedAllocationRecordCommon; + using RecordBase = SharedAllocationRecord; - static void deallocate(RecordBase *); + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - /**\brief Root record for tracked allocations from this MPISpace instance */ +#ifdef KOKKOS_ENABLE_DEBUG + /**\brief Root record for tracked allocations from this HostSpace instance */ static RecordBase s_root_record; +#endif + + const Kokkos::Experimental::MPISpace m_space; protected: ~SharedAllocationRecord(); SharedAllocationRecord() = default; + // This constructor does not forward to the one without exec_space arg + // in order to work around https://github.com/kokkos/kokkos/issues/5258 + // This constructor is templated so I can't just put it into the cpp file + // like the other constructor. template SharedAllocationRecord( - const ExecutionSpace &execution_space, - const Kokkos::Experimental::MPISpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); + const ExecutionSpace& /* exec_space*/, + const Kokkos::Experimental::MPISpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &deallocate) + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord::s_root_record, +#endif + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, + arg_label), + m_space(arg_space) { + this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, + arg_label); + } SharedAllocationRecord( - const Kokkos::Experimental::MPISpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, + const Kokkos::Experimental::MPISpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, const RecordBase::function_type arg_dealloc = &deallocate); public: - const Kokkos::Experimental::MPISpace m_space; - MPI_Win win; - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); + KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( + const Kokkos::Experimental::MPISpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size) { + KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, + arg_alloc_size);)) + KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; + return nullptr;)) } - - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord *allocate( - const Kokkos::Experimental::MPISpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size) { -#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -#else - return (SharedAllocationRecord *)0; -#endif - } - - /**\brief Allocate tracked memory in the space */ - static void *allocate_tracked(const Kokkos::Experimental::MPISpace &arg_space, - const std::string &arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void *reallocate_tracked(void *const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void *const arg_alloc_ptr); - - static SharedAllocationRecord *get_record(void *arg_alloc_ptr); - - static void print_records(std::ostream &, - const Kokkos::Experimental::MPISpace &, - bool detail = false); }; } // namespace Impl } // namespace Kokkos -#endif // KOKKOS_MPI_ALLOCREC_HPP +#endif // KOKKOS_REMOTESPACES_MPI_ALLOCREC_HPP diff --git a/src/impl/mpispace/Kokkos_MPISpace_BlockOps.hpp b/src/impl/mpispace/Kokkos_MPISpace_BlockOps.hpp new file mode 100644 index 00000000..69f395f7 --- /dev/null +++ b/src/impl/mpispace/Kokkos_MPISpace_BlockOps.hpp @@ -0,0 +1,116 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#ifndef KOKKOS_REMOTESPACES_MPISPACE_BLOCK_OPS_HPP +#define KOKKOS_REMOTESPACES_MPISPACE_BLOCK_OPS_HPP + +#include +#include + +namespace Kokkos { +namespace Impl { + +#define KOKKOS_REMOTESPACES_PUT(type, mpi_type) \ + static KOKKOS_INLINE_FUNCTION void mpi_block_type_put( \ + const type *ptr, const size_t offset, const size_t nelems, const int pe, \ + const MPI_Win &win) { \ + assert(win != MPI_WIN_NULL); \ + int _typesize; \ + MPI_Request request; \ + MPI_Type_size(mpi_type, &_typesize); \ + const void *src_adr = ptr + offset; \ + size_t win_offset = sizeof(SharedAllocationHeader) + offset * _typesize; \ + MPI_Rput(src_adr, nelems, mpi_type, pe, win_offset, nelems, mpi_type, win, \ + &request); \ + MPI_Wait(&request, MPI_STATUS_IGNORE); \ + } + +KOKKOS_REMOTESPACES_PUT(char, MPI_SIGNED_CHAR) +KOKKOS_REMOTESPACES_PUT(unsigned char, MPI_UNSIGNED_CHAR) +KOKKOS_REMOTESPACES_PUT(short, MPI_SHORT) +KOKKOS_REMOTESPACES_PUT(unsigned short, MPI_UNSIGNED_SHORT) +KOKKOS_REMOTESPACES_PUT(int, MPI_INT) +KOKKOS_REMOTESPACES_PUT(unsigned int, MPI_UNSIGNED) +KOKKOS_REMOTESPACES_PUT(long, MPI_INT64_T) +KOKKOS_REMOTESPACES_PUT(long long, MPI_LONG_LONG) +KOKKOS_REMOTESPACES_PUT(unsigned long long, MPI_UNSIGNED_LONG_LONG) +KOKKOS_REMOTESPACES_PUT(unsigned long, MPI_UNSIGNED_LONG) +KOKKOS_REMOTESPACES_PUT(float, MPI_FLOAT) +KOKKOS_REMOTESPACES_PUT(double, MPI_DOUBLE) + +#undef KOKKOS_REMOTESPACES_PUT + +#define KOKKOS_REMOTESPACES_GET(type, mpi_type) \ + static KOKKOS_INLINE_FUNCTION void mpi_block_type_get( \ + type *ptr, const size_t offset, const size_t nelems, const int pe, \ + const MPI_Win &win) { \ + assert(win != MPI_WIN_NULL); \ + int _typesize; \ + MPI_Request request; \ + MPI_Type_size(mpi_type, &_typesize); \ + void *dst_adr = ptr + offset; \ + size_t win_offset = sizeof(SharedAllocationHeader) + offset * _typesize; \ + MPI_Rget(dst_adr, nelems, mpi_type, pe, win_offset, nelems, mpi_type, win, \ + &request); \ + MPI_Wait(&request, MPI_STATUS_IGNORE); \ + } + +KOKKOS_REMOTESPACES_GET(char, MPI_SIGNED_CHAR) +KOKKOS_REMOTESPACES_GET(unsigned char, MPI_UNSIGNED_CHAR) +KOKKOS_REMOTESPACES_GET(short, MPI_SHORT) +KOKKOS_REMOTESPACES_GET(unsigned short, MPI_UNSIGNED_SHORT) +KOKKOS_REMOTESPACES_GET(int, MPI_INT) +KOKKOS_REMOTESPACES_GET(unsigned int, MPI_UNSIGNED) +KOKKOS_REMOTESPACES_GET(long, MPI_INT64_T) +KOKKOS_REMOTESPACES_GET(long long, MPI_LONG_LONG) +KOKKOS_REMOTESPACES_GET(unsigned long long, MPI_UNSIGNED_LONG_LONG) +KOKKOS_REMOTESPACES_GET(unsigned long, MPI_UNSIGNED_LONG) +KOKKOS_REMOTESPACES_GET(float, MPI_FLOAT) +KOKKOS_REMOTESPACES_GET(double, MPI_DOUBLE) + +#undef KOKKOS_REMOTESPACES_GET + +template +struct MPIBlockDataElement {}; + +// Atomic Operators +template +struct MPIBlockDataElement { + const MPI_Win win; + T *ptr; + int offset; + int pe; + size_t nelems; + typedef const T const_value_type; + typedef T non_const_value_type; + + KOKKOS_INLINE_FUNCTION + MPIBlockDataElement(T *ptr_, MPI_Win win_, int pe_, size_t i_, size_t size_) + : win(win_), ptr(ptr_), offset(i_), pe(pe_), nelems(size_) {} + + KOKKOS_INLINE_FUNCTION + void put() const { mpi_block_type_put(ptr, offset, nelems, pe, win); } + + KOKKOS_INLINE_FUNCTION + void get() const { mpi_block_type_get(ptr, offset, nelems, pe, win); } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_REMOTESPACES_MPISPACE_BLOCK_OPS_HPP diff --git a/src/impl/mpispace/Kokkos_MPISpace_DataHandle.hpp b/src/impl/mpispace/Kokkos_MPISpace_DataHandle.hpp index c69d65a9..a54f99d5 100644 --- a/src/impl/mpispace/Kokkos_MPISpace_DataHandle.hpp +++ b/src/impl/mpispace/Kokkos_MPISpace_DataHandle.hpp @@ -25,34 +25,60 @@ namespace Impl { template struct MPIDataHandle { T *ptr; - mutable MPI_Win win; - size_t win_offset; + MPIAccessLocation loc; KOKKOS_INLINE_FUNCTION - MPIDataHandle() : ptr(NULL), win(MPI_WIN_NULL), win_offset(0) {} + MPIDataHandle() : ptr(NULL), loc(MPI_WIN_NULL, 0) {} KOKKOS_INLINE_FUNCTION - MPIDataHandle(T *ptr_, MPI_Win &win_, size_t offset_ = 0) - : ptr(ptr_), win(win_), win_offset(offset_) {} + MPIDataHandle(T *ptr_, MPI_Win win_ = MPI_WIN_NULL, size_t offset_ = 0) + : ptr(ptr_ + offset_), loc(win_, offset_) {} KOKKOS_INLINE_FUNCTION MPIDataHandle(MPIDataHandle const &arg) - : ptr(arg.ptr), win(arg.win), win_offset(arg.win_offset) {} - - template - KOKKOS_INLINE_FUNCTION MPIDataHandle(SrcTraits const &arg) - : ptr(arg.ptr), win(arg.win), win_offset(arg.win_offset) {} + : ptr(arg.ptr), loc(arg.loc) {} template KOKKOS_INLINE_FUNCTION MPIDataElement operator()( const int &pe, const iType &i) const { - assert(win != MPI_WIN_NULL); - MPIDataElement element(&win, pe, i + win_offset); + assert(loc.win != MPI_WIN_NULL); + MPIDataElement element(&loc.win, pe, i + loc.offset); return element; } KOKKOS_INLINE_FUNCTION - T *operator+(size_t &offset) const { return ptr + offset; } + MPIDataHandle operator+(size_t &offset) { + return MPIDataHandle(ptr += offset, loc.offset += offset); + } +}; + +template +struct BlockDataHandle { + T *ptr; + MPIAccessLocation loc; + size_t pe; + size_t elems; + + KOKKOS_INLINE_FUNCTION + BlockDataHandle(T *ptr_, MPI_Win win_, size_t offset_, size_t elems_, + size_t pe_) + : ptr(ptr_), loc(win_, offset_), elems(elems_), pe(pe_) {} + + KOKKOS_INLINE_FUNCTION + BlockDataHandle(BlockDataHandle const &arg) + : ptr(arg.ptr), loc(arg.loc), elems(arg.elems), pe(arg.pe) {} + + KOKKOS_INLINE_FUNCTION + void get() { + MPIBlockDataElement element(ptr, loc.win, pe, loc.offset, elems); + element.get(); + } + + KOKKOS_INLINE_FUNCTION + void put() { + MPIBlockDataElement element(ptr, loc.win, pe, loc.offset, elems); + element.put(); + } }; template @@ -83,6 +109,12 @@ struct ViewDataHandle< return handle_type(arg_data_ptr + offset); } + template + KOKKOS_INLINE_FUNCTION static handle_type assign( + SrcHandleType const arg_data_ptr, MPI_Win win, size_t offset) { + return handle_type(arg_data_ptr.ptr, win, offset); + } + template KOKKOS_INLINE_FUNCTION static handle_type assign( SrcHandleType const arg_data_ptr) { diff --git a/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace.cpp b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace.cpp index 37521436..23ce01dd 100644 --- a/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace.cpp +++ b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace.cpp @@ -32,7 +32,23 @@ void NVSHMEMSpace::impl_set_allocation_mode(const int allocation_mode_) { void NVSHMEMSpace::impl_set_extent(const int64_t extent_) { extent = extent_; } -void *NVSHMEMSpace::allocate(const size_t arg_alloc_size) const { +void *SHMEMSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} + +void *SHMEMSpace::allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t + + arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} + +void *NVSHMEMSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; static_assert(sizeof(void *) == sizeof(uintptr_t), "Error sizeof(void*) != sizeof(uintptr_t)"); @@ -40,21 +56,90 @@ void *NVSHMEMSpace::allocate(const size_t arg_alloc_size) const { Kokkos::Impl::is_integral_power_of_two(Kokkos::Impl::MEMORY_ALIGNMENT), "Memory alignment must be power of two"); - void *ptr = 0; + constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; + constexpr uintptr_t alignment_mask = alignment - 1; + + void *ptr = nullptr; + if (arg_alloc_size) { + // Over-allocate to and round up to guarantee proper alignment. + size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; + if (allocation_mode == Kokkos::Experimental::Symmetric) { int num_pes = nvshmem_n_pes(); int my_id = nvshmem_my_pe(); ptr = nvshmem_malloc(arg_alloc_size); } else { - Kokkos::abort("NVSHMEMSpace only supports symmetric allocation policy."); + Kokkos::abort("SHMEMSpace only supports symmetric allocation policy."); } + + if (ptr) { + auto address = reinterpret_cast(ptr); + + // offset enough to record the alloc_ptr + address += sizeof(void *); + uintptr_t rem = address % alignment; + uintptr_t offset = rem ? (alignment - rem) : 0u; + address += offset; + ptr = reinterpret_cast(address); + // record the alloc'd pointer + address -= sizeof(void *); + *reinterpret_cast(address) = ptr; + } + } + + using MemAllocFailure = + Kokkos::Impl::Experimental::RemoteSpacesMemoryAllocationFailure; + using MemAllocFailureMode = Kokkos::Impl::Experimental:: + RemoteSpacesMemoryAllocationFailure::FailureMode; + + if ((ptr == nullptr) || (reinterpret_cast(ptr) == ~uintptr_t(0)) || + (reinterpret_cast(ptr) & alignment_mask)) { + MemAllocFailureMode failure_mode = + MemAllocFailureMode::AllocationNotAligned; + if (ptr == nullptr) { + failure_mode = MemAllocFailureMode::OutOfMemoryError; + } + + MemAllocFailure::AllocationMechanism alloc_mec = + MemAllocFailure::AllocationMechanism::NVSHMEMMALLOC; + throw MemAllocFailure(arg_alloc_size, alignment, failure_mode, alloc_mec); + } + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; } -void NVSHMEMSpace::deallocate(void *const arg_alloc_ptr, const size_t) const { - nvshmem_free(arg_alloc_ptr); +void NVSHMEMSpace::deallocate(void *const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} + +void NVSHMEMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t + + arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} + +void NVSHMEMSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (arg_alloc_ptr) { + Kokkos::fence("HostSpace::impl_deallocate before free"); + fence(); + size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + nvshmem_free(arg_alloc_ptr); + } } void NVSHMEMSpace::fence() { @@ -63,39 +148,10 @@ void NVSHMEMSpace::fence() { } KOKKOS_FUNCTION -size_t get_num_pes() { return nvshmem_n_pes(); } +int get_num_pes() { return nvshmem_n_pes(); } KOKKOS_FUNCTION -size_t get_my_pe() { return nvshmem_my_pe(); } - -KOKKOS_FUNCTION -size_t get_indexing_block_size(size_t size) { - size_t num_pes, block; - num_pes = get_num_pes(); - block = (size + num_pes - 1) / num_pes; - return block; -} - -std::pair getRange(size_t size, size_t pe) { - size_t start, end; - size_t block = get_indexing_block_size(size); - start = pe * block; - end = (pe + 1) * block; - - size_t num_pes = get_num_pes(); - - if (size < num_pes) { - size_t diff = (num_pes * block) - size; - if (pe > num_pes - 1 - diff) end--; - } else { - if (pe == num_pes - 1) { - size_t diff = size - (num_pes - 1) * block; - end = start + diff; - } - end--; - } - return std::make_pair(start, end); -} +int get_my_pe() { return nvshmem_my_pe(); } } // namespace Experimental diff --git a/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace.hpp b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace.hpp index 7183389d..761e9a06 100644 --- a/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace.hpp +++ b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace.hpp @@ -30,7 +30,6 @@ #include #include #include -/*--------------------------------------------------------------------------*/ namespace Kokkos { namespace Experimental { @@ -58,15 +57,32 @@ class NVSHMEMSpace { explicit NVSHMEMSpace(const MPI_Comm &); + /**\brief Allocate untracked memory in the space */ void *allocate(const size_t arg_alloc_size) const; + void *allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + /**\brief Deallocate untracked memory in the space */ void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; - void *allocate(const int *gids, const int &arg_local_alloc_size) const; - - void deallocate(const int *gids, void *const arg_alloc_ptr, - const size_t arg_alloc_size) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + + void *impl_allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + public: /**\brief Return Name of the MemorySpace */ static constexpr const char *name() { return m_name; } @@ -85,12 +101,9 @@ class NVSHMEMSpace { }; KOKKOS_FUNCTION -size_t get_num_pes(); +int get_num_pes(); KOKKOS_FUNCTION -size_t get_my_pe(); -KOKKOS_FUNCTION -size_t get_indexing_block_size(size_t size); -std::pair getRange(size_t size, size_t pe); +int get_my_pe(); } // namespace Experimental } // namespace Kokkos @@ -142,15 +155,17 @@ struct MemorySpaceAccess #include #include -#include #include #include #include #include +#include #include #include +#include #include #endif // #define KOKKOS_NVSHMEMSPACE_HPP diff --git a/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_AllocationRecord.hpp b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_AllocationRecord.hpp index 6a0af81f..76a227fc 100644 --- a/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_AllocationRecord.hpp +++ b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_AllocationRecord.hpp @@ -16,8 +16,8 @@ // //@HEADER -#ifndef KOKKOS_NVSHMEM_ALLOCREC_HPP -#define KOKKOS_NVSHMEM_ALLOCREC_HPP +#ifndef KOKKOS_REMOTESPACES_NVSHMEM_ALLOCREC_HPP +#define KOKKOS_REMOTESPACES_NVSHMEM_ALLOCREC_HPP #include @@ -99,4 +99,4 @@ class SharedAllocationRecord } // namespace Impl } // namespace Kokkos -#endif // KOKKOS_NVSHMEM_ALLOCREC_HPP +#endif // KOKKOS_REMOTESPACES_NVSHMEM_ALLOCREC_HPP diff --git a/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_BlockOps.hpp b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_BlockOps.hpp new file mode 100644 index 00000000..72405af8 --- /dev/null +++ b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_BlockOps.hpp @@ -0,0 +1,97 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#ifndef KOKKOS_REMOTESPACES_NVSHMEM_BLOCK_OPS_HPP +#define KOKKOS_REMOTESPACES_NVSHMEM_BLOCK_OPS_HPP + +#include +#include + +namespace Kokkos { +namespace Impl { + +#define KOKKOS_REMOTESPACES_PUT(type, op) \ + static KOKKOS_INLINE_FUNCTION void shmem_block_type_put( \ + type *dst, const type *src, size_t nelems, int pe) { \ + op(dst, src, nelems, pe); \ + } + +KOKKOS_REMOTESPACES_PUT(char, nvshmem_char_put) +KOKKOS_REMOTESPACES_PUT(unsigned char, nvshmem_uchar_put) +KOKKOS_REMOTESPACES_PUT(short, nvshmem_short_put) +KOKKOS_REMOTESPACES_PUT(unsigned short, nvshmem_ushort_put) +KOKKOS_REMOTESPACES_PUT(int, nvshmem_int_put) +KOKKOS_REMOTESPACES_PUT(unsigned int, nvshmem_uint_put) +KOKKOS_REMOTESPACES_PUT(long, nvshmem_long_put) +KOKKOS_REMOTESPACES_PUT(unsigned long, nvshmem_ulong_put) +KOKKOS_REMOTESPACES_PUT(long long, nvshmem_longlong_put) +KOKKOS_REMOTESPACES_PUT(unsigned long long, nvshmem_ulonglong_put) +KOKKOS_REMOTESPACES_PUT(float, nvshmem_float_put) +KOKKOS_REMOTESPACES_PUT(double, nvshmem_double_put) + +#undef KOKKOS_REMOTESPACES_PUT + +#define KOKKOS_REMOTESPACES_GET(type, op) \ + static KOKKOS_INLINE_FUNCTION void shmem_block_type_get( \ + type *dst, const type *src, size_t nelems, int pe) { \ + op(dst, src, nelems, pe); \ + } + +KOKKOS_REMOTESPACES_GET(char, nvshmem_char_get) +KOKKOS_REMOTESPACES_GET(unsigned char, nvshmem_uchar_get) +KOKKOS_REMOTESPACES_GET(short, nvshmem_short_get) +KOKKOS_REMOTESPACES_GET(unsigned short, nvshmem_ushort_get) +KOKKOS_REMOTESPACES_GET(int, nvshmem_int_get) +KOKKOS_REMOTESPACES_GET(unsigned int, nvshmem_uint_get) +KOKKOS_REMOTESPACES_GET(long, nvshmem_long_get) +KOKKOS_REMOTESPACES_GET(unsigned long, nvshmem_ulong_get) +KOKKOS_REMOTESPACES_GET(long long, nvshmem_longlong_get) +KOKKOS_REMOTESPACES_GET(unsigned long long, nvshmem_ulonglong_get) +KOKKOS_REMOTESPACES_GET(float, nvshmem_float_get) +KOKKOS_REMOTESPACES_GET(double, nvshmem_double_get) + +#undef KOKKOS_REMOTESPACES_GET + +template +struct NVSHMEMBlockDataElement {}; + +// Atomic Operators +template +struct NVSHMEMBlockDataElement { + typedef const T const_value_type; + typedef T non_const_value_type; + T *src; + T *dst; + size_t nelems; + int pe; + + KOKKOS_INLINE_FUNCTION + NVSHMEMBlockDataElement(T *src_, T *dst_, size_t size_, int pe_) + : src(src_), dst(dst_), nelems(size_), pe(pe_) {} + + KOKKOS_INLINE_FUNCTION + void put() const { shmem_block_type_put(dst, src, nelems, pe); } + + KOKKOS_INLINE_FUNCTION + void get() const { shmem_block_type_get(dst, src, nelems, pe); } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_REMOTESPACES_NVSHMEM_BLOCK_OPS_HPP diff --git a/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_DataHandle.hpp b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_DataHandle.hpp index e0cc7393..93d13d91 100644 --- a/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_DataHandle.hpp +++ b/src/impl/nvshmemspace/Kokkos_NVSHMEMSpace_DataHandle.hpp @@ -35,10 +35,6 @@ struct NVSHMEMDataHandle { KOKKOS_INLINE_FUNCTION NVSHMEMDataHandle(NVSHMEMDataHandle const &arg) : ptr(arg.ptr) {} - template - KOKKOS_INLINE_FUNCTION NVSHMEMDataHandle(SrcTraits const &arg) - : ptr(arg.ptr) {} - template KOKKOS_INLINE_FUNCTION NVSHMEMDataElement operator()( const int &pe, const iType &i) const { @@ -50,6 +46,38 @@ struct NVSHMEMDataHandle { T *operator+(size_t &offset) const { return ptr + offset; } }; +template +struct BlockDataHandle { + T *src; + T *dst; + size_t elems; + int pe; + + KOKKOS_INLINE_FUNCTION + BlockDataHandle(T *src_, T *dst_, size_t elems_, int pe_) + : src(src_), dst(dst_), elems(elems_), pe(pe_) {} + + KOKKOS_INLINE_FUNCTION + BlockDataHandle(BlockDataHandle const &arg) + : src(arg.src), dst(arg.dst), elems(arg.elems), pe(arg.pe_) {} + + template + KOKKOS_INLINE_FUNCTION BlockDataHandle(SrcTraits const &arg) + : src(arg.src), dst(arg.dst), elems(arg.elems), pe(arg.pe_) {} + + KOKKOS_INLINE_FUNCTION + void get() { + NVSHMEMBlockDataElement element(src, dst, elems, pe); + element.get(); + } + + KOKKOS_INLINE_FUNCTION + void put() { + NVSHMEMBlockDataElement element(src, dst, elems, pe); + element.put(); + } +}; + template struct ViewDataHandle< Traits, typename std::enable_if_t +#include + +namespace Kokkos { +namespace Impl { + +#define KOKKOS_REMOTESPACES_PUT(type, op) \ + static KOKKOS_INLINE_FUNCTION void shmem_block_type_put( \ + type *dst, const type *src, size_t nelems, int pe) { \ + op(dst, src, nelems, pe); \ + } + +KOKKOS_REMOTESPACES_PUT(char, nvshmem_char_put) +KOKKOS_REMOTESPACES_PUT(unsigned char, nvshmem_uchar_put) +KOKKOS_REMOTESPACES_PUT(short, nvshmem_short_put) +KOKKOS_REMOTESPACES_PUT(unsigned short, nvshmem_ushort_put) +KOKKOS_REMOTESPACES_PUT(int, nvshmem_int_put) +KOKKOS_REMOTESPACES_PUT(unsigned int, nvshmem_uint_put) +KOKKOS_REMOTESPACES_PUT(long, nvshmem_long_put) +KOKKOS_REMOTESPACES_PUT(unsigned long, nvshmem_ulong_put) +KOKKOS_REMOTESPACES_PUT(long long, nvshmem_longlong_put) +KOKKOS_REMOTESPACES_PUT(unsigned long long, nvshmem_ulonglong_put) +KOKKOS_REMOTESPACES_PUT(float, nvshmem_float_put) +KOKKOS_REMOTESPACES_PUT(double, nvshmem_double_put) + +#undef KOKKOS_REMOTESPACES_PUT + +#define KOKKOS_REMOTESPACES_GET(type, op) \ + static KOKKOS_INLINE_FUNCTION void shmem_block_type_get( \ + type *dst, const type *src, size_t nelems, int pe) { \ + op(dst, src, nelems, pe); \ + } + +KOKKOS_REMOTESPACES_GET(char, nvshmem_char_get) +KOKKOS_REMOTESPACES_GET(unsigned char, nvshmem_uchar_get) +KOKKOS_REMOTESPACES_GET(short, nvshmem_short_get) +KOKKOS_REMOTESPACES_GET(unsigned short, nvshmem_ushort_get) +KOKKOS_REMOTESPACES_GET(int, nvshmem_int_get) +KOKKOS_REMOTESPACES_GET(unsigned int, nvshmem_uint_get) +KOKKOS_REMOTESPACES_GET(long, nvshmem_long_get) +KOKKOS_REMOTESPACES_GET(unsigned long, nvshmem_ulong_get) +KOKKOS_REMOTESPACES_GET(long long, nvshmem_longlong_get) +KOKKOS_REMOTESPACES_GET(unsigned long long, nvshmem_ulonglong_get) +KOKKOS_REMOTESPACES_GET(float, nvshmem_float_get) +KOKKOS_REMOTESPACES_GET(double, nvshmem_double_get) + +#undef KOKKOS_REMOTESPACES_GET + +template +struct NVSHMEMBlockDataElement {}; + +// Atomic Operators +template +struct NVSHMEMBlockDataElement { + typedef const T const_value_type; + typedef T non_const_value_type; + T *src; + T *dst; + size_t nelems; + int pe; + + KOKKOS_INLINE_FUNCTION + NVSHMEMBlockDataElement(T *src_, T *dst_, size_t size_, int pe_) + : src(src_), dst(dst_), nelems(size_), pe(pe_) {} + + KOKKOS_INLINE_FUNCTION + void put() const { shmem_block_type_put(dst, src, nelems, pe); } + + KOKKOS_INLINE_FUNCTION + void get() const { shmem_block_type_get(dst, src, nelems, pe); } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_REMOTESPACES_NVSHMEM_BLOCK_OPS_HPP diff --git a/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace.cpp b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace.cpp index cf540b9c..1663e782 100644 --- a/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace.cpp +++ b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace.cpp @@ -32,7 +32,23 @@ void ROCSHMEMSpace::impl_set_allocation_mode(const int allocation_mode_) { void ROCSHMEMSpace::impl_set_extent(const int64_t extent_) { extent = extent_; } -void *ROCSHMEMSpace::allocate(const size_t arg_alloc_size) const { +void *SHMEMSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} + +void *SHMEMSpace::allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t + + arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} + +void *ROCSHMEMSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; static_assert(sizeof(void *) == sizeof(uintptr_t), "Error sizeof(void*) != sizeof(uintptr_t)"); @@ -40,21 +56,90 @@ void *ROCSHMEMSpace::allocate(const size_t arg_alloc_size) const { Kokkos::Impl::is_integral_power_of_two(Kokkos::Impl::MEMORY_ALIGNMENT), "Memory alignment must be power of two"); - void *ptr = 0; + constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; + constexpr uintptr_t alignment_mask = alignment - 1; + + void *ptr = nullptr; + if (arg_alloc_size) { + // Over-allocate to and round up to guarantee proper alignment. + size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; + if (allocation_mode == Kokkos::Experimental::Symmetric) { int num_pes = roc_shmem_n_pes(); int my_id = roc_shmem_my_pe(); ptr = roc_shmem_malloc(arg_alloc_size); } else { - Kokkos::abort("ROCSHMEMSpace only supports symmetric allocation policy."); + Kokkos::abort("SHMEMSpace only supports symmetric allocation policy."); } + + if (ptr) { + auto address = reinterpret_cast(ptr); + + // offset enough to record the alloc_ptr + address += sizeof(void *); + uintptr_t rem = address % alignment; + uintptr_t offset = rem ? (alignment - rem) : 0u; + address += offset; + ptr = reinterpret_cast(address); + // record the alloc'd pointer + address -= sizeof(void *); + *reinterpret_cast(address) = ptr; + } + } + + using MemAllocFailure = + Kokkos::Impl::Experimental::RemoteSpacesMemoryAllocationFailure; + using MemAllocFailureMode = Kokkos::Impl::Experimental:: + RemoteSpacesMemoryAllocationFailure::FailureMode; + + if ((ptr == nullptr) || (reinterpret_cast(ptr) == ~uintptr_t(0)) || + (reinterpret_cast(ptr) & alignment_mask)) { + MemAllocFailureMode failure_mode = + MemAllocFailureMode::AllocationNotAligned; + if (ptr == nullptr) { + failure_mode = MemAllocFailureMode::OutOfMemoryError; + } + + MemAllocFailure::AllocationMechanism alloc_mec = + MemAllocFailure::AllocationMechanism::ROCSHMEMMALLOC; + throw MemAllocFailure(arg_alloc_size, alignment, failure_mode, alloc_mec); + } + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; } -void ROCSHMEMSpace::deallocate(void *const arg_alloc_ptr, const size_t) const { - roc_shmem_free(arg_alloc_ptr); +void ROCSHMEMSpace::deallocate(void *const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} + +void ROCSHMEMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t + + arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} + +void ROCSHMEMSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (arg_alloc_ptr) { + Kokkos::fence("HostSpace::impl_deallocate before free"); + fence(); + size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + roc_shmem_free(arg_alloc_ptr); + } } void ROCSHMEMSpace::fence() { @@ -68,35 +153,6 @@ size_t get_num_pes() { return roc_shmem_n_pes(); } KOKKOS_FUNCTION size_t get_my_pe() { return roc_shmem_my_pe(); } -KOKKOS_FUNCTION -size_t get_indexing_block_size(size_t size) { - size_t num_pes, block; - num_pes = get_num_pes(); - block = (size + num_pes - 1) / num_pes; - return block; -} - -std::pair getRange(size_t size, size_t pe) { - size_t start, end; - size_t block = get_indexing_block_size(size); - start = pe * block; - end = (pe + 1) * block; - - size_t num_pes = get_num_pes(); - - if (size < num_pes) { - size_t diff = (num_pes * block) - size; - if (pe > num_pes - 1 - diff) end--; - } else { - if (pe == num_pes - 1) { - size_t diff = size - (num_pes - 1) * block; - end = start + diff; - } - end--; - } - return std::make_pair(start, end); -} - } // namespace Experimental namespace Impl { diff --git a/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace.hpp b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace.hpp index 7020add2..6e6a2de0 100644 --- a/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace.hpp +++ b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace.hpp @@ -28,7 +28,6 @@ #include #include -/*--------------------------------------------------------------------------*/ namespace Kokkos { namespace Experimental { @@ -56,14 +55,30 @@ class ROCSHMEMSpace { explicit ROCSHMEMSpace(const MPI_Comm &); + /**\brief Allocate untracked memory in the space */ void *allocate(const size_t arg_alloc_size) const; + void *allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + /**\brief Deallocate untracked memory in the space */ void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; - void *allocate(const int *gids, const int &arg_local_alloc_size) const; - - void deallocate(const int *gids, void *const arg_alloc_ptr, - const size_t arg_alloc_size) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + + void *impl_allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; /**\brief Return Name of the MemorySpace */ static constexpr const char *name() { return m_name; } @@ -86,9 +101,6 @@ KOKKOS_FUNCTION size_t get_num_pes(); KOKKOS_FUNCTION size_t get_my_pe(); -KOKKOS_FUNCTION -size_t get_indexing_block_size(size_t size); -std::pair getRange(size_t size, size_t pe); } // namespace Experimental } // namespace Kokkos @@ -140,15 +152,17 @@ struct MemorySpaceAccess #include #include -#include #include #include #include #include +#include #include #include +#include #include #endif // #define KOKKOS_ROCSHMEMSPACE_HPP diff --git a/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_AllocationRecord.hpp b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_AllocationRecord.hpp index 5933345e..0bc2dec2 100644 --- a/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_AllocationRecord.hpp +++ b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_AllocationRecord.hpp @@ -16,8 +16,8 @@ // //@HEADER -#ifndef KOKKOS_ROCSHMEM_ALLOCREC_HPP -#define KOKKOS_ROCSHMEM_ALLOCREC_HPP +#ifndef KOKKOS_REMOTESPACES_ROCSHMEM_ALLOCREC_HPP +#define KOKKOS_REMOTESPACES_ROCSHMEM_ALLOCREC_HPP #include @@ -101,4 +101,4 @@ class SharedAllocationRecord } // namespace Impl } // namespace Kokkos -#endif // KOKKOS_ROCSHMEM_ALLOCREC_HPP +#endif // KOKKOS_REMOTESPACES_ROCSHMEM_ALLOCREC_HPP diff --git a/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_BlockOps.hpp b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_BlockOps.hpp new file mode 100644 index 00000000..c07bc50f --- /dev/null +++ b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_BlockOps.hpp @@ -0,0 +1,97 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#ifndef KOKKOS_REMOTESPACES_ROCSHMEM_BLOCK_OPS_HPP +#define KOKKOS_REMOTESPACES_ROCSHMEM_BLOCK_OPS_HPP + +#include +#include + +namespace Kokkos { +namespace Impl { + +#define KOKKOS_REMOTESPACES_PUT(type, op) \ + static KOKKOS_INLINE_FUNCTION void shmem_block_type_put( \ + type *dst, const type *src, size_t nelems, int pe) { \ + op(dst, src, nelems, pe); \ + } + +KOKKOS_REMOTESPACES_PUT(char, roc_shmem_char_put) +KOKKOS_REMOTESPACES_PUT(unsigned char, roc_shmem_uchar_put) +KOKKOS_REMOTESPACES_PUT(short, roc_shmem_short_put) +KOKKOS_REMOTESPACES_PUT(unsigned short, roc_shmem_ushort_put) +KOKKOS_REMOTESPACES_PUT(int, roc_shmem_int_put) +KOKKOS_REMOTESPACES_PUT(unsigned int, roc_shmem_uint_put) +KOKKOS_REMOTESPACES_PUT(long, roc_shmem_long_put) +KOKKOS_REMOTESPACES_PUT(unsigned long, roc_shmem_ulong_put) +KOKKOS_REMOTESPACES_PUT(long long, roc_shmem_longlong_put) +KOKKOS_REMOTESPACES_PUT(unsigned long long, roc_shmem_ulonglong_put) +KOKKOS_REMOTESPACES_PUT(float, roc_shmem_float_put) +KOKKOS_REMOTESPACES_PUT(double, roc_shmem_double_put) + +#undef KOKKOS_REMOTESPACES_PUT + +#define KOKKOS_REMOTESPACES_GET(type, op) \ + static KOKKOS_INLINE_FUNCTION void shmem_block_type_get( \ + type *dst, const type *src, size_t nelems, int pe) { \ + op(dst, src, nelems, pe); \ + } + +KOKKOS_REMOTESPACES_GET(char, roc_shmem_char_get) +KOKKOS_REMOTESPACES_GET(unsigned char, roc_shmem_uchar_get) +KOKKOS_REMOTESPACES_GET(short, roc_shmem_short_get) +KOKKOS_REMOTESPACES_GET(unsigned short, roc_shmem_ushort_get) +KOKKOS_REMOTESPACES_GET(int, roc_shmem_int_get) +KOKKOS_REMOTESPACES_GET(unsigned int, roc_shmem_uint_get) +KOKKOS_REMOTESPACES_GET(long, roc_shmem_long_get) +KOKKOS_REMOTESPACES_GET(unsigned long, roc_shmem_ulong_get) +KOKKOS_REMOTESPACES_GET(long long, roc_shmem_longlong_get) +KOKKOS_REMOTESPACES_GET(unsigned long long, roc_shmem_ulonglong_get) +KOKKOS_REMOTESPACES_GET(float, roc_shmem_float_get) +KOKKOS_REMOTESPACES_GET(double, roc_shmem_double_get) + +#undef KOKKOS_REMOTESPACES_GET + +template +struct ROCSHMEMBlockDataElement {}; + +// Atomic Operators +template +struct ROCSHMEMBlockDataElement { + typedef const T const_value_type; + typedef T non_const_value_type; + T *src; + T *dst; + size_t nelems; + int pe; + + KOKKOS_INLINE_FUNCTION + ROCSHMEMBlockDataElement(T *src_, T *dst_, size_t size_, int pe_) + : src(src_), dst(dst_), nelems(size_), pe(pe_) {} + + KOKKOS_INLINE_FUNCTION + void put() const { shmem_block_type_put(dst, src, nelems, pe); } + + KOKKOS_INLINE_FUNCTION + void get() const { shmem_block_type_get(dst, src, nelems, pe); } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_REMOTESPACES_ROCSHMEM_BLOCK_OPS_HPP diff --git a/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_DataHandle.hpp b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_DataHandle.hpp index 62a64af6..0ce6cc18 100644 --- a/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_DataHandle.hpp +++ b/src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace_DataHandle.hpp @@ -43,6 +43,38 @@ struct ROCSHMEMDataHandle { T *operator+(size_t &offset) const { return ptr + offset; } }; +template +struct BlockDataHandle { + T *src; + T *dst; + size_t elems; + int pe; + + KOKKOS_INLINE_FUNCTION + BlockDataHandle(T *src_, T *dst_, size_t elems_, int pe_) + : src(src_), dst(dst_), elems(elems_), pe(pe_) {} + + KOKKOS_INLINE_FUNCTION + BlockDataHandle(BlockDataHandle const &arg) + : src(arg.src), dst(arg.dst), elems(arg.elems), pe(arg.pe_) {} + + template + KOKKOS_INLINE_FUNCTION BlockDataHandle(SrcTraits const &arg) + : src(arg.src), dst(arg.dst), elems(arg.elems), pe(arg.pe_) {} + + KOKKOS_INLINE_FUNCTION + void get() { + ROCSHMEMBlockDataElement element(src, dst, elems, pe); + element.get(); + } + + KOKKOS_INLINE_FUNCTION + void put() { + ROCSHMEMBlockDataElement element(src, dst, elems, pe); + element.put(); + } +}; + template struct ViewDataHandle< Traits, typename std::enable_if #include #include -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- namespace Kokkos { namespace Experimental { @@ -35,6 +32,22 @@ void SHMEMSpace::impl_set_allocation_mode(const int allocation_mode_) { void SHMEMSpace::impl_set_extent(const int64_t extent_) { extent = extent_; } void *SHMEMSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} + +void *SHMEMSpace::allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t + + arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} + +void *SHMEMSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; static_assert(sizeof(void *) == sizeof(uintptr_t), "Error sizeof(void*) != sizeof(uintptr_t)"); @@ -42,59 +55,100 @@ void *SHMEMSpace::allocate(const size_t arg_alloc_size) const { Kokkos::Impl::is_integral_power_of_two(Kokkos::Impl::MEMORY_ALIGNMENT), "Memory alignment must be power of two"); - void *ptr = 0; + constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; + constexpr uintptr_t alignment_mask = alignment - 1; + + void *ptr = nullptr; + if (arg_alloc_size) { + // Over-allocate to and round up to guarantee proper alignment. + size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; + if (allocation_mode == Kokkos::Experimental::Symmetric) { int num_pes = shmem_n_pes(); int my_id = shmem_my_pe(); - ptr = shmem_malloc(arg_alloc_size); + ptr = shmem_malloc(size_padded); } else { Kokkos::abort("SHMEMSpace only supports symmetric allocation policy."); } + + if (ptr) { + auto address = reinterpret_cast(ptr); + + // offset enough to record the alloc_ptr + address += sizeof(void *); + uintptr_t rem = address % alignment; + uintptr_t offset = rem ? (alignment - rem) : 0u; + address += offset; + ptr = reinterpret_cast(address); + // record the alloc'd pointer + address -= sizeof(void *); + *reinterpret_cast(address) = ptr; + } } - return ptr; -} -void SHMEMSpace::deallocate(void *const arg_alloc_ptr, const size_t) const { - shmem_free(arg_alloc_ptr); + using MemAllocFailure = + Kokkos::Impl::Experimental::RemoteSpacesMemoryAllocationFailure; + using MemAllocFailureMode = Kokkos::Impl::Experimental:: + RemoteSpacesMemoryAllocationFailure::FailureMode; + + if ((ptr == nullptr) || (reinterpret_cast(ptr) == ~uintptr_t(0)) || + (reinterpret_cast(ptr) & alignment_mask)) { + MemAllocFailureMode failure_mode = + MemAllocFailureMode::AllocationNotAligned; + if (ptr == nullptr) { + failure_mode = MemAllocFailureMode::OutOfMemoryError; + } + + MemAllocFailure::AllocationMechanism alloc_mec = + MemAllocFailure::AllocationMechanism::SHMEMMALLOC; + throw MemAllocFailure(arg_alloc_size, alignment, failure_mode, alloc_mec); + } + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); + } + return ptr; } -void SHMEMSpace::fence() { - Kokkos::fence(); - shmem_barrier_all(); +void SHMEMSpace::deallocate(void *const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); } -size_t get_num_pes() { return shmem_n_pes(); } -size_t get_my_pe() { return shmem_my_pe(); } +void SHMEMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t -size_t get_indexing_block_size(size_t size) { - size_t num_pes, block; - num_pes = get_num_pes(); - block = (size + num_pes - 1) / num_pes; - return block; + arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); } -std::pair getRange(size_t size, size_t pe) { - size_t start, end; - size_t block = get_indexing_block_size(size); - start = pe * block; - end = (pe + 1) * block; - - size_t num_pes = get_num_pes(); - - if (size < num_pes) { - size_t diff = (num_pes * block) - size; - if (pe > num_pes - 1 - diff) end--; - } else { - if (pe == num_pes - 1) { - size_t diff = size - (num_pes - 1) * block; - end = start + diff; +void SHMEMSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (arg_alloc_ptr) { + Kokkos::fence("HostSpace::impl_deallocate before free"); + fence(); + size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } - end--; + shmem_free(arg_alloc_ptr); } - return std::make_pair(start, end); } +void SHMEMSpace::fence() const { + Kokkos::fence(); + shmem_barrier_all(); +} + +size_t get_num_pes() { return shmem_n_pes(); } +size_t get_my_pe() { return shmem_my_pe(); } + } // namespace Experimental namespace Impl { diff --git a/src/impl/shmemspace/Kokkos_SHMEMSpace.hpp b/src/impl/shmemspace/Kokkos_SHMEMSpace.hpp index 8bcca659..fff4df39 100644 --- a/src/impl/shmemspace/Kokkos_SHMEMSpace.hpp +++ b/src/impl/shmemspace/Kokkos_SHMEMSpace.hpp @@ -29,7 +29,6 @@ #include #include #include -/*--------------------------------------------------------------------------*/ namespace Kokkos { namespace Experimental { @@ -66,19 +65,36 @@ class SHMEMSpace { explicit SHMEMSpace(const MPI_Comm &); + /**\brief Allocate untracked memory in the space */ void *allocate(const size_t arg_alloc_size) const; + void *allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + /**\brief Deallocate untracked memory in the space */ void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; - void *allocate(const int *gids, const int &arg_local_alloc_size) const; - - void deallocate(const int *gids, void *const arg_alloc_ptr, - const size_t arg_alloc_size) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + + void *impl_allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + public: /**\brief Return Name of the MemorySpace */ static constexpr const char *name() { return m_name; } - void fence(); + void fence() const; int allocation_mode; int64_t extent; @@ -94,8 +110,6 @@ class SHMEMSpace { size_t get_num_pes(); size_t get_my_pe(); -size_t get_indexing_block_size(size_t size); -std::pair getRange(size_t size, size_t pe); } // namespace Experimental } // namespace Kokkos @@ -144,15 +158,17 @@ struct MemorySpaceAccess { } // namespace Impl } // namespace Kokkos +#include #include #include -#include #include #include #include #include +#include #include #include +#include #include #endif // #define KOKKOS_SHMEMSPACE_HPP diff --git a/src/impl/shmemspace/Kokkos_SHMEMSpace_AllocationRecord.cpp b/src/impl/shmemspace/Kokkos_SHMEMSpace_AllocationRecord.cpp index bda96ea2..6fcc9997 100644 --- a/src/impl/shmemspace/Kokkos_SHMEMSpace_AllocationRecord.cpp +++ b/src/impl/shmemspace/Kokkos_SHMEMSpace_AllocationRecord.cpp @@ -22,34 +22,42 @@ namespace Kokkos { namespace Impl { -template -SharedAllocationRecord:: - SharedAllocationRecord( - const ExecutionSpace &execution_space, - const Kokkos::Experimental::SHMEMSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : SharedAllocationRecord( - execution_space, - reinterpret_cast(arg_space.allocate( - sizeof(SharedAllocationHeader) + arg_alloc_size)), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { -#if defined(KOKKOS_ENABLE_PROFILING) - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::allocateData( - Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(), - arg_alloc_size); - } +#ifdef KOKKOS_ENABLE_DEBUG +SharedAllocationRecord SharedAllocationRecord< + Kokkos::Experimental::SHMEMSpace, void>::s_root_record; #endif - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast *>(this); - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); + +SharedAllocationRecord::~SharedAllocationRecord() { + m_space.deallocate(m_label.c_str(), + SharedAllocationRecord::m_alloc_ptr, + SharedAllocationRecord::m_alloc_size, + (SharedAllocationRecord::m_alloc_size - + sizeof(SharedAllocationHeader))); +} + +SharedAllocationHeader *_do_allocation( + Kokkos::Experimental::SHMEMSpace const &space, std::string const &label, + size_t alloc_size) { + using MemAllocFailure = + Kokkos::Impl::Experimental::RemoteSpacesMemoryAllocationFailure; + try { + return reinterpret_cast( + space.allocate(alloc_size)); + } catch (MemAllocFailure const &failure) { + if (failure.failure_mode() == + MemAllocFailure::FailureMode::AllocationNotAligned) { + // TODO: delete the misaligned memory + } + + std::cerr << "Kokkos failed to allocate memory for label \"" << label + << "\". Allocation using MemorySpace named \"" << space.name() + << " failed with the following error: "; + failure.print_error_message(std::cerr); + std::cerr.flush(); + Kokkos::Impl::throw_runtime_exception("Memory allocation failure"); + } + return nullptr; // unreachable } SharedAllocationRecord:: @@ -59,120 +67,33 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif - reinterpret_cast(arg_space.allocate( - sizeof(SharedAllocationHeader) + arg_alloc_size)), + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, arg_label), m_space(arg_space) { -#if defined(KOKKOS_ENABLE_PROFILING) - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::allocateData( - Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(), - arg_alloc_size); - } -#endif - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast *>(this); - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); + this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, + arg_label); } -SharedAllocationRecord::~SharedAllocationRecord() { -#if defined(KOKKOS_ENABLE_PROFILING) - if (Kokkos::Profiling::profileLibraryLoaded()) { - SharedAllocationHeader header; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::SpaceHandle( - Kokkos::Experimental::SHMEMSpace::name()), - header.m_label, data(), size()); - } -#endif - - m_space.deallocate(SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SHMEMSpace, void>::s_root_record; - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -void *SharedAllocationRecord:: - allocate_tracked(const Kokkos::Experimental::SHMEMSpace &arg_space, - const std::string &arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return (void *)0; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - RecordBase::increment(r); - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != 0) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord:: - reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), r_new->size()); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} +} // namespace Impl +} // namespace Kokkos -SharedAllocationRecord - *SharedAllocationRecord::get_record( - void *alloc_ptr) { - typedef SharedAllocationHeader Header; - typedef SharedAllocationRecord - RecordHost; +#define KOKKOS_IMPL_PUBLIC_INCLUDE - // Copy the header from the allocation - SharedAllocationHeader const *const head = - alloc_ptr ? Header::get_header(alloc_ptr) : (SharedAllocationHeader *)0; - RecordHost *const record = - head ? static_cast(head->m_record) : (RecordHost *)0; +#include - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::Experimental::SHMEMSpace , void >::get_record ERROR")); - } +namespace Kokkos { +namespace Impl { - return record; -} +template class SharedAllocationRecordCommon; -// Iterate records to print orphaned memory ... -void SharedAllocationRecord:: - print_records(std::ostream &s, const Kokkos::Experimental::SHMEMSpace &, - bool detail) { - SharedAllocationRecord::print_host_accessible_records( - s, "SHMEMSpace", &s_root_record, detail); -} +#undef KOKKOS_IMPL_PUBLIC_INCLUDE } // namespace Impl } // namespace Kokkos diff --git a/src/impl/shmemspace/Kokkos_SHMEMSpace_AllocationRecord.hpp b/src/impl/shmemspace/Kokkos_SHMEMSpace_AllocationRecord.hpp index 42a1d2a5..e940ef5a 100644 --- a/src/impl/shmemspace/Kokkos_SHMEMSpace_AllocationRecord.hpp +++ b/src/impl/shmemspace/Kokkos_SHMEMSpace_AllocationRecord.hpp @@ -16,87 +16,79 @@ // //@HEADER -#ifndef KOKKOS_SHMEM_ALLOCREC_HPP -#define KOKKOS_SHMEM_ALLOCREC_HPP +#ifndef KOKKOS_REMOTESPACES_SHMEM_ALLOCREC_HPP +#define KOKKOS_REMOTESPACES_SHMEM_ALLOCREC_HPP #include -/*--------------------------------------------------------------------------*/ - namespace Kokkos { namespace Impl { template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public SharedAllocationRecordCommon { private: friend Kokkos::Experimental::SHMEMSpace; + friend class SharedAllocationRecordCommon; - typedef SharedAllocationRecord RecordBase; - - SharedAllocationRecord(const SharedAllocationRecord &) = delete; - SharedAllocationRecord &operator=(const SharedAllocationRecord &) = delete; + using base_t = SharedAllocationRecordCommon; + using RecordBase = SharedAllocationRecord; - static void deallocate(RecordBase *); + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - /**\brief Root record for tracked allocations from this SHMEMSpace instance - */ +#ifdef KOKKOS_ENABLE_DEBUG + /**\brief Root record for tracked allocations from this HostSpace instance */ static RecordBase s_root_record; +#endif const Kokkos::Experimental::SHMEMSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() = default; + // This constructor does not forward to the one without exec_space arg + // in order to work around https://github.com/kokkos/kokkos/issues/5258 + // This constructor is templated so I can't just put it into the cpp file + // like the other constructor. template SharedAllocationRecord( - const ExecutionSpace &execution_space, - const Kokkos::Experimental::SHMEMSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); + const ExecutionSpace& /* exec_space*/, + const Kokkos::Experimental::SHMEMSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &deallocate) + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord::s_root_record, +#endif + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, + arg_label), + m_space(arg_space) { + this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, + arg_label); + } SharedAllocationRecord( - const Kokkos::Experimental::SHMEMSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, + const Kokkos::Experimental::SHMEMSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, const RecordBase::function_type arg_dealloc = &deallocate); public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); + KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( + const Kokkos::Experimental::SHMEMSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size) { + KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, + arg_alloc_size);)) + KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; + return nullptr;)) } - - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord *allocate( - const Kokkos::Experimental::SHMEMSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size) { -#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -#else - return (SharedAllocationRecord *)0; -#endif - } - - /**\brief Allocate tracked memory in the space */ - static void *allocate_tracked( - const Kokkos::Experimental::SHMEMSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void *reallocate_tracked(void *const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void *const arg_alloc_ptr); - - static SharedAllocationRecord *get_record(void *arg_alloc_ptr); - - static void print_records(std::ostream &, - const Kokkos::Experimental::SHMEMSpace &, - bool detail = false); }; } // namespace Impl } // namespace Kokkos -#endif // KOKKOS_SHMEM_ALLOCREC_HPP +#endif // KOKKOS_REMOTESPACES_SHMEM_ALLOCREC_HPP diff --git a/src/impl/shmemspace/Kokkos_SHMEMSpace_BlockOps.hpp b/src/impl/shmemspace/Kokkos_SHMEMSpace_BlockOps.hpp new file mode 100644 index 00000000..5889fefb --- /dev/null +++ b/src/impl/shmemspace/Kokkos_SHMEMSpace_BlockOps.hpp @@ -0,0 +1,97 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Contact: Jan Ciesko (jciesko@sandia.gov) +// +//@HEADER + +#ifndef KOKKOS_REMOTESPACES_SHMEM_BLOCK_OPS_HPP +#define KOKKOS_REMOTESPACES_SHMEM_BLOCK_OPS_HPP + +#include +#include + +namespace Kokkos { +namespace Impl { + +#define KOKKOS_REMOTESPACES_PUT(type, op) \ + static KOKKOS_INLINE_FUNCTION void shmem_block_type_put( \ + type *dst, const type *src, size_t nelems, int pe) { \ + op(dst, src, nelems, pe); \ + } + +KOKKOS_REMOTESPACES_PUT(char, shmem_char_put) +KOKKOS_REMOTESPACES_PUT(unsigned char, shmem_uchar_put) +KOKKOS_REMOTESPACES_PUT(short, shmem_short_put) +KOKKOS_REMOTESPACES_PUT(unsigned short, shmem_ushort_put) +KOKKOS_REMOTESPACES_PUT(int, shmem_int_put) +KOKKOS_REMOTESPACES_PUT(unsigned int, shmem_uint_put) +KOKKOS_REMOTESPACES_PUT(long, shmem_long_put) +KOKKOS_REMOTESPACES_PUT(unsigned long, shmem_ulong_put) +KOKKOS_REMOTESPACES_PUT(long long, shmem_longlong_put) +KOKKOS_REMOTESPACES_PUT(unsigned long long, shmem_ulonglong_put) +KOKKOS_REMOTESPACES_PUT(float, shmem_float_put) +KOKKOS_REMOTESPACES_PUT(double, shmem_double_put) + +#undef KOKKOS_REMOTESPACES_PUT + +#define KOKKOS_REMOTESPACES_GET(type, op) \ + static KOKKOS_INLINE_FUNCTION void shmem_block_type_get( \ + type *dst, const type *src, size_t nelems, int pe) { \ + op(dst, src, nelems, pe); \ + } + +KOKKOS_REMOTESPACES_GET(char, shmem_char_get) +KOKKOS_REMOTESPACES_GET(unsigned char, shmem_uchar_get) +KOKKOS_REMOTESPACES_GET(short, shmem_short_get) +KOKKOS_REMOTESPACES_GET(unsigned short, shmem_ushort_get) +KOKKOS_REMOTESPACES_GET(int, shmem_int_get) +KOKKOS_REMOTESPACES_GET(unsigned int, shmem_uint_get) +KOKKOS_REMOTESPACES_GET(long, shmem_long_get) +KOKKOS_REMOTESPACES_GET(unsigned long, shmem_ulong_get) +KOKKOS_REMOTESPACES_GET(long long, shmem_longlong_get) +KOKKOS_REMOTESPACES_GET(unsigned long long, shmem_ulonglong_get) +KOKKOS_REMOTESPACES_GET(float, shmem_float_get) +KOKKOS_REMOTESPACES_GET(double, shmem_double_get) + +#undef KOKKOS_REMOTESPACES_GET + +template +struct SHMEMBlockDataElement {}; + +// Atomic Operators +template +struct SHMEMBlockDataElement { + typedef const T const_value_type; + typedef T non_const_value_type; + T *src; + T *dst; + size_t nelems; + int pe; + + KOKKOS_INLINE_FUNCTION + SHMEMBlockDataElement(T *src_, T *dst_, size_t size_, int pe_) + : src(src_), dst(dst_), nelems(size_), pe(pe_) {} + + KOKKOS_INLINE_FUNCTION + void put() const { shmem_block_type_put(dst, src, nelems, pe); } + + KOKKOS_INLINE_FUNCTION + void get() const { shmem_block_type_get(dst, src, nelems, pe); } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_REMOTESPACES_SHMEM_BLOCK_OPS_HPP diff --git a/src/impl/shmemspace/Kokkos_SHMEMSpace_DataHandle.hpp b/src/impl/shmemspace/Kokkos_SHMEMSpace_DataHandle.hpp index fbb9ad6e..1949aeb3 100644 --- a/src/impl/shmemspace/Kokkos_SHMEMSpace_DataHandle.hpp +++ b/src/impl/shmemspace/Kokkos_SHMEMSpace_DataHandle.hpp @@ -35,9 +35,6 @@ struct SHMEMDataHandle { KOKKOS_INLINE_FUNCTION SHMEMDataHandle(SHMEMDataHandle const &arg) : ptr(arg.ptr) {} - template - KOKKOS_INLINE_FUNCTION SHMEMDataHandle(SrcTraits const &arg) : ptr(arg.ptr) {} - template KOKKOS_INLINE_FUNCTION SHMEMDataElement operator()( const int &pe, const iType &i) const { @@ -49,6 +46,38 @@ struct SHMEMDataHandle { T *operator+(size_t &offset) const { return ptr + offset; } }; +template +struct BlockDataHandle { + T *src; + T *dst; + size_t elems; + int pe; + + KOKKOS_INLINE_FUNCTION + BlockDataHandle(T *src_, T *dst_, size_t elems_, int pe_) + : src(src_), dst(dst_), elems(elems_), pe(pe_) {} + + KOKKOS_INLINE_FUNCTION + BlockDataHandle(BlockDataHandle const &arg) + : src(arg.src), dst(arg.dst), elems(arg.elems), pe(arg.pe_) {} + + template + KOKKOS_INLINE_FUNCTION BlockDataHandle(SrcTraits const &arg) + : src(arg.src), dst(arg.dst), elems(arg.elems), pe(arg.pe_) {} + + KOKKOS_INLINE_FUNCTION + void get() { + SHMEMBlockDataElement element(src, dst, elems, pe); + element.get(); + } + + KOKKOS_INLINE_FUNCTION + void put() { + SHMEMBlockDataElement element(src, dst, elems, pe); + element.put(); + } +}; + template struct ViewDataHandle< Traits, typename std::enable_if(0); test_atomic_globalview1D(1); @@ -140,7 +137,6 @@ TEST(TEST_CATEGORY, test_atomic_globalview) { test_atomic_globalview3D(1, 1, 1); test_atomic_globalview3D(255, 1024, 3); test_atomic_globalview3D(3, 33, 1024); -#endif } #endif /* TEST_ATOMIC_GLOBALVIEW_HPP_ */ \ No newline at end of file diff --git a/unit_tests/Test_DeepCopy.cpp b/unit_tests/Test_DeepCopy.cpp index 52682da1..6cd61704 100644 --- a/unit_tests/Test_DeepCopy.cpp +++ b/unit_tests/Test_DeepCopy.cpp @@ -24,6 +24,12 @@ #include #include +/* + Deep_copy can move data residing on the local node + We need to take the dim0 offset into account to support deep_copying + between memory spaces. +*/ + using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; template @@ -128,6 +134,8 @@ void test_deepcopy( Kokkos::parallel_for( "Team", 1, KOKKOS_LAMBDA(const int i) { assert(v_R(my_rank, 0) == (Data_t)0x123); }); + + Kokkos::fence(); } template @@ -153,6 +161,8 @@ void test_deepcopy( Kokkos::parallel_for( "Team", i1, KOKKOS_LAMBDA(const int i) { assert(v_R(my_rank, i) == (Data_t)0x123); }); + + Kokkos::fence(); } template @@ -183,6 +193,7 @@ void test_deepcopy( for (int j = 0; j < i2; ++j) assert(v_R(my_rank, i, j) == (Data_t)0x123); }); + Kokkos::fence(); } TEST(TEST_CATEGORY, test_deepcopy) { diff --git a/unit_tests/Test_LocalDeepCopy.cpp b/unit_tests/Test_LocalDeepCopy.cpp index acb59eb6..a7a633ec 100644 --- a/unit_tests/Test_LocalDeepCopy.cpp +++ b/unit_tests/Test_LocalDeepCopy.cpp @@ -24,13 +24,16 @@ #include #include +enum flavor : int { with_team, without_team }; +enum block_ops : int { get_op, put_op }; + using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace; -template -void test_localdeepcopy( - typename std::enable_if<(std::is_same::value && - std::is_same::value)>::type - * = nullptr) { +template +void test_localdeepcopy(typename std::enable_if_t< + (std::is_same::value && + std::is_same::value && + is_enabled_team == with_team)> * = nullptr) { int my_rank; int num_ranks; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -52,7 +55,7 @@ void test_localdeepcopy( [&](const int i) { v_R(my_rank, 0) = 0x123; }); team.team_barrier(); - Kokkos::single(Kokkos::PerThread(team), [&]() { + Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::Experimental::RemoteSpaces::local_deep_copy(team, v_R_cpy, v_R); }); @@ -62,12 +65,85 @@ void test_localdeepcopy( ASSERT_EQ(0x123, v_H(0, 0)); } -template -void test_localdeepcopy( - int i1, - typename std::enable_if<(std::is_same::value && - std::is_same::value)>::type - * = nullptr) { +template +void test_localdeepcopy(typename std::enable_if_t< + (std::is_same::value && + std::is_same::value && + is_enabled_team == without_team)> * = nullptr) { + int my_rank; + int num_ranks; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + + using ViewRemote_t = Kokkos::View; + using ViewHost_t = typename ViewRemote_t::HostMirror; + using TeamPolicy_t = Kokkos::TeamPolicy<>; + + ViewHost_t v_H("HostView", 1, 1); + + ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, 1); + ViewRemote_t v_R_cpy = ViewRemote_t("RemoteView", num_ranks, 1); + + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, Kokkos::AUTO), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 1), + [&](const int i) { v_R(my_rank, 0) = 0x123; }); + + team.team_barrier(); + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Kokkos::Experimental::RemoteSpaces::local_deep_copy(v_R_cpy, v_R); + }); + }); + + RemoteSpace_t().fence(); + + Kokkos::deep_copy(v_H, v_R_cpy); + ASSERT_EQ(0x123, v_H(0, 0)); +} + +template +void test_localdeepcopy(int i1, + typename std::enable_if_t< + (std::is_same::value && + std::is_same::value && + is_enabled_team == with_team)> * = nullptr) { + int my_rank; + int num_ranks; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + + using ViewRemote_t = Kokkos::View; + using ViewHost_t = typename ViewRemote_t::HostMirror; + using TeamPolicy_t = Kokkos::TeamPolicy<>; + + ViewHost_t v_H("HostView", 1, i1); + + ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1); + ViewRemote_t v_R_cpy = ViewRemote_t("RemoteView", num_ranks, i1); + + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, Kokkos::AUTO), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 1), + [&](const int i) { + for (int j = 0; j < i1; ++j) + v_R(my_rank, j) = 0x123; + }); + team.team_barrier(); + Kokkos::Experimental::RemoteSpaces::local_deep_copy(team, v_R_cpy, v_R); + }); + + Kokkos::deep_copy(v_H, v_R_cpy); + for (int j = 0; j < i1; ++j) ASSERT_EQ(0x123, v_H(0, j)); +} + +template +void test_localdeepcopy(int i1, + typename std::enable_if_t< + (std::is_same::value && + std::is_same::value && + is_enabled_team == without_team)> * = nullptr) { int my_rank; int num_ranks; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -92,7 +168,7 @@ void test_localdeepcopy( }); team.team_barrier(); - Kokkos::single(Kokkos::PerThread(team), [&]() { + Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::Experimental::RemoteSpaces::local_deep_copy(v_R_cpy, v_R); }); }); @@ -101,12 +177,12 @@ void test_localdeepcopy( for (int j = 0; j < i1; ++j) ASSERT_EQ(0x123, v_H(0, j)); } -template -void test_localdeepcopy( - int i1, int i2, - typename std::enable_if<(std::is_same::value && - std::is_same::value)>::type - * = nullptr) { +template +void test_localdeepcopy(int i1, int i2, + typename std::enable_if_t< + (std::is_same::value && + std::is_same::value && + is_enabled_team == without_team)> * = nullptr) { int my_rank; int num_ranks; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -132,7 +208,7 @@ void test_localdeepcopy( }); team.team_barrier(); - Kokkos::single(Kokkos::PerThread(team), [&]() { + Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::Experimental::RemoteSpaces::local_deep_copy(v_R_cpy, v_R); }); }); @@ -142,21 +218,412 @@ void test_localdeepcopy( for (int j = 0; j < i2; ++j) ASSERT_EQ(0x123, v_H(0, i, j)); } +template +void test_localdeepcopy(int i1, int i2, + typename std::enable_if_t< + (std::is_same::value && + std::is_same::value && + is_enabled_team == with_team)> * = nullptr) { + int my_rank; + int num_ranks; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + + using ViewRemote_t = Kokkos::View; + using ViewHost_t = typename ViewRemote_t::HostMirror; + using TeamPolicy_t = Kokkos::TeamPolicy<>; + + ViewHost_t v_H("HostView", 1, i1, i2); + + ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1, i2); + ViewRemote_t v_R_cpy = ViewRemote_t("RemoteView", num_ranks, i1, i2); + + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, Kokkos::AUTO), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 1), + [&](const int i) { + for (int j = 0; j < i1; ++j) + for (int k = 0; k < i2; ++k) + v_R(my_rank, j, k) = 0x123; + }); + + team.team_barrier(); + Kokkos::Experimental::RemoteSpaces::local_deep_copy(team, v_R_cpy, v_R); + }); + + Kokkos::deep_copy(v_H, v_R_cpy); + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(0x123, v_H(0, i, j)); +} + +template +void test_localdeepcopy_withSubview( + int i1, int i2, + typename std::enable_if_t< + (std::is_same::value && + std::is_same::value && + is_enabled_team == without_team && block_op_type == get_op)> * = + nullptr) { + int my_rank; + int prev_rank, next_rank; + int num_ranks; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; + next_rank = (my_rank + 1) % num_ranks; + + if (num_ranks % 2 && num_ranks > 1) return; // skip + + using ViewRemote_t = Kokkos::View; + using ViewHost_t = typename ViewRemote_t::HostMirror; + using TeamPolicy_t = Kokkos::TeamPolicy<>; + + ViewHost_t v_H("HostView", 1, i1, i2); + + ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1, i2); + + auto next_range = Kokkos::Experimental::get_range(num_ranks, next_rank); + auto prev_range = Kokkos::Experimental::get_range(num_ranks, prev_rank); + auto local_range = Kokkos::Experimental::get_local_range(num_ranks); + auto v_R_subview_prev = + Kokkos::subview(v_R, prev_range, Kokkos::ALL, Kokkos::ALL); + auto v_R_subview_next = + Kokkos::subview(v_R, next_range, Kokkos::ALL, Kokkos::ALL); + auto v_R_subview_local = + Kokkos::subview(v_R, local_range, Kokkos::ALL, Kokkos::ALL); + + Kokkos::parallel_for( + "Init", i1, KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < i2; ++j) v_R(my_rank, i, j) = my_rank; + }); + RemoteSpace_t().fence(); + + // Copy from next + if (my_rank % 2 == 0) { + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, 1), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Kokkos::Experimental::RemoteSpaces::local_deep_copy( + v_R_subview_local, v_R_subview_next); + }); + }); + } + RemoteSpace_t().fence(); + Kokkos::deep_copy(v_H, v_R); + if (my_rank % 2 == 0) { + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(next_rank, v_H(0, i, j)); + } + + // Copy from previous + if (my_rank % 2 == 0) { + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, 1), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Kokkos::Experimental::RemoteSpaces::local_deep_copy( + v_R_subview_local, v_R_subview_prev); + }); + }); + } + + RemoteSpace_t().fence(); + Kokkos::deep_copy(v_H, v_R); + if (my_rank % 2 == 0) { + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(prev_rank, v_H(0, i, j)); + } +} + +template +void test_localdeepcopy_withSubview( + int i1, int i2, + typename std::enable_if_t<( + std::is_same::value && + std::is_same::value && + is_enabled_team == with_team && block_op_type == get_op)> * = nullptr) { + int my_rank; + int prev_rank, next_rank; + int num_ranks; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; + next_rank = (my_rank + 1) % num_ranks; + + if (num_ranks % 2 && num_ranks > 1) return; // skip + + using ViewRemote_t = Kokkos::View; + using ViewHost_t = typename ViewRemote_t::HostMirror; + using TeamPolicy_t = Kokkos::TeamPolicy<>; + + ViewHost_t v_H("HostView", 1, i1, i2); + + ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1, i2); + + auto next_range = Kokkos::Experimental::get_range(num_ranks, next_rank); + auto prev_range = Kokkos::Experimental::get_range(num_ranks, prev_rank); + auto local_range = Kokkos::Experimental::get_local_range(num_ranks); + auto v_R_subview_prev = + Kokkos::subview(v_R, prev_range, Kokkos::ALL, Kokkos::ALL); + auto v_R_subview_next = + Kokkos::subview(v_R, next_range, Kokkos::ALL, Kokkos::ALL); + auto v_R_subview_local = + Kokkos::subview(v_R, local_range, Kokkos::ALL, Kokkos::ALL); + + Kokkos::parallel_for( + "Init", i1, KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < i2; ++j) v_R(my_rank, i, j) = my_rank; + }); + RemoteSpace_t().fence(); + + // Copy from next + if (my_rank % 2 == 0) { + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, 1), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::Experimental::RemoteSpaces::local_deep_copy( + team, v_R_subview_local, v_R_subview_next); + }); + } + RemoteSpace_t().fence(); + Kokkos::deep_copy(v_H, v_R); + if (my_rank % 2 == 0) { + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(next_rank, v_H(0, i, j)); + } + + // Copy from previous + if (my_rank % 2 == 0) { + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, 1), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::Experimental::RemoteSpaces::local_deep_copy( + team, v_R_subview_local, v_R_subview_prev); + }); + } + + RemoteSpace_t().fence(); + Kokkos::deep_copy(v_H, v_R); + if (my_rank % 2 == 0) { + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(prev_rank, v_H(0, i, j)); + } +} + +template +void test_localdeepcopy_withSubview( + int i1, int i2, + typename std::enable_if_t< + (std::is_same::value && + std::is_same::value && + is_enabled_team == without_team && block_op_type == put_op)> * = + nullptr) { + int my_rank; + int prev_rank, next_rank; + int num_ranks; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; + next_rank = (my_rank + 1) % num_ranks; + + if (num_ranks % 2 && num_ranks > 1) return; // skip + + using ViewRemote_t = Kokkos::View; + using ViewHost_t = typename ViewRemote_t::HostMirror; + using TeamPolicy_t = Kokkos::TeamPolicy<>; + + ViewHost_t v_H("HostView", 1, i1, i2); + + ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1, i2); + + auto next_range = Kokkos::Experimental::get_range(num_ranks, next_rank); + auto prev_range = Kokkos::Experimental::get_range(num_ranks, prev_rank); + auto local_range = Kokkos::Experimental::get_local_range(num_ranks); + auto v_R_subview_prev = + Kokkos::subview(v_R, prev_range, Kokkos::ALL, Kokkos::ALL); + auto v_R_subview_next = + Kokkos::subview(v_R, next_range, Kokkos::ALL, Kokkos::ALL); + auto v_R_subview_local = + Kokkos::subview(v_R, local_range, Kokkos::ALL, Kokkos::ALL); + + Kokkos::parallel_for( + "Init", i1, KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < i2; ++j) v_R(my_rank, i, j) = my_rank; + }); + RemoteSpace_t().fence(); + + // Put to next + if (my_rank % 2 == 0) { + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, 1), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Kokkos::Experimental::RemoteSpaces::local_deep_copy( + v_R_subview_next, v_R_subview_local); + }); + }); + } + RemoteSpace_t().fence(); + Kokkos::deep_copy(v_H, v_R); + if (my_rank % 2 != 0) { + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(prev_rank, v_H(0, i, j)); + } + + // Put to previous + if (my_rank % 2 == 0) { + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, 1), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Kokkos::Experimental::RemoteSpaces::local_deep_copy( + v_R_subview_prev, v_R_subview_local); + }); + }); + } + + RemoteSpace_t().fence(); + Kokkos::deep_copy(v_H, v_R); + if (my_rank % 2 != 0) { + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(next_rank, v_H(0, i, j)); + } +} + +template +void test_localdeepcopy_withSubview( + int i1, int i2, + typename std::enable_if_t<( + std::is_same::value && + std::is_same::value && + is_enabled_team == with_team && block_op_type == put_op)> * = nullptr) { + int my_rank; + int prev_rank, next_rank; + int num_ranks; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; + next_rank = (my_rank + 1) % num_ranks; + + if (num_ranks % 2 && num_ranks > 1) return; // skip + + using ViewRemote_t = Kokkos::View; + using ViewHost_t = typename ViewRemote_t::HostMirror; + using TeamPolicy_t = Kokkos::TeamPolicy<>; + + ViewHost_t v_H("HostView", 1, i1, i2); + + ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1, i2); + + auto next_range = Kokkos::Experimental::get_range(num_ranks, next_rank); + auto prev_range = Kokkos::Experimental::get_range(num_ranks, prev_rank); + auto local_range = Kokkos::Experimental::get_local_range(num_ranks); + auto v_R_subview_prev = + Kokkos::subview(v_R, prev_range, Kokkos::ALL, Kokkos::ALL); + auto v_R_subview_next = + Kokkos::subview(v_R, next_range, Kokkos::ALL, Kokkos::ALL); + auto v_R_subview_local = + Kokkos::subview(v_R, local_range, Kokkos::ALL, Kokkos::ALL); + + Kokkos::parallel_for( + "Init", i1, KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < i2; ++j) v_R(my_rank, i, j) = my_rank; + }); + RemoteSpace_t().fence(); + + // Put to next + if (my_rank % 2 == 0) { + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, 1), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::Experimental::RemoteSpaces::local_deep_copy( + team, v_R_subview_next, v_R_subview_local); + }); + } + RemoteSpace_t().fence(); + Kokkos::deep_copy(v_H, v_R); + if (my_rank % 2 != 0) { + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(prev_rank, v_H(0, i, j)); + } + + // Put to previous + if (my_rank % 2 == 0) { + Kokkos::parallel_for( + "Team", TeamPolicy_t(1, 1), + KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) { + Kokkos::Experimental::RemoteSpaces::local_deep_copy( + team, v_R_subview_prev, v_R_subview_local); + }); + } + + RemoteSpace_t().fence(); + Kokkos::deep_copy(v_H, v_R); + if (my_rank % 2 != 0) { + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(next_rank, v_H(0, i, j)); + } +} + TEST(TEST_CATEGORY, test_localdeepcopy) { // Scalar - test_localdeepcopy(); - test_localdeepcopy(); - test_localdeepcopy(); + test_localdeepcopy(); + test_localdeepcopy(); + test_localdeepcopy(); + + // Scalar with Teams + test_localdeepcopy(); + test_localdeepcopy(); + test_localdeepcopy(); // 1D - test_localdeepcopy(50); - test_localdeepcopy(150); - test_localdeepcopy(1500); + test_localdeepcopy(50); + test_localdeepcopy( + 150); + test_localdeepcopy( + 1500); + + // 1D with Teams + test_localdeepcopy(50); + test_localdeepcopy(150); + test_localdeepcopy(1500); // 2D - test_localdeepcopy(50, 20); - test_localdeepcopy(150, 99); - test_localdeepcopy(1500, 2199); + test_localdeepcopy(50, + 20); + test_localdeepcopy( + 150, 99); + test_localdeepcopy( + 1500, 2199); + + // 2D with Teams + test_localdeepcopy(50, 20); + test_localdeepcopy(150, + 99); + test_localdeepcopy(1500, + 2199); + + // 2D with Subviews (get block transfer) + test_localdeepcopy_withSubview(10, 10); + + // 2D with Teams and Subviews (get block transfer) + test_localdeepcopy_withSubview(10, 10); + + // 2D with Subviews (put block transfer) + test_localdeepcopy_withSubview(10, 10); + + // 2D with Teams and Subviews (put block transfer) + test_localdeepcopy_withSubview(10, 10); } #endif /* TEST_LOCAL_DEEP_COPY_HPP_ */ diff --git a/unit_tests/Test_PartitionedSubview.cpp b/unit_tests/Test_PartitionedSubview.cpp index 3bf0dcda..4d498dfe 100644 --- a/unit_tests/Test_PartitionedSubview.cpp +++ b/unit_tests/Test_PartitionedSubview.cpp @@ -153,7 +153,7 @@ void test_partitioned_subview3D(int i1, int i2, int sub1, int sub2) { } template -void test_partitioned_subview2D_byRank(int i1, int i2) { +void test_partitioned_subview2D_byRank_localRank(int i1, int i2) { int my_rank; int num_ranks; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -168,7 +168,7 @@ void test_partitioned_subview2D_byRank(int i1, int i2) { // Init for (int i = 0; i < i1; ++i) - for (int j = 0; j < i2; ++j) v_h(0, i, j) = VAL; + for (int j = 0; j < i2; ++j) v_h(0, i, j) = my_rank; auto v_sub = Kokkos::subview(v, std::make_pair(my_rank, my_rank + 1), Kokkos::ALL, Kokkos::ALL); @@ -184,7 +184,43 @@ void test_partitioned_subview2D_byRank(int i1, int i2) { Kokkos::deep_copy(v_h, v_sub); for (int i = 0; i < i1; ++i) - for (int j = 0; j < i2; ++j) ASSERT_EQ(v_h(0, i, j), VAL + 1); + for (int j = 0; j < i2; ++j) ASSERT_EQ(v_h(0, i, j), my_rank + 1); +} + +template +void test_partitioned_subview2D_byRank_nextRank(int i1, int i2) { + int my_rank, next_rank; + int num_ranks; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + + next_rank = (my_rank + 1) % num_ranks; + + using ViewRemote_3D_t = Kokkos::View; + using ViewRemote_2D_t = Kokkos::View; + using ViewHost_3D_t = typename ViewRemote_3D_t::HostMirror; + + ViewRemote_3D_t v = ViewRemote_3D_t("RemoteView", num_ranks, i1, i2); + ViewHost_3D_t v_h("HostView", 1, i1, i2); + + // Init + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) v_h(0, i, j) = my_rank; + + auto v_sub = Kokkos::subview(v, std::make_pair(my_rank, my_rank + 1), + Kokkos::ALL, Kokkos::ALL); + auto v_sub_next = Kokkos::subview(v, next_rank, Kokkos::ALL, Kokkos::ALL); + Kokkos::deep_copy(v_sub, v_h); + + Kokkos::parallel_for( + "Increment", v_sub_next.extent(0), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < v_sub_next.extent(1); ++j) v_sub_next(i, j)++; + }); + + Kokkos::deep_copy(v_h, v_sub); + + for (int i = 0; i < i1; ++i) + for (int j = 0; j < i2; ++j) ASSERT_EQ(v_h(0, i, j), my_rank + 1); } TEST(TEST_CATEGORY, test_partitioned_subview) { @@ -208,11 +244,26 @@ TEST(TEST_CATEGORY, test_partitioned_subview) { 19); // 2D subview split by dim0 - test_partitioned_subview2D_byRank(8, 1); - test_partitioned_subview2D_byRank(55, - 20); - test_partitioned_subview2D_byRank(50, - 77); + test_partitioned_subview2D_byRank_localRank( + 8, 1); + test_partitioned_subview2D_byRank_localRank( + 55, 20); + test_partitioned_subview2D_byRank_localRank( + 50, 77); + + // 2D subview split by dim0 + test_partitioned_subview2D_byRank_nextRank( + 8, 10); + test_partitioned_subview2D_byRank_nextRank(55, + 20); + test_partitioned_subview2D_byRank_nextRank(50, + 77); // 1D subview test_partitioned_subview1D(4, 4, 0, 0); @@ -232,9 +283,26 @@ TEST(TEST_CATEGORY, test_partitioned_subview) { test_partitioned_subview3D(70, 20, 0, 19); // 2D subview split by dim0 - test_partitioned_subview2D_byRank(8, 1); - test_partitioned_subview2D_byRank(55, 20); - test_partitioned_subview2D_byRank(50, 77); + test_partitioned_subview2D_byRank_localRank(8, + 1); + test_partitioned_subview2D_byRank_localRank( + 55, 20); + test_partitioned_subview2D_byRank_localRank( + 50, 77); + + // 2D subview split by dim0 + test_partitioned_subview2D_byRank_nextRank(8, + 10); + test_partitioned_subview2D_byRank_nextRank(55, + 20); + test_partitioned_subview2D_byRank_nextRank(50, + 77); } #endif /* TEST_PARTITIONED_SUBVIEW_HPP_ */ diff --git a/unit_tests/Test_Reduction.cpp b/unit_tests/Test_Reduction.cpp index a87c6a6d..23465170 100644 --- a/unit_tests/Test_Reduction.cpp +++ b/unit_tests/Test_Reduction.cpp @@ -43,9 +43,10 @@ void test_scalar_reduce_1D(int dim0) { // Init for (int i = 0; i < v_h.extent(0); ++i) - v_h(i) = (Data_t)local_range.first + i; + v_h(i) = static_cast(local_range.first + i); Kokkos::deep_copy(v, v_h); + RemoteSpace_t().fence(); Data_t gsum = 0; @@ -74,9 +75,11 @@ void test_scalar_reduce_2D(int dim0, int dim1) { // Init for (int i = 0; i < v_h.extent(0); ++i) for (int j = 0; j < v_h.extent(1); ++j) - v_h(i, j) = (Data_t)(local_range.first + i) * v_h.extent(1) + j; + v_h(i, j) = + static_cast(local_range.first + i) * v_h.extent(1) + j; Kokkos::deep_copy(v, v_h); + RemoteSpace_t().fence(); Data_t gsum = 0; @@ -98,30 +101,23 @@ void test_scalar_reduce_partitioned_1D(int dim1) { MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); - using ViewRemote_3D_t = - Kokkos::View; using ViewRemote_2D_t = - Kokkos::View; - using ViewHost_3D_t = typename ViewRemote_3D_t::HostMirror; - - ViewRemote_3D_t v = - ViewRemote_3D_t("RemoteView", num_ranks /*dim0*/, dim1 / num_ranks); - ViewHost_3D_t v_h("HostView", 1 /*dim0*/, v.extent(1) /*dim1*/); - - auto v_sub = - Kokkos::subview(v, std::make_pair(my_rank, my_rank + 1), Kokkos::ALL); + Kokkos::View; - // Use a more sophisticated function to partition data if needed but may come - // at the expense of operator cost. Here we rely on that KRS internally - // allocates (dim1+num_ranks)/num_ranks symetrically. size_t dim1_block = dim1 / num_ranks; size_t block = dim1_block; size_t start = my_rank * block; + ViewRemote_2D_t v = + ViewRemote_2D_t("RemoteView", num_ranks /*dim0*/, dim1_block); + // Init - for (int i = 0; i < dim1_block; ++i) v_h(0, i) = (Data_t)start + i; + Kokkos::parallel_for( + "Local init", block, KOKKOS_LAMBDA(const int i) { + v(my_rank, i) = static_cast(start + i); + }); - Kokkos::deep_copy(v_sub, v_h); + RemoteSpace_t().fence(); Data_t gsum = 0; Kokkos::parallel_reduce( @@ -134,7 +130,7 @@ void test_scalar_reduce_partitioned_1D(int dim1) { }, gsum); - size_t total = dim1_block * num_ranks; + size_t total = block * num_ranks; ASSERT_EQ((total - 1) * (total) / 2, gsum); } @@ -147,29 +143,26 @@ void test_scalar_reduce_partitioned_2D(int dim1, int dim2) { using ViewRemote_3D_t = Kokkos::View; - using ViewRemote_2D_t = - Kokkos::View; using ViewHost_3D_t = typename ViewRemote_3D_t::HostMirror; - ViewRemote_3D_t v = - ViewRemote_3D_t("RemoteView", num_ranks /*dim0*/, dim1 / num_ranks, dim2); - ViewHost_3D_t v_h("HostView", 1 /*dim0*/, v.extent(1) /*dim1*/, - v.extent(2) /*dim2*/); - - auto v_sub = Kokkos::subview(v, std::make_pair(my_rank, my_rank + 1), - Kokkos::ALL, Kokkos::ALL); - size_t dim1_block = dim1 / num_ranks; size_t block = dim1_block * dim2; size_t start = my_rank * block; - // Init + ViewRemote_3D_t v = + ViewRemote_3D_t("RemoteView", num_ranks /*dim0*/, dim1_block, dim2); + + ViewHost_3D_t v_h = ViewHost_3D_t("HostView", 1 /*dim0*/, dim1_block, dim2); + + auto v_sub = Kokkos::subview(v, Kokkos::pair(my_rank, my_rank + 1), + Kokkos::ALL, Kokkos::ALL); for (int i = 0; i < dim1_block; ++i) for (int j = 0; j < v_h.extent(2); ++j) v_h(0, i, j) = (Data_t)start + i * dim2 + j; - Kokkos::deep_copy(v_sub, v_h); + RemoteSpace_t().fence(); + Data_t gsum = 0; Kokkos::parallel_reduce( "Global reduce", dim1_block * num_ranks, @@ -177,16 +170,20 @@ void test_scalar_reduce_partitioned_2D(int dim1, int dim2) { size_t pe, index; pe = i / dim1_block; index = i % dim1_block; - for (int j = 0; j < dim2; ++j) lsum += v(pe, index, j); + + for (int j = 0; j < v.extent(2); ++j) { + int tmp = v(pe, index, j); + lsum += v(pe, index, j); + } }, gsum); - size_t total = dim1_block * num_ranks * dim2; + size_t total = block * num_ranks; ASSERT_EQ((total - 1) * (total) / 2, gsum); } TEST(TEST_CATEGORY, test_reduce) { - // Param 1: array size + // Params: array size // Scalar reduce test_scalar_reduce_1D(0); diff --git a/unit_tests/Test_RemoteAccess.cpp b/unit_tests/Test_RemoteAccess.cpp index fec9f801..7eb19c84 100644 --- a/unit_tests/Test_RemoteAccess.cpp +++ b/unit_tests/Test_RemoteAccess.cpp @@ -41,11 +41,8 @@ void test_remote_accesses(int size) { RemoteView_t v_R = RemoteView_t("RemoteView", num_ranks, size); HostSpace_t v_H("HostView", v_R.extent(0), size); - // Allocate remote view - - RemoteSpace_t().fence(); - int next_rank = (my_rank + 1) % num_ranks; + int prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1; Kokkos::parallel_for( "Update", size, KOKKOS_LAMBDA(const int i) { @@ -57,7 +54,7 @@ void test_remote_accesses(int size) { Data_t check(0), ref(0); for (int i = 0; i < size; i++) { check += v_H(0, i); - ref += next_rank * size + i; + ref += prev_rank * size + i; } ASSERT_EQ(check, ref); } @@ -65,7 +62,7 @@ void test_remote_accesses(int size) { TEST(TEST_CATEGORY, test_remote_accesses) { test_remote_accesses(0); test_remote_accesses(1); - test_remote_accesses(122); + test_remote_accesses(64); test_remote_accesses(4567); test_remote_accesses(89); } diff --git a/unit_tests/Test_Subview.cpp b/unit_tests/Test_Subview.cpp index 31393a3e..2368097f 100644 --- a/unit_tests/Test_Subview.cpp +++ b/unit_tests/Test_Subview.cpp @@ -216,7 +216,7 @@ void test_subview3D_DCCopiesSubviewAccess(int i1, int i2, int i3) { TEST(TEST_CATEGORY, test_subview) { // 1D subview - Subview with GlobalLayout - /*test_subview1D(20); + test_subview1D(20); test_subview1D(555); test_subview1D(123); @@ -234,7 +234,7 @@ TEST(TEST_CATEGORY, test_subview) { // deep_copy accessing the subview directly test_subview3D_DCCopiesSubviewAccess(20, 20, 20); test_subview3D_DCCopiesSubviewAccess(55, 11, 13); - test_subview3D_DCCopiesSubviewAccess(13, 31, 23);*/ + test_subview3D_DCCopiesSubviewAccess(13, 31, 23); } #endif /* TEST_SUBVIEW_HPP_ */ \ No newline at end of file