From c3905330983e2203a8c118b4acbcbc11245d22c7 Mon Sep 17 00:00:00 2001 From: Maarten Arnst Date: Thu, 9 Nov 2023 17:01:36 +0100 Subject: [PATCH 1/2] :Use eventset for roctxconnector --- CMakeLists.txt | 3 +- common/kp_config.hpp.in | 1 + example/CMakeLists.txt | 3 + profiling/all/kp_all.cpp | 6 + profiling/nvtx-connector/Makefile | 2 +- profiling/roctx-connector/CMakeLists.txt | 4 +- .../roctx-connector/kp_roctx_connector.cpp | 133 +++++++++++++----- 7 files changed, 115 insertions(+), 37 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 428402c1a..7e7699414 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,8 @@ endif() include(cmake/configure_variorum.cmake) set(KOKKOSTOOLS_HAS_CALIPER ${KokkosTools_ENABLE_CALIPER}) -set(KOKKOSTOOLS_HAS_NVTX ${Kokkos_ENABLE_CUDA}) # we assume that enabling CUDA for Kokkos program means nvtx should be available +set(KOKKOSTOOLS_HAS_NVTX ${Kokkos_ENABLE_CUDA}) # we assume that enabling CUDA for Kokkos program means nvtx should be available +set(KOKKOSTOOLS_HAS_ROCTX ${Kokkos_ENABLE_HIP}) # we assume that enabling HIP for Kokkos program means roctx should be available if(DEFINED ENV{VTUNE_HOME}) set(VTune_ROOT $ENV{VTUNE_HOME}) diff --git a/common/kp_config.hpp.in b/common/kp_config.hpp.in index 77c160870..09f2ad0d7 100644 --- a/common/kp_config.hpp.in +++ b/common/kp_config.hpp.in @@ -3,6 +3,7 @@ #define USE_MPI @KOKKOSTOOLS_HAS_MPI@ #cmakedefine KOKKOSTOOLS_HAS_NVTX +#cmakedefine KOKKOSTOOLS_HAS_ROCTX #cmakedefine KOKKOSTOOLS_HAS_CALIPER #cmakedefine KOKKOSTOOLS_HAS_SYSTEMTAP #cmakedefine KOKKOSTOOLS_HAS_VARIORUM diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index e7490dbcb..88b3b5ac4 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -48,3 +48,6 @@ if(KOKKOSTOOLS_HAS_NVTX) add_kp_test(nvtx_connector "nvtx-connector") add_kp_test(nvtx_focused_connector "nvtx-focused-connector") endif() +if(KOKKOSTOOLS_HAS_ROCTX) + add_kp_test(roctx_connector "roctx-connector") +endif() diff --git a/profiling/all/kp_all.cpp b/profiling/all/kp_all.cpp index 67419b039..1c8691274 100644 --- a/profiling/all/kp_all.cpp +++ b/profiling/all/kp_all.cpp @@ -52,6 +52,9 @@ KOKKOSTOOLS_EXTERN_EVENT_SET(VariorumConnector) KOKKOSTOOLS_EXTERN_EVENT_SET(NVTXConnector) KOKKOSTOOLS_EXTERN_EVENT_SET(NVTXFocusedConnector) #endif +#ifdef KOKKOSTOOLS_HAS_ROCTX +KOKKOSTOOLS_EXTERN_EVENT_SET(ROCTXConnector) +#endif #ifdef KOKKOSTOOLS_HAS_CALIPER namespace cali { extern Kokkos::Tools::Experimental::EventSet get_kokkos_event_set( @@ -93,6 +96,9 @@ EventSet get_event_set(const char* profiler, const char* config_str) { #ifdef KOKKOSTOOLS_HAS_NVTX handlers["nvtx-connector"] = NVTXConnector::get_event_set(); handlers["nvtx-focused-connector"] = NVTXFocusedConnector::get_event_set(); +#endif +#ifdef KOKKOSTOOLS_HAS_ROCTX + handlers["roctx-connector"] = ROCTXConnector::get_event_set(); #endif auto e = handlers.find(profiler); if (e != handlers.end()) return e->second; diff --git a/profiling/nvtx-connector/Makefile b/profiling/nvtx-connector/Makefile index 35071bfcf..ab82ca8af 100644 --- a/profiling/nvtx-connector/Makefile +++ b/profiling/nvtx-connector/Makefile @@ -1,5 +1,5 @@ CXX=g++ -CXXFLAGS=-O3 -std=c++11 -g -I$(CUDA_ROOT)/include +CXXFLAGS=-O3 -std=c++11 -g -I$(CUDA_ROOT)/include/ LDFLAGS=-L$(CUDA_ROOT)/lib64 LIBS=-lnvToolsExt SHARED_CXXFLAGS=-shared -fPIC diff --git a/profiling/roctx-connector/CMakeLists.txt b/profiling/roctx-connector/CMakeLists.txt index 2c6857303..a7d18c829 100644 --- a/profiling/roctx-connector/CMakeLists.txt +++ b/profiling/roctx-connector/CMakeLists.txt @@ -3,5 +3,5 @@ find_path(ROCM_ROCTX_INCLUDE roctx.h REQUIRED HINTS $ENV{ROCM_PATH}/include/roct kp_add_library(kp_roctx_connector kp_roctx_connector.cpp) -target_include_directories(kp_roctx_connector PRIVATE ${ROCM_ROCTX_INCLUDE}) -target_link_libraries(kp_roctx_connector PRIVATE ${ROCM_ROCTX_LIB}) +target_include_directories(kp_roctx_connector PUBLIC ${ROCM_ROCTX_INCLUDE}) +target_link_libraries(kp_roctx_connector PUBLIC ${ROCM_ROCTX_LIB}) diff --git a/profiling/roctx-connector/kp_roctx_connector.cpp b/profiling/roctx-connector/kp_roctx_connector.cpp index 593210aa3..543696191 100644 --- a/profiling/roctx-connector/kp_roctx_connector.cpp +++ b/profiling/roctx-connector/kp_roctx_connector.cpp @@ -21,6 +21,8 @@ #include #include +#include "kp_core.hpp" + namespace { struct Section { std::string label; @@ -29,29 +31,40 @@ struct Section { std::vector
kokkosp_sections; } // namespace -struct Kokkos_Tools_ToolSettings { - bool requires_global_fencing; - bool padding[255]; -}; +namespace KokkosTools { +namespace ROCTXConnector { -extern "C" void kokkosp_request_tool_settings( +static bool tool_globfences; + +void kokkosp_request_tool_settings( const uint32_t, Kokkos_Tools_ToolSettings* settings) { - settings->requires_global_fencing = false; + if (tool_globfences) { + settings->requires_global_fencing = true; + } else { + settings->requires_global_fencing = false; + } } -extern "C" void kokkosp_init_library(const int loadSeq, - const uint64_t interfaceVer, - const uint32_t /*devInfoCount*/, - void* /*deviceInfo*/) { +void kokkosp_init_library(const int loadSeq, + const uint64_t interfaceVer, + const uint32_t /*devInfoCount*/, + Kokkos_Profiling_KokkosPDeviceInfo* /*deviceInfo*/) { + + const char* tool_global_fences = std::getenv("KOKKOS_TOOLS_GLOBALFENCES"); + if (tool_global_fences) { + tool_globfences = (atoi(tool_global_fences) != 0); + } + std::cout << "-----------------------------------------------------------\n" << "KokkosP: ROC Tracer Connector (sequence is " << loadSeq << ", version: " << interfaceVer << ")\n" + << "Global fences: " << (tool_globfences ? "ON" : "OFF") << "\n" << "-----------------------------------------------------------\n"; roctxMark("Kokkos::Initialization Complete"); } -extern "C" void kokkosp_finalize_library() { +void kokkosp_finalize_library() { std::cout << R"( ----------------------------------------------------------- KokkosP: Finalization of ROC Tracer Connector. Complete. @@ -61,66 +74,120 @@ KokkosP: Finalization of ROC Tracer Connector. Complete. roctxMark("Kokkos::Finalization Complete"); } -extern "C" void kokkosp_begin_parallel_for(const char* name, - const uint32_t /*devID*/, - uint64_t* /*kID*/) { +void kokkosp_begin_parallel_for(const char* name, + const uint32_t /*devID*/, + uint64_t* /*kID*/) { roctxRangePush(name); } -extern "C" void kokkosp_end_parallel_for(const uint64_t /*kID*/) { +void kokkosp_end_parallel_for(const uint64_t /*kID*/) { roctxRangePop(); } -extern "C" void kokkosp_begin_parallel_scan(const char* name, - const uint32_t /*devID*/, - uint64_t* /*kID*/) { +void kokkosp_begin_parallel_scan(const char* name, + const uint32_t /*devID*/, + uint64_t* /*kID*/) { roctxRangePush(name); } -extern "C" void kokkosp_end_parallel_scan(const uint64_t /*kID*/) { +void kokkosp_end_parallel_scan(const uint64_t /*kID*/) { roctxRangePop(); } -extern "C" void kokkosp_begin_parallel_reduce(const char* name, - const uint32_t /*devID*/, - uint64_t* /*kID*/) { +void kokkosp_begin_parallel_reduce(const char* name, + const uint32_t /*devID*/, + uint64_t* /*kID*/) { roctxRangePush(name); } -extern "C" void kokkosp_end_parallel_reduce(const uint64_t /*kID*/) { +void kokkosp_end_parallel_reduce(const uint64_t /*kID*/) { roctxRangePop(); } -extern "C" void kokkosp_push_profile_region(char* name) { +void kokkosp_push_profile_region(const char* name) { roctxRangePush(name); } -extern "C" void kokkosp_pop_profile_region() { roctxRangePop(); } +void kokkosp_pop_profile_region() { roctxRangePop(); } -extern "C" void kokkosp_create_profile_section(const char* name, - uint32_t* sID) { +void kokkosp_create_profile_section(const char* name, + uint32_t* sID) { *sID = kokkosp_sections.size(); kokkosp_sections.push_back( {std::string(name), static_cast(-1)}); } -extern "C" void kokkosp_start_profile_section(const uint32_t sID) { +void kokkosp_start_profile_section(const uint32_t sID) { auto& section = kokkosp_sections[sID]; section.id = roctxRangeStart(section.label.c_str()); } -extern "C" void kokkosp_stop_profile_section(const uint32_t sID) { +void kokkosp_stop_profile_section(const uint32_t sID) { auto const& section = kokkosp_sections[sID]; roctxRangeStop(section.id); } -extern "C" void kokkosp_destroy_profile_section(const uint32_t sID) { +void kokkosp_destroy_profile_section(const uint32_t sID) { // do nothing } -extern "C" void kokkosp_begin_fence(const char* name, const uint32_t /*devID*/, - uint64_t* fID) { +void kokkosp_profile_event(const char* name) { roctxMark(name); } + +void kokkosp_begin_fence(const char* name, const uint32_t /*devID*/, + uint64_t* fID) { *fID = roctxRangeStart(name); } -extern "C" void kokkosp_end_fence(const uint64_t fID) { roctxRangeStop(fID); } +void kokkosp_end_fence(const uint64_t fID) { roctxRangeStop(fID); } + +Kokkos::Tools::Experimental::EventSet get_event_set() { + Kokkos::Tools::Experimental::EventSet my_event_set; + memset(&my_event_set, 0, + sizeof(my_event_set)); // zero any pointers not set here + my_event_set.request_tool_settings = kokkosp_request_tool_settings; + my_event_set.init = kokkosp_init_library; + my_event_set.finalize = kokkosp_finalize_library; + my_event_set.push_region = kokkosp_push_profile_region; + my_event_set.pop_region = kokkosp_pop_profile_region; + my_event_set.begin_parallel_for = kokkosp_begin_parallel_for; + my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce; + my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan; + my_event_set.end_parallel_for = kokkosp_end_parallel_for; + my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce; + my_event_set.end_parallel_scan = kokkosp_end_parallel_scan; + my_event_set.create_profile_section = kokkosp_create_profile_section; + my_event_set.start_profile_section = kokkosp_start_profile_section; + my_event_set.stop_profile_section = kokkosp_stop_profile_section; + my_event_set.destroy_profile_section = kokkosp_destroy_profile_section; + my_event_set.profile_event = kokkosp_profile_event; + my_event_set.begin_fence = kokkosp_begin_fence; + my_event_set.end_fence = kokkosp_end_fence; + return my_event_set; +} + +} // namespace ROCTXConnector +} // namespace KokkosTools + +extern "C" { + +namespace impl = KokkosTools::ROCTXConnector; + +EXPOSE_TOOL_SETTINGS(impl::kokkosp_request_tool_settings) +EXPOSE_INIT(impl::kokkosp_init_library) +EXPOSE_FINALIZE(impl::kokkosp_finalize_library) +EXPOSE_PUSH_REGION(impl::kokkosp_push_profile_region) +EXPOSE_POP_REGION(impl::kokkosp_pop_profile_region) +EXPOSE_BEGIN_PARALLEL_FOR(impl::kokkosp_begin_parallel_for) +EXPOSE_END_PARALLEL_FOR(impl::kokkosp_end_parallel_for) +EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan) +EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan) +EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce) +EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce) +EXPOSE_CREATE_PROFILE_SECTION(impl::kokkosp_create_profile_section) +EXPOSE_START_PROFILE_SECTION(impl::kokkosp_start_profile_section) +EXPOSE_STOP_PROFILE_SECTION(impl::kokkosp_stop_profile_section) +EXPOSE_DESTROY_PROFILE_SECTION(impl::kokkosp_destroy_profile_section) +EXPOSE_PROFILE_EVENT(impl::kokkosp_profile_event); +EXPOSE_BEGIN_FENCE(impl::kokkosp_begin_fence); +EXPOSE_END_FENCE(impl::kokkosp_end_fence); +} // extern "C" From 194a1fd919fcae1e407d5872d916226be1bcb3f0 Mon Sep 17 00:00:00 2001 From: Maarten Arnst Date: Fri, 17 Nov 2023 16:12:46 +0100 Subject: [PATCH 2/2] Improvements thanks to review. --- profiling/nvtx-connector/Makefile | 2 +- profiling/roctx-connector/CMakeLists.txt | 4 +- .../roctx-connector/kp_roctx_connector.cpp | 37 ++++++------------- 3 files changed, 14 insertions(+), 29 deletions(-) diff --git a/profiling/nvtx-connector/Makefile b/profiling/nvtx-connector/Makefile index ab82ca8af..35071bfcf 100644 --- a/profiling/nvtx-connector/Makefile +++ b/profiling/nvtx-connector/Makefile @@ -1,5 +1,5 @@ CXX=g++ -CXXFLAGS=-O3 -std=c++11 -g -I$(CUDA_ROOT)/include/ +CXXFLAGS=-O3 -std=c++11 -g -I$(CUDA_ROOT)/include LDFLAGS=-L$(CUDA_ROOT)/lib64 LIBS=-lnvToolsExt SHARED_CXXFLAGS=-shared -fPIC diff --git a/profiling/roctx-connector/CMakeLists.txt b/profiling/roctx-connector/CMakeLists.txt index a7d18c829..2c6857303 100644 --- a/profiling/roctx-connector/CMakeLists.txt +++ b/profiling/roctx-connector/CMakeLists.txt @@ -3,5 +3,5 @@ find_path(ROCM_ROCTX_INCLUDE roctx.h REQUIRED HINTS $ENV{ROCM_PATH}/include/roct kp_add_library(kp_roctx_connector kp_roctx_connector.cpp) -target_include_directories(kp_roctx_connector PUBLIC ${ROCM_ROCTX_INCLUDE}) -target_link_libraries(kp_roctx_connector PUBLIC ${ROCM_ROCTX_LIB}) +target_include_directories(kp_roctx_connector PRIVATE ${ROCM_ROCTX_INCLUDE}) +target_link_libraries(kp_roctx_connector PRIVATE ${ROCM_ROCTX_LIB}) diff --git a/profiling/roctx-connector/kp_roctx_connector.cpp b/profiling/roctx-connector/kp_roctx_connector.cpp index 543696191..6c1ea182e 100644 --- a/profiling/roctx-connector/kp_roctx_connector.cpp +++ b/profiling/roctx-connector/kp_roctx_connector.cpp @@ -36,8 +36,8 @@ namespace ROCTXConnector { static bool tool_globfences; -void kokkosp_request_tool_settings( - const uint32_t, Kokkos_Tools_ToolSettings* settings) { +void kokkosp_request_tool_settings(const uint32_t, + Kokkos_Tools_ToolSettings* settings) { if (tool_globfences) { settings->requires_global_fencing = true; } else { @@ -45,11 +45,9 @@ void kokkosp_request_tool_settings( } } -void kokkosp_init_library(const int loadSeq, - const uint64_t interfaceVer, +void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, const uint32_t /*devInfoCount*/, Kokkos_Profiling_KokkosPDeviceInfo* /*deviceInfo*/) { - const char* tool_global_fences = std::getenv("KOKKOS_TOOLS_GLOBALFENCES"); if (tool_global_fences) { tool_globfences = (atoi(tool_global_fences) != 0); @@ -58,7 +56,6 @@ void kokkosp_init_library(const int loadSeq, std::cout << "-----------------------------------------------------------\n" << "KokkosP: ROC Tracer Connector (sequence is " << loadSeq << ", version: " << interfaceVer << ")\n" - << "Global fences: " << (tool_globfences ? "ON" : "OFF") << "\n" << "-----------------------------------------------------------\n"; roctxMark("Kokkos::Initialization Complete"); @@ -74,44 +71,32 @@ KokkosP: Finalization of ROC Tracer Connector. Complete. roctxMark("Kokkos::Finalization Complete"); } -void kokkosp_begin_parallel_for(const char* name, - const uint32_t /*devID*/, +void kokkosp_begin_parallel_for(const char* name, const uint32_t /*devID*/, uint64_t* /*kID*/) { roctxRangePush(name); } -void kokkosp_end_parallel_for(const uint64_t /*kID*/) { - roctxRangePop(); -} +void kokkosp_end_parallel_for(const uint64_t /*kID*/) { roctxRangePop(); } -void kokkosp_begin_parallel_scan(const char* name, - const uint32_t /*devID*/, +void kokkosp_begin_parallel_scan(const char* name, const uint32_t /*devID*/, uint64_t* /*kID*/) { roctxRangePush(name); } -void kokkosp_end_parallel_scan(const uint64_t /*kID*/) { - roctxRangePop(); -} +void kokkosp_end_parallel_scan(const uint64_t /*kID*/) { roctxRangePop(); } -void kokkosp_begin_parallel_reduce(const char* name, - const uint32_t /*devID*/, +void kokkosp_begin_parallel_reduce(const char* name, const uint32_t /*devID*/, uint64_t* /*kID*/) { roctxRangePush(name); } -void kokkosp_end_parallel_reduce(const uint64_t /*kID*/) { - roctxRangePop(); -} +void kokkosp_end_parallel_reduce(const uint64_t /*kID*/) { roctxRangePop(); } -void kokkosp_push_profile_region(const char* name) { - roctxRangePush(name); -} +void kokkosp_push_profile_region(const char* name) { roctxRangePush(name); } void kokkosp_pop_profile_region() { roctxRangePop(); } -void kokkosp_create_profile_section(const char* name, - uint32_t* sID) { +void kokkosp_create_profile_section(const char* name, uint32_t* sID) { *sID = kokkosp_sections.size(); kokkosp_sections.push_back( {std::string(name), static_cast(-1)});