Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(nvidia): Add cupti support #323

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ IfUpdatedUnsetAll(lo2s_USE_STATIC_LIBS
Libpfm_USE_STATIC_LIBS
X86Adapt_STATIC
x86_energy_STATIC
CUDA_USE_STATIC_LIBS
)

if(lo2s_USE_STATIC_LIBS STREQUAL "OFF")
Expand All @@ -45,6 +46,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "OFF")
set(x86_energy_STATIC OFF CACHE BOOL "")
set(Sensors_USE_STATIC_LIBS OFF CACHE BOOL "")
set(Libpfm_USE_STATIC_LIBS OFF CACHE BOOL "")
set(CUDA_USE_STATIC_LIBS OFF CACHE BOOL "")
endif()

if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY")
Expand All @@ -56,6 +58,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY")
set(x86_energy_STATIC ON CACHE BOOL "")
set(Sensors_USE_STATIC_LIBS ON CACHE BOOL "")
set(Libpfm_USE_STATIC_LIBS ON CACHE BOOL "")
set(CUDA_USE_STATIC_LIBS ON CACHE BOOL "")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc")
endif()

Expand All @@ -68,6 +71,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "ALL")
set(x86_energy_STATIC ON CACHE BOOL "")
set(Sensors_USE_STATIC_LIBS ON CACHE BOOL "")
set(Libpfm_USE_STATIC_LIBS ON CACHE BOOL "")
set(CUDA_USE_STATIC_LIBS ON CACHE BOOL "")

# Doesn't seem to work with clang, even though it should,
# but at least it doesn't complain about it either
Expand Down Expand Up @@ -107,6 +111,7 @@ find_package(Sensors)
find_package(Veosinfo)
find_package(Libpfm)
find_package(PkgConfig)
find_package(CUDAToolkit)

if(PkgConfig_FOUND)
pkg_check_modules(Audit audit)
Expand All @@ -129,6 +134,8 @@ CMAKE_DEPENDENT_OPTION(USE_LIBAUDIT "Use libaudit for syscall name resolution" O
add_feature_info("USE_LIBAUDIT" USE_LIBAUDIT "Use libaudit for syscall name resolution.")
CMAKE_DEPENDENT_OPTION(USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards." ON "Veosinfo_FOUND" OFF)
add_feature_info("USE_VEOSINFO" USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards.")
CMAKE_DEPENDENT_OPTION(USE_CUPTI "Use CUPTI to record CUDA activity." ON "CUDAToolkit_FOUND" OFF)
add_feature_info("USE_CUPTI" USE_CUPTI "Use CUPTI to record CUDA activity.")
# system configuration checks
CHECK_INCLUDE_FILES(linux/hw_breakpoint.h HAVE_HW_BREAKPOINT_H)
CHECK_STRUCT_HAS_MEMBER("struct perf_event_attr" clockid linux/perf_event.h HAVE_PERF_EVENT_ATTR_CLOCKID)
Expand All @@ -144,6 +151,13 @@ if(NOT CLOCK_GETTIME_FOUND)
unset(CMAKE_REQUIRED_LIBRARIES)
endif()

check_function_exists(shm_open SHM_OPEN_FOUND)
if(NOT SHM_OPEN_FOUND)
set(CMAKE_REQUIRED_LIBRARIES "rt")
check_function_exists(shm_open SHM_OPEN_FOUND_WITH_RT)
unset(CMAKE_REQUIRED_LIBRARIES)
endif()

cvonelm marked this conversation as resolved.
Show resolved Hide resolved
CHECK_STRUCT_HAS_BITFIELD("struct perf_event_attr" context_switch linux/perf_event.h HAVE_PERF_RECORD_SWITCH)

if(NOT HAVE_PERF_RECORD_SWITCH)
Expand Down Expand Up @@ -226,6 +240,14 @@ if(NOT CLOCK_GETTIME_FOUND)
endif()
endif()

if(NOT SHM_OPEN_FOUND)
if(SHM_OPEN_FOUND_WITH_RT)
target_link_libraries(lo2s PRIVATE rt)
else()
message(SEND_ERROR "Could not find the function shm_open(), but it is required.")
endif()
endif()

# handle x86_adapt dependency
if(X86Adapt_FOUND)
target_sources(lo2s PRIVATE
Expand Down Expand Up @@ -306,6 +328,38 @@ if (USE_LIBAUDIT)
endif()
endif()

set(LO2S_CUDA_INJECTIONLIB_PATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/liblo2s_injection.so")
if(USE_CUPTI)
if(CUDAToolkit_FOUND)
cvonelm marked this conversation as resolved.
Show resolved Hide resolved
add_library(lo2s_injection SHARED src/cupti/lib.cpp)
target_include_directories(lo2s_injection PRIVATE include
${CMAKE_CURRENT_BINARY_DIR}/include)

if (CUDA_USE_STATIC_LIBS)
target_link_libraries(lo2s_injection PRIVATE CUDA::cupti_static)
else()
target_link_libraries(lo2s_injection PRIVATE CUDA::cupti)
endif()

target_link_libraries(lo2s_injection PRIVATE fmt::fmt
Nitro::log
Nitro::env
Nitro::dl
Nitro::options
otf2xx::Writer)

if(SHM_OPEN_FOUND_WITH_RT)
target_link_libraries(lo2s_injection PRIVATE rt)
endif()

target_compile_definitions(lo2s PUBLIC HAVE_CUDA)
install(TARGETS lo2s_injection LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
else()
message(SEND_ERROR "Cupti not found but requested.")
endif()
endif()



# generate version string used in lo2s
if(Git_FOUND)
Expand Down
4 changes: 4 additions & 0 deletions include/lo2s/build_config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@


#cmakedefine LO2S_COPYRIGHT_YEAR "@LO2S_COPYRIGHT_YEAR@"

// The CUDA injection library installation path

#cmakedefine LO2S_CUDA_INJECTIONLIB_PATH "@LO2S_CUDA_INJECTIONLIB_PATH@"
4 changes: 4 additions & 0 deletions include/lo2s/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ struct Config
bool use_nec;
std::chrono::microseconds nec_read_interval;
std::chrono::milliseconds nec_check_interval;
// Nvidia CUPTI
bool use_nvidia;
std::string cuda_injectionlib_path;
uint64_t nvidia_ringbuf_size;
};

const Config& config();
Expand Down
50 changes: 50 additions & 0 deletions include/lo2s/cupti/events.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* This file is part of the lo2s software.
* Linux OTF2 sampling
*
* Copyright (c) 2024,
* Technische Universitaet Dresden, Germany
*
* lo2s is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* lo2s is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with lo2s. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include <cstdint>

namespace lo2s
{
namespace cupti
{
enum class EventType : uint64_t
{
CUPTI_KERNEL = 1,
};

struct event_header
{
EventType type;
uint64_t size;
};

struct event_kernel
{
struct event_header header;
uint64_t start;
uint64_t end;
char name[1];
};

} // namespace cupti
} // namespace lo2s
98 changes: 98 additions & 0 deletions include/lo2s/cupti/reader.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* This file is part of the lo2s software.
* Linux OTF2 sampling
*
* Copyright (c) 2016,
* Technische Universitaet Dresden, Germany
*
* lo2s is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* lo2s is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with lo2s. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include <lo2s/config.hpp>
#include <lo2s/cupti/events.hpp>
#include <lo2s/log.hpp>
#include <lo2s/perf/time/converter.hpp>
#include <lo2s/ringbuf.hpp>
#include <lo2s/trace/trace.hpp>
#include <lo2s/types.hpp>

#include <chrono>
#include <cstdlib>
#include <string>

extern "C"
{
#include <sys/timerfd.h>
#include <unistd.h>
}

namespace lo2s
{
namespace cupti
{

class Reader
{
public:
Reader(trace::Trace& trace, Process process)
: process_(process), trace_(trace), time_converter_(perf::time::Converter::instance()),
ringbuf_reader_("cupti", process.as_pid_t(), true, config().nvidia_ringbuf_size),
timer_fd_(timerfd_from_ns(config().userspace_read_interval)),
executable_name_(get_process_exe(process))
{
}

void read()
{
struct event_header* header = nullptr;

while ((header = reinterpret_cast<struct event_header*>(
ringbuf_reader_.get(sizeof(struct event_header)))) != nullptr)
{
if (header->type == EventType::CUPTI_KERNEL)
{
struct event_kernel* kernel =
reinterpret_cast<struct event_kernel*>(ringbuf_reader_.get(header->size));

auto& writer = trace_.cuda_writer(Thread(process_.as_thread()));

std::string kernel_name = kernel->name;
auto& cu_cctx = trace_.cuda_calling_context(executable_name_, kernel_name);

writer.write_calling_context_enter(time_converter_(kernel->start), cu_cctx.ref(),
2);
writer.write_calling_context_leave(time_converter_(kernel->end), cu_cctx.ref());
}

ringbuf_reader_.pop(header->size);
}
}

int fd()
{
return timer_fd_;
}

private:
Process process_;
trace::Trace& trace_;
perf::time::Converter& time_converter_;
RingBufReader ringbuf_reader_;
int timer_fd_;
std::string executable_name_;
};
} // namespace cupti
} // namespace lo2s
8 changes: 8 additions & 0 deletions include/lo2s/measurement_scope.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ enum class MeasurementScopeType
NEC_METRIC,
BIO,
SYSCALL,
CUDA,
UNKNOWN
};

Expand Down Expand Up @@ -79,6 +80,11 @@ struct MeasurementScope
return { MeasurementScopeType::SYSCALL, s };
}

static MeasurementScope cuda(ExecutionScope s)
{
return { MeasurementScopeType::CUDA, s };
}

friend bool operator==(const MeasurementScope& lhs, const MeasurementScope& rhs)
{
return (lhs.scope == rhs.scope) && lhs.type == rhs.type;
Expand Down Expand Up @@ -111,6 +117,8 @@ struct MeasurementScope
return fmt::format("block layer I/O events for {}", scope.name());
case MeasurementScopeType::SYSCALL:
return fmt::format("syscall events for {}", scope.name());
case lo2s::MeasurementScopeType::CUDA:
return fmt::format("cuda kernel events for {}", scope.name());
default:
throw new std::runtime_error("Unknown ExecutionScopeType!");
}
Expand Down
2 changes: 1 addition & 1 deletion include/lo2s/monitor/abstract_process_monitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class AbstractProcessMonitor
virtual void insert_process(Process parent, Process process, std::string proc_name,
bool spawn = false) = 0;
virtual void insert_thread(Process process, Thread thread, std::string name = "",
bool spawn = false) = 0;
bool spawn = false, bool is_process = false) = 0;

virtual void exit_thread(Thread thread) = 0;

Expand Down
3 changes: 2 additions & 1 deletion include/lo2s/monitor/process_monitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ class ProcessMonitor : public AbstractProcessMonitor, public MainMonitor
~ProcessMonitor();
void insert_process(Process parent, Process child, std::string proc_name,
bool spawn = false) override;
void insert_thread(Process parent, Thread child, std::string name, bool spawn = false) override;
void insert_thread(Process parent, Thread child, std::string name, bool spawn = false,
bool is_process = false) override;

void exit_thread(Thread thread) override;

Expand Down
7 changes: 5 additions & 2 deletions include/lo2s/monitor/scope_monitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@
#include <lo2s/monitor/main_monitor.hpp>
#include <lo2s/monitor/poll_monitor.hpp>

#include <lo2s/cupti/reader.hpp>
#include <lo2s/perf/counter/group/writer.hpp>
#include <lo2s/perf/counter/userspace/writer.hpp>

#include <lo2s/perf/sample/writer.hpp>
#include <lo2s/perf/syscall/writer.hpp>

#include <array>
#include <chrono>
#include <memory>
#include <thread>

#include <cstddef>
Expand All @@ -50,7 +51,8 @@ namespace monitor
class ScopeMonitor : public PollMonitor
{
public:
ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec);
ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec,
bool is_process = false);

void initialize_thread() override;
void finalize_thread() override;
Expand All @@ -74,6 +76,7 @@ class ScopeMonitor : public PollMonitor
std::unique_ptr<perf::sample::Writer> sample_writer_;
std::unique_ptr<perf::counter::group::Writer> group_counter_writer_;
std::unique_ptr<perf::counter::userspace::Writer> userspace_counter_writer_;
std::unique_ptr<cupti::Reader> cupti_reader_;
};
} // namespace monitor
} // namespace lo2s
4 changes: 2 additions & 2 deletions include/lo2s/monitor/system_process_monitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ class SystemProcessMonitor : public AbstractProcessMonitor
virtual void insert_process(Process parent, Process process, std::string proc_name,
bool spawn) override;

virtual void insert_thread(Process process, Thread thread, std::string name,
bool spawn) override;
virtual void insert_thread(Process process, Thread thread, std::string name, bool spawn,
bool is_process) override;

virtual void exit_thread(Thread thread) override;

Expand Down
Loading
Loading