Skip to content

Commit

Permalink
Support tracing thread locks with perfetto (#143)
Browse files Browse the repository at this point in the history
- remove sampling and roctracer flat/timeline options
  - unused/unnecessary clutter
- start pthread_gotcha before perfetto
- remove pthread_mutex_gotcha validate
- update timemory submodule with tid fix
  • Loading branch information
jrmadsen authored Aug 31, 2022
1 parent e67afd3 commit 2718596
Show file tree
Hide file tree
Showing 15 changed files with 122 additions and 204 deletions.
8 changes: 0 additions & 8 deletions source/docs/runtime.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,6 @@ OMNITRACE_CRITICAL_TRACE_PER_ROW = 0
OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES = false
OMNITRACE_DEBUG = false
OMNITRACE_DL_VERBOSE = 0
OMNITRACE_FLAT_SAMPLING = false
OMNITRACE_INSTRUMENTATION_INTERVAL = 1
OMNITRACE_KOKKOS_KERNEL_LOGGER = false
OMNITRACE_PAPI_EVENTS = PAPI_TOT_CYC
Expand All @@ -206,17 +205,14 @@ OMNITRACE_PERFETTO_COMBINE_TRACES = true
OMNITRACE_PERFETTO_FILE = perfetto-trace.proto
OMNITRACE_PERFETTO_FILL_POLICY = discard
OMNITRACE_PERFETTO_SHMEM_SIZE_HINT_KB = 4096
OMNITRACE_ROCTRACER_FLAT_PROFILE = false
OMNITRACE_ROCTRACER_HSA_ACTIVITY = false
OMNITRACE_ROCTRACER_HSA_API = false
OMNITRACE_ROCTRACER_HSA_API_TYPES =
OMNITRACE_ROCTRACER_TIMELINE_PROFILE = false
OMNITRACE_SAMPLING_CPUS =
OMNITRACE_SAMPLING_DELAY = 0.5
OMNITRACE_SAMPLING_FREQ = 10
OMNITRACE_SAMPLING_GPUS = all
OMNITRACE_TIME_OUTPUT = true
OMNITRACE_TIMELINE_SAMPLING = false
OMNITRACE_TIMEMORY_COMPONENTS = wall_clock
OMNITRACE_TRACE_THREAD_LOCKS = false
OMNITRACE_VERBOSE = 0
Expand Down Expand Up @@ -297,7 +293,6 @@ $ omnitrace-avail -S -bd
| OMNITRACE_ENABLE_SIGNAL_HANDLER | Enable signals in timemory_init |
| OMNITRACE_FILE_OUTPUT | Write output to files |
| OMNITRACE_FLAT_PROFILE | Set the label hierarchy mode to defa... |
| OMNITRACE_FLAT_SAMPLING | Ignore hierarchy in all statistical ... |
| OMNITRACE_INPUT_EXTENSIONS | File extensions used when searching ... |
| OMNITRACE_INPUT_PATH | Explicitly specify the input folder ... |
| OMNITRACE_INPUT_PREFIX | Explicitly specify the prefix for in... |
Expand Down Expand Up @@ -328,11 +323,9 @@ $ omnitrace-avail -S -bd
| OMNITRACE_PERFETTO_FILL_POLICY | Behavior when perfetto buffer is ful... |
| OMNITRACE_PERFETTO_SHMEM_SIZE_HINT_KB | Hint for shared-memory buffer size i... |
| OMNITRACE_PRECISION | Set the global output precision for ... |
| OMNITRACE_ROCTRACER_FLAT_PROFILE | Ignore hierarchy in all kernels entr... |
| OMNITRACE_ROCTRACER_HSA_ACTIVITY | Enable HSA activity tracing support |
| OMNITRACE_ROCTRACER_HSA_API | Enable HSA API tracing support |
| OMNITRACE_ROCTRACER_HSA_API_TYPES | HSA API type to collect |
| OMNITRACE_ROCTRACER_TIMELINE_PROFILE | Create unique entries for every kern... |
| OMNITRACE_SAMPLING_CPUS | CPUs to collect frequency informatio... |
| OMNITRACE_SAMPLING_DELAY | Number of seconds to wait before the... |
| OMNITRACE_SAMPLING_FREQ | Number of software interrupts per se... |
Expand All @@ -343,7 +336,6 @@ $ omnitrace-avail -S -bd
| OMNITRACE_SUPPRESS_PARSING | Disable parsing environment |
| OMNITRACE_TEXT_OUTPUT | Write text output files |
| OMNITRACE_TIMELINE_PROFILE | Set the label hierarchy mode to defa... |
| OMNITRACE_TIMELINE_SAMPLING | Create unique entries for every samp... |
| OMNITRACE_TIMEMORY_COMPONENTS | List of components to collect via ti... |
| OMNITRACE_TIME_FORMAT | Customize the folder generation when... |
| OMNITRACE_TIME_OUTPUT | Output data to subfolder w/ a timest... |
Expand Down
31 changes: 20 additions & 11 deletions source/lib/omnitrace/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@ omnitrace_set_env_hidden(const char* env_name, const char* env_val)

namespace
{
bool _set_mpi_called = false;
std::function<void()> _start_gotcha_callback = []() {};
bool _set_mpi_called = false;
std::function<void()> _preinit_callback = []() {};
} // namespace

extern "C" void
Expand Down Expand Up @@ -223,7 +223,7 @@ omnitrace_set_mpi_hidden(bool use, bool attached)
std::to_string(use).c_str(), std::to_string(attached).c_str(),
std::to_string(get_state()).c_str());

_start_gotcha_callback();
_preinit_callback();
}

//======================================================================================//
Expand Down Expand Up @@ -356,6 +356,9 @@ omnitrace_init_tooling_hidden()

OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);

// start these gotchas once settings have been initialized
get_init_bundle()->start();

if(get_use_sampling()) sampling::block_signals();

if(get_use_critical_trace())
Expand Down Expand Up @@ -554,11 +557,11 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a

if(!_set_mpi_called)
{
_start_gotcha_callback = []() { get_gotcha_bundle()->start(); };
_preinit_callback = []() { get_preinit_bundle()->start(); };
}
else
{
get_gotcha_bundle()->start();
get_preinit_bundle()->start();
}
}

Expand Down Expand Up @@ -615,7 +618,7 @@ omnitrace_finalize_hidden(void)
if(_debug_init) config::set_setting_value("OMNITRACE_DEBUG", _debug_value);
} };

auto& _thread_bundle = thread_data<omnitrace_thread_bundle_t>::instance();
auto& _thread_bundle = thread_data<thread_bundle_t>::instance();
if(_thread_bundle) _thread_bundle->stop();

if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n");
Expand Down Expand Up @@ -644,7 +647,7 @@ omnitrace_finalize_hidden(void)
}
}

// stop the main bundle which shuts down the pthread gotchas
// stop the main bundle which has stats for run
if(get_main_bundle())
{
OMNITRACE_DEBUG_F("Stopping main bundle...\n");
Expand Down Expand Up @@ -690,12 +693,18 @@ omnitrace_finalize_hidden(void)
}
}

// stop the main gotcha which shuts down the pthread gotchas
if(get_init_bundle())
{
OMNITRACE_DEBUG_F("Stopping main gotcha...\n");
get_init_bundle()->stop();
}

// stop the gotcha bundle
if(get_gotcha_bundle())
if(get_preinit_bundle())
{
OMNITRACE_VERBOSE_F(1, "Shutting down miscellaneous gotchas...\n");
get_gotcha_bundle()->stop();
get_gotcha_bundle().reset();
get_preinit_bundle()->stop();
component::mpi_gotcha::shutdown();
}

Expand Down Expand Up @@ -746,7 +755,7 @@ omnitrace_finalize_hidden(void)
// if they are still running (e.g. thread-pool still alive), the
// thread-specific data will be wrong if try to stop them from
// the main thread.
for(auto& itr : thread_data<omnitrace_thread_bundle_t>::instances())
for(auto& itr : thread_data<thread_bundle_t>::instances())
{
if(itr && itr->get<comp::wall_clock>() &&
!itr->get<comp::wall_clock>()->get_is_running())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ pthread_create_gotcha::wrapper::wrapper(routine_t _routine, void* _arg,
void*
pthread_create_gotcha::wrapper::operator()() const
{
using thread_bundle_data_t = thread_data<omnitrace_thread_bundle_t>;
using thread_bundle_data_t = thread_data<thread_bundle_t>;

if(is_shutdown && *is_shutdown)
{
Expand Down Expand Up @@ -195,7 +195,7 @@ pthread_create_gotcha::wrapper::operator()() const
threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str());
if(!thread_bundle_data_t::instances().at(_tid))
{
thread_data<omnitrace_thread_bundle_t>::construct(
thread_data<thread_bundle_t>::construct(
TIMEMORY_JOIN('/', "omnitrace/process", process::get_id(), "thread",
_tid),
quirk::config<quirk::auto_start>{});
Expand Down
34 changes: 4 additions & 30 deletions source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "library/debug.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_info.hpp"
#include "library/utility.hpp"

#include <timemory/backends/threading.hpp>
Expand Down Expand Up @@ -98,8 +99,6 @@ pthread_mutex_gotcha::configure()
pthread_mutex_gotcha_t::get_initializer() = []() {
if(config::get_trace_thread_locks())
{
validate();

pthread_mutex_gotcha_t::configure(
comp::gotcha_config<0, int, pthread_mutex_t*>{ "pthread_mutex_lock" });

Expand Down Expand Up @@ -161,31 +160,6 @@ pthread_mutex_gotcha::shutdown()
pthread_mutex_gotcha_t::disable();
}

void
pthread_mutex_gotcha::validate()
{
if(config::get_trace_thread_locks() && config::get_use_perfetto())
{
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F(
"The overhead of all the mutex locking internally by perfetto is\n")
OMNITRACE_PRINT_F(
"so significant that all timing data is rendered meaningless.\n");
OMNITRACE_PRINT_F(
"However, mutex locking is effectively non-existant in timemory.\n");
OMNITRACE_PRINT_F("If you want to trace the mutex locking:\n")
OMNITRACE_PRINT_F(" OMNITRACE_USE_TIMEMORY=ON\n");
OMNITRACE_PRINT_F(" OMNITRACE_USE_PERFETTO=OFF\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_FAIL_F("OMNITRACE_USE_PERFETTO and OMNITRACE_TRACE_THREAD_LOCKS cannot "
"both be enabled.\n");
}
}

pthread_mutex_gotcha::pthread_mutex_gotcha(const gotcha_data_t& _data)
: m_data{ &_data }
{}
Expand Down Expand Up @@ -290,9 +264,9 @@ pthread_mutex_gotcha::operator()(int (*_callee)(pthread_t, void**), pthread_t _t
bool
pthread_mutex_gotcha::is_disabled()
{
return (get_state() != ::omnitrace::State::Active ||
get_thread_state() != ThreadState::Enabled ||
(get_use_sampling() && !sampling_enabled_on_child_threads()));
static thread_local const auto& _info = thread_info::get();
return (!_info || _info->is_offset || get_state() != ::omnitrace::State::Active ||
get_thread_state() != ThreadState::Enabled);
}
} // namespace component
} // namespace omnitrace
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ struct pthread_mutex_gotcha : comp::base<pthread_mutex_gotcha, void>
// generate the gotcha wrappers
static void configure();
static void shutdown();
static void validate();

int operator()(int (*)(pthread_mutex_t*), pthread_mutex_t*) const;
int operator()(int (*)(pthread_spinlock_t*), pthread_spinlock_t*) const;
Expand Down
51 changes: 0 additions & 51 deletions source/lib/omnitrace/library/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,29 +491,6 @@ configure_settings(bool _init)
std::to_string(_sigrt_range),
0, "sampling", "advanced");

OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_FLAT_SAMPLING",
"Ignore hierarchy in all statistical sampling entries",
_config->get_flat_profile(), "timemory", "sampling",
"data_layout", "advanced");

OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_TIMELINE_SAMPLING",
"Create unique entries for every sample when statistical sampling is enabled",
_config->get_timeline_profile(), "timemory", "sampling", "data_layout",
"advanced");

OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_ROCTRACER_FLAT_PROFILE",
"Ignore hierarchy in all kernels entries with timemory backend",
_config->get_flat_profile(), "timemory", "roctracer", "data_layout", "rocm",
"advanced");

OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_ROCTRACER_TIMELINE_PROFILE",
"Create unique entries for every kernel with timemory backend",
_config->get_timeline_profile(), "timemory", "roctracer", "data_layout", "rocm",
"advanced");

OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_ROCTRACER_HSA_ACTIVITY",
"Enable HSA activity tracing support", true, "roctracer",
"rocm", "advanced");
Expand Down Expand Up @@ -1615,34 +1592,6 @@ get_sampling_rtoffset()
return static_cast<tim::tsettings<int>&>(*_v->second).get();
}

bool
get_timeline_sampling()
{
static auto _v = get_config()->find("OMNITRACE_TIMELINE_SAMPLING");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}

bool
get_flat_sampling()
{
static auto _v = get_config()->find("OMNITRACE_FLAT_SAMPLING");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}

bool
get_roctracer_timeline_profile()
{
static auto _v = get_config()->find("OMNITRACE_ROCTRACER_TIMELINE_PROFILE");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}

bool
get_roctracer_flat_profile()
{
static auto _v = get_config()->find("OMNITRACE_ROCTRACER_FLAT_PROFILE");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}

bool
get_trace_hsa_api()
{
Expand Down
12 changes: 0 additions & 12 deletions source/lib/omnitrace/library/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,18 +219,6 @@ get_sampling_rtoffset();
bool
get_use_rcclp();

bool
get_timeline_sampling();

bool
get_flat_sampling();

bool
get_roctracer_timeline_profile();

bool
get_roctracer_flat_profile();

bool
get_trace_hsa_api();

Expand Down
36 changes: 9 additions & 27 deletions source/lib/omnitrace/library/roctracer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,12 +238,6 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");

static thread_local int64_t begin_timestamp = 0;
static auto _scope = []() {
auto _v = scope::config{};
if(get_roctracer_timeline_profile()) _v += scope::timeline{};
if(get_roctracer_flat_profile()) _v += scope::flat{};
return _v;
}();

switch(cid)
{
Expand Down Expand Up @@ -320,7 +314,7 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(tasking::roctracer::get_task_group().pool())
tasking::roctracer::get_task_group().exec(
[_name, _beg_ns, _end_ns]() {
roctracer_hsa_bundle_t _bundle{ _name, _scope };
roctracer_hsa_bundle_t _bundle{ _name };
_bundle.start()
.store(std::plus<double>{},
static_cast<double>(_end_ns - _beg_ns))
Expand Down Expand Up @@ -374,14 +368,8 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)

if(!_name) return;

auto _beg_ns = record->begin_ns + get_clock_skew();
auto _end_ns = record->end_ns + get_clock_skew();
static auto _scope = []() {
auto _v = scope::config{};
if(get_roctracer_timeline_profile()) _v += scope::timeline{};
if(get_roctracer_flat_profile()) _v += scope::flat{};
return _v;
}();
auto _beg_ns = record->begin_ns + get_clock_skew();
auto _end_ns = record->end_ns + get_clock_skew();

if(get_use_perfetto())
{
Expand All @@ -394,7 +382,7 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)
auto _func = [_beg_ns, _end_ns, _name]() {
if(get_use_timemory())
{
roctracer_hsa_bundle_t _bundle{ *_name, _scope };
roctracer_hsa_bundle_t _bundle{ *_name };
_bundle.start()
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
.stop();
Expand Down Expand Up @@ -836,16 +824,10 @@ hip_activity_callback(const char* begin, const char* end, void*)

const char* op_name =
roctracer_op_string(record->domain, record->op, record->kind);
auto _ns_skew = get_clock_skew();
uint64_t _beg_ns = record->begin_ns + _ns_skew;
uint64_t _end_ns = record->end_ns + _ns_skew;
auto _corr_id = record->correlation_id;
static auto _scope = []() {
auto _v = scope::config{};
if(get_roctracer_timeline_profile()) _v += scope::timeline{};
if(get_roctracer_flat_profile()) _v += scope::flat{};
return _v;
}();
auto _ns_skew = get_clock_skew();
uint64_t _beg_ns = record->begin_ns + _ns_skew;
uint64_t _end_ns = record->end_ns + _ns_skew;
auto _corr_id = record->correlation_id;

auto& _keys = get_roctracer_key_data();
auto& _tids = get_roctracer_tid_data();
Expand Down Expand Up @@ -936,7 +918,7 @@ hip_activity_callback(const char* begin, const char* end, void*)
if(_found && _name != nullptr && get_use_timemory())
{
auto _func = [_beg_ns, _end_ns, _name]() {
roctracer_bundle_t _bundle{ _name, _scope };
roctracer_bundle_t _bundle{ _name };
_bundle.start()
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
.stop()
Expand Down
Loading

0 comments on commit 2718596

Please sign in to comment.