diff --git a/cmake/Packages.cmake b/cmake/Packages.cmake index 93559c2c2..b6c6d7a7b 100644 --- a/cmake/Packages.cmake +++ b/cmake/Packages.cmake @@ -314,8 +314,7 @@ if(OMNITRACE_BUILD_DYNINST) TARGETS ${_LIB} DESTINATION ${CMAKE_INSTALL_LIBDIR}/omnitrace COMPONENT dyninst - PUBLIC_HEADER DESTINATION ${PROJECT_BINARY_DIR}/.discard/omnitrace/include - ) + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/omnitrace/dyninst) endif() endforeach() diff --git a/source/lib/common/defines.h.in b/source/lib/common/defines.h.in index 8f52ec097..9ac4e7fa2 100644 --- a/source/lib/common/defines.h.in +++ b/source/lib/common/defines.h.in @@ -97,7 +97,7 @@ #define OMNITRACE_STRINGIZE(X) OMNITRACE_STRINGIZE2(X) #define OMNITRACE_STRINGIZE2(X) #X #define OMNITRACE_VAR_NAME_COMBINE(X, Y) X##Y -#define OMNITRACE_VARIABLE(Y) OMNITRACE_VAR_NAME_COMBINE(_omni_var_, Y) +#define OMNITRACE_VARIABLE(X, Y) OMNITRACE_VAR_NAME_COMBINE(X, Y) #define OMNITRACE_LINESTR OMNITRACE_STRINGIZE(__LINE__) #define OMNITRACE_ESC(...) __VA_ARGS__ diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index 0365e8606..7d234eb4a 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -337,27 +337,26 @@ omnitrace_init_tooling_hidden() if(get_state() > State::Active) return; if(get_use_process_sampling()) { - pthread_gotcha::push_enable_sampling_on_child_threads(false); + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); process_sampler::setup(); - pthread_gotcha::pop_enable_sampling_on_child_threads(); } if(get_use_sampling()) { - pthread_gotcha::push_enable_sampling_on_child_threads(false); + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); sampling::setup(); - pthread_gotcha::pop_enable_sampling_on_child_threads(); - pthread_gotcha::push_enable_sampling_on_child_threads(get_use_sampling()); + } + if(get_use_sampling()) + { + push_enable_sampling_on_child_threads(get_use_sampling()); sampling::unblock_signals(); } get_main_bundle()->start(); set_state(State::Active); // set to active as very last operation } }; - if(get_use_sampling()) - { - pthread_gotcha::push_enable_sampling_on_child_threads(false); - sampling::block_signals(); - } + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); + + if(get_use_sampling()) sampling::block_signals(); if(get_use_critical_trace()) { @@ -426,8 +425,8 @@ omnitrace_init_tooling_hidden() for(const auto& itr : _disabled_categories) { - OMNITRACE_VERBOSE(1, "Disabling perfetto track event category: %s\n", - itr.c_str()); + OMNITRACE_VERBOSE_F(1, "Disabling perfetto track event category: %s\n", + itr.c_str()); track_event_cfg.add_disabled_categories(itr); } @@ -581,6 +580,8 @@ omnitrace_finalize_hidden(void) return; } + if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n"); + OMNITRACE_VERBOSE_F(0, "finalizing...\n"); thread_info::set_stop(comp::wall_clock::record()); @@ -604,8 +605,8 @@ omnitrace_finalize_hidden(void) set_state(State::Finalized); - pthread_gotcha::push_enable_sampling_on_child_threads(false); - pthread_gotcha::set_sampling_on_all_future_threads(false); + push_enable_sampling_on_child_threads(false); + set_sampling_on_all_future_threads(false); auto _debug_init = get_debug_finalize(); auto _debug_value = get_debug(); @@ -614,8 +615,6 @@ omnitrace_finalize_hidden(void) if(_debug_init) config::set_setting_value("OMNITRACE_DEBUG", _debug_value); } }; - OMNITRACE_DEBUG_F("\n"); - auto& _thread_bundle = thread_data::instance(); if(_thread_bundle) _thread_bundle->stop(); @@ -713,7 +712,7 @@ omnitrace_finalize_hidden(void) comp::roctracer::shutdown(); // join extra thread(s) used by roctracer - OMNITRACE_VERBOSE_F(1, "Waiting on roctracer tasks...\n"); + OMNITRACE_VERBOSE_F(2, "Waiting on roctracer tasks...\n"); tasking::join(); } @@ -734,10 +733,11 @@ omnitrace_finalize_hidden(void) // report the high-level metrics for the process if(get_main_bundle()) { + if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n"); std::string _msg = JOIN("", *get_main_bundle()); auto _pos = _msg.find(">>> "); if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5); - OMNITRACE_PRINT("%s\n", _msg.c_str()); + OMNITRACE_VERBOSE_F(0, "%s\n", _msg.c_str()); OMNITRACE_DEBUG_F("Resetting main bundle...\n"); get_main_bundle()->reset(); } @@ -754,10 +754,12 @@ omnitrace_finalize_hidden(void) std::string _msg = JOIN("", *itr); auto _pos = _msg.find(">>> "); if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5); - OMNITRACE_VERBOSE(0, "%s\n", _msg.c_str()); + OMNITRACE_VERBOSE_F(0, "%s\n", _msg.c_str()); } } + if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n"); + // ensure that all the MT instances are flushed if(get_use_sampling()) { @@ -813,6 +815,16 @@ omnitrace_finalize_hidden(void) tasking::join(); } + // shutdown tasking before timemory is finalized, especially the roctracer thread-pool + OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n"); + tasking::shutdown(); + + if(get_use_code_coverage()) + { + OMNITRACE_VERBOSE_F(1, "Post-processing the code coverage...\n"); + coverage::post_process(); + } + bool _perfetto_output_error = false; if(get_use_perfetto() && !is_system_backend()) { @@ -821,11 +833,7 @@ omnitrace_finalize_hidden(void) OMNITRACE_CI_THROW(tracing_session == nullptr, "Null pointer to the tracing session"); - if(get_verbose() >= 0) fprintf(stderr, "\n"); - if(get_verbose() >= 0 || get_debug()) - fprintf(stderr, "%s[%s][%s]|%i> Flushing perfetto...%s\n", - tim::log::color::info(), TIMEMORY_PROJECT_NAME, OMNITRACE_FUNCTION, - dmp::rank(), tim::log::color::end()); + OMNITRACE_VERBOSE_F(0, "Finalizing perfetto...\n"); // Make sure the last event is closed for this example. perfetto::TrackEvent::Flush(); @@ -905,16 +913,6 @@ omnitrace_finalize_hidden(void) } } - // shutdown tasking before timemory is finalized, especially the roctracer thread-pool - OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n"); - tasking::shutdown(); - - OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n"); - if(get_use_code_coverage()) - { - coverage::post_process(); - } - tim::manager::instance()->add_metadata([](auto& ar) { auto _maps = tim::procfs::read_maps(process::get_id()); auto _libs = std::set{}; diff --git a/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp b/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp index bfc4e3991..6b9fab818 100644 --- a/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp +++ b/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp @@ -22,7 +22,6 @@ #include "library/components/pthread_create_gotcha.hpp" #include "library/components/category_region.hpp" -#include "library/components/pthread_gotcha.hpp" #include "library/components/roctracer.hpp" #include "library/config.hpp" #include "library/debug.hpp" @@ -213,9 +212,8 @@ pthread_create_gotcha::wrapper::operator()() const if(m_enable_sampling) { _is_sampling = true; - pthread_gotcha::push_enable_sampling_on_child_threads(false); + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); _signals = sampling::setup(); - pthread_gotcha::pop_enable_sampling_on_child_threads(); sampling::unblock_signals(); } } @@ -336,7 +334,7 @@ pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr, auto _active = (get_state() == ::omnitrace::State::Active && !_disabled); auto _coverage = (get_mode() == Mode::Coverage); auto _use_sampling = get_use_sampling(); - auto _sample_child = pthread_gotcha::sampling_enabled_on_child_threads(); + auto _sample_child = sampling_enabled_on_child_threads(); auto _tid = utility::get_thread_index(); auto _use_bundle = (_active && !_coverage); const auto& _info = thread_info::init(!_active || !_sample_child || _disabled); diff --git a/source/lib/omnitrace/library/components/pthread_gotcha.cpp b/source/lib/omnitrace/library/components/pthread_gotcha.cpp index d98fab67d..6dab1b9b6 100644 --- a/source/lib/omnitrace/library/components/pthread_gotcha.cpp +++ b/source/lib/omnitrace/library/components/pthread_gotcha.cpp @@ -69,14 +69,6 @@ namespace using bundle_t = tim::lightweight_tuple; -auto& -get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index()) -{ - static auto _v = utility::get_filled_array( - []() { return utility::get_reserved_vector(32); }); - return _v.at(_idx); -} - auto& get_bundle() { @@ -112,51 +104,6 @@ pthread_gotcha::shutdown() } } -bool -pthread_gotcha::sampling_enabled_on_child_threads() -{ - return sampling_on_child_threads(); -} - -bool -pthread_gotcha::push_enable_sampling_on_child_threads(bool _v) -{ - bool _last = sampling_on_child_threads(); - sampling_on_child_threads() = _v; - auto& _hist = get_sampling_on_child_threads_history(); - _hist.emplace_back(_last); - return _last; -} - -bool -pthread_gotcha::pop_enable_sampling_on_child_threads() -{ - auto& _hist = get_sampling_on_child_threads_history(); - if(!_hist.empty()) - { - bool _restored = _hist.back(); - _hist.pop_back(); - sampling_on_child_threads() = _restored; - } - return sampling_on_child_threads(); -} - -void -pthread_gotcha::set_sampling_on_all_future_threads(bool _v) -{ - for(size_t i = 0; i < max_supported_threads; ++i) - get_sampling_on_child_threads_history(i).emplace_back(_v); -} - -bool& -pthread_gotcha::sampling_on_child_threads() -{ - static thread_local bool _v = get_sampling_on_child_threads_history().empty() - ? false - : get_sampling_on_child_threads_history().back(); - return _v; -} - void pthread_gotcha::start() { diff --git a/source/lib/omnitrace/library/components/pthread_gotcha.hpp b/source/lib/omnitrace/library/components/pthread_gotcha.hpp index 0711812ac..7edc6f791 100644 --- a/source/lib/omnitrace/library/components/pthread_gotcha.hpp +++ b/source/lib/omnitrace/library/components/pthread_gotcha.hpp @@ -42,22 +42,7 @@ struct pthread_gotcha : tim::component::base static void configure(); static void shutdown(); - // query current value - static bool sampling_enabled_on_child_threads(); - - // use this to disable sampling in a region (e.g. right before thread creation) - static bool push_enable_sampling_on_child_threads(bool _v); - - // use this to restore previous setting - static bool pop_enable_sampling_on_child_threads(); - - // make sure every newly created thead starts with this value - static void set_sampling_on_all_future_threads(bool _v); - static void start(); static void stop(); - -private: - static bool& sampling_on_child_threads(); }; } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp b/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp index c1587b6ed..bb6eee219 100644 --- a/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp +++ b/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp @@ -22,7 +22,6 @@ #include "library/components/pthread_mutex_gotcha.hpp" #include "library/components/category_region.hpp" -#include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/critical_trace.hpp" #include "library/debug.hpp" @@ -293,7 +292,7 @@ pthread_mutex_gotcha::is_disabled() { return (get_state() != ::omnitrace::State::Active || get_thread_state() != ThreadState::Enabled || - (get_use_sampling() && !pthread_gotcha::sampling_enabled_on_child_threads())); + (get_use_sampling() && !sampling_enabled_on_child_threads())); } } // namespace component } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/roctracer.cpp b/source/lib/omnitrace/library/components/roctracer.cpp index 39bdfc097..a5a26f61d 100644 --- a/source/lib/omnitrace/library/components/roctracer.cpp +++ b/source/lib/omnitrace/library/components/roctracer.cpp @@ -22,13 +22,13 @@ #include "library/components/roctracer.hpp" #include "library/common.hpp" -#include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/defines.hpp" #include "library/dynamic_library.hpp" #include "library/redirect.hpp" #include "library/roctracer.hpp" +#include "library/runtime.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" @@ -121,7 +121,7 @@ roctracer::setup() roctracer_is_setup() = true; OMNITRACE_VERBOSE_F(1, "setting up roctracer...\n"); - pthread_gotcha::push_enable_sampling_on_child_threads(false); + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); dynamic_library _amdhip64{ "OMNITRACE_ROCTRACER_LIBAMDHIP64", find_library_path("libamdhip64.so", @@ -169,8 +169,6 @@ roctracer::setup() for(auto& itr : roctracer_setup_routines()) itr.second(); - pthread_gotcha::pop_enable_sampling_on_child_threads(); - OMNITRACE_VERBOSE_F(1, "roctracer is setup\n"); } diff --git a/source/lib/omnitrace/library/config.cpp b/source/lib/omnitrace/library/config.cpp index 192b939f7..db232d98c 100644 --- a/source/lib/omnitrace/library/config.cpp +++ b/source/lib/omnitrace/library/config.cpp @@ -92,6 +92,42 @@ get_available_perfetto_categories() return _v; } +template +std::set +parse_numeric_range(std::string _input_string, const std::string& _label) +{ + for(auto& itr : _input_string) + itr = tolower(itr); + auto _result = std::set{}; + for(const auto& _v : tim::delimit(_input_string, ",; \t")) + { + if(_v.find_first_not_of("0123456789-") != std::string::npos) + { + OMNITRACE_VERBOSE_F( + 0, + "Invalid %s specification. Only numerical values (e.g., 0) or " + "ranges (e.g., 0-7) are permitted. Ignoring %s...", + _label.c_str(), _v.c_str()); + continue; + } + if(_v.find('-') != std::string::npos) + { + auto _vv = tim::delimit(_v, "-"); + OMNITRACE_CONDITIONAL_THROW( + _vv.size() != 2, + "Invalid %s range specification: %s. Required format N-M, e.g. 0-4", + _label.c_str(), _v.c_str()); + for(int64_t i = std::stol(_vv.at(0)); i <= std::stol(_vv.at(1)); ++i) + _result.emplace(i); + } + else + { + _result.emplace(std::stol(_v)); + } + } + return _result; +} + #define OMNITRACE_CONFIG_SETTING(TYPE, ENV_NAME, DESCRIPTION, INITIAL_VALUE, ...) \ [&]() { \ auto _ret = _config->insert( \ @@ -334,12 +370,21 @@ configure_settings(bool _init) "delivered. Defaults to OMNITRACE_SAMPLING_DELAY when <= 0.0", -1.0, "sampling", "advanced"); + OMNITRACE_CONFIG_SETTING(double, "OMNITRACE_SAMPLING_DURATION", + "If > 0.0, time (in seconds) to sample before stopping", 0.0, + "sampling", "process_sampling"); + OMNITRACE_CONFIG_SETTING( double, "OMNITRACE_PROCESS_SAMPLING_FREQ", "Number of measurements per second when OMNITTRACE_USE_PROCESS_SAMPLING=ON. If " "set to zero, uses OMNITRACE_SAMPLING_FREQ value", 0.0, "process_sampling"); + OMNITRACE_CONFIG_SETTING(double, "OMNITRACE_PROCESS_SAMPLING_DURATION", + "If > 0.0, time (in seconds) to sample before stopping. If " + "less than zero, uses OMNITRACE_SAMPLING_DURATION", + -1.0, "sampling", "process_sampling"); + OMNITRACE_CONFIG_SETTING( std::string, "OMNITRACE_SAMPLING_CPUS", "CPUs to collect frequency information for. Values should be separated by commas " @@ -359,6 +404,29 @@ configure_settings(bool _init) "'all' and 'none' suppresses all GPU sampling", std::string{ "all" }, "rocm_smi", "rocm", "process_sampling"); + OMNITRACE_CONFIG_SETTING( + std::string, "OMNITRACE_SAMPLING_TIDS", + "Limit call-stack sampling to specific thread IDs, starting at zero for the main " + "thread. Be aware that some libraries, such as ROCm may create additional " + "threads which increment the TID count. However, no threads started by omnitrace " + "will increment the TID count. Values should be separated by commas and can be " + "explicit or ranges, e.g. 0,1,5-8. An empty value implies all TIDs.", + std::string{}, "sampling", "advanced"); + + OMNITRACE_CONFIG_SETTING( + std::string, "OMNITRACE_SAMPLING_CPUTIME_TIDS", + "Same as OMNITRACE_SAMPLING_TIDS but applies specifically to samplers whose " + "timers are based on the CPU-time. This is useful when both " + "OMNITRACE_SAMPLING_CPUTIME=ON and OMNITRACE_SAMPLING_REALTIME=ON", + std::string{}, "sampling", "advanced"); + + OMNITRACE_CONFIG_SETTING( + std::string, "OMNITRACE_SAMPLING_REALTIME_TIDS", + "Same as OMNITRACE_SAMPLING_TIDS but applies specifically to samplers whose " + "timers are based on the real (wall) time. This is useful when both " + "OMNITRACE_SAMPLING_CPUTIME=ON and OMNITRACE_SAMPLING_REALTIME=ON", + std::string{}, "sampling", "advanced"); + auto _backend = tim::get_env_choice( "OMNITRACE_PERFETTO_BACKEND", (_system_backend) ? "system" // if OMNITRACE_PERFETTO_BACKEND_SYSTEM is true, @@ -480,7 +548,7 @@ configure_settings(bool _init) OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_PERFETTO_BUFFER_SIZE_KB", "Size of perfetto buffer (in KB)", size_t{ 1024000 }, - "perfetto", "data", "advanced"); + "perfetto", "data"); OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_PERFETTO_COMBINE_TRACES", "Combine Perfetto traces. If not explicitly set, it will " @@ -695,9 +763,13 @@ configure_settings(bool _init) tim::delimit(_config->get("OMNITRACE_CONFIG_FILE"), ";:")) { if(_config->get_suppress_config()) continue; + OMNITRACE_BASIC_VERBOSE(1, "Reading config file %s\n", itr.c_str()); _config->read(itr); - if(_config->get("OMNITRACE_CI") && _main_proc) + + if(_main_proc && + ((_config->get("OMNITRACE_CI") && settings::verbose() >= 0) || + settings::verbose() >= 1 || settings::debug())) { std::ifstream _in{ itr }; std::stringstream _iss{}; @@ -709,7 +781,7 @@ configure_settings(bool _init) } if(!_iss.str().empty()) { - OMNITRACE_BASIC_PRINT("config file '%s':\n%s\n", itr.c_str(), + OMNITRACE_BASIC_PRINT("config file '%s':\n%s", itr.c_str(), _iss.str().c_str()); } } @@ -1753,6 +1825,13 @@ get_sampling_real_delay() return _val; } +double +get_sampling_duration() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_DURATION"); + return static_cast&>(*_v->second).get(); +} + std::string get_sampling_cpus() { @@ -1760,6 +1839,30 @@ get_sampling_cpus() return static_cast&>(*_v->second).get(); } +std::set +get_sampling_tids() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_TIDS"); + return parse_numeric_range<>( + static_cast&>(*_v->second).get(), "thread IDs"); +} + +std::set +get_sampling_cpu_tids() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUTIME_TIDS"); + return parse_numeric_range<>( + static_cast&>(*_v->second).get(), "thread IDs"); +} + +std::set +get_sampling_real_tids() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME_TIDS"); + return parse_numeric_range<>( + static_cast&>(*_v->second).get(), "thread IDs"); +} + int64_t get_critical_trace_count() { @@ -1777,6 +1880,13 @@ get_process_sampling_freq() return _val; } +double +get_process_sampling_duration() +{ + static auto _v = get_config()->find("OMNITRACE_PROCESS_SAMPLING_DURATION"); + return static_cast&>(*_v->second).get(); +} + std::string get_sampling_gpus() { diff --git a/source/lib/omnitrace/library/config.hpp b/source/lib/omnitrace/library/config.hpp index 86aa540b3..824f96cf4 100644 --- a/source/lib/omnitrace/library/config.hpp +++ b/source/lib/omnitrace/library/config.hpp @@ -298,12 +298,24 @@ get_sampling_cpu_delay(); double get_sampling_real_delay(); +double +get_sampling_duration(); + std::string get_sampling_cpus(); +std::set +get_sampling_cpu_tids(); + +std::set +get_sampling_real_tids(); + double get_process_sampling_freq(); +double +get_process_sampling_duration(); + std::string get_sampling_gpus(); diff --git a/source/lib/omnitrace/library/cpu_freq.cpp b/source/lib/omnitrace/library/cpu_freq.cpp index 1f26a37ef..917193515 100644 --- a/source/lib/omnitrace/library/cpu_freq.cpp +++ b/source/lib/omnitrace/library/cpu_freq.cpp @@ -160,8 +160,9 @@ write_perfetto_counter_track(index&& _idx, Args... _args) void post_process() { - OMNITRACE_PRINT("Post-processing %zu cpu frequency and memory usage entries...\n", - cpu_data.size()); + OMNITRACE_VERBOSE(1, + "Post-processing %zu cpu frequency and memory usage entries...\n", + cpu_data.size()); auto _process_frequencies = [](size_t _idx, size_t _offset) { using freq_track = perfetto_counter_track; diff --git a/source/lib/omnitrace/library/process_sampler.cpp b/source/lib/omnitrace/library/process_sampler.cpp index 6126b6cdd..9fc941e93 100644 --- a/source/lib/omnitrace/library/process_sampler.cpp +++ b/source/lib/omnitrace/library/process_sampler.cpp @@ -21,7 +21,6 @@ // SOFTWARE. #include "library/process_sampler.hpp" -#include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/cpu_freq.hpp" #include "library/debug.hpp" @@ -86,10 +85,16 @@ sampler::poll(std::atomic* _state, nsec_t _interval, promise_t* _ready) itr->config(); OMNITRACE_VERBOSE( - 1, "Thread sampler polling at an interval of %f seconds...\n", + 1, "Background process sampling polling at an interval of %f seconds...\n", std::chrono::duration_cast>(_interval).count()); + auto _duration = config::get_process_sampling_duration(); + if(_duration < 0.0) _duration = config::get_sampling_duration(); + bool _has_duration = (_duration > 0.0); + auto _now = std::chrono::steady_clock::now(); + auto _end = + _now + std::chrono::nanoseconds{ static_cast(_duration * units::sec) }; while(_state && _state->load() != State::Finalized && get_state() != State::Finalized) { std::this_thread::sleep_until(_now); @@ -100,12 +105,23 @@ sampler::poll(std::atomic* _state, nsec_t _interval, promise_t* _ready) for(auto& itr : instances) itr->sample(); get_sampler_is_sampling().store(false); + if(_has_duration && _now >= _end) break; while(_now < std::chrono::steady_clock::now()) _now += _interval; } + // ensure this is always false get_sampler_is_sampling().store(false); + if(_has_duration && _now >= _end && get_state() != State::Finalized) + { + OMNITRACE_VERBOSE( + 1, + "Background process sampling duration of %f seconds has elapsed. " + "Shutting down process sampling...\n", + _duration); + } + OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug(), "Thread sampler polling completed...\n"); @@ -155,12 +171,12 @@ sampler::setup() auto _fut = _prom.get_future(); polling_finished = std::make_unique(); + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); + set_state(State::PreInit); - pthread_gotcha::push_enable_sampling_on_child_threads(false); get_thread() = std::make_unique(&poll, &get_sampler_state(), msec_t{ _msec_freq }, &_prom); _fut.wait(); - pthread_gotcha::pop_enable_sampling_on_child_threads(); set_state(State::Active); } diff --git a/source/lib/omnitrace/library/rocm.cpp b/source/lib/omnitrace/library/rocm.cpp index fd8a4131f..2786b065d 100644 --- a/source/lib/omnitrace/library/rocm.cpp +++ b/source/lib/omnitrace/library/rocm.cpp @@ -167,7 +167,7 @@ extern "C" if(!tim::settings::enabled()) return true; roctracer_is_init() = true; - pthread_gotcha::push_enable_sampling_on_child_threads(false); + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); OMNITRACE_BASIC_VERBOSE_F(1 || rocm::on_load_trace, "Loading ROCm tooling...\n"); tim::consume_parameters(table, runtime_version, failed_tool_count, @@ -308,7 +308,6 @@ extern "C" "failed! OMNITRACE_ROCPROFILER_LIBRARY=%s\n", _rocprof.filename.c_str()); } - pthread_gotcha::pop_enable_sampling_on_child_threads(); OMNITRACE_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading... %s\n", (_success) ? "Done" : "Failed"); diff --git a/source/lib/omnitrace/library/rocm_smi.cpp b/source/lib/omnitrace/library/rocm_smi.cpp index 523e94b9e..33a348fda 100644 --- a/source/lib/omnitrace/library/rocm_smi.cpp +++ b/source/lib/omnitrace/library/rocm_smi.cpp @@ -33,12 +33,12 @@ #include "library/rocm_smi.hpp" #include "library/common.hpp" #include "library/components/fwd.hpp" -#include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/critical_trace.hpp" #include "library/debug.hpp" #include "library/gpu.hpp" #include "library/perfetto.hpp" +#include "library/runtime.hpp" #include "library/state.hpp" #include "library/thread_info.hpp" @@ -328,7 +328,7 @@ setup() if(is_initialized() || !get_use_rocm_smi()) return; - pthread_gotcha::push_enable_sampling_on_child_threads(false); + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); // assign the data value to determined by rocm-smi data::device_count = device_count(); @@ -402,8 +402,6 @@ setup() _e.what()); data::device_list = {}; } - - pthread_gotcha::pop_enable_sampling_on_child_threads(); } void diff --git a/source/lib/omnitrace/library/runtime.cpp b/source/lib/omnitrace/library/runtime.cpp index f323c39d1..85c04aca4 100644 --- a/source/lib/omnitrace/library/runtime.cpp +++ b/source/lib/omnitrace/library/runtime.cpp @@ -52,6 +52,26 @@ namespace omnitrace { +namespace +{ +auto& +get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index()) +{ + static auto _v = utility::get_filled_array( + []() { return utility::get_reserved_vector(32); }); + return _v.at(_idx); +} + +bool& +sampling_on_child_threads() +{ + static thread_local bool _v = get_sampling_on_child_threads_history().empty() + ? false + : get_sampling_on_child_threads_history().back(); + return _v; +} +} // namespace + int get_realtime_signal() { @@ -254,4 +274,40 @@ pop_thread_state() } return get_thread_state(); } + +bool +sampling_enabled_on_child_threads() +{ + return sampling_on_child_threads(); +} + +bool +push_enable_sampling_on_child_threads(bool _v) +{ + bool _last = sampling_on_child_threads(); + sampling_on_child_threads() = _v; + auto& _hist = get_sampling_on_child_threads_history(); + _hist.emplace_back(_last); + return _last; +} + +bool +pop_enable_sampling_on_child_threads() +{ + auto& _hist = get_sampling_on_child_threads_history(); + if(!_hist.empty()) + { + bool _restored = _hist.back(); + _hist.pop_back(); + sampling_on_child_threads() = _restored; + } + return sampling_on_child_threads(); +} + +void +set_sampling_on_all_future_threads(bool _v) +{ + for(size_t i = 0; i < max_supported_threads; ++i) + get_sampling_on_child_threads_history(i).emplace_back(_v); +} } // namespace omnitrace diff --git a/source/lib/omnitrace/library/runtime.hpp b/source/lib/omnitrace/library/runtime.hpp index 6608fd737..b88a7b876 100644 --- a/source/lib/omnitrace/library/runtime.hpp +++ b/source/lib/omnitrace/library/runtime.hpp @@ -117,11 +117,39 @@ struct scoped_thread_state scoped_thread_state(ThreadState _v) { push_thread_state(_v); } ~scoped_thread_state() { pop_thread_state(); } }; + +// query current value +bool +sampling_enabled_on_child_threads(); + +// use this to disable sampling in a region (e.g. right before thread creation) +bool +push_enable_sampling_on_child_threads(bool _v); + +// use this to restore previous setting +bool +pop_enable_sampling_on_child_threads(); + +// make sure every newly created thead starts with this value +void +set_sampling_on_all_future_threads(bool _v); + +struct scoped_child_sampling +{ + scoped_child_sampling(bool _v) { push_enable_sampling_on_child_threads(_v); } + ~scoped_child_sampling() { pop_enable_sampling_on_child_threads(); } +}; } // namespace omnitrace #define OMNITRACE_SCOPED_THREAD_STATE(STATE) \ - ::omnitrace::scoped_thread_state OMNITRACE_VARIABLE( \ - OMNITRACE_VAR_NAME_COMBINE(scoped_thread_state_, __LINE__)) \ + ::omnitrace::scoped_thread_state OMNITRACE_VARIABLE(_scoped_thread_state_, __LINE__) \ { \ ::omnitrace::STATE \ } + +#define OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(VALUE) \ + ::omnitrace::scoped_child_sampling OMNITRACE_VARIABLE(_scoped_child_sampling_, \ + __LINE__) \ + { \ + VALUE \ + } diff --git a/source/lib/omnitrace/library/sampling.cpp b/source/lib/omnitrace/library/sampling.cpp index 1ce180516..a35758a91 100644 --- a/source/lib/omnitrace/library/sampling.cpp +++ b/source/lib/omnitrace/library/sampling.cpp @@ -26,7 +26,6 @@ #include "library/components/backtrace_metrics.hpp" #include "library/components/backtrace_timestamp.hpp" #include "library/components/fwd.hpp" -#include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/ptl.hpp" @@ -60,6 +59,8 @@ #include #include +#include +#include #include #include #include @@ -161,6 +162,79 @@ get_sampler_running(int64_t _tid) return _v.at(_tid); } +auto& +get_duration_cv() +{ + static auto _v = std::condition_variable{}; + return _v; +} + +auto& +get_duration_thread() +{ + static auto _v = std::unique_ptr{}; + return _v; +} + +void +start_duration_thread() +{ + static std::mutex _start_mutex{}; + std::unique_lock _start_lk{ _start_mutex, std::defer_lock }; + if(!_start_lk.owns_lock()) _start_lk.lock(); + + if(!get_duration_thread() && config::get_sampling_duration() > 0.0) + { + // we may need to protect against recursion bc of pthread wrapper + static bool _protect = false; + if(_protect) return; + _protect = true; + auto _now = std::chrono::steady_clock::now(); + auto _end = _now + std::chrono::nanoseconds{ static_cast( + config::get_sampling_duration() * units::sec) }; + auto _func = [_end]() { + thread_info::init(true); + std::mutex _mutex{}; + bool _wait = true; + while(_wait) + { + _wait = false; + std::unique_lock _lk{ _mutex }; + get_duration_cv().wait_until(_lk, _end); + auto _premature = (std::chrono::steady_clock::now() < _end); + auto _finalized = (get_state() == State::Finalized); + if(_premature && !_finalized) + { + // protect against spurious wakeups + OMNITRACE_VERBOSE( + 2, "%sSpurious wakeup of sampling duration thread...\n", + tim::log::color::warning()); + _wait = true; + } + else if(_finalized) + { + break; + } + else + { + OMNITRACE_VERBOSE(1, + "Sampling duration of %f seconds has elapsed. " + "Shutting down sampling...\n", + config::get_sampling_duration()); + shutdown(); + } + } + }; + + OMNITRACE_VERBOSE(1, "Sampling will be disabled after %f seconds...\n", + config::get_sampling_duration()); + + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); + get_duration_thread() = std::make_unique(_func); + _protect = false; + } +} + std::set configure(bool _setup, int64_t _tid = threading::get_id()) { @@ -170,10 +244,24 @@ configure(bool _setup, int64_t _tid = threading::get_id()) bool _is_running = (!_running) ? false : *_running; auto& _signal_types = sampling::get_signal_types(_tid); - pthread_gotcha::push_enable_sampling_on_child_threads(false); - auto _dtor = scope::destructor{ []() { - pthread_gotcha::pop_enable_sampling_on_child_threads(); - } }; + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); + + auto&& _cpu_tids = get_sampling_cpu_tids(); + auto&& _real_tids = get_sampling_real_tids(); + + auto _erase_tid_signal = [_tid, &_signal_types](auto& _tids, int _signum) { + if(!_tids.empty()) + { + if(_tids.count(_tid) == 0) + { + OMNITRACE_VERBOSE(3, "Disabling SIG%i from thread %li\n", _signum, _tid); + _signal_types->erase(_signum); + } + } + }; + + _erase_tid_signal(_cpu_tids, get_cputime_signal()); + _erase_tid_signal(_real_tids, get_realtime_signal()); if(_setup && !_sampler && !_is_running && !_signal_types->empty()) { @@ -253,6 +341,7 @@ configure(bool _setup, int64_t _tid = threading::get_id()) *_running = true; sampling::get_sampler_init(_tid)->sample(); + start_duration_thread(); _sampler->start(); } else if(!_setup && _sampler && _is_running) @@ -265,6 +354,7 @@ configure(bool _setup, int64_t _tid = threading::get_id()) sampling::block_signals(*_signal_types); } + get_duration_cv().notify_one(); if(_tid == 0) { // this propagates to all threads @@ -278,6 +368,12 @@ configure(bool _setup, int64_t _tid = threading::get_id()) *get_sampler_running(i) = false; } } + + if(get_duration_thread()) + { + get_duration_thread()->join(); + get_duration_thread().reset(); + } } _sampler->stop(); @@ -363,8 +459,8 @@ post_process() for(size_t i = 0; i < max_supported_threads; ++i) backtrace_metrics::configure(false, i); - OMNITRACE_VERBOSE(1 || get_debug_sampling(), "Post-processing sampling data...\n"); - + size_t _total_data = 0; + size_t _total_threads = 0; for(size_t i = 0; i < max_supported_threads; ++i) { auto& _sampler = get_sampler(i); @@ -398,7 +494,7 @@ post_process() _sampler->stop(); auto& _raw_data = _sampler->get_data(); - OMNITRACE_VERBOSE(0 || get_debug_sampling(), + OMNITRACE_VERBOSE(2 || get_debug_sampling(), "Sampler data for thread %lu has %zu initial entries...\n", i, _raw_data.size()); @@ -430,23 +526,27 @@ post_process() continue; } - OMNITRACE_VERBOSE(0 || get_debug_sampling(), + OMNITRACE_VERBOSE(2 || get_debug_sampling(), "Sampler data for thread %lu has %zu valid entries...\n", i, _raw_data.size()); + _total_data += _raw_data.size(); + _total_threads += 1; + if(get_use_perfetto()) post_process_perfetto(i, _init, _data); if(get_use_timemory()) post_process_timemory(i, _init, _data); } - OMNITRACE_VERBOSE(0 || get_debug_sampling(), - "Post-processing sampling entries completed\n"); + OMNITRACE_VERBOSE(3 || get_debug_sampling(), "Destroying samplers...\n"); for(size_t i = 0; i < max_supported_threads; ++i) { get_sampler(i).reset(); } - OMNITRACE_VERBOSE(0 || get_debug_sampling(), "Post-processing samplers destroyed\n"); + OMNITRACE_VERBOSE(1 || get_debug_sampling(), + "Collected %zu samples from %zu threads...\n", _total_data, + _total_threads); } namespace @@ -535,17 +635,8 @@ post_process_perfetto(int64_t _tid, const bundle_t* _init, } }; - if(_tid == 0 && config::get_mode() == Mode::Sampling && - config::get_perfetto_fill_policy() == "discard") - { - _process_perfetto(_data); - } - else - { - pthread_gotcha::push_enable_sampling_on_child_threads(false); - std::thread{ _process_perfetto_wrapper }.join(); - pthread_gotcha::pop_enable_sampling_on_child_threads(); - } + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); + std::thread{ _process_perfetto_wrapper }.join(); } void