Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-10589] Add internal metrics for GVL profiling performance #3993

Merged
merged 6 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/ProfilingDevelopment.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,9 @@ from the VM callbacks and also messing with cpu/wall-time accounting for threads

The `ThreadContext` collector exposes three APIs for GVL profiling:

* `void thread_context_collector_on_gvl_waiting(VALUE thread)`
* `bool thread_context_collector_on_gvl_running(VALUE thread)`
* `VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance)`
* `thread_context_collector_on_gvl_waiting`
* `thread_context_collector_on_gvl_running`
* `thread_context_collector_sample_after_gvl_running`

The intuition here is that:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ struct cpu_and_wall_time_worker_state {
// # GVL profiling stats
// How many times we triggered the after_gvl_running sampling
unsigned int after_gvl_running;
// How many times we skipped the after_gvl_running sampling
unsigned int gvl_dont_sample;
// Min/max/total wall-time spent on gvl sampling
uint64_t gvl_sampling_time_ns_min;
uint64_t gvl_sampling_time_ns_max;
uint64_t gvl_sampling_time_ns_total;
} stats;
};

Expand Down Expand Up @@ -230,6 +236,7 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self);
static void on_gvl_event(rb_event_flag_t event_id, const rb_internal_thread_event_data_t *event_data, DDTRACE_UNUSED void *_unused);
static void after_gvl_running_from_postponed_job(DDTRACE_UNUSED void *_unused);
#endif
static VALUE rescued_after_gvl_running_from_postponed_job(VALUE self_instance);
static VALUE _native_gvl_profiling_hook_active(DDTRACE_UNUSED VALUE self, VALUE instance);

// We're using `on_newobj_event` function with `rb_add_event_hook2`, which requires in its public signature a function
Expand Down Expand Up @@ -1042,7 +1049,12 @@ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
ID2SYM(rb_intern("allocations_during_sample")), /* => */ state->allocation_profiling_enabled ? UINT2NUM(state->stats.allocations_during_sample) : Qnil,

// GVL profiling stats
ID2SYM(rb_intern("after_gvl_running")), /* => */ UINT2NUM(state->stats.after_gvl_running),
ID2SYM(rb_intern("after_gvl_running")), /* => */ UINT2NUM(state->stats.after_gvl_running),
ID2SYM(rb_intern("gvl_dont_sample")), /* => */ UINT2NUM(state->stats.gvl_dont_sample),
ID2SYM(rb_intern("gvl_sampling_time_ns_min")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_min, != UINT64_MAX, ULL2NUM),
ID2SYM(rb_intern("gvl_sampling_time_ns_max")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_max, > 0, ULL2NUM),
ID2SYM(rb_intern("gvl_sampling_time_ns_total")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_total, > 0, ULL2NUM),
ID2SYM(rb_intern("gvl_sampling_time_ns_avg")), /* => */ RUBY_AVG_OR_NIL(state->stats.gvl_sampling_time_ns_total, state->stats.after_gvl_running),
};
for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]);
return stats_as_hash;
Expand Down Expand Up @@ -1080,8 +1092,10 @@ static void reset_stats_not_thread_safe(struct cpu_and_wall_time_worker_state *s
// Given the expected infrequency of resetting (~once per 60s profile) and the auxiliary/non-critical nature of these stats
// this momentary loss of accuracy is deemed acceptable to keep overhead to a minimum.
state->stats = (struct stats) {
.cpu_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample
.allocation_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample
// All these values are initialized to their highest value possible since we always take the min between existing and latest sample
.cpu_sampling_time_ns_min = UINT64_MAX,
.allocation_sampling_time_ns_min = UINT64_MAX,
.gvl_sampling_time_ns_min = UINT64_MAX,
};
}

Expand Down Expand Up @@ -1315,23 +1329,29 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
} else if (event_id == RUBY_INTERNAL_THREAD_EVENT_RESUMED) { /* running/runnable */
// Interesting note: A RUBY_INTERNAL_THREAD_EVENT_RESUMED is guaranteed to be called with the GVL being acquired.
// (And... I think target_thread will be == rb_thread_current()?)
//
// But we're not sure if we're on the main Ractor yet. The thread context collector actually can actually help here:
// it tags threads it's tracking, so if a thread is tagged then by definition we know that thread belongs to the main
// Ractor. Thus, if we really really wanted to access the state, we could do it after making sure we're on the correct Ractor.
// Ractor. Thus, if we get a ON_GVL_RUNNING_UNKNOWN result we shouldn't touch any state, but otherwise we're good to go.

#ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS
target_thread = gvl_profiling_state_maybe_initialize();
#endif

bool should_sample = thread_context_collector_on_gvl_running(target_thread);
on_gvl_running_result result = thread_context_collector_on_gvl_running(target_thread);

if (should_sample) {
// should_sample is only true if a thread belongs to the main Ractor, so we're good to go
if (result == ON_GVL_RUNNING_SAMPLE) {
#ifndef NO_POSTPONED_TRIGGER
rb_postponed_job_trigger(after_gvl_running_from_postponed_job_handle);
#else
rb_postponed_job_register_one(0, after_gvl_running_from_postponed_job, NULL);
#endif
} else if (result == ON_GVL_RUNNING_DONT_SAMPLE) {
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above

if (state == NULL) return; // This should not happen, but just in case...

state->stats.gvl_dont_sample++;
}
} else {
// This is a very delicate time and it's hard for us to raise an exception so let's at least complain to stderr
Expand All @@ -1347,11 +1367,32 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {

state->during_sample = true;

safely_call(thread_context_collector_sample_after_gvl_running, state->thread_context_collector_instance, state->self_instance);
// Rescue against any exceptions that happen during sampling
safely_call(rescued_after_gvl_running_from_postponed_job, state->self_instance, state->self_instance);

state->during_sample = false;
}

static VALUE rescued_after_gvl_running_from_postponed_job(VALUE self_instance) {
struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance, rb_thread_current(), wall_time_ns_before_sample);
long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);

long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;

// Guard against wall-time going backwards, see https://github.com/DataDog/dd-trace-rb/pull/2336 for discussion.
uint64_t sampling_time_ns = delta_ns < 0 ? 0 : delta_ns;

state->stats.gvl_sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_min);
state->stats.gvl_sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_max);
state->stats.gvl_sampling_time_ns_total += sampling_time_ns;

state->stats.after_gvl_running++;

state->during_sample = false;
return Qnil;
}

static VALUE _native_gvl_profiling_hook_active(DDTRACE_UNUSED VALUE self, VALUE instance) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1763,14 +1763,14 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {

// This function can get called from outside the GVL and even on non-main Ractors
__attribute__((warn_unused_result))
bool thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
on_gvl_running_result thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
intptr_t gvl_waiting_at = gvl_profiling_state_get(thread);

// Thread was not being profiled / not waiting on gvl
if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return false;
if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return ON_GVL_RUNNING_UNKNOWN;

// @ivoanjo: I'm not sure if this can happen -- It means we should've sampled already but haven't gotten the chance yet?
if (gvl_waiting_at < 0) return true;
if (gvl_waiting_at < 0) return ON_GVL_RUNNING_SAMPLE;

long waiting_for_gvl_duration_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE) - gvl_waiting_at;

Expand All @@ -1786,11 +1786,11 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
gvl_profiling_state_set(thread, GVL_WAITING_ENABLED_EMPTY);
}

return should_sample;
return should_sample ? ON_GVL_RUNNING_SAMPLE : ON_GVL_RUNNING_DONT_SAMPLE;
}

__attribute__((warn_unused_result))
bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
on_gvl_running_result thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
return thread_context_collector_on_gvl_running_with_threshold(thread, global_waiting_for_gvl_threshold_ns);
}

Expand Down Expand Up @@ -1822,7 +1822,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
//
// NOTE: In normal use, current_thread is expected to be == rb_thread_current(); the `current_thread` parameter only
// exists to enable testing.
VALUE thread_context_collector_sample_after_gvl_running_with_thread(VALUE self_instance, VALUE current_thread) {
VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread, long current_monotonic_wall_time_ns) {
struct thread_context_collector_state *state;
TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);

Expand Down Expand Up @@ -1854,14 +1854,10 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
thread_context,
thread_context->sampling_buffer,
cpu_time_for_thread,
monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
current_monotonic_wall_time_ns
);

return Qtrue; // To allow this to be called from rb_rescue2
}

VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance) {
return thread_context_collector_sample_after_gvl_running_with_thread(self_instance, rb_thread_current());
return Qtrue;
}

// This method is intended to be called from update_metrics_and_sample. It exists to handle extra sampling steps we
Expand Down Expand Up @@ -1980,13 +1976,17 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) {
ENFORCE_THREAD(thread);

return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) ? Qtrue : Qfalse;
return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) == ON_GVL_RUNNING_SAMPLE ? Qtrue : Qfalse;
}

static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {
ENFORCE_THREAD(thread);

return thread_context_collector_sample_after_gvl_running_with_thread(collector_instance, thread);
return thread_context_collector_sample_after_gvl_running(
collector_instance,
thread,
monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
);
}

static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@ __attribute__((warn_unused_result)) bool thread_context_collector_on_gc_finish(V
VALUE enforce_thread_context_collector_instance(VALUE object);

#ifndef NO_GVL_INSTRUMENTATION
typedef enum {
ON_GVL_RUNNING_UNKNOWN, // Thread is not known, it may not even be from the current Ractor
ON_GVL_RUNNING_DONT_SAMPLE, // Thread is known, but "Waiting for GVL" period was too small to be sampled
ON_GVL_RUNNING_SAMPLE, // Thread is known, and "Waiting for GVL" period should be sampled
} on_gvl_running_result;

void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread);
__attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);
__attribute__((warn_unused_result)) on_gvl_running_result thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread, long current_monotonic_wall_time_ns);
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,52 @@
expect(waiting_for_gvl_time).to be_within(5).percent_of(total_time), \
"Expected waiting_for_gvl_time to be close to total_time, debug_failures: #{debug_failures}"

expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be > 0
expect(cpu_and_wall_time_worker.stats).to match(
hash_including(
after_gvl_running: be > 0,
gvl_sampling_time_ns_min: be > 0,
gvl_sampling_time_ns_max: be > 0,
gvl_sampling_time_ns_total: be > 0,
gvl_sampling_time_ns_avg: be > 0,
)
)
end

context "when 'Waiting for GVL' periods are below waiting_for_gvl_threshold_ns" do
let(:options) do
ten_seconds_as_ns = 1_000_000_000
collector = build_thread_context_collector(recorder, waiting_for_gvl_threshold_ns: ten_seconds_as_ns)

{thread_context_collector: collector}
end

it "does not trigger extra samples" do
background_thread_affected_by_gvl_contention
ready_queue_2.pop

start
wait_until_running

sleep 0.1
background_thread_affected_by_gvl_contention.kill

cpu_and_wall_time_worker.stop

# Note: There may still be "Waiting for GVL" samples in the output, but these samples will come from the
# periodic cpu/wall-sampling, not samples directly triggered by the end of a "Waiting for GVL" period.

expect(cpu_and_wall_time_worker.stats.fetch(:gvl_dont_sample)).to be > 0

expect(cpu_and_wall_time_worker.stats).to match(
hash_including(
after_gvl_running: 0,
gvl_sampling_time_ns_min: nil,
gvl_sampling_time_ns_max: nil,
gvl_sampling_time_ns_total: nil,
gvl_sampling_time_ns_avg: nil,
)
)
end
end
end
end
Expand Down Expand Up @@ -1137,6 +1182,11 @@
allocation_sampler_snapshot: nil,
allocations_during_sample: nil,
after_gvl_running: 0,
gvl_dont_sample: 0,
gvl_sampling_time_ns_min: nil,
gvl_sampling_time_ns_max: nil,
gvl_sampling_time_ns_total: nil,
gvl_sampling_time_ns_avg: nil,
}
)
end
Expand Down Expand Up @@ -1417,11 +1467,12 @@ def build_another_instance
described_class.new(**worker_settings)
end

def build_thread_context_collector(recorder)
def build_thread_context_collector(recorder, **options)
Datadog::Profiling::Collectors::ThreadContext.for_testing(
recorder: recorder,
endpoint_collection_enabled: endpoint_collection_enabled,
timeline_enabled: timeline_enabled,
**options,
)
end

Expand Down
Loading