DataDog · ivoanjo · Oct 11, 2024 · Oct 10, 2024 · Oct 11, 2024 · Oct 11, 2024
@@ -209,9 +209,9 @@ from the VM callbacks and also messing with cpu/wall-time accounting for threads
 
 The `ThreadContext` collector exposes three APIs for GVL profiling:
 
-* `void thread_context_collector_on_gvl_waiting(VALUE thread)`
-* `bool thread_context_collector_on_gvl_running(VALUE thread)`
-* `VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance)`
+* `thread_context_collector_on_gvl_waiting`
+* `thread_context_collector_on_gvl_running`
+* `thread_context_collector_sample_after_gvl_running`
 
 The intuition here is that:
 

@@ -180,6 +180,12 @@ struct cpu_and_wall_time_worker_state {
     // # GVL profiling stats
     // How many times we triggered the after_gvl_running sampling
     unsigned int after_gvl_running;
+    // How many times we skipped the after_gvl_running sampling
+    unsigned int gvl_dont_sample;
+    // Min/max/total wall-time spent on gvl sampling
+    uint64_t gvl_sampling_time_ns_min;
+    uint64_t gvl_sampling_time_ns_max;
+    uint64_t gvl_sampling_time_ns_total;
   } stats;
 };
 
@@ -230,6 +236,7 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self);
 static void on_gvl_event(rb_event_flag_t event_id, const rb_internal_thread_event_data_t *event_data, DDTRACE_UNUSED void *_unused);
 static void after_gvl_running_from_postponed_job(DDTRACE_UNUSED void *_unused);
 #endif
+static VALUE rescued_after_gvl_running_from_postponed_job(VALUE self_instance);
 static VALUE _native_gvl_profiling_hook_active(DDTRACE_UNUSED VALUE self, VALUE instance);
 
 // We're using `on_newobj_event` function with `rb_add_event_hook2`, which requires in its public signature a function
@@ -1042,7 +1049,12 @@ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
     ID2SYM(rb_intern("allocations_during_sample")),         /* => */ state->allocation_profiling_enabled ? UINT2NUM(state->stats.allocations_during_sample) : Qnil,
 
     // GVL profiling stats
-    ID2SYM(rb_intern("after_gvl_running")), /* => */ UINT2NUM(state->stats.after_gvl_running),
+    ID2SYM(rb_intern("after_gvl_running")),          /* => */ UINT2NUM(state->stats.after_gvl_running),
+    ID2SYM(rb_intern("gvl_dont_sample")),            /* => */ UINT2NUM(state->stats.gvl_dont_sample),
+    ID2SYM(rb_intern("gvl_sampling_time_ns_min")),   /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_min, != UINT64_MAX, ULL2NUM),
+    ID2SYM(rb_intern("gvl_sampling_time_ns_max")),   /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_max, > 0, ULL2NUM),
+    ID2SYM(rb_intern("gvl_sampling_time_ns_total")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_total, > 0, ULL2NUM),
+    ID2SYM(rb_intern("gvl_sampling_time_ns_avg")),   /* => */ RUBY_AVG_OR_NIL(state->stats.gvl_sampling_time_ns_total, state->stats.after_gvl_running),
   };
   for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]);
   return stats_as_hash;
@@ -1080,8 +1092,10 @@ static void reset_stats_not_thread_safe(struct cpu_and_wall_time_worker_state *s
   //       Given the expected infrequency of resetting (~once per 60s profile) and the auxiliary/non-critical nature of these stats
   //       this momentary loss of accuracy is deemed acceptable to keep overhead to a minimum.
   state->stats = (struct stats) {
-    .cpu_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample
-    .allocation_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample
+    // All these values are initialized to their highest value possible since we always take the min between existing and latest sample
+    .cpu_sampling_time_ns_min        = UINT64_MAX,
+    .allocation_sampling_time_ns_min = UINT64_MAX,
+    .gvl_sampling_time_ns_min        = UINT64_MAX,
   };
 }
 
@@ -1315,23 +1329,29 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
     } else if (event_id == RUBY_INTERNAL_THREAD_EVENT_RESUMED) { /* running/runnable */
       // Interesting note: A RUBY_INTERNAL_THREAD_EVENT_RESUMED is guaranteed to be called with the GVL being acquired.
       // (And... I think target_thread will be == rb_thread_current()?)
+      //
       // But we're not sure if we're on the main Ractor yet. The thread context collector actually can actually help here:
       // it tags threads it's tracking, so if a thread is tagged then by definition we know that thread belongs to the main
-      // Ractor. Thus, if we really really wanted to access the state, we could do it after making sure we're on the correct Ractor.
+      // Ractor. Thus, if we get a ON_GVL_RUNNING_UNKNOWN result we shouldn't touch any state, but otherwise we're good to go.
 
       #ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS
         target_thread = gvl_profiling_state_maybe_initialize();
       #endif
 
-      bool should_sample = thread_context_collector_on_gvl_running(target_thread);
+      on_gvl_running_result result = thread_context_collector_on_gvl_running(target_thread);
 
-      if (should_sample) {
-        // should_sample is only true if a thread belongs to the main Ractor, so we're good to go
+      if (result == ON_GVL_RUNNING_SAMPLE) {
         #ifndef NO_POSTPONED_TRIGGER
           rb_postponed_job_trigger(after_gvl_running_from_postponed_job_handle);
         #else
           rb_postponed_job_register_one(0, after_gvl_running_from_postponed_job, NULL);
         #endif
+      } else if (result == ON_GVL_RUNNING_DONT_SAMPLE) {
+        struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
+
+        if (state == NULL) return; // This should not happen, but just in case...
+
+        state->stats.gvl_dont_sample++;
       }
     } else {
       // This is a very delicate time and it's hard for us to raise an exception so let's at least complain to stderr
@@ -1347,11 +1367,32 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
 
     state->during_sample = true;
 
-    safely_call(thread_context_collector_sample_after_gvl_running, state->thread_context_collector_instance, state->self_instance);
+    // Rescue against any exceptions that happen during sampling
+    safely_call(rescued_after_gvl_running_from_postponed_job, state->self_instance, state->self_instance);
+
+    state->during_sample = false;
+  }
+
+  static VALUE rescued_after_gvl_running_from_postponed_job(VALUE self_instance) {
+    struct cpu_and_wall_time_worker_state *state;
+    TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
+
+    long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
+    thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance, rb_thread_current(), wall_time_ns_before_sample);
+    long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
+
+    long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;
+
+    // Guard against wall-time going backwards, see https://github.com/DataDog/dd-trace-rb/pull/2336 for discussion.
+    uint64_t sampling_time_ns = delta_ns < 0 ? 0 : delta_ns;
+
+    state->stats.gvl_sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_min);
+    state->stats.gvl_sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_max);
+    state->stats.gvl_sampling_time_ns_total += sampling_time_ns;
 
     state->stats.after_gvl_running++;
 
-    state->during_sample = false;
+    return Qnil;
   }
 
   static VALUE _native_gvl_profiling_hook_active(DDTRACE_UNUSED VALUE self, VALUE instance) {

@@ -1763,14 +1763,14 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
 
   // This function can get called from outside the GVL and even on non-main Ractors
   __attribute__((warn_unused_result))
-  bool thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
+  on_gvl_running_result thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
     intptr_t gvl_waiting_at = gvl_profiling_state_get(thread);
 
     // Thread was not being profiled / not waiting on gvl
-    if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return false;
+    if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return ON_GVL_RUNNING_UNKNOWN;
 
     // @ivoanjo: I'm not sure if this can happen -- It means we should've sampled already but haven't gotten the chance yet?
-    if (gvl_waiting_at < 0) return true;
+    if (gvl_waiting_at < 0) return ON_GVL_RUNNING_SAMPLE;
 
     long waiting_for_gvl_duration_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE) - gvl_waiting_at;
 
@@ -1786,11 +1786,11 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
       gvl_profiling_state_set(thread, GVL_WAITING_ENABLED_EMPTY);
     }
 
-    return should_sample;
+    return should_sample ? ON_GVL_RUNNING_SAMPLE : ON_GVL_RUNNING_DONT_SAMPLE;
   }
 
   __attribute__((warn_unused_result))
-  bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
+  on_gvl_running_result thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
     return thread_context_collector_on_gvl_running_with_threshold(thread, global_waiting_for_gvl_threshold_ns);
   }
 
@@ -1822,7 +1822,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
   //
   // NOTE: In normal use, current_thread is expected to be == rb_thread_current(); the `current_thread` parameter only
   // exists to enable testing.
-  VALUE thread_context_collector_sample_after_gvl_running_with_thread(VALUE self_instance, VALUE current_thread) {
+  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread, long current_monotonic_wall_time_ns) {
     struct thread_context_collector_state *state;
     TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
 
@@ -1854,14 +1854,10 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
       thread_context,
       thread_context->sampling_buffer,
       cpu_time_for_thread,
-      monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
+      current_monotonic_wall_time_ns
     );
 
-    return Qtrue; // To allow this to be called from rb_rescue2
-  }
-
-  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance) {
-    return thread_context_collector_sample_after_gvl_running_with_thread(self_instance, rb_thread_current());
+    return Qtrue;
   }
 
   // This method is intended to be called from update_metrics_and_sample. It exists to handle extra sampling steps we
@@ -1980,13 +1976,17 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
   static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) ? Qtrue : Qfalse;
+    return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) == ON_GVL_RUNNING_SAMPLE ? Qtrue : Qfalse;
   }
 
   static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    return thread_context_collector_sample_after_gvl_running_with_thread(collector_instance, thread);
+    return thread_context_collector_sample_after_gvl_running(
+      collector_instance,
+      thread,
+      monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
+    );
   }
 
   static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) {

@@ -18,7 +18,13 @@ __attribute__((warn_unused_result)) bool thread_context_collector_on_gc_finish(V
 VALUE enforce_thread_context_collector_instance(VALUE object);
 
 #ifndef NO_GVL_INSTRUMENTATION
+  typedef enum {
+    ON_GVL_RUNNING_UNKNOWN, // Thread is not known, it may not even be from the current Ractor
+    ON_GVL_RUNNING_DONT_SAMPLE, // Thread is known, but "Waiting for GVL" period was too small to be sampled
+    ON_GVL_RUNNING_SAMPLE, // Thread is known, and "Waiting for GVL" period should be sampled
+  } on_gvl_running_result;
+
   void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread);
-  __attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
-  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);
+  __attribute__((warn_unused_result)) on_gvl_running_result thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
+  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread, long current_monotonic_wall_time_ns);
 #endif
@@ -507,7 +507,52 @@
           expect(waiting_for_gvl_time).to be_within(5).percent_of(total_time), \
             "Expected waiting_for_gvl_time to be close to total_time, debug_failures: #{debug_failures}"
 
-          expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be > 0
+          expect(cpu_and_wall_time_worker.stats).to match(
+            hash_including(
+              after_gvl_running: be > 0,
+              gvl_sampling_time_ns_min: be > 0,
+              gvl_sampling_time_ns_max: be > 0,
+              gvl_sampling_time_ns_total: be > 0,
+              gvl_sampling_time_ns_avg: be > 0,
+            )
+          )
+        end
+
+        context "when 'Waiting for GVL' periods are below waiting_for_gvl_threshold_ns" do
+          let(:options) do
+            ten_seconds_as_ns = 1_000_000_000
+            collector = build_thread_context_collector(recorder, waiting_for_gvl_threshold_ns: ten_seconds_as_ns)
+
+            {thread_context_collector: collector}
+          end
+
+          it "does not trigger extra samples" do
+            background_thread_affected_by_gvl_contention
+            ready_queue_2.pop
+
+            start
+            wait_until_running
+
+            sleep 0.1
+            background_thread_affected_by_gvl_contention.kill
+
+            cpu_and_wall_time_worker.stop
+
+            # Note: There may still be "Waiting for GVL" samples in the output, but these samples will come from the
+            # periodic cpu/wall-sampling, not samples directly triggered by the end of a "Waiting for GVL" period.
+
+            expect(cpu_and_wall_time_worker.stats.fetch(:gvl_dont_sample)).to be > 0
+
+            expect(cpu_and_wall_time_worker.stats).to match(
+              hash_including(
+                after_gvl_running: 0,
+                gvl_sampling_time_ns_min: nil,
+                gvl_sampling_time_ns_max: nil,
+                gvl_sampling_time_ns_total: nil,
+                gvl_sampling_time_ns_avg: nil,
+              )
+            )
+          end
         end
       end
     end
@@ -1137,6 +1182,11 @@
           allocation_sampler_snapshot: nil,
           allocations_during_sample: nil,
           after_gvl_running: 0,
+          gvl_dont_sample: 0,
+          gvl_sampling_time_ns_min: nil,
+          gvl_sampling_time_ns_max: nil,
+          gvl_sampling_time_ns_total: nil,
+          gvl_sampling_time_ns_avg: nil,
         }
       )
     end
@@ -1417,11 +1467,12 @@ def build_another_instance
     described_class.new(**worker_settings)
   end
 
-  def build_thread_context_collector(recorder)
+  def build_thread_context_collector(recorder, **options)
     Datadog::Profiling::Collectors::ThreadContext.for_testing(
       recorder: recorder,
       endpoint_collection_enabled: endpoint_collection_enabled,
       timeline_enabled: timeline_enabled,
+      **options,
     )
   end