From fac2c197d46e603d398f67bac51edc34c612fb30 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Thu, 10 Oct 2024 16:53:32 +0100
Subject: [PATCH 1/6] Introduce stat to track when we skip samples at the end
 of "Waiting for GVL"

This is useful because our main control for "Waiting for GVL" is the
waiting_for_gvl_threshold_ns, and thus we can use this stat to monitor
how many events are being skipped as being under that threshold.

Note that regular (periodic) samples can still observe the
"Waiting for GVL" state; but there will not be any special samples
created to represent the beginning or end of "Waiting for GVL".
---
 docs/ProfilingDevelopment.md                  |  6 ++--
 .../collectors_cpu_and_wall_time_worker.c     | 17 ++++++++---
 .../collectors_thread_context.c               | 12 ++++----
 .../collectors_thread_context.h               |  8 ++++-
 .../cpu_and_wall_time_worker_spec.rb          | 30 ++++++++++++++++++-
 5 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/docs/ProfilingDevelopment.md b/docs/ProfilingDevelopment.md
index 9155735a546..309bac05f57 100644
--- a/docs/ProfilingDevelopment.md
+++ b/docs/ProfilingDevelopment.md
@@ -209,9 +209,9 @@ from the VM callbacks and also messing with cpu/wall-time accounting for threads
 
 The `ThreadContext` collector exposes three APIs for GVL profiling:
 
-* `void thread_context_collector_on_gvl_waiting(VALUE thread)`
-* `bool thread_context_collector_on_gvl_running(VALUE thread)`
-* `VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance)`
+* `thread_context_collector_on_gvl_waiting(VALUE thread)`
+* `thread_context_collector_on_gvl_running(VALUE thread)`
+* `thread_context_collector_sample_after_gvl_running(VALUE self_instance)`
 
 The intuition here is that:
 
diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index c1b872bf7ef..9f65dd66b37 100644
--- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -180,6 +180,8 @@ struct cpu_and_wall_time_worker_state {
     // # GVL profiling stats
     // How many times we triggered the after_gvl_running sampling
     unsigned int after_gvl_running;
+    // How many times we skipped the after_gvl_running sampling
+    unsigned int gvl_dont_sample;
   } stats;
 };
 
@@ -1043,6 +1045,7 @@ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
 
     // GVL profiling stats
     ID2SYM(rb_intern("after_gvl_running")), /* => */ UINT2NUM(state->stats.after_gvl_running),
+    ID2SYM(rb_intern("gvl_dont_sample")),   /* => */ UINT2NUM(state->stats.gvl_dont_sample),
   };
   for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]);
   return stats_as_hash;
@@ -1315,23 +1318,29 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
     } else if (event_id == RUBY_INTERNAL_THREAD_EVENT_RESUMED) { /* running/runnable */
       // Interesting note: A RUBY_INTERNAL_THREAD_EVENT_RESUMED is guaranteed to be called with the GVL being acquired.
       // (And... I think target_thread will be == rb_thread_current()?)
+      //
       // But we're not sure if we're on the main Ractor yet. The thread context collector actually can actually help here:
       // it tags threads it's tracking, so if a thread is tagged then by definition we know that thread belongs to the main
-      // Ractor. Thus, if we really really wanted to access the state, we could do it after making sure we're on the correct Ractor.
+      // Ractor. Thus, if we get a ON_GVL_RUNNING_UNKNOWN result we shouldn't touch any state, but otherwise we're good to go.
 
       #ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS
         target_thread = gvl_profiling_state_maybe_initialize();
       #endif
 
-      bool should_sample = thread_context_collector_on_gvl_running(target_thread);
+      on_gvl_running_result result = thread_context_collector_on_gvl_running(target_thread);
 
-      if (should_sample) {
-        // should_sample is only true if a thread belongs to the main Ractor, so we're good to go
+      if (result == ON_GVL_RUNNING_SAMPLE) {
         #ifndef NO_POSTPONED_TRIGGER
           rb_postponed_job_trigger(after_gvl_running_from_postponed_job_handle);
         #else
           rb_postponed_job_register_one(0, after_gvl_running_from_postponed_job, NULL);
         #endif
+      } else if (result == ON_GVL_RUNNING_DONT_SAMPLE) {
+        struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
+
+        if (state == NULL) return; // This should not happen, but just in case...
+
+        state->stats.gvl_dont_sample++;
       }
     } else {
       // This is a very delicate time and it's hard for us to raise an exception so let's at least complain to stderr
diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c
index 52b2e8df049..6b79ed05b6c 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.c
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c
@@ -1763,14 +1763,14 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
 
   // This function can get called from outside the GVL and even on non-main Ractors
   __attribute__((warn_unused_result))
-  bool thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
+  on_gvl_running_result thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
     intptr_t gvl_waiting_at = gvl_profiling_state_get(thread);
 
     // Thread was not being profiled / not waiting on gvl
-    if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return false;
+    if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return ON_GVL_RUNNING_UNKNOWN;
 
     // @ivoanjo: I'm not sure if this can happen -- It means we should've sampled already but haven't gotten the chance yet?
-    if (gvl_waiting_at < 0) return true;
+    if (gvl_waiting_at < 0) return ON_GVL_RUNNING_SAMPLE;
 
     long waiting_for_gvl_duration_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE) - gvl_waiting_at;
 
@@ -1786,11 +1786,11 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
       gvl_profiling_state_set(thread, GVL_WAITING_ENABLED_EMPTY);
     }
 
-    return should_sample;
+    return should_sample ? ON_GVL_RUNNING_SAMPLE : ON_GVL_RUNNING_DONT_SAMPLE;
   }
 
   __attribute__((warn_unused_result))
-  bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
+  on_gvl_running_result thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
     return thread_context_collector_on_gvl_running_with_threshold(thread, global_waiting_for_gvl_threshold_ns);
   }
 
@@ -1980,7 +1980,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
   static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) ? Qtrue : Qfalse;
+    return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) == ON_GVL_RUNNING_SAMPLE ? Qtrue : Qfalse;
   }
 
   static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {
diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.h b/ext/datadog_profiling_native_extension/collectors_thread_context.h
index 1204e4fe97f..d17fe7d6ea9 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.h
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.h
@@ -18,7 +18,13 @@ __attribute__((warn_unused_result)) bool thread_context_collector_on_gc_finish(V
 VALUE enforce_thread_context_collector_instance(VALUE object);
 
 #ifndef NO_GVL_INSTRUMENTATION
+  typedef enum {
+    ON_GVL_RUNNING_UNKNOWN, // Thread is not known, it may not even be from the current Ractor
+    ON_GVL_RUNNING_DONT_SAMPLE, // Thread is known, but "Waiting for GVL" period was too small to be sampled
+    ON_GVL_RUNNING_SAMPLE, // Thread is known, and "Waiting for GVL" period should be sampled
+  } on_gvl_running_result;
+
   void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread);
-  __attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
+  __attribute__((warn_unused_result)) on_gvl_running_result thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
   VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);
 #endif
diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
index 3b05165de74..43045da98a2 100644
--- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
+++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
@@ -509,6 +509,32 @@
 
           expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be > 0
         end
+
+        context "when 'Waiting for GVL' periods are below waiting_for_gvl_threshold_ns" do
+          let(:options) do
+            ten_seconds_as_ns = 1_000_000_000
+            collector = build_thread_context_collector(recorder, waiting_for_gvl_threshold_ns: ten_seconds_as_ns)
+
+            {thread_context_collector: collector}
+          end
+
+          it "does not trigger extra samples" do
+            background_thread_affected_by_gvl_contention
+            ready_queue_2.pop
+
+            start
+            wait_until_running
+
+            sleep 0.1
+            background_thread_affected_by_gvl_contention.kill
+
+            cpu_and_wall_time_worker.stop
+
+            expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be 0
+            expect(cpu_and_wall_time_worker.stats.fetch(:gvl_dont_sample))
+              .to be > 100 # Arbitrary, on my machine I see 250k on a run
+          end
+        end
       end
     end
 
@@ -1137,6 +1163,7 @@
           allocation_sampler_snapshot: nil,
           allocations_during_sample: nil,
           after_gvl_running: 0,
+          gvl_dont_sample: 0,
         }
       )
     end
@@ -1417,11 +1444,12 @@ def build_another_instance
     described_class.new(**worker_settings)
   end
 
-  def build_thread_context_collector(recorder)
+  def build_thread_context_collector(recorder, **options)
     Datadog::Profiling::Collectors::ThreadContext.for_testing(
       recorder: recorder,
       endpoint_collection_enabled: endpoint_collection_enabled,
       timeline_enabled: timeline_enabled,
+      **options,
     )
   end
 

From da8af37a3f2a7bcc554c2e215a0fa5cc78558b18 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Fri, 11 Oct 2024 10:51:34 +0100
Subject: [PATCH 2/6] Refactor: Introduce
 `rescued_after_gvl_running_from_postponed_job` instead of calling
 ThreadContext directly

This structure is similar to the one we're already using for
`sample_from_postponed_job` => `rescued_sample_from_postponed_job` and
enables us to have more control over the parameters sent to the
ThreadContext collector.
---
 .../collectors_cpu_and_wall_time_worker.c         | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index 9f65dd66b37..638fbb4e0eb 100644
--- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -232,6 +232,7 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self);
 static void on_gvl_event(rb_event_flag_t event_id, const rb_internal_thread_event_data_t *event_data, DDTRACE_UNUSED void *_unused);
 static void after_gvl_running_from_postponed_job(DDTRACE_UNUSED void *_unused);
 #endif
+static VALUE rescued_after_gvl_running_from_postponed_job(VALUE self_instance);
 static VALUE _native_gvl_profiling_hook_active(DDTRACE_UNUSED VALUE self, VALUE instance);
 
 // We're using `on_newobj_event` function with `rb_add_event_hook2`, which requires in its public signature a function
@@ -1356,11 +1357,21 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
 
     state->during_sample = true;
 
-    safely_call(thread_context_collector_sample_after_gvl_running, state->thread_context_collector_instance, state->self_instance);
+    // Rescue against any exceptions that happen during sampling
+    safely_call(rescued_after_gvl_running_from_postponed_job, state->self_instance, state->self_instance);
+
+    state->during_sample = false;
+  }
+
+  static VALUE rescued_after_gvl_running_from_postponed_job(VALUE self_instance) {
+    struct cpu_and_wall_time_worker_state *state;
+    TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
+
+    thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance);
 
     state->stats.after_gvl_running++;
 
-    state->during_sample = false;
+    return Qnil;
   }
 
   static VALUE _native_gvl_profiling_hook_active(DDTRACE_UNUSED VALUE self, VALUE instance) {

From 9cfdd0377ffc24a263e16a81770c21e58d4db8ad Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Fri, 11 Oct 2024 11:01:39 +0100
Subject: [PATCH 3/6] Refactor: Remove intermediate
 `thread_context_collector_sample_after_gvl_running` function

Now that the `CpuAndWallTimeWorker` does not call
`thread_context_collector_sample_after_gvl_running` directly from a
`rb_rescue`, it can use the version of the method with extra
arguments directly.
---
 .../collectors_cpu_and_wall_time_worker.c              |  2 +-
 .../collectors_thread_context.c                        | 10 +++-------
 .../collectors_thread_context.h                        |  2 +-
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index 638fbb4e0eb..0923fc40f59 100644
--- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -1367,7 +1367,7 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
     struct cpu_and_wall_time_worker_state *state;
     TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
 
-    thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance);
+    thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance, rb_thread_current());
 
     state->stats.after_gvl_running++;
 
diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c
index 6b79ed05b6c..cdf85f494f2 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.c
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c
@@ -1822,7 +1822,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
   //
   // NOTE: In normal use, current_thread is expected to be == rb_thread_current(); the `current_thread` parameter only
   // exists to enable testing.
-  VALUE thread_context_collector_sample_after_gvl_running_with_thread(VALUE self_instance, VALUE current_thread) {
+  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread) {
     struct thread_context_collector_state *state;
     TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
 
@@ -1857,11 +1857,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
       monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
     );
 
-    return Qtrue; // To allow this to be called from rb_rescue2
-  }
-
-  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance) {
-    return thread_context_collector_sample_after_gvl_running_with_thread(self_instance, rb_thread_current());
+    return Qtrue;
   }
 
   // This method is intended to be called from update_metrics_and_sample. It exists to handle extra sampling steps we
@@ -1986,7 +1982,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
   static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    return thread_context_collector_sample_after_gvl_running_with_thread(collector_instance, thread);
+    return thread_context_collector_sample_after_gvl_running(collector_instance, thread);
   }
 
   static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) {
diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.h b/ext/datadog_profiling_native_extension/collectors_thread_context.h
index d17fe7d6ea9..994fb8762ef 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.h
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.h
@@ -26,5 +26,5 @@ VALUE enforce_thread_context_collector_instance(VALUE object);
 
   void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread);
   __attribute__((warn_unused_result)) on_gvl_running_result thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
-  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);
+  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread);
 #endif

From d1965176af5d9be20eecde6ee621893a29f715bd Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Fri, 11 Oct 2024 11:05:22 +0100
Subject: [PATCH 4/6] Refactor: Pass in timestamp from CpuAndWallTimeWorker
 instead of measuring directly

This is similar to what we do for `thread_context_collector_sample` and
makes it easier for the CpuAndWallTimeWorker to keep stats on how long
sampling took without needing to read the clock more than needed.
---
 .../collectors_cpu_and_wall_time_worker.c              |  3 ++-
 .../collectors_thread_context.c                        | 10 +++++++---
 .../collectors_thread_context.h                        |  2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index 0923fc40f59..da5e286bd7f 100644
--- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -1367,7 +1367,8 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
     struct cpu_and_wall_time_worker_state *state;
     TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
 
-    thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance, rb_thread_current());
+    long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
+    thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance, rb_thread_current(), wall_time_ns_before_sample);
 
     state->stats.after_gvl_running++;
 
diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c
index cdf85f494f2..9f249602733 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.c
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c
@@ -1822,7 +1822,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
   //
   // NOTE: In normal use, current_thread is expected to be == rb_thread_current(); the `current_thread` parameter only
   // exists to enable testing.
-  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread) {
+  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread, long current_monotonic_wall_time_ns) {
     struct thread_context_collector_state *state;
     TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
 
@@ -1854,7 +1854,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
       thread_context,
       thread_context->sampling_buffer,
       cpu_time_for_thread,
-      monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
+      current_monotonic_wall_time_ns
     );
 
     return Qtrue;
@@ -1982,7 +1982,11 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
   static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    return thread_context_collector_sample_after_gvl_running(collector_instance, thread);
+    return thread_context_collector_sample_after_gvl_running(
+      collector_instance,
+      thread,
+      monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
+    );
   }
 
   static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) {
diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.h b/ext/datadog_profiling_native_extension/collectors_thread_context.h
index 994fb8762ef..9a10f0297b3 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.h
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.h
@@ -26,5 +26,5 @@ VALUE enforce_thread_context_collector_instance(VALUE object);
 
   void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread);
   __attribute__((warn_unused_result)) on_gvl_running_result thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
-  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread);
+  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance, VALUE current_thread, long current_monotonic_wall_time_ns);
 #endif

From 7bba03eebbc3093231c922fa0856109e574f1d7b Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Fri, 11 Oct 2024 11:26:15 +0100
Subject: [PATCH 5/6] Introduce internal stats to track time spent in extra GVL
 profiling samples

---
 docs/ProfilingDevelopment.md                  |  6 ++--
 .../collectors_cpu_and_wall_time_worker.c     | 28 ++++++++++++++++---
 .../cpu_and_wall_time_worker_spec.rb          | 28 +++++++++++++++++--
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/docs/ProfilingDevelopment.md b/docs/ProfilingDevelopment.md
index 309bac05f57..26fa6cd66c5 100644
--- a/docs/ProfilingDevelopment.md
+++ b/docs/ProfilingDevelopment.md
@@ -209,9 +209,9 @@ from the VM callbacks and also messing with cpu/wall-time accounting for threads
 
 The `ThreadContext` collector exposes three APIs for GVL profiling:
 
-* `thread_context_collector_on_gvl_waiting(VALUE thread)`
-* `thread_context_collector_on_gvl_running(VALUE thread)`
-* `thread_context_collector_sample_after_gvl_running(VALUE self_instance)`
+* `thread_context_collector_on_gvl_waiting`
+* `thread_context_collector_on_gvl_running`
+* `thread_context_collector_sample_after_gvl_running`
 
 The intuition here is that:
 
diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index da5e286bd7f..7dbecfde7bd 100644
--- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -182,6 +182,10 @@ struct cpu_and_wall_time_worker_state {
     unsigned int after_gvl_running;
     // How many times we skipped the after_gvl_running sampling
     unsigned int gvl_dont_sample;
+    // Min/max/total wall-time spent on gvl sampling
+    uint64_t gvl_sampling_time_ns_min;
+    uint64_t gvl_sampling_time_ns_max;
+    uint64_t gvl_sampling_time_ns_total;
   } stats;
 };
 
@@ -1045,8 +1049,12 @@ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
     ID2SYM(rb_intern("allocations_during_sample")),         /* => */ state->allocation_profiling_enabled ? UINT2NUM(state->stats.allocations_during_sample) : Qnil,
 
     // GVL profiling stats
-    ID2SYM(rb_intern("after_gvl_running")), /* => */ UINT2NUM(state->stats.after_gvl_running),
-    ID2SYM(rb_intern("gvl_dont_sample")),   /* => */ UINT2NUM(state->stats.gvl_dont_sample),
+    ID2SYM(rb_intern("after_gvl_running")),          /* => */ UINT2NUM(state->stats.after_gvl_running),
+    ID2SYM(rb_intern("gvl_dont_sample")),            /* => */ UINT2NUM(state->stats.gvl_dont_sample),
+    ID2SYM(rb_intern("gvl_sampling_time_ns_min")),   /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_min, != UINT64_MAX, ULL2NUM),
+    ID2SYM(rb_intern("gvl_sampling_time_ns_max")),   /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_max, > 0, ULL2NUM),
+    ID2SYM(rb_intern("gvl_sampling_time_ns_total")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_total, > 0, ULL2NUM),
+    ID2SYM(rb_intern("gvl_sampling_time_ns_avg")),   /* => */ RUBY_AVG_OR_NIL(state->stats.gvl_sampling_time_ns_total, state->stats.after_gvl_running),
   };
   for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]);
   return stats_as_hash;
@@ -1084,8 +1092,10 @@ static void reset_stats_not_thread_safe(struct cpu_and_wall_time_worker_state *s
   //       Given the expected infrequency of resetting (~once per 60s profile) and the auxiliary/non-critical nature of these stats
   //       this momentary loss of accuracy is deemed acceptable to keep overhead to a minimum.
   state->stats = (struct stats) {
-    .cpu_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample
-    .allocation_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample
+    // All these values are initialized to their highest value possible since we always take the min between existing and latest sample
+    .cpu_sampling_time_ns_min        = UINT64_MAX,
+    .allocation_sampling_time_ns_min = UINT64_MAX,
+    .gvl_sampling_time_ns_min        = UINT64_MAX,
   };
 }
 
@@ -1369,6 +1379,16 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
 
     long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
     thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance, rb_thread_current(), wall_time_ns_before_sample);
+    long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
+
+    long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;
+
+    // Guard against wall-time going backwards, see https://github.com/DataDog/dd-trace-rb/pull/2336 for discussion.
+    uint64_t sampling_time_ns = delta_ns < 0 ? 0 : delta_ns;
+
+    state->stats.gvl_sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_min);
+    state->stats.gvl_sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_max);
+    state->stats.gvl_sampling_time_ns_total += sampling_time_ns;
 
     state->stats.after_gvl_running++;
 
diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
index 43045da98a2..3626af65b7e 100644
--- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
+++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
@@ -507,7 +507,15 @@
           expect(waiting_for_gvl_time).to be_within(5).percent_of(total_time), \
             "Expected waiting_for_gvl_time to be close to total_time, debug_failures: #{debug_failures}"
 
-          expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be > 0
+          expect(cpu_and_wall_time_worker.stats).to match(
+            hash_including(
+              after_gvl_running: be > 0,
+              gvl_sampling_time_ns_min: be > 0,
+              gvl_sampling_time_ns_max: be > 0,
+              gvl_sampling_time_ns_total: be > 0,
+              gvl_sampling_time_ns_avg: be > 0,
+            )
+          )
         end
 
         context "when 'Waiting for GVL' periods are below waiting_for_gvl_threshold_ns" do
@@ -530,9 +538,21 @@
 
             cpu_and_wall_time_worker.stop
 
-            expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be 0
+            # Note: There may still be "Waiting for GVL" samples in the output, but these samples will come from the
+            # periodic cpu/wall-sampling, not samples directly triggered by the end of a "Waiting for GVL" period.
+
             expect(cpu_and_wall_time_worker.stats.fetch(:gvl_dont_sample))
               .to be > 100 # Arbitrary, on my machine I see 250k on a run
+
+            expect(cpu_and_wall_time_worker.stats).to match(
+              hash_including(
+                after_gvl_running: 0,
+                gvl_sampling_time_ns_min: nil,
+                gvl_sampling_time_ns_max: nil,
+                gvl_sampling_time_ns_total: nil,
+                gvl_sampling_time_ns_avg: nil,
+              )
+            )
           end
         end
       end
@@ -1164,6 +1184,10 @@
           allocations_during_sample: nil,
           after_gvl_running: 0,
           gvl_dont_sample: 0,
+          gvl_sampling_time_ns_min: nil,
+          gvl_sampling_time_ns_max: nil,
+          gvl_sampling_time_ns_total: nil,
+          gvl_sampling_time_ns_avg: nil,
         }
       )
     end

From b2b331b0176d8b607a7dad547375354dd2247ee2 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Fri, 11 Oct 2024 12:28:36 +0100
Subject: [PATCH 6/6] Relax expectation on spec

Flaked in CI with...

```
  1) Datadog::Profiling::Collectors::CpuAndWallTimeWorker#start when main thread is sleeping but a background thread is working when GVL profiling is enabled when 'Waiting for GVL' periods are below waiting_for_gvl_threshold_ns does not trigger extra samples
     Failure/Error:
       expect(cpu_and_wall_time_worker.stats.fetch(:gvl_dont_sample))
         .to be > 100 # Arbitrary, on my machine I see 250k on a run

       expected: > 100
            got:   5
```
---
 .../profiling/collectors/cpu_and_wall_time_worker_spec.rb      | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
index 3626af65b7e..1960c4bb1bf 100644
--- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
+++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
@@ -541,8 +541,7 @@
             # Note: There may still be "Waiting for GVL" samples in the output, but these samples will come from the
             # periodic cpu/wall-sampling, not samples directly triggered by the end of a "Waiting for GVL" period.
 
-            expect(cpu_and_wall_time_worker.stats.fetch(:gvl_dont_sample))
-              .to be > 100 # Arbitrary, on my machine I see 250k on a run
+            expect(cpu_and_wall_time_worker.stats.fetch(:gvl_dont_sample)).to be > 0
 
             expect(cpu_and_wall_time_worker.stats).to match(
               hash_including(