Skip to content

Commit

Permalink
[PROF-7252] Add Profiling.allocation_count API for new profiler
Browse files Browse the repository at this point in the history
**What does this PR do?**:

This PR adds a new profiler public API:
`Datadog::Profiling.allocation_count`.

The public documentation for this API is as follows:

> Returns an ever-increasing counter of the number of allocations
> observed by the profiler in this thread.
>
> Note 1: This counter may not start from zero on new threads. It
> should only be used to measure how many
> allocations have happened between two calls to this API:
> ```ruby
> allocations_before = Datadog::Profiling.allocation_count
> do_some_work()
> allocations_after = Datadog::Profiling.allocation_count
> puts "Allocations during do_some_work: #{allocations_after - allocations_before}"
> ```
> (This is similar to some OS-based time representations.)
>
> Note 2: All fibers in the same thread will share the same counter
> values.
>
> Only available when the profiler is running, the new CPU Profiling
> 2.0 profiler is in use, and allocation-related
> features are not disabled via configuration.
> For instructions on enabling CPU Profiling 2.0 see the ddtrace
> release notes.

As long as CPU Profiling 2.0 is in use, this API is enabled by
default. To disable it, this PR adds a new setting:

```ruby
Datadog.configure do |c|
  c.profiling.advanced.allocation_counting_enabled = # ...
end
```

**Motivation**:

This feature has long been something we want to provide with ddtrace,
see issues #2164 and #468, as well as PRs #1891, #1805, #597

As part of the ongoing work of enabling allocation profiling,
counting the number of allocations comes at a very cheap cost since
the profiler needs to have a `RUBY_INTERNAL_EVENT_NEWOBJ`
tracepoint anyway -- it's just a matter of also incrementing a
counter inside it.

**Additional Notes**:

Note that this does not yet change any user-visible feature for
ddtrace. I'm counting on @marcotc to pick up the task of using this
API to make some tracing magic :)

**How to test the change?**:

This change includes code coverage.

---

Fixes #2164
  • Loading branch information
ivoanjo committed Feb 20, 2023
1 parent ec2076a commit 55ebe2c
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ struct cpu_and_wall_time_worker_state {
atomic_bool should_run;

bool gc_profiling_enabled;
bool allocation_counting_enabled;
VALUE self_instance;
VALUE cpu_and_wall_time_collector_instance;
VALUE idle_sampling_helper_instance;
Expand All @@ -95,6 +96,8 @@ struct cpu_and_wall_time_worker_state {
// Used to get gc start/finish information
VALUE gc_tracepoint;

VALUE object_allocation_tracepoint;

struct stats {
// How many times we tried to trigger a sample
unsigned int trigger_sample_attempts;
Expand Down Expand Up @@ -155,6 +158,8 @@ static void grab_gvl_and_sample(void);
static void reset_stats(struct cpu_and_wall_time_worker_state *state);
static void sleep_for(uint64_t time_ns);
static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self);
static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state);

// Note on sampler global state safety:
//
Expand All @@ -165,7 +170,14 @@ static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self);
// This global state is needed because a bunch of functions on this file need to access it from situations
// (e.g. signal handler) where it's impossible or just awkward to pass it as an argument.
static VALUE active_sampler_instance = Qnil;
struct cpu_and_wall_time_worker_state *active_sampler_instance_state = NULL;
static struct cpu_and_wall_time_worker_state *active_sampler_instance_state = NULL;

// Used to implement CpuAndWallTimeWorker._native_allocation_count . To be able to use cheap thread-local variables
// (here with `__thread`, see https://gcc.gnu.org/onlinedocs/gcc/Thread-Local.html), this needs to be global.
//
// Carryover of state between profiler instances can happen and is not considered to be a problem -- see expectations for this
// API documented in profiling.rb .
__thread uint64_t allocation_count = 0;

void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
rb_global_variable(&active_sampler_instance);
Expand Down Expand Up @@ -220,13 +232,15 @@ static VALUE _native_new(VALUE klass) {

atomic_init(&state->should_run, false);
state->gc_profiling_enabled = false;
state->allocation_counting_enabled = false;
state->cpu_and_wall_time_collector_instance = Qnil;
state->idle_sampling_helper_instance = Qnil;
state->owner_thread = Qnil;
dynamic_sampling_rate_init(&state->dynamic_sampling_rate);
state->failure_exception = Qnil;
state->stop_thread = Qnil;
state->gc_tracepoint = Qnil;
state->object_allocation_tracepoint = Qnil;
reset_stats(state);

return state->self_instance = TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
Expand All @@ -238,17 +252,20 @@ static VALUE _native_initialize(
VALUE cpu_and_wall_time_collector_instance,
VALUE gc_profiling_enabled,
VALUE idle_sampling_helper_instance,
DDTRACE_UNUSED VALUE allocation_counting_enabled
VALUE allocation_counting_enabled
) {
ENFORCE_BOOLEAN(gc_profiling_enabled);
ENFORCE_BOOLEAN(allocation_counting_enabled);

struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
state->allocation_counting_enabled = (allocation_counting_enabled == Qtrue);
state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
state->idle_sampling_helper_instance = idle_sampling_helper_instance;
state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
state->object_allocation_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_NEWOBJ, on_newobj_event, NULL /* unused */);

return Qtrue;
}
Expand All @@ -263,6 +280,7 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
rb_gc_mark(state->failure_exception);
rb_gc_mark(state->stop_thread);
rb_gc_mark(state->gc_tracepoint);
rb_gc_mark(state->object_allocation_tracepoint);
}

// Called in a background thread created in CpuAndWallTimeWorker#start
Expand All @@ -279,14 +297,13 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
);
} else {
// The previously active thread seems to have died without cleaning up after itself.
// In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing GC tracepoint
// In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing tracepoint
// first as:
// a) If this is a new instance of the CpuAndWallTimeWorker, we don't want the tracepoint from the old instance
// being kept around
// b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
// enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.

rb_tracepoint_disable(old_state->gc_tracepoint);
disable_tracepoints(old_state);
}
}

Expand All @@ -312,7 +329,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {

// The sample trigger loop finished (either cleanly or with an error); let's clean up

rb_tracepoint_disable(state->gc_tracepoint);
disable_tracepoints(state);

active_sampler_instance_state = NULL;
active_sampler_instance = Qnil;
Expand Down Expand Up @@ -364,8 +381,8 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
atomic_store(&state->should_run, false);
state->failure_exception = optional_exception;

// Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
rb_tracepoint_disable(state->gc_tracepoint);
// Disable the tracepoints as soon as possible, so the VM doesn't keep on calling them
disable_tracepoints(state);

return Qtrue;
}
Expand Down Expand Up @@ -532,10 +549,11 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

// Final preparations: Setup signal handler and enable tracepoint. We run these here and not in `_native_sampling_loop`
// Final preparations: Setup signal handler and enable tracepoints. We run these here and not in `_native_sampling_loop`
// because they may raise exceptions.
install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
if (state->allocation_counting_enabled) rb_tracepoint_enable(state->object_allocation_tracepoint);

rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);

Expand Down Expand Up @@ -699,7 +717,7 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance)
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

// Disable all tracepoints, so that there are no more attempts to mutate the profile
rb_tracepoint_disable(state->gc_tracepoint);
disable_tracepoints(state);

reset_stats(state);

Expand Down Expand Up @@ -781,6 +799,23 @@ static void sleep_for(uint64_t time_ns) {
}

static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self) {
// TODO
return Qnil;
bool is_profiler_running = active_sampler_instance_state != NULL;

return is_profiler_running ? ULL2NUM(allocation_count) : Qnil;
}

// Implements memory-related profiling events. This function is called by Ruby via the `object_allocation_tracepoint`
// when the RUBY_INTERNAL_EVENT_NEWOBJ event is triggered.
static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
// Update thread-local allocation count
if (RB_UNLIKELY(allocation_count == UINT64_MAX)) {
allocation_count = 0;
} else {
allocation_count++;
}
}

static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state) {
rb_tracepoint_disable(state->gc_tracepoint);
rb_tracepoint_disable(state->object_allocation_tracepoint);
}
1 change: 1 addition & 0 deletions lib/datadog/profiling.rb
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def self.start_if_enabled
# allocations_after = Datadog::Profiling.allocation_count
# puts "Allocations during do_some_work: #{allocations_after - allocations_before}"
# ```
# (This is similar to some OS-based time representations.)
#
# Note 2: All fibers in the same thread will share the same counter values.
#
Expand Down
55 changes: 53 additions & 2 deletions spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -525,8 +525,59 @@
describe '._native_allocation_count' do
subject(:_native_allocation_count) { described_class._native_allocation_count }

# TODO
it { is_expected.to be nil }
context 'when CpuAndWallTimeWorker has not been started' do
it { is_expected.to be nil }
end

context 'when CpuAndWallTimeWorker has been started' do
before do
cpu_and_wall_time_worker.start
wait_until_running
end

after do
cpu_and_wall_time_worker.stop
end

it 'returns the number of allocations between two calls of the method' do
before_allocations = described_class._native_allocation_count
100.times { Object.new }
after_allocations = described_class._native_allocation_count

# The profiler can (currently) cause extra allocations, which is why this is not exactly 100
expect(after_allocations - before_allocations).to be >= 100
expect(after_allocations - before_allocations).to be < 110
end

it 'returns different numbers of allocations for different threads' do
t1_can_run = Queue.new
t1_has_run = Queue.new
before_t1 = nil
after_t1 = nil

background_t1 = Thread.new do
before_t1 = described_class._native_allocation_count
t1_can_run.pop

100.times { Object.new }
after_t1 = described_class._native_allocation_count
t1_has_run << true
end

before_allocations = described_class._native_allocation_count
t1_can_run << true
t1_has_run.pop
after_allocations = described_class._native_allocation_count

background_t1.join

# This test checks that even though we observed >= 100 allocations in a background thread t1, the counters for
# the current thread were not affected by this change (even though they may be slightly affected by the profiler)

expect(after_t1 - before_t1).to be >= 100
expect(after_allocations - before_allocations).to be < 10
end
end
end

def wait_until_running
Expand Down

0 comments on commit 55ebe2c

Please sign in to comment.