Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NO-TICKET] Redesign GC profiling to add timeline support and reduce overhead #3313

Merged
merged 21 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f3da457
Refactor placeholder stack recording to be reusable
ivoanjo Dec 4, 2023
3bca251
Garbage Collection samples no longer collect stacks
ivoanjo Dec 4, 2023
1f5926b
Remove record_buffer/extra_frames_in_record_buffer abstraction
ivoanjo Dec 5, 2023
6f922aa
Extract `record_placeholder_stack_in_native_code` again
ivoanjo Dec 5, 2023
7ce8d90
Remove old comment
ivoanjo Dec 5, 2023
c376568
Add timeline event type to stack recorder
ivoanjo Dec 6, 2023
14d9789
Introduce gc profiling helper
ivoanjo Dec 8, 2023
c64d78a
Minor: Add long_max_of/long_min_of helpers
ivoanjo Dec 8, 2023
0ac923a
[NO-TICKET] Redesign GC profiling to add timeline support and reduce …
ivoanjo Dec 8, 2023
a2c8913
Remove `SAMPLE_IN_GC` enum entry
ivoanjo Dec 8, 2023
ab39dce
Remove `sample_type` enum
ivoanjo Dec 8, 2023
f3a5c97
Fix rubocop nitpick
ivoanjo Dec 8, 2023
b78d772
Tweak code to avoid triggering warnings on the buggy gcc that our tra…
ivoanjo Dec 8, 2023
d97f4dc
Tweak spec to avoid flakiness
ivoanjo Dec 8, 2023
0127e3a
Update outdated comment
ivoanjo Dec 8, 2023
e8d7b4e
Reorder steps in on_gc_finish as suggested during PR review
ivoanjo Dec 11, 2023
c6b791d
Add explanation for why we add both cpu/wall-time duration as well as…
ivoanjo Dec 11, 2023
2265fc3
Avoid duplicate boilerplate when calling `record_placeholder_stack`
ivoanjo Dec 11, 2023
bfdd1b7
Minor fix to spec after branch rebase on top of master
ivoanjo Dec 13, 2023
5f42107
Fix spec flakyness
ivoanjo Dec 13, 2023
a3bdd74
Fix another flaky spec
ivoanjo Dec 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarks/profiler_sample_loop_v2.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def create_profiler
@recorder = Datadog::Profiling::StackRecorder.new(
cpu_time_enabled: true,
alloc_samples_enabled: false,
heap_samples_enabled: false
heap_samples_enabled: false,
timeline_enabled: false,
)
@collector = Datadog::Profiling::Collectors::ThreadContext.new(
recorder: @recorder, max_frames: 400, tracer: nil, endpoint_collection_enabled: false, timeline_enabled: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -718,28 +718,11 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
if (event == RUBY_INTERNAL_EVENT_GC_ENTER) {
thread_context_collector_on_gc_start(state->thread_context_collector_instance);
} else if (event == RUBY_INTERNAL_EVENT_GC_EXIT) {
// Design: In an earlier iteration of this feature (see https://github.com/DataDog/dd-trace-rb/pull/2308) we
// actually had a single method to implement the behavior of both thread_context_collector_on_gc_finish
// and thread_context_collector_sample_after_gc (the latter is called via after_gc_from_postponed_job).
//
// Unfortunately, then we discovered the safety issue around no allocations, and thus decided to separate them -- so that
// the sampling could run outside the tight safety constraints of the garbage collection process.
//
// There is a downside: The sample is now taken very very shortly afterwards the GC finishes, and not immediately
// as the GC finishes, which means the stack captured may by affected by "skid", e.g. point slightly after where
// it should be pointing at.
// Alternatives to solve this would be to capture no stack for garbage collection (as we do for Java and .net);
// making the sampling process allocation-safe (very hard); or separate stack sampling from sample recording,
// e.g. enabling us to capture the stack in thread_context_collector_on_gc_finish and do the rest later
// (medium hard).

thread_context_collector_on_gc_finish(state->thread_context_collector_instance);
// We use rb_postponed_job_register_one to ask Ruby to run thread_context_collector_sample_after_gc after if
// fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual.
//
// Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
// this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
bool should_flush = thread_context_collector_on_gc_finish(state->thread_context_collector_instance);

// We use rb_postponed_job_register_one to ask Ruby to run thread_context_collector_sample_after_gc when the
// thread collector flags it's time to flush.
if (should_flush) rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#include <ruby.h>
#include <datadog/profiling.h>

#include "collectors_gc_profiling_helper.h"

// This helper is used by the Datadog::Profiling::Collectors::ThreadContext to profile garbage collection.
// It's tested through that class' interfaces.
// ---

// Used when retrieving GC information from the VM.
// All these are symbols, but we don't need to mark them since we ask for them to be interned (and thus live forever)
static VALUE state_sym;
static VALUE marking_sym;
static VALUE sweeping_sym;
static VALUE none_sym;
static VALUE gc_by_sym;
static VALUE newobj_sym;
static VALUE malloc_sym;
static VALUE method_sym;
static VALUE capi_sym;
static VALUE stress_sym;
static VALUE major_by_sym;
static VALUE nofree_sym;
static VALUE oldgen_sym;
static VALUE shady_sym;
static VALUE force_sym;
static VALUE oldmalloc_sym;

static ddog_CharSlice major_gc_reason_pretty(VALUE major_gc_reason);
static ddog_CharSlice gc_cause_pretty(VALUE gc_cause);
static ddog_CharSlice gc_type_pretty(VALUE major_gc_reason, VALUE gc_state);

void gc_profiling_init(void) {
// This function lazy-interns a few constants, which may trigger allocations. Since we want to call it during GC as
// well, when allocations are not allowed, we call it once here so that the constants get defined ahead of time.
rb_gc_latest_gc_info(rb_hash_new());

// Used to query and look up the results of GC information
state_sym = ID2SYM(rb_intern_const("state"));
marking_sym = ID2SYM(rb_intern_const("marking"));
sweeping_sym = ID2SYM(rb_intern_const("sweeping"));
none_sym = ID2SYM(rb_intern_const("none"));
gc_by_sym = ID2SYM(rb_intern_const("gc_by"));
newobj_sym = ID2SYM(rb_intern_const("newobj"));
malloc_sym = ID2SYM(rb_intern_const("malloc"));
method_sym = ID2SYM(rb_intern_const("method"));
capi_sym = ID2SYM(rb_intern_const("capi"));
stress_sym = ID2SYM(rb_intern_const("stress"));
major_by_sym = ID2SYM(rb_intern_const("major_by"));
nofree_sym = ID2SYM(rb_intern_const("nofree"));
oldgen_sym = ID2SYM(rb_intern_const("oldgen"));
shady_sym = ID2SYM(rb_intern_const("shady"));
force_sym = ID2SYM(rb_intern_const("force"));
oldmalloc_sym = ID2SYM(rb_intern_const("oldmalloc"));
state_sym = ID2SYM(rb_intern_const("state"));
none_sym = ID2SYM(rb_intern_const("none"));
}

bool gc_profiling_has_major_gc_finished(void) {
return rb_gc_latest_gc_info(state_sym) == none_sym && rb_gc_latest_gc_info(major_by_sym) != Qnil;
}

uint8_t gc_profiling_set_metadata(ddog_prof_Label *labels, int labels_length) {
uint8_t max_label_count =
1 + // thread id
1 + // thread name
1 + // state
1 + // event
1 + // gc reason
1 + // gc cause
1; // gc type

if (max_label_count > labels_length) {
rb_raise(rb_eArgError, "BUG: gc_profiling_set_metadata invalid labels_length (%d) < max_label_count (%d)", labels_length, max_label_count);
}

uint8_t label_pos = 0;

labels[label_pos++] = (ddog_prof_Label) {
.key = DDOG_CHARSLICE_C("thread id"),
.str = DDOG_CHARSLICE_C("GC"),
.num = 0, // This shouldn't be needed but the tracer-2.7 docker image ships a buggy gcc that complains about this
};

labels[label_pos++] = (ddog_prof_Label) {
.key = DDOG_CHARSLICE_C("thread name"),
.str = DDOG_CHARSLICE_C("Garbage Collection"),
.num = 0, // Workaround, same as above
};

labels[label_pos++] = (ddog_prof_Label) {
.key = DDOG_CHARSLICE_C("state"),
.str = DDOG_CHARSLICE_C("had cpu"),
.num = 0, // Workaround, same as above
};

labels[label_pos++] = (ddog_prof_Label) {
.key = DDOG_CHARSLICE_C("event"),
.str = DDOG_CHARSLICE_C("gc"),
.num = 0, // Workaround, same as above
};

VALUE major_by = rb_gc_latest_gc_info(major_by_sym);
if (major_by != Qnil) {
labels[label_pos++] = (ddog_prof_Label) {
.key = DDOG_CHARSLICE_C("gc reason"),
.str = major_gc_reason_pretty(major_by),
};
}

labels[label_pos++] = (ddog_prof_Label) {
.key = DDOG_CHARSLICE_C("gc cause"),
.str = gc_cause_pretty(rb_gc_latest_gc_info(gc_by_sym)),
};

labels[label_pos++] = (ddog_prof_Label) {
.key = DDOG_CHARSLICE_C("gc type"),
.str = gc_type_pretty(major_by, rb_gc_latest_gc_info(state_sym)),
};

if (label_pos > max_label_count) {
rb_raise(rb_eRuntimeError, "BUG: gc_profiling_set_metadata unexpected label_pos (%d) > max_label_count (%d)", label_pos, max_label_count);
}

return label_pos;
}

static ddog_CharSlice major_gc_reason_pretty(VALUE major_gc_reason) {
if (major_gc_reason == nofree_sym ) return DDOG_CHARSLICE_C("not enough free slots (NOFREE)");
if (major_gc_reason == oldgen_sym ) return DDOG_CHARSLICE_C("old generation full (OLDGEN)");
if (major_gc_reason == shady_sym ) return DDOG_CHARSLICE_C("too many objects without write barriers (SHADY)");
if (major_gc_reason == force_sym ) return DDOG_CHARSLICE_C("requested (FORCE)");
if (major_gc_reason == oldmalloc_sym) return DDOG_CHARSLICE_C("heap bytes allocated threshold (OLDMALLOC)");
return DDOG_CHARSLICE_C("unknown");
}

static ddog_CharSlice gc_cause_pretty(VALUE gc_cause) {
if (gc_cause == newobj_sym) return DDOG_CHARSLICE_C("object allocation");
if (gc_cause == malloc_sym) return DDOG_CHARSLICE_C("malloc()");
if (gc_cause == method_sym) return DDOG_CHARSLICE_C("GC.start()");
if (gc_cause == capi_sym ) return DDOG_CHARSLICE_C("rb_gc()");
if (gc_cause == stress_sym) return DDOG_CHARSLICE_C("stress");
return DDOG_CHARSLICE_C("unknown");
}

static ddog_CharSlice gc_type_pretty(VALUE major_gc_reason, VALUE gc_state) {
if (major_gc_reason != Qnil) {
if (gc_state == marking_sym ) return DDOG_CHARSLICE_C("major (ongoing, marking)");
if (gc_state == sweeping_sym) return DDOG_CHARSLICE_C("major (ongoing, sweeping)");
return DDOG_CHARSLICE_C("major");
} else {
// As we delay flushing events when a minor GC finishes, it's not relevant to include the observed state of the
// minor GC, as we often won't record a marking -> sweeping -> done cycle, as it happens too quickly.
return DDOG_CHARSLICE_C("minor");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

void gc_profiling_init(void);
bool gc_profiling_has_major_gc_finished(void);
uint8_t gc_profiling_set_metadata(ddog_prof_Label *labels, int labels_length);
Loading
Loading