Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-6463] Implement support for endpoint profiling in CpuAndWallTime collector #2384

Merged
merged 2 commits into from
Nov 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,12 @@
#define IS_NOT_WALL_TIME false
#define MISSING_TRACER_CONTEXT_KEY 0

static ID at_active_trace_id; // id of :@active_trace in Ruby
static ID at_root_span_id; // id of :@root_span in Ruby
static ID at_active_span_id; // id of :@active_span in Ruby
static ID at_active_trace_id; // id of :@active_trace in Ruby
static ID at_id_id; // id of :@id in Ruby
static ID at_resource_id; // id of :@resource in Ruby
static ID at_root_span_id; // id of :@root_span in Ruby
static ID at_type_id; // id of :@type in Ruby

// Contains state for a single CpuAndWallTime instance
struct cpu_and_wall_time_collector_state {
Expand Down Expand Up @@ -129,6 +131,7 @@ struct trace_identifiers {
ddog_CharSlice span_id;
char local_root_span_id_buffer[MAXIMUM_LENGTH_64_BIT_IDENTIFIER];
char span_id_buffer[MAXIMUM_LENGTH_64_BIT_IDENTIFIER];
VALUE trace_endpoint;
};

static void cpu_and_wall_time_collector_typed_data_mark(void *state_ptr);
Expand Down Expand Up @@ -165,6 +168,7 @@ static long wall_time_now_ns(bool raise_on_failure);
static long thread_id_for(VALUE thread);
static VALUE _native_stats(VALUE self, VALUE collector_instance);
static void trace_identifiers_for(struct cpu_and_wall_time_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result);
static bool is_type_web(VALUE root_span_type);
static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector_instance);

void collectors_cpu_and_wall_time_init(VALUE profiling_module) {
Expand Down Expand Up @@ -194,10 +198,12 @@ void collectors_cpu_and_wall_time_init(VALUE profiling_module) {
rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1);
rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1);

at_active_trace_id = rb_intern_const("@active_trace");
at_root_span_id = rb_intern_const("@root_span");
at_active_span_id = rb_intern_const("@active_span");
at_active_trace_id = rb_intern_const("@active_trace");
at_id_id = rb_intern_const("@id");
at_resource_id = rb_intern_const("@resource");
at_root_span_id = rb_intern_const("@root_span");
at_type_id = rb_intern_const("@type");
}

// This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_collector_state
Expand Down Expand Up @@ -572,12 +578,29 @@ static void trigger_sample_for_thread(
};
}

struct trace_identifiers trace_identifiers_result = {.valid = false};
struct trace_identifiers trace_identifiers_result = {.valid = false, .trace_endpoint = Qnil};
trace_identifiers_for(state, thread, &trace_identifiers_result);

if (trace_identifiers_result.valid) {
labels[label_pos++] = (ddog_Label) {.key = DDOG_CHARSLICE_C("local root span id"), .str = trace_identifiers_result.local_root_span_id};
labels[label_pos++] = (ddog_Label) {.key = DDOG_CHARSLICE_C("span id"), .str = trace_identifiers_result.span_id};

if (trace_identifiers_result.trace_endpoint != Qnil) {
// The endpoint gets recorded in a different way because it is mutable in the tracer and can change during a
// trace.
//
// Instead of each sample for the same local_root_span_id getting a potentially-different endpoint,
// `record_endpoint` (via libdatadog) keeps a list of local_root_span_id values and their most-recently-seen
// endpoint values, and at serialization time the most-recently-seen endpoint is applied to all relevant samples.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

most-recently-seen endpoint is applied to all relevant samples

This is a very reasonable approach! 👍 👍 👍

//
// This is why the endpoint is not directly added in this function to the labels array, although it will later
// show up in the array in the output pprof.
record_endpoint(
state->recorder_instance,
trace_identifiers_result.local_root_span_id,
char_slice_from_ruby_string(trace_identifiers_result.trace_endpoint)
);
}
}

// The number of times `label_pos++` shows up in this function needs to match `max_label_count`. To avoid "oops I
Expand Down Expand Up @@ -864,6 +887,24 @@ static void trace_identifiers_for(struct cpu_and_wall_time_collector_state *stat
};

trace_identifiers_result->valid = true;

VALUE root_span_type = rb_ivar_get(root_span, at_type_id /* @type */);
if (root_span_type == Qnil || !is_type_web(root_span_type)) return;

VALUE trace_resource = rb_ivar_get(active_trace, at_resource_id /* @resource */);
if (RB_TYPE_P(trace_resource, T_STRING)) {
trace_identifiers_result->trace_endpoint = trace_resource;
} else if (trace_resource == Qnil) {
// Fall back to resource from span, if any
trace_identifiers_result->trace_endpoint = rb_ivar_get(root_span, at_resource_id /* @resource */);
}
}

static bool is_type_web(VALUE root_span_type) {
ENFORCE_TYPE(root_span_type, T_STRING);

return RSTRING_LEN(root_span_type) == strlen("web") &&
(memcmp("web", StringValuePtr(root_span_type), strlen("web")) == 0);
}

// After the Ruby VM forks, this method gets called in the child process to clean up any leftover state from the parent.
Expand Down
10 changes: 6 additions & 4 deletions lib/datadog/profiling/collectors/cpu_and_wall_time.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ def reset_after_fork
private

def safely_extract_context_key_from(tracer)
tracer &&
tracer.respond_to?(:provider) &&
# NOTE: instance_variable_get always works, even on nil -- it just returns nil if the variable doesn't exist
tracer.provider.instance_variable_get(:@context).instance_variable_get(:@key)
provider = tracer && tracer.respond_to?(:provider) && tracer.provider

return unless provider

context = provider.instance_variable_get(:@context)
context && context.instance_variable_get(:@key)
end
end
end
Expand Down
110 changes: 107 additions & 3 deletions spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,15 @@ def stats
end

context 'when thread has a tracer context, and a trace is in progress' do
let(:root_span_type) { 'not-web' }

let(:t1) do
Thread.new(ready_queue) do |ready_queue|
Datadog::Tracing.trace('profiler.test') do
Datadog::Tracing.trace('profiler.test.inner') do |span, trace|
@t1_span_id = span.id
Datadog::Tracing.trace('profiler.test', type: root_span_type) do |_span, trace|
@t1_trace = trace

Datadog::Tracing.trace('profiler.test.inner') do |inner_span|
@t1_span_id = inner_span.id
@t1_local_root_span_id = trace.send(:root_span).id
ready_queue << true
sleep
Expand Down Expand Up @@ -361,6 +365,106 @@ def stats
:'span id' => @t1_span_id.to_s,
)
end

it 'does not include the "trace endpoint" label' do
sample

expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint')
end

context 'when local root span type is web' do
let(:root_span_type) { 'web' }

it 'includes the "trace endpoint" label in the samples' do
sample

expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'profiler.test')
end

describe 'trace vs root span resource mutation' do
let(:t1) do
Thread.new(ready_queue) do |ready_queue|
Datadog::Tracing.trace('profiler.test', type: root_span_type) do |span, trace|
trace.resource = trace_resource
span.resource = root_span_resource

Datadog::Tracing.trace('profiler.test.inner') do |inner_span|
@t1_span_id = inner_span.id
@t1_local_root_span_id = trace.send(:root_span).id
ready_queue << true
sleep
end
end
end
end

context 'when the trace resource is nil but the root span resource is not nil' do
let(:trace_resource) { nil }
let(:root_span_resource) { 'root_span_resource' }

it 'includes the "trace endpoint" label in the samples with the root span resource' do
sample

expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'root_span_resource')
end
end

context 'when both the trace resource and the root span resource are specified' do
let(:trace_resource) { 'trace_resource' }
let(:root_span_resource) { 'root_span_resource' }

it 'includes the "trace endpoint" label in the samples with the trace resource' do
sample

expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'trace_resource')
end
end

context 'when both the trace resource and the root span resource are nil' do
let(:trace_resource) { nil }
let(:root_span_resource) { nil }

it 'does not include the "trace endpoint" label' do
sample

expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint')
end
end
end

context 'when resource is changed after a sample was taken' do
before do
sample
@t1_trace.resource = 'changed_after_first_sample'
end

it 'changes the "trace endpoint" label in all samples' do
sample

t1_samples = samples_for_thread(samples, t1)

expect(t1_samples).to have(1).item
expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_first_sample')
expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 2)
end

context 'when the resource is changed multiple times' do
it 'changes the "trace endpoint" label in all samples' do
sample

@t1_trace.resource = 'changed_after_second_sample'

sample

t1_samples = samples_for_thread(samples, t1)

expect(t1_samples).to have(1).item
expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_second_sample')
expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 3)
end
end
end
end
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion spec/datadog/profiling/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def samples_from_pprof(pprof_data)
labels: sample.label.map do |it|
[
string_table[it.key].to_sym,
it.str != 0 ? string_table[it.str] : raise('Unexpected: label encoded as number instead of string'),
it.num == 0 ? string_table[it.str] : raise('Unexpected: label encoded as number instead of string'),
]
end.to_h,
}
Expand Down