Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NO-TICKET] Add memory leak testing for profiling + fix (small) memory leak in profiler #3852

Merged
merged 18 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/test-memory-leaks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Test for memory leaks
on: [push]
jobs:
test-memcheck:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- uses: ruby/setup-ruby@v1
with:
ruby-version: 3.4.0-preview1 # TODO: Use stable version once 3.4 is out
bundler-cache: true # runs 'bundle install' and caches installed gems automatically
bundler: latest
cache-version: v1 # bump this to invalidate cache
- run: sudo apt install -y valgrind && valgrind --version
- run: bundle exec rake compile spec:profiling:memcheck
8 changes: 7 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@ gem 'webmock', '>= 3.10.0'

gem 'rexml', '>= 3.2.7' # https://www.ruby-lang.org/en/news/2024/05/16/dos-rexml-cve-2024-35176/

gem 'webrick', '>= 1.7.0' if RUBY_VERSION >= '3.0.0' # No longer bundled by default since Ruby 3.0
if RUBY_VERSION.start_with?('3.4.')
# ruby 3.4 breaks stable webrick; we need this fix until a version later than 1.8.1 comes out
gem 'webrick', git: 'https://github.com/ruby/webrick.git', ref: '0c600e169bd4ae267cb5eeb6197277c848323bbe'
elsif RUBY_VERSION >= '3.0.0' # No longer bundled by default since Ruby 3.0
gem 'webrick', '>= 1.7.0'
end

gem 'yard', '~> 0.9' # NOTE: YardDoc is generated with ruby 3.2 in GitHub Actions

Expand Down Expand Up @@ -82,6 +87,7 @@ group :check do
gem 'rbs', '~> 3.2.0', require: false
gem 'steep', '~> 1.6.0', require: false
end
gem 'ruby_memcheck', '>= 3' if RUBY_VERSION >= '3.4.0' && RUBY_PLATFORM != 'java'
gem 'standard', require: false
end

Expand Down
31 changes: 31 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@ require 'rspec/core/rake_task'
require 'rake/extensiontask'
require 'yard'
require 'os'
if Gem.loaded_specs.key? 'ruby_memcheck'
require 'ruby_memcheck'
require 'ruby_memcheck/rspec/rake_task'

RubyMemcheck.config(
# If there's an error, print the suppression for that error, to allow us to easily skip such an error if it's
# a false-positive / something in the VM we can't fix.
valgrind_generate_suppressions: true,
# This feature provides better quality data -- I couldn't get good output out of ruby_memcheck without it.
use_only_ruby_free_at_exit: true,
)
end

Dir.glob('tasks/*.rake').each { |r| import r }

Expand Down Expand Up @@ -314,6 +326,25 @@ namespace :spec do
t.rspec_opts = [*args.to_a, '-t ractors'].join(' ')
end

desc 'Run spec:profiling:main tests with memory leak checking'
if Gem.loaded_specs.key?('ruby_memcheck')
RubyMemcheck::RSpec::RakeTask.new(:memcheck) do |t, args|
t.pattern = 'spec/datadog/profiling/**/*_spec.rb,spec/datadog/profiling_spec.rb'
# Some of our specs use multi-threading + busy looping, or multiple processes, or are just really really slow.
# We skip running these when running under valgrind.
# (As a reminder, by default valgrind simulates a sequential/single-threaded execution).
#
# @ivoanjo: I previously tried https://github.com/Shopify/ruby_memcheck/issues/51 but in some cases valgrind
# would give incomplete output, causing a "FATAL: Premature end of data in tag valgrindoutput line 3" error in
# ruby_memcheck. I did not figure out why exactly.
t.rspec_opts = [*args.to_a, '-t ~ractors -t ~memcheck_valgrind_skip'].join(' ')
end
else
task :memcheck do
raise 'Memcheck requires the ruby_memcheck gem to be installed'
end
end

# Make sure each profiling test suite has a dependency on compiled native extensions
Rake::Task[:all].prerequisite_tasks.each { |t| t.enhance([:compile_native_extensions]) }
end
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/profiler_gc.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def run_benchmark
x.report('profiler gc') do
Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector)
Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector)
Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector)
Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false)
end

x.save! "#{File.basename(__FILE__)}-results.json" unless VALIDATE_BENCHMARK_MODE
Expand All @@ -61,7 +61,7 @@ def run_benchmark
estimated_gc_per_minute.times do
Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector)
Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector)
Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector)
Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false)
end

@recorder.serialize
Expand Down
59 changes: 46 additions & 13 deletions ext/datadog_profiling_native_extension/collectors_stack.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ static VALUE _native_sample(
VALUE max_frames,
VALUE in_gc
);
static VALUE native_sample_do(VALUE args);
static VALUE native_sample_ensure(VALUE args);
static void maybe_add_placeholder_frames_omitted(VALUE thread, sampling_buffer* buffer, char *frames_omitted_message, int frames_omitted_message_size);
static void record_placeholder_stack_in_native_code(VALUE recorder_instance, sample_values values, sample_labels labels);
static void maybe_trim_template_random_ids(ddog_CharSlice *name_slice, ddog_CharSlice *filename_slice);
Expand All @@ -51,6 +53,16 @@ void collectors_stack_init(VALUE profiling_module) {
rb_global_variable(&missing_string);
}

struct native_sample_args {
VALUE in_gc;
VALUE recorder_instance;
sample_values values;
sample_labels labels;
VALUE thread;
ddog_prof_Location *locations;
sampling_buffer *buffer;
};

// This method exists only to enable testing Datadog::Profiling::Collectors::Stack behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_sample(
Expand Down Expand Up @@ -105,33 +117,54 @@ static VALUE _native_sample(
};
}

int max_frames_requested = NUM2INT(max_frames);
if (max_frames_requested < 0) rb_raise(rb_eArgError, "Invalid max_frames: value must not be negative");
int max_frames_requested = sampling_buffer_check_max_frames(NUM2INT(max_frames));

ddog_prof_Location *locations = ruby_xcalloc(max_frames_requested, sizeof(ddog_prof_Location));
sampling_buffer *buffer = sampling_buffer_new(max_frames_requested, locations);

ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = labels_count};

if (in_gc == Qtrue) {
struct native_sample_args args_struct = {
.in_gc = in_gc,
.recorder_instance = recorder_instance,
.values = values,
.labels = (sample_labels) {.labels = slice_labels, .state_label = state_label},
.thread = thread,
.locations = locations,
.buffer = buffer,
};

return rb_ensure(native_sample_do, (VALUE) &args_struct, native_sample_ensure, (VALUE) &args_struct);
}

static VALUE native_sample_do(VALUE args) {
struct native_sample_args *args_struct = (struct native_sample_args *) args;

if (args_struct->in_gc == Qtrue) {
record_placeholder_stack(
recorder_instance,
values,
(sample_labels) {.labels = slice_labels, .state_label = state_label},
args_struct->recorder_instance,
args_struct->values,
args_struct->labels,
DDOG_CHARSLICE_C("Garbage Collection")
);
} else {
sample_thread(
thread,
buffer,
recorder_instance,
values,
(sample_labels) {.labels = slice_labels, .state_label = state_label}
args_struct->thread,
args_struct->buffer,
args_struct->recorder_instance,
args_struct->values,
args_struct->labels
);
}

ruby_xfree(locations);
sampling_buffer_free(buffer);
return Qtrue;
}

static VALUE native_sample_ensure(VALUE args) {
struct native_sample_args *args_struct = (struct native_sample_args *) args;

ruby_xfree(args_struct->locations);
sampling_buffer_free(args_struct->buffer);

return Qtrue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ static VALUE _native_initialize(
static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread);
static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance);
static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance);
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state);
void update_metrics_and_sample(
struct thread_context_collector_state *state,
VALUE thread_being_sampled,
Expand Down Expand Up @@ -261,7 +261,7 @@ void collectors_thread_context_init(VALUE profiling_module) {
rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 3);
rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1);
rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1);
rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1);
rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 2);
rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0);
rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1);
rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1);
Expand Down Expand Up @@ -439,7 +439,16 @@ static VALUE _native_on_gc_finish(DDTRACE_UNUSED VALUE self, VALUE collector_ins

// This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance) {
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state) {
ENFORCE_BOOLEAN(reset_monotonic_to_system_state);

struct thread_context_collector_state *state;
TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);

if (reset_monotonic_to_system_state == Qtrue) {
state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER;
}
Comment on lines +448 to +450
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't seem to find any code overriding this state. Why do we need to do this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See commit description for c8f2338 -- it's my attempt at fixing a flaky spec that showed up.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ack! (not used to seeing useful commit descriptions 😂 )

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ack! (not used to seeing useful commit descriptions 😂 )

🔥🔥🔥


thread_context_collector_sample_after_gc(collector_instance);
return Qtrue;
}
Expand Down Expand Up @@ -1223,6 +1232,9 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector
struct thread_context_collector_state *state;
TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);

// Release all context memory before clearing the existing context
st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_free_values, 0 /* unused */);

st_clear(state->hash_map_per_thread_context);

state->stats = (struct stats) {}; // Resets all stats back to zero
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ uint64_t native_thread_id_for(VALUE thread) {
#if !defined(NO_THREAD_TID) && defined(RB_THREAD_T_HAS_NATIVE_ID)
#ifndef NO_RB_NATIVE_THREAD
struct rb_native_thread* native_thread = thread_struct_from_object(thread)->nt;
if (native_thread == NULL) rb_raise(rb_eRuntimeError, "BUG: rb_native_thread* is null. Is this Ruby running with RUBY_MN_THREADS=1?");
if (native_thread == NULL) return 0;
return native_thread->tid;
#else
return thread_struct_from_object(thread)->tid;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@
# See native bits for more details.
let(:options) { {**super(), skip_idle_samples_for_testing: true} }

it "triggers sampling and records the results" do
it "triggers sampling and records the results", :memcheck_valgrind_skip do
start

all_samples = loop_until do
Expand All @@ -182,6 +182,7 @@
it(
"keeps statistics on how many samples were triggered by the background thread, " \
"as well as how many samples were requested from the VM",
:memcheck_valgrind_skip,
) do
start

Expand Down Expand Up @@ -337,7 +338,7 @@
background_thread.join
end

it "is able to sample even when the main thread is sleeping" do
it "is able to sample even when the main thread is sleeping", :memcheck_valgrind_skip do
background_thread
ready_queue.pop

Expand Down Expand Up @@ -546,7 +547,7 @@
# and we clamp it if it goes over the limit.
# But the total amount of allocations recorded should match the number we observed, and thus we record the
# remainder above the clamped value as a separate "Skipped Samples" step.
context "with a high allocation rate" do
context "with a high allocation rate", :memcheck_valgrind_skip do
let(:options) { {**super(), dynamic_sampling_rate_overhead_target_percentage: 0.1} }
let(:thread_that_allocates_as_fast_as_possible) { Thread.new { loop { BasicObject.new } } }

Expand Down
6 changes: 3 additions & 3 deletions spec/datadog/profiling/collectors/thread_context_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def on_gc_finish
described_class::Testing._native_on_gc_finish(cpu_and_wall_time_collector)
end

def sample_after_gc
described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector)
def sample_after_gc(reset_monotonic_to_system_state: false)
described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector, reset_monotonic_to_system_state)
end

def sample_allocation(weight:, new_object: Object.new)
Expand Down Expand Up @@ -1019,7 +1019,7 @@ def self.otel_sdk_available?
end

it "creates a Garbage Collection sample using the timestamp set by on_gc_finish, converted to epoch ns" do
sample_after_gc
sample_after_gc(reset_monotonic_to_system_state: true)

expect(gc_sample.labels.fetch(:end_timestamp_ns)).to be_between(@time_before, @time_after)
end
Expand Down
2 changes: 1 addition & 1 deletion spec/datadog/profiling/native_extension_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
it { is_expected.to be false }
end

describe "correctness" do
describe "correctness", :memcheck_valgrind_skip do
let(:ready_queue) { Queue.new }
let(:background_thread) do
Thread.new do
Expand Down
2 changes: 1 addition & 1 deletion spec/datadog/profiling/validate_benchmarks_spec.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
require "datadog/profiling/spec_helper"

RSpec.describe "Profiling benchmarks" do
RSpec.describe "Profiling benchmarks", :memcheck_valgrind_skip do
before { skip_if_profiling_not_supported(self) }

around do |example|
Expand Down
1 change: 1 addition & 0 deletions spec/datadog/release_gem_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
|tasks
|yard
|vendor/rbs
|suppressions
)/
}x

Expand Down
64 changes: 64 additions & 0 deletions suppressions/ruby-3.4.supp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# This is a valgrind suppression configuration file.
#
# We use it together with the ruby_memcheck gem to find issues in the dd-trace-rb native extensions; in some cases
# we need to ignore potential issues as they're not something we can fix (e.g. outside our code.)
#
# See https://valgrind.org/docs/manual/manual-core.html#manual-core.suppress for details.

# Ruby weak maps seem to be able to cause invalid reads?
{
ruby-weak-map
Memcheck:Addr8
fun:wmap_cmp
fun:find_table_bin_ind
fun:st_general_foreach
fun:rb_st_foreach
...
}

# When a Ruby process forks, it looks like Ruby doesn't clean up the memory of old threads?
{
ruby-native-thread-memory
Memcheck:Leak
fun:calloc
fun:calloc1
fun:rb_gc_impl_calloc
fun:native_thread_alloc
fun:native_thread_create_dedicated
fun:native_thread_create
fun:thread_create_core
...
}

# When a Ruby process forks, it looks like Ruby doesn't clean up the memory of old threads?
{
ruby-native-thread-memory-2
Memcheck:Leak
fun:calloc
fun:calloc1
fun:objspace_xcalloc
fun:ruby_xcalloc_body
fun:native_thread_alloc
fun:native_thread_create_dedicated
fun:native_thread_create
fun:thread_create_core
...
}

# We don't care about the pkg-config external tool
{
pkg-config-memory
Memcheck:Leak
...
obj:/usr/bin/pkg-config
...
}

# We don't care about the tr external tool
{
pkg-config-memory
Memcheck:Leak
...
obj:/usr/bin/tr
...
}
Loading