Skip to content

Commit

Permalink
Randomized allocation sampling (#104955)
Browse files Browse the repository at this point in the history
* Add Randomized Allocation Sampling

This feature allows profilers to do allocation profiling based off randomized samples. It has better theoretical and empirically observed accuracy than our current allocation profiling approaches while also maintaining low performance overhead. It is designed for use in production profiling scenarios. For more information about usage and implementation, see the included doc docs/design/features/RandomizedAllocationSampling.md

Much of this code was originally written in #100356 by @chrisnas, but for logistical reasons we superceded it with this new PR.
  • Loading branch information
noahfalk authored Nov 14, 2024
1 parent 8d21e13 commit 1c4c009
Show file tree
Hide file tree
Showing 35 changed files with 2,358 additions and 23 deletions.
317 changes: 317 additions & 0 deletions docs/design/features/RandomizedAllocationSampling.md

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions src/coreclr/inc/eventtracebase.h
Original file line number Diff line number Diff line change
Expand Up @@ -1331,17 +1331,19 @@ namespace ETW
#define ETWLoaderStaticLoad 0 // Static reference load
#define ETWLoaderDynamicLoad 1 // Dynamic assembly load

#if defined (FEATURE_EVENT_TRACE)
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PROVIDER_DOTNET_Context;
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Context;
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_RUNDOWN_PROVIDER_DOTNET_Context;
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_STRESS_PROVIDER_DOTNET_Context;
#endif // FEATURE_EVENT_TRACE

#if defined(FEATURE_EVENT_TRACE) && !defined(HOST_UNIX)
//
// The ONE and only ONE global instantiation of this class
//
extern ETW::CEtwTracer * g_pEtwTracer;

EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PROVIDER_DOTNET_Context;
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Context;
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_RUNDOWN_PROVIDER_DOTNET_Context;
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_STRESS_PROVIDER_DOTNET_Context;

//
// Special Handling of Startup events
//
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/nativeaot/Runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ set(COMMON_RUNTIME_SOURCES
${GC_DIR}/handletablescan.cpp
${GC_DIR}/objecthandle.cpp
${GC_DIR}/softwarewritewatch.cpp
${CLR_SRC_NATIVE_DIR}/minipal/xoshiro128pp.c
)

set(SERVER_GC_SOURCES
Expand Down
80 changes: 78 additions & 2 deletions src/coreclr/nativeaot/Runtime/GCHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@

#include "gcdesc.h"

#ifdef FEATURE_EVENT_TRACE
#include "clretwallmain.h"
#else // FEATURE_EVENT_TRACE
#include "etmdummy.h"
#endif // FEATURE_EVENT_TRACE

#define RH_LARGE_OBJECT_SIZE 85000

MethodTable g_FreeObjectEEType;
Expand Down Expand Up @@ -471,6 +477,24 @@ EXTERN_C int64_t QCALLTYPE RhGetTotalAllocatedBytesPrecise()
return allocated;
}

void FireAllocationSampled(GC_ALLOC_FLAGS flags, size_t size, size_t samplingBudgetOffset, Object* orObject)
{
#ifdef FEATURE_EVENT_TRACE
void* typeId = GetLastAllocEEType();
// Note: Just as for AllocationTick, the type name cannot be retrieved
WCHAR* name = nullptr;

if (typeId != nullptr)
{
unsigned int allocKind =
(flags & GC_ALLOC_PINNED_OBJECT_HEAP) ? 2 :
(flags & GC_ALLOC_LARGE_OBJECT_HEAP) ? 1 :
0; // SOH
FireEtwAllocationSampled(allocKind, GetClrInstanceId(), typeId, name, (BYTE*)orObject, size, samplingBudgetOffset);
}
#endif
}

static Object* GcAllocInternal(MethodTable* pEEType, uint32_t uFlags, uintptr_t numElements, Thread* pThread)
{
ASSERT(!pThread->IsDoNotTriggerGcSet());
Expand Down Expand Up @@ -539,8 +563,47 @@ static Object* GcAllocInternal(MethodTable* pEEType, uint32_t uFlags, uintptr_t
// Save the MethodTable for instrumentation purposes.
tls_pLastAllocationEEType = pEEType;

Object* pObject = GCHeapUtilities::GetGCHeap()->Alloc(pThread->GetAllocContext(), cbSize, uFlags);
pThread->GetEEAllocContext()->UpdateCombinedLimit();
// check for dynamic allocation sampling
ee_alloc_context* pEEAllocContext = pThread->GetEEAllocContext();
gc_alloc_context* pAllocContext = pEEAllocContext->GetGCAllocContext();
bool isSampled = false;
size_t availableSpace = 0;
size_t samplingBudget = 0;

bool isRandomizedSamplingEnabled = ee_alloc_context::IsRandomizedSamplingEnabled();
if (isRandomizedSamplingEnabled)
{
// The number bytes we can allocate before we need to emit a sampling event.
// This calculation is only valid if combined_limit < alloc_limit.
samplingBudget = (size_t)(pEEAllocContext->combined_limit - pAllocContext->alloc_ptr);

// The number of bytes available in the current allocation context
availableSpace = (size_t)(pAllocContext->alloc_limit - pAllocContext->alloc_ptr);

// Check to see if the allocated object overlaps a sampled byte
// in this AC. This happens when both:
// 1) The AC contains a sampled byte (combined_limit < alloc_limit)
// 2) The object is large enough to overlap it (samplingBudget < aligned_size)
//
// Note that the AC could have no remaining space for allocations (alloc_ptr =
// alloc_limit = combined_limit). When a thread hasn't done any SOH allocations
// yet it also starts in an empty state where alloc_ptr = alloc_limit =
// combined_limit = nullptr. The (1) check handles both of these situations
// properly as an empty AC can not have a sampled byte inside of it.
isSampled =
(pEEAllocContext->combined_limit < pAllocContext->alloc_limit) &&
(samplingBudget < cbSize);

// if the object overflows the AC, we need to sample the remaining bytes
// the sampling budget only included at most the bytes inside the AC
if (cbSize > availableSpace && !isSampled)
{
samplingBudget = ee_alloc_context::ComputeGeometricRandom() + availableSpace;
isSampled = (samplingBudget < cbSize);
}
}

Object* pObject = GCHeapUtilities::GetGCHeap()->Alloc(pAllocContext, cbSize, uFlags);
if (pObject == NULL)
return NULL;

Expand All @@ -551,6 +614,19 @@ static Object* GcAllocInternal(MethodTable* pEEType, uint32_t uFlags, uintptr_t
((Array*)pObject)->InitArrayLength((uint32_t)numElements);
}

if (isSampled)
{
FireAllocationSampled((GC_ALLOC_FLAGS)uFlags, cbSize, samplingBudget, pObject);
}

// There are a variety of conditions that may have invalidated the previous combined_limit value
// such as not allocating the object in the AC memory region (UOH allocations), moving the AC, adding
// extra alignment padding, allocating a new AC, or allocating an object that consumed the sampling budget.
// Rather than test for all the different invalidation conditions individually we conservatively always
// recompute it. If sampling isn't enabled this inlined function is just trivially setting
// combined_limit=alloc_limit.
pEEAllocContext->UpdateCombinedLimit(isRandomizedSamplingEnabled);

if (uFlags & GC_ALLOC_USER_OLD_HEAP)
GCHeapUtilities::GetGCHeap()->PublishObject((uint8_t*)pObject);

Expand Down
5 changes: 5 additions & 0 deletions src/coreclr/nativeaot/Runtime/disabledeventtrace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@

void EventTracing_Initialize() { }

bool IsRuntimeProviderEnabled(uint8_t level, uint64_t keyword)
{
return false;
}

void ETW::GCLog::FireGcStart(ETW_GC_INFO * pGcInfo) { }

#ifdef FEATURE_ETW
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Native runtime events supported by aot runtime.

AllocationSampled
BGC1stConEnd
BGC1stNonConEnd
BGC1stSweepEnd
Expand Down
7 changes: 6 additions & 1 deletion src/coreclr/nativeaot/Runtime/eventtrace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Con
MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_EVENTPIPE_Context
};

bool IsRuntimeProviderEnabled(uint8_t level, uint64_t keyword)
{
return RUNTIME_PROVIDER_CATEGORY_ENABLED(level, keyword);
}

volatile LONGLONG ETW::GCLog::s_l64LastClientSequenceNumber = 0;

//---------------------------------------------------------------------------------------
Expand Down Expand Up @@ -300,4 +305,4 @@ void EventPipeEtwCallbackDotNETRuntimePrivate(
_Inout_opt_ PVOID CallbackContext)
{
EtwCallbackCommon(DotNETRuntimePrivate, ControlCode, Level, MatchAnyKeyword, FilterData, true);
}
}
3 changes: 3 additions & 0 deletions src/coreclr/nativeaot/Runtime/eventtracebase.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ void InitializeEventTracing();

#ifdef FEATURE_EVENT_TRACE

bool IsRuntimeProviderEnabled(uint8_t level, uint64_t keyword);

// !!!!!!! NOTE !!!!!!!!
// The flags must match those in the ETW manifest exactly
// !!!!!!! NOTE !!!!!!!!
Expand Down Expand Up @@ -102,6 +104,7 @@ struct ProfilingScanContext;
#define CLR_GCHEAPSURVIVALANDMOVEMENT_KEYWORD 0x400000
#define CLR_MANAGEDHEAPCOLLECT_KEYWORD 0x800000
#define CLR_GCHEAPANDTYPENAMES_KEYWORD 0x1000000
#define CLR_ALLOCATIONSAMPLING_KEYWORD 0x80000000000

//
// Using KEYWORDZERO means when checking the events category ignore the keyword
Expand Down
9 changes: 9 additions & 0 deletions src/coreclr/nativeaot/Runtime/gctoclreventsink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "common.h"
#include "gctoclreventsink.h"
#include "thread.h"
#include "eventtracebase.h"

GCToCLREventSink g_gcToClrEventSink;

Expand Down Expand Up @@ -174,6 +175,14 @@ void GCToCLREventSink::FireGCAllocationTick_V4(uint64_t allocationAmount,
{
LIMITED_METHOD_CONTRACT;

#ifdef FEATURE_EVENT_TRACE
if (IsRuntimeProviderEnabled(TRACE_LEVEL_INFORMATION, CLR_ALLOCATIONSAMPLING_KEYWORD))
{
// skip AllocationTick if AllocationSampled is emitted
return;
}
#endif // FEATURE_EVENT_TRACE

void * typeId = GetLastAllocEEType();
WCHAR * name = nullptr;

Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/nativeaot/Runtime/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ static Thread* g_RuntimeInitializingThread;

#endif //!DACCESS_COMPILE

ee_alloc_context::PerThreadRandom::PerThreadRandom()
{
minipal_xoshiro128pp_init(&random_state, (uint32_t)PalGetTickCount64());
}

thread_local ee_alloc_context::PerThreadRandom ee_alloc_context::t_random = PerThreadRandom();

PInvokeTransitionFrame* Thread::GetTransitionFrame()
{
if (ThreadStore::GetSuspendingThread() == this)
Expand Down
15 changes: 14 additions & 1 deletion src/coreclr/nativeaot/Runtime/thread.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "StackFrameIterator.h"
#include "slist.h" // DefaultSListTraits
#include <minipal/xoshiro128pp.h>

struct gc_alloc_context;
class RuntimeInstance;
Expand Down Expand Up @@ -113,7 +114,19 @@ struct ee_alloc_context

gc_alloc_context* GetGCAllocContext();
uint8_t* GetCombinedLimit();
void UpdateCombinedLimit();
void UpdateCombinedLimit(bool samplingEnabled);
static bool IsRandomizedSamplingEnabled();
static uint32_t ComputeGeometricRandom();

struct PerThreadRandom
{
minipal_xoshiro128pp random_state;

PerThreadRandom();
double NextDouble();
};

static thread_local PerThreadRandom t_random;
};


Expand Down
48 changes: 44 additions & 4 deletions src/coreclr/nativeaot/Runtime/thread.inl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

#ifndef DACCESS_COMPILE

#include "eventtracebase.h"

const uint32_t SamplingDistributionMean = (100 * 1024);

inline gc_alloc_context* ee_alloc_context::GetGCAllocContext()
{
Expand All @@ -22,11 +24,49 @@ struct _thread_inl_gc_alloc_context
uint8_t* alloc_limit;
};

inline void ee_alloc_context::UpdateCombinedLimit()

inline bool ee_alloc_context::IsRandomizedSamplingEnabled()
{
#ifdef FEATURE_EVENT_TRACE
return IsRuntimeProviderEnabled(TRACE_LEVEL_INFORMATION, CLR_ALLOCATIONSAMPLING_KEYWORD);
#else
return false;
#endif // FEATURE_EVENT_TRACE
}

inline void ee_alloc_context::UpdateCombinedLimit(bool samplingEnabled)
{
_thread_inl_gc_alloc_context* gc_alloc_context = (_thread_inl_gc_alloc_context*)GetGCAllocContext();
if (!samplingEnabled)
{
combined_limit = gc_alloc_context->alloc_limit;
}
else
{
// compute the next sampling budget based on a geometric distribution
size_t samplingBudget = ComputeGeometricRandom();

// if the sampling limit is larger than the allocation context, no sampling will occur in this AC
// We do Min() prior to adding to alloc_ptr to ensure alloc_ptr+samplingBudget doesn't cause an overflow.

size_t size = gc_alloc_context->alloc_limit - gc_alloc_context->alloc_ptr;
combined_limit = gc_alloc_context->alloc_ptr + min(samplingBudget, size);
}
}

inline uint32_t ee_alloc_context::ComputeGeometricRandom()
{
// compute a random sample from the Geometric distribution.
double probability = t_random.NextDouble();
uint32_t threshold = (uint32_t)(-log(1 - probability) * SamplingDistributionMean);
return threshold;
}

// Returns a random double in the range [0, 1).
inline double ee_alloc_context::PerThreadRandom::NextDouble()
{
// The randomized allocation sampling feature is being submitted in stages. For now sampling is never enabled so
// combined_limit is always the same as alloc_limit.
combined_limit = ((_thread_inl_gc_alloc_context*)GetGCAllocContext())->alloc_limit;
uint32_t value = minipal_xoshiro128pp_next(&random_state);
return value * (1.0/(UINT32_MAX+1.0));
}

// Set the m_pDeferredTransitionFrame field for GC allocation helpers that setup transition frame
Expand Down
Loading

0 comments on commit 1c4c009

Please sign in to comment.