Skip to content

Commit

Permalink
Optimize runtime metrics (#5304)
Browse files Browse the repository at this point in the history
  • Loading branch information
kevingosse authored Mar 15, 2024
1 parent 623d195 commit 3dab836
Show file tree
Hide file tree
Showing 6 changed files with 332 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@
<assembly fullname="Microsoft.VisualStudio.TestPlatform.Common" />
<assembly fullname="Microsoft.VisualStudio.TestPlatform.MSTest.TestAdapter" />
<assembly fullname="Microsoft.VisualStudio.TestPlatform.TestFramework" />
<assembly fullname="Microsoft.Win32.Primitives">
<type fullname="System.ComponentModel.Win32Exception" />
</assembly>
<assembly fullname="MongoDB.Driver.Core" />
<assembly fullname="MySql.Data" />
<assembly fullname="MySqlConnector" />
Expand Down Expand Up @@ -712,6 +715,7 @@
<type fullname="System.Runtime.CompilerServices.ConfiguredValueTaskAwaitable`1/ConfiguredValueTaskAwaiter" />
<type fullname="System.Runtime.CompilerServices.DefaultInterpolatedStringHandler" />
<type fullname="System.Runtime.CompilerServices.ExtensionAttribute" />
<type fullname="System.Runtime.CompilerServices.FixedBufferAttribute" />
<type fullname="System.Runtime.CompilerServices.FormattableStringFactory" />
<type fullname="System.Runtime.CompilerServices.IAsyncStateMachine" />
<type fullname="System.Runtime.CompilerServices.ICriticalNotifyCompletion" />
Expand All @@ -732,6 +736,7 @@
<type fullname="System.Runtime.CompilerServices.TaskAwaiter" />
<type fullname="System.Runtime.CompilerServices.TaskAwaiter`1" />
<type fullname="System.Runtime.CompilerServices.TupleElementNamesAttribute" />
<type fullname="System.Runtime.CompilerServices.UnsafeValueTypeAttribute" />
<type fullname="System.Runtime.CompilerServices.YieldAwaitable" />
<type fullname="System.Runtime.CompilerServices.YieldAwaitable/YieldAwaiter" />
<type fullname="System.Runtime.ConstrainedExecution.CriticalFinalizerObject" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
// <copyright file="ProcessSnapshotRuntimeInformation.cs" company="Datadog">
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc.
// </copyright>

#nullable enable

using System;
using System.ComponentModel;
using System.Runtime.InteropServices;
using Datadog.Trace.Logging;

namespace Datadog.Trace.RuntimeMetrics;

// ReSharper disable InconsistentNaming UnusedMember.Local
internal class ProcessSnapshotRuntimeInformation
{
private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<ProcessSnapshotRuntimeInformation>();

private enum PSS_PROCESS_FLAGS
{
PSS_PROCESS_FLAGS_NONE = 0x00000000,
PSS_PROCESS_FLAGS_PROTECTED = 0x00000001,
PSS_PROCESS_FLAGS_WOW64 = 0x00000002,
PSS_PROCESS_FLAGS_RESERVED_03 = 0x00000004,
PSS_PROCESS_FLAGS_RESERVED_04 = 0x00000008,
PSS_PROCESS_FLAGS_FROZEN = 0x00000010
}

private enum PSS_QUERY_INFORMATION_CLASS
{
PSS_QUERY_PROCESS_INFORMATION = 0,
PSS_QUERY_VA_CLONE_INFORMATION = 1,
PSS_QUERY_AUXILIARY_PAGES_INFORMATION = 2,
PSS_QUERY_VA_SPACE_INFORMATION = 3,
PSS_QUERY_HANDLE_INFORMATION = 4,
PSS_QUERY_THREAD_INFORMATION = 5,
PSS_QUERY_HANDLE_TRACE_INFORMATION = 6,
PSS_QUERY_PERFORMANCE_COUNTERS = 7
}

[Flags]
private enum PSS_CAPTURE_FLAGS : uint
{
PSS_CAPTURE_NONE = 0x00000000,
PSS_CAPTURE_VA_CLONE = 0x00000001,
PSS_CAPTURE_RESERVED_00000002 = 0x00000002,
PSS_CAPTURE_HANDLES = 0x00000004,
PSS_CAPTURE_HANDLE_NAME_INFORMATION = 0x00000008,
PSS_CAPTURE_HANDLE_BASIC_INFORMATION = 0x00000010,
PSS_CAPTURE_HANDLE_TYPE_SPECIFIC_INFORMATION = 0x00000020,
PSS_CAPTURE_HANDLE_TRACE = 0x00000040,
PSS_CAPTURE_THREADS = 0x00000080,
PSS_CAPTURE_THREAD_CONTEXT = 0x00000100,
PSS_CAPTURE_THREAD_CONTEXT_EXTENDED = 0x00000200,
PSS_CAPTURE_RESERVED_00000400 = 0x00000400,
PSS_CAPTURE_VA_SPACE = 0x00000800,
PSS_CAPTURE_VA_SPACE_SECTION_INFORMATION = 0x00001000,
PSS_CREATE_BREAKAWAY_OPTIONAL = 0x04000000,
PSS_CREATE_BREAKAWAY = 0x08000000,
PSS_CREATE_FORCE_BREAKAWAY = 0x10000000,
PSS_CREATE_USE_VM_ALLOCATIONS = 0x20000000,
PSS_CREATE_MEASURE_PERFORMANCE = 0x40000000,
PSS_CREATE_RELEASE_SECTION = 0x80000000
}

// The value of the current process handle on Windows is hardcoded to -1
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getcurrentprocess#remarks
private static IntPtr CurrentProcessHandle => new(-1);

public static unsafe bool GetCurrentProcessMetrics(out TimeSpan userProcessorTime, out TimeSpan systemCpuTime, out int threadCount, out long privateMemorySize)
{
var snapshotHandle = IntPtr.Zero;

try
{
var result = PssCaptureSnapshot(CurrentProcessHandle, PSS_CAPTURE_FLAGS.PSS_CAPTURE_THREADS, 0, out snapshotHandle);

if (result != 0)
{
throw new Win32Exception(result, $"PssCaptureSnapshot failed with code {result}");
}

PSS_THREAD_INFORMATION threadInformation = default;

result = PssQuerySnapshot(snapshotHandle, PSS_QUERY_INFORMATION_CLASS.PSS_QUERY_THREAD_INFORMATION, &threadInformation, Marshal.SizeOf<PSS_THREAD_INFORMATION>());

if (result != 0)
{
throw new Win32Exception(result, $"PssQuerySnapshot with PSS_QUERY_THREAD_INFORMATION failed with code {result}");
}

threadCount = threadInformation.ThreadsCaptured;

long userTime;
long kernelTime;
long memoryUsage;

if (Environment.Is64BitProcess)
{
PSS_PROCESS_INFORMATION_64 processInformation;

result = PssQuerySnapshot(snapshotHandle, PSS_QUERY_INFORMATION_CLASS.PSS_QUERY_PROCESS_INFORMATION, &processInformation, Marshal.SizeOf<PSS_PROCESS_INFORMATION_64>());

if (result != 0)
{
throw new Win32Exception(result, $"PssQuerySnapshot with PSS_QUERY_PROCESS_INFORMATION (64 bits) failed with code {result}");
}

userTime = (long)processInformation.UserTime;
kernelTime = (long)processInformation.KernelTime;
memoryUsage = (long)processInformation.PrivateUsage;
}
else
{
PSS_PROCESS_INFORMATION_32 processInformation;

result = PssQuerySnapshot(snapshotHandle, PSS_QUERY_INFORMATION_CLASS.PSS_QUERY_PROCESS_INFORMATION, &processInformation, Marshal.SizeOf<PSS_PROCESS_INFORMATION_32>());

if (result != 0)
{
throw new Win32Exception(result, $"PssQuerySnapshot with PSS_QUERY_PROCESS_INFORMATION (32 bits) failed with code {result}");
}

userTime = (long)processInformation.UserTime;
kernelTime = (long)processInformation.KernelTime;
memoryUsage = (long)processInformation.PrivateUsage;
}

userProcessorTime = TimeSpan.FromTicks(userTime);
systemCpuTime = TimeSpan.FromTicks(kernelTime);
privateMemorySize = memoryUsage;
return true;
}
finally
{
if (snapshotHandle != IntPtr.Zero)
{
var result = PssFreeSnapshot(CurrentProcessHandle, snapshotHandle);

if (result != 0)
{
Log.Error<IntPtr, int>("PssFreeSnapshot returned an error, the tracer might be leaking memory. Handle: {Handle}. Error code: {Result}.", snapshotHandle, result);
}
}
}
}

[DllImport("kernel32.dll")]
private static extern int PssCaptureSnapshot(IntPtr processHandle, PSS_CAPTURE_FLAGS captureFlags, int threadContextFlags, out IntPtr snapshotHandle);

[DllImport("kernel32.dll")]
private static extern int PssFreeSnapshot(IntPtr processHandle, IntPtr snapshotHandle);

[DllImport("kernel32.dll")]
private static extern unsafe int PssQuerySnapshot(IntPtr snapshotHandle, PSS_QUERY_INFORMATION_CLASS informationClass, void* buffer, int bufferLength);

[StructLayout(LayoutKind.Sequential)]
private struct PSS_THREAD_INFORMATION
{
public int ThreadsCaptured;
public int ContextLength;
}

[StructLayout(LayoutKind.Sequential, Pack = 8)]
private unsafe struct PSS_PROCESS_INFORMATION_64
{
public uint ExitStatus;
public IntPtr PebBaseAddress;
public nint AffinityMask;
public int BasePriority;
public uint ProcessId;
public uint ParentProcessId;
public PSS_PROCESS_FLAGS Flags;
public ulong CreateTime;
public ulong ExitTime;
public ulong KernelTime;
public ulong UserTime;
public uint PriorityClass;
public nint PeakVirtualSize;
public nint VirtualSize;
public uint PageFaultCount;
public nint PeakWorkingSetSize;
public nint WorkingSetSize;
public nint QuotaPeakPagedPoolUsage;
public nint QuotaPagedPoolUsage;
public nint QuotaPeakNonPagedPoolUsage;
public nint QuotaNonPagedPoolUsage;
public nint PagefileUsage;
public nint PeakPagefileUsage;
public nuint PrivateUsage;
public uint ExecuteFlags;
public fixed char ImageFileName[260];
}

[StructLayout(LayoutKind.Sequential, Pack = 4)]
private unsafe struct PSS_PROCESS_INFORMATION_32
{
public uint ExitStatus;
public IntPtr PebBaseAddress;
public nint AffinityMask;
public int BasePriority;
public uint ProcessId;
public uint ParentProcessId;
public PSS_PROCESS_FLAGS Flags;
public ulong CreateTime;
public ulong ExitTime;
public ulong KernelTime;
public ulong UserTime;
public uint PriorityClass;
public nuint PeakVirtualSize;
public nuint VirtualSize;
public uint PageFaultCount;
public nuint PeakWorkingSetSize;
public nuint WorkingSetSize;
public nint QuotaPeakPagedPoolUsage;
public nint QuotaPagedPoolUsage;
public nint QuotaPeakNonPagedPoolUsage;
public nint QuotaNonPagedPoolUsage;
public nint PagefileUsage;
public nint PeakPagefileUsage;
public nuint PrivateUsage;
public uint ExecuteFlags;
public fixed char ImageFileName[260];
}
}
28 changes: 26 additions & 2 deletions tracer/src/Datadog.Trace/RuntimeMetrics/RuntimeMetricsWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
using System.Runtime.ExceptionServices;
using System.Threading;
using Datadog.Trace.Logging;
using Datadog.Trace.PlatformHelpers;
using Datadog.Trace.Util;
using Datadog.Trace.Vendors.StatsdClient;

namespace Datadog.Trace.RuntimeMetrics
Expand All @@ -25,9 +23,13 @@ internal class RuntimeMetricsWriter : IDisposable
private const string ProcessMetrics = $"{MetricsNames.ThreadsCount}, {MetricsNames.CommittedMemory}, {MetricsNames.CpuUserTime}, {MetricsNames.CpuSystemTime}, {MetricsNames.CpuPercentage}";
#endif

private static readonly Version Windows81Version = new(6, 3, 9600);

private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<RuntimeMetricsWriter>();
private static readonly Func<IDogStatsd, TimeSpan, bool, IRuntimeMetricsListener> InitializeListenerFunc = InitializeListener;

private static int _pssConsecutiveFailures;

private readonly Process _process;

private readonly TimeSpan _delay;
Expand Down Expand Up @@ -233,6 +235,28 @@ private void FirstChanceException(object sender, FirstChanceExceptionEventArgs e

private void GetCurrentProcessMetrics(out TimeSpan userProcessorTime, out TimeSpan systemCpuTime, out int threadCount, out long privateMemorySize)
{
if (_pssConsecutiveFailures < 3 && Environment.OSVersion.Platform == PlatformID.Win32NT && Environment.OSVersion.Version > Windows81Version)
{
try
{
ProcessSnapshotRuntimeInformation.GetCurrentProcessMetrics(out userProcessorTime, out systemCpuTime, out threadCount, out privateMemorySize);
_pssConsecutiveFailures = 0;
}
catch
{
var consecutiveFailures = Interlocked.Increment(ref _pssConsecutiveFailures);

if (consecutiveFailures >= 3)
{
Log.Error("Pss failed 3 times in a row, falling back to the Process API");
}

throw;
}

return;
}

_process.Refresh();
userProcessorTime = _process.UserProcessorTime;
systemCpuTime = _process.PrivilegedProcessorTime;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
// </copyright>

using System;
using System.Diagnostics;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Datadog.Trace.RuntimeMetrics;
using Datadog.Trace.TestHelpers;
using Datadog.Trace.Vendors.StatsdClient;
using FluentAssertions;
using Moq;
using Xunit;

Expand Down Expand Up @@ -100,6 +104,75 @@ public void ShouldCaptureFirstChanceExceptions()
}
}

[SkippableFact]
public async Task ShouldCaptureProcessMetrics()
{
// This test is specifically targeting process metrics collected with PSS
SkipOn.Platform(SkipOn.PlatformValue.Linux);
SkipOn.Platform(SkipOn.PlatformValue.MacOs);

var statsd = new Mock<IDogStatsd>();
var listener = new Mock<IRuntimeMetricsListener>();

using (new RuntimeMetricsWriter(statsd.Object, TimeSpan.FromSeconds(1), false, (_, _, _) => listener.Object))
{
var expectedNumberOfThreads = Process.GetCurrentProcess().Threads.Count;

var tcs = new TaskCompletionSource<bool>();

double? actualNumberOfThreads = null;
double? userCpuTime = null;
double? kernelCpuTime = null;
double? memoryUsage = null;

statsd.Setup(s => s.Gauge(MetricsNames.ThreadsCount, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => actualNumberOfThreads = value);

statsd.Setup(s => s.Gauge(MetricsNames.CommittedMemory, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => memoryUsage = value);

statsd.Setup(s => s.Gauge(MetricsNames.CpuUserTime, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => userCpuTime = value);

statsd.Setup(s => s.Gauge(MetricsNames.CpuSystemTime, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => kernelCpuTime = value);

// CPU percentage is the last pushed event
statsd.Setup(s => s.Gauge(MetricsNames.CpuPercentage, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, _, _, _) => tcs.TrySetResult(true));

// Spin a bit to eat CPU
var sw = System.Diagnostics.Stopwatch.StartNew();

while (sw.Elapsed < TimeSpan.FromMilliseconds(50))
{
Thread.SpinWait(10);
}

var timeout = Task.Delay(TimeSpan.FromSeconds(30));

await Task.WhenAny(tcs.Task, timeout);

tcs.Task.IsCompleted.Should().BeTrue();

actualNumberOfThreads.Should().NotBeNull();

// To future generations: if 100 is not enough, feel free to bump it up. We're really just checking that the value is "realistic".
actualNumberOfThreads.Should().NotBeNull().And.BeGreaterThan(0).And.BeInRange(expectedNumberOfThreads - 100, expectedNumberOfThreads + 100);

// CPU time and memory usage can vary wildly, so don't try too hard to validate
userCpuTime.Should().NotBeNull().And.BeGreaterThan(0);

// Unfortunately we can't guarantee that the process will be eating kernel time, so greater or equal
kernelCpuTime.Should().NotBeNull().And.BeGreaterThanOrEqualTo(0);

// Between 10MB and 100GB seems realistic.
// If in the future the tests runner really get below 10MB, congratulations!
// If it gets above 100GB, God save us all.
memoryUsage.Should().NotBeNull().And.BeInRange(10.0 * 1024 * 1024, 100.0 * 1024 * 1024 * 1024);
}
}

[Fact]
public void CleanupResources()
{
Expand Down
Loading

0 comments on commit 3dab836

Please sign in to comment.