Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize runtime metrics #5304

Merged
merged 8 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@
<assembly fullname="Microsoft.VisualStudio.TestPlatform.Common" />
<assembly fullname="Microsoft.VisualStudio.TestPlatform.MSTest.TestAdapter" />
<assembly fullname="Microsoft.VisualStudio.TestPlatform.TestFramework" />
<assembly fullname="Microsoft.Win32.Primitives">
<type fullname="System.ComponentModel.Win32Exception" />
</assembly>
<assembly fullname="MongoDB.Driver.Core" />
<assembly fullname="MySql.Data" />
<assembly fullname="MySqlConnector" />
Expand Down Expand Up @@ -713,6 +716,7 @@
<type fullname="System.Runtime.CompilerServices.ConfiguredValueTaskAwaitable`1/ConfiguredValueTaskAwaiter" />
<type fullname="System.Runtime.CompilerServices.DefaultInterpolatedStringHandler" />
<type fullname="System.Runtime.CompilerServices.ExtensionAttribute" />
<type fullname="System.Runtime.CompilerServices.FixedBufferAttribute" />
<type fullname="System.Runtime.CompilerServices.FormattableStringFactory" />
<type fullname="System.Runtime.CompilerServices.IAsyncStateMachine" />
<type fullname="System.Runtime.CompilerServices.ICriticalNotifyCompletion" />
Expand All @@ -733,6 +737,7 @@
<type fullname="System.Runtime.CompilerServices.TaskAwaiter" />
<type fullname="System.Runtime.CompilerServices.TaskAwaiter`1" />
<type fullname="System.Runtime.CompilerServices.TupleElementNamesAttribute" />
<type fullname="System.Runtime.CompilerServices.UnsafeValueTypeAttribute" />
<type fullname="System.Runtime.CompilerServices.YieldAwaitable" />
<type fullname="System.Runtime.CompilerServices.YieldAwaitable/YieldAwaiter" />
<type fullname="System.Runtime.ConstrainedExecution.CriticalFinalizerObject" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
// <copyright file="ProcessSnapshotRuntimeInformation.cs" company="Datadog">
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc.
// </copyright>

#nullable enable

using System;
using System.ComponentModel;
using System.Runtime.InteropServices;
using Datadog.Trace.Logging;

namespace Datadog.Trace.RuntimeMetrics;

// ReSharper disable InconsistentNaming UnusedMember.Local
internal class ProcessSnapshotRuntimeInformation
{
private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<ProcessSnapshotRuntimeInformation>();

private enum PSS_PROCESS_FLAGS
{
PSS_PROCESS_FLAGS_NONE = 0x00000000,
PSS_PROCESS_FLAGS_PROTECTED = 0x00000001,
PSS_PROCESS_FLAGS_WOW64 = 0x00000002,
PSS_PROCESS_FLAGS_RESERVED_03 = 0x00000004,
PSS_PROCESS_FLAGS_RESERVED_04 = 0x00000008,
PSS_PROCESS_FLAGS_FROZEN = 0x00000010
}

private enum PSS_QUERY_INFORMATION_CLASS
{
PSS_QUERY_PROCESS_INFORMATION = 0,
PSS_QUERY_VA_CLONE_INFORMATION = 1,
PSS_QUERY_AUXILIARY_PAGES_INFORMATION = 2,
PSS_QUERY_VA_SPACE_INFORMATION = 3,
PSS_QUERY_HANDLE_INFORMATION = 4,
PSS_QUERY_THREAD_INFORMATION = 5,
PSS_QUERY_HANDLE_TRACE_INFORMATION = 6,
PSS_QUERY_PERFORMANCE_COUNTERS = 7
}

[Flags]
private enum PSS_CAPTURE_FLAGS : uint
{
PSS_CAPTURE_NONE = 0x00000000,
PSS_CAPTURE_VA_CLONE = 0x00000001,
PSS_CAPTURE_RESERVED_00000002 = 0x00000002,
PSS_CAPTURE_HANDLES = 0x00000004,
PSS_CAPTURE_HANDLE_NAME_INFORMATION = 0x00000008,
PSS_CAPTURE_HANDLE_BASIC_INFORMATION = 0x00000010,
PSS_CAPTURE_HANDLE_TYPE_SPECIFIC_INFORMATION = 0x00000020,
PSS_CAPTURE_HANDLE_TRACE = 0x00000040,
PSS_CAPTURE_THREADS = 0x00000080,
PSS_CAPTURE_THREAD_CONTEXT = 0x00000100,
PSS_CAPTURE_THREAD_CONTEXT_EXTENDED = 0x00000200,
PSS_CAPTURE_RESERVED_00000400 = 0x00000400,
PSS_CAPTURE_VA_SPACE = 0x00000800,
PSS_CAPTURE_VA_SPACE_SECTION_INFORMATION = 0x00001000,
PSS_CREATE_BREAKAWAY_OPTIONAL = 0x04000000,
PSS_CREATE_BREAKAWAY = 0x08000000,
PSS_CREATE_FORCE_BREAKAWAY = 0x10000000,
PSS_CREATE_USE_VM_ALLOCATIONS = 0x20000000,
PSS_CREATE_MEASURE_PERFORMANCE = 0x40000000,
PSS_CREATE_RELEASE_SECTION = 0x80000000
}

// The value of the current process handle on Windows is hardcoded to -1
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getcurrentprocess#remarks
private static IntPtr CurrentProcessHandle => new(-1);

public static unsafe bool GetCurrentProcessMetrics(out TimeSpan userProcessorTime, out TimeSpan systemCpuTime, out int threadCount, out long privateMemorySize)
{
var snapshotHandle = IntPtr.Zero;

try
{
var result = PssCaptureSnapshot(CurrentProcessHandle, PSS_CAPTURE_FLAGS.PSS_CAPTURE_THREADS, 0, out snapshotHandle);

if (result != 0)
{
throw new Win32Exception(result, $"PssCaptureSnapshot failed with code {result}");
}

PSS_THREAD_INFORMATION threadInformation = default;

result = PssQuerySnapshot(snapshotHandle, PSS_QUERY_INFORMATION_CLASS.PSS_QUERY_THREAD_INFORMATION, &threadInformation, Marshal.SizeOf<PSS_THREAD_INFORMATION>());

if (result != 0)
{
throw new Win32Exception(result, $"PssQuerySnapshot with PSS_QUERY_THREAD_INFORMATION failed with code {result}");
}

threadCount = threadInformation.ThreadsCaptured;

long userTime;
long kernelTime;
long memoryUsage;

if (Environment.Is64BitProcess)
{
PSS_PROCESS_INFORMATION_64 processInformation;

result = PssQuerySnapshot(snapshotHandle, PSS_QUERY_INFORMATION_CLASS.PSS_QUERY_PROCESS_INFORMATION, &processInformation, Marshal.SizeOf<PSS_PROCESS_INFORMATION_64>());

if (result != 0)
{
throw new Win32Exception(result, $"PssQuerySnapshot with PSS_QUERY_PROCESS_INFORMATION (64 bits) failed with code {result}");
}

userTime = (long)processInformation.UserTime;
kernelTime = (long)processInformation.KernelTime;
memoryUsage = (long)processInformation.PrivateUsage;
}
else
{
PSS_PROCESS_INFORMATION_32 processInformation;

result = PssQuerySnapshot(snapshotHandle, PSS_QUERY_INFORMATION_CLASS.PSS_QUERY_PROCESS_INFORMATION, &processInformation, Marshal.SizeOf<PSS_PROCESS_INFORMATION_32>());

if (result != 0)
{
throw new Win32Exception(result, $"PssQuerySnapshot with PSS_QUERY_PROCESS_INFORMATION (32 bits) failed with code {result}");
}

userTime = (long)processInformation.UserTime;
kernelTime = (long)processInformation.KernelTime;
memoryUsage = (long)processInformation.PrivateUsage;
}

userProcessorTime = TimeSpan.FromTicks(userTime);
systemCpuTime = TimeSpan.FromTicks(kernelTime);
privateMemorySize = memoryUsage;
return true;
}
finally
{
if (snapshotHandle != IntPtr.Zero)
{
var result = PssFreeSnapshot(CurrentProcessHandle, snapshotHandle);

if (result != 0)
{
Log.Error<IntPtr, int>("PssFreeSnapshot returned an error, the tracer might be leaking memory. Handle: {Handle}. Error code: {Result}.", snapshotHandle, result);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:panic:

}
}
}
}

[DllImport("kernel32.dll")]
private static extern int PssCaptureSnapshot(IntPtr processHandle, PSS_CAPTURE_FLAGS captureFlags, int threadContextFlags, out IntPtr snapshotHandle);

[DllImport("kernel32.dll")]
private static extern int PssFreeSnapshot(IntPtr processHandle, IntPtr snapshotHandle);

[DllImport("kernel32.dll")]
private static extern unsafe int PssQuerySnapshot(IntPtr snapshotHandle, PSS_QUERY_INFORMATION_CLASS informationClass, void* buffer, int bufferLength);

[StructLayout(LayoutKind.Sequential)]
private struct PSS_THREAD_INFORMATION
{
public int ThreadsCaptured;
public int ContextLength;
}

[StructLayout(LayoutKind.Sequential, Pack = 8)]
private unsafe struct PSS_PROCESS_INFORMATION_64
{
public uint ExitStatus;
public IntPtr PebBaseAddress;
public nint AffinityMask;
public int BasePriority;
public uint ProcessId;
public uint ParentProcessId;
public PSS_PROCESS_FLAGS Flags;
public ulong CreateTime;
public ulong ExitTime;
public ulong KernelTime;
public ulong UserTime;
public uint PriorityClass;
public nint PeakVirtualSize;
public nint VirtualSize;
public uint PageFaultCount;
public nint PeakWorkingSetSize;
public nint WorkingSetSize;
public nint QuotaPeakPagedPoolUsage;
public nint QuotaPagedPoolUsage;
public nint QuotaPeakNonPagedPoolUsage;
public nint QuotaNonPagedPoolUsage;
public nint PagefileUsage;
public nint PeakPagefileUsage;
public nuint PrivateUsage;
public uint ExecuteFlags;
public fixed char ImageFileName[260];
}

[StructLayout(LayoutKind.Sequential, Pack = 4)]
private unsafe struct PSS_PROCESS_INFORMATION_32
{
public uint ExitStatus;
public IntPtr PebBaseAddress;
public nint AffinityMask;
public int BasePriority;
public uint ProcessId;
public uint ParentProcessId;
public PSS_PROCESS_FLAGS Flags;
public ulong CreateTime;
public ulong ExitTime;
public ulong KernelTime;
public ulong UserTime;
public uint PriorityClass;
public nuint PeakVirtualSize;
public nuint VirtualSize;
public uint PageFaultCount;
public nuint PeakWorkingSetSize;
public nuint WorkingSetSize;
public nint QuotaPeakPagedPoolUsage;
public nint QuotaPagedPoolUsage;
public nint QuotaPeakNonPagedPoolUsage;
public nint QuotaNonPagedPoolUsage;
public nint PagefileUsage;
public nint PeakPagefileUsage;
public nuint PrivateUsage;
public uint ExecuteFlags;
public fixed char ImageFileName[260];
}
}
28 changes: 26 additions & 2 deletions tracer/src/Datadog.Trace/RuntimeMetrics/RuntimeMetricsWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
using System.Runtime.ExceptionServices;
using System.Threading;
using Datadog.Trace.Logging;
using Datadog.Trace.PlatformHelpers;
using Datadog.Trace.Util;
using Datadog.Trace.Vendors.StatsdClient;

namespace Datadog.Trace.RuntimeMetrics
Expand All @@ -25,9 +23,13 @@ internal class RuntimeMetricsWriter : IDisposable
private const string ProcessMetrics = $"{MetricsNames.ThreadsCount}, {MetricsNames.CommittedMemory}, {MetricsNames.CpuUserTime}, {MetricsNames.CpuSystemTime}, {MetricsNames.CpuPercentage}";
#endif

private static readonly Version Windows81Version = new(6, 3, 9600);

private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<RuntimeMetricsWriter>();
private static readonly Func<IDogStatsd, TimeSpan, bool, IRuntimeMetricsListener> InitializeListenerFunc = InitializeListener;

private static int _pssConsecutiveFailures;

private readonly Process _process;

private readonly TimeSpan _delay;
Expand Down Expand Up @@ -233,6 +235,28 @@ private void FirstChanceException(object sender, FirstChanceExceptionEventArgs e

private void GetCurrentProcessMetrics(out TimeSpan userProcessorTime, out TimeSpan systemCpuTime, out int threadCount, out long privateMemorySize)
{
if (_pssConsecutiveFailures < 3 && Environment.OSVersion.Platform == PlatformID.Win32NT && Environment.OSVersion.Version > Windows81Version)
{
try
{
ProcessSnapshotRuntimeInformation.GetCurrentProcessMetrics(out userProcessorTime, out systemCpuTime, out threadCount, out privateMemorySize);
_pssConsecutiveFailures = 0;
}
catch
{
var consecutiveFailures = Interlocked.Increment(ref _pssConsecutiveFailures);

if (consecutiveFailures >= 3)
{
Log.Error("Pss failed 3 times in a row, falling back to the Process API");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be a Warning message instead?

Copy link
Collaborator Author

@kevingosse kevingosse Mar 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made it an error because it's not supposed to ever happen, and I'd like to see it in telemetry logs if it does.

}

throw;
}

return;
}

_process.Refresh();
userProcessorTime = _process.UserProcessorTime;
systemCpuTime = _process.PrivilegedProcessorTime;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
// </copyright>

using System;
using System.Diagnostics;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Datadog.Trace.RuntimeMetrics;
using Datadog.Trace.TestHelpers;
using Datadog.Trace.Vendors.StatsdClient;
using FluentAssertions;
using Moq;
using Xunit;

Expand Down Expand Up @@ -100,6 +104,75 @@ public void ShouldCaptureFirstChanceExceptions()
}
}

[SkippableFact]
public async Task ShouldCaptureProcessMetrics()
{
// This test is specifically targeting process metrics collected with PSS
SkipOn.Platform(SkipOn.PlatformValue.Linux);
SkipOn.Platform(SkipOn.PlatformValue.MacOs);

var statsd = new Mock<IDogStatsd>();
var listener = new Mock<IRuntimeMetricsListener>();

using (new RuntimeMetricsWriter(statsd.Object, TimeSpan.FromSeconds(1), false, (_, _, _) => listener.Object))
{
var expectedNumberOfThreads = Process.GetCurrentProcess().Threads.Count;

var tcs = new TaskCompletionSource<bool>();

double? actualNumberOfThreads = null;
double? userCpuTime = null;
double? kernelCpuTime = null;
double? memoryUsage = null;

statsd.Setup(s => s.Gauge(MetricsNames.ThreadsCount, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => actualNumberOfThreads = value);

statsd.Setup(s => s.Gauge(MetricsNames.CommittedMemory, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => memoryUsage = value);

statsd.Setup(s => s.Gauge(MetricsNames.CpuUserTime, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => userCpuTime = value);

statsd.Setup(s => s.Gauge(MetricsNames.CpuSystemTime, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => kernelCpuTime = value);

// CPU percentage is the last pushed event
statsd.Setup(s => s.Gauge(MetricsNames.CpuPercentage, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, _, _, _) => tcs.TrySetResult(true));

// Spin a bit to eat CPU
var sw = System.Diagnostics.Stopwatch.StartNew();

while (sw.Elapsed < TimeSpan.FromMilliseconds(50))
{
Thread.SpinWait(10);
}

var timeout = Task.Delay(TimeSpan.FromSeconds(30));

await Task.WhenAny(tcs.Task, timeout);

tcs.Task.IsCompleted.Should().BeTrue();

actualNumberOfThreads.Should().NotBeNull();

// To future generations: if 100 is not enough, feel free to bump it up. We're really just checking that the value is "realistic".
actualNumberOfThreads.Should().NotBeNull().And.BeGreaterThan(0).And.BeInRange(expectedNumberOfThreads - 100, expectedNumberOfThreads + 100);

// CPU time and memory usage can vary wildly, so don't try too hard to validate
userCpuTime.Should().NotBeNull().And.BeGreaterThan(0);

// Unfortunately we can't guarantee that the process will be eating kernel time, so greater or equal
kernelCpuTime.Should().NotBeNull().And.BeGreaterThanOrEqualTo(0);

// Between 10MB and 100GB seems realistic.
// If in the future the tests runner really get below 10MB, congratulations!
// If it gets above 100GB, God save us all.
memoryUsage.Should().NotBeNull().And.BeInRange(10.0 * 1024 * 1024, 100.0 * 1024 * 1024 * 1024);
}
}

[Fact]
public void CleanupResources()
{
Expand Down
Loading
Loading