Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize runtime metrics #5304

Merged
merged 8 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
// <copyright file="ProcessSnapshotRuntimeInformation.cs" company="Datadog">
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc.
// </copyright>

#nullable enable

using System;
using System.ComponentModel;
using System.Runtime.InteropServices;
using Datadog.Trace.Logging;

namespace Datadog.Trace.RuntimeMetrics;

// ReSharper disable InconsistentNaming UnusedMember.Local
internal class ProcessSnapshotRuntimeInformation
{
private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<ProcessSnapshotRuntimeInformation>();

private enum PSS_PROCESS_FLAGS
{
PSS_PROCESS_FLAGS_NONE = 0x00000000,
PSS_PROCESS_FLAGS_PROTECTED = 0x00000001,
PSS_PROCESS_FLAGS_WOW64 = 0x00000002,
PSS_PROCESS_FLAGS_RESERVED_03 = 0x00000004,
PSS_PROCESS_FLAGS_RESERVED_04 = 0x00000008,
PSS_PROCESS_FLAGS_FROZEN = 0x00000010
}

private enum PSS_QUERY_INFORMATION_CLASS
{
PSS_QUERY_PROCESS_INFORMATION = 0,
PSS_QUERY_VA_CLONE_INFORMATION = 1,
PSS_QUERY_AUXILIARY_PAGES_INFORMATION = 2,
PSS_QUERY_VA_SPACE_INFORMATION = 3,
PSS_QUERY_HANDLE_INFORMATION = 4,
PSS_QUERY_THREAD_INFORMATION = 5,
PSS_QUERY_HANDLE_TRACE_INFORMATION = 6,
PSS_QUERY_PERFORMANCE_COUNTERS = 7
}

[Flags]
private enum PSS_CAPTURE_FLAGS : uint
{
PSS_CAPTURE_NONE = 0x00000000,
PSS_CAPTURE_VA_CLONE = 0x00000001,
PSS_CAPTURE_RESERVED_00000002 = 0x00000002,
PSS_CAPTURE_HANDLES = 0x00000004,
PSS_CAPTURE_HANDLE_NAME_INFORMATION = 0x00000008,
PSS_CAPTURE_HANDLE_BASIC_INFORMATION = 0x00000010,
PSS_CAPTURE_HANDLE_TYPE_SPECIFIC_INFORMATION = 0x00000020,
PSS_CAPTURE_HANDLE_TRACE = 0x00000040,
PSS_CAPTURE_THREADS = 0x00000080,
PSS_CAPTURE_THREAD_CONTEXT = 0x00000100,
PSS_CAPTURE_THREAD_CONTEXT_EXTENDED = 0x00000200,
PSS_CAPTURE_RESERVED_00000400 = 0x00000400,
PSS_CAPTURE_VA_SPACE = 0x00000800,
PSS_CAPTURE_VA_SPACE_SECTION_INFORMATION = 0x00001000,
PSS_CREATE_BREAKAWAY_OPTIONAL = 0x04000000,
PSS_CREATE_BREAKAWAY = 0x08000000,
PSS_CREATE_FORCE_BREAKAWAY = 0x10000000,
PSS_CREATE_USE_VM_ALLOCATIONS = 0x20000000,
PSS_CREATE_MEASURE_PERFORMANCE = 0x40000000,
PSS_CREATE_RELEASE_SECTION = 0x80000000
}

// The value of the current process handle on Windows is hardcoded to 1
kevingosse marked this conversation as resolved.
Show resolved Hide resolved
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getcurrentprocess#remarks
private static IntPtr CurrentProcessHandle => new(-1);

public static unsafe bool GetCurrentProcessMetrics(out TimeSpan userProcessorTime, out TimeSpan systemCpuTime, out int threadCount, out long privateMemorySize)
{
var snapshotHandle = IntPtr.Zero;

try
{
var result = PssCaptureSnapshot(CurrentProcessHandle, PSS_CAPTURE_FLAGS.PSS_CAPTURE_THREADS, 0, out snapshotHandle);

if (result != 0)
{
throw new Win32Exception(result, $"PssCaptureSnapshot failed with code {result}");
}

PSS_THREAD_INFORMATION threadInformation = default;

result = PssQuerySnapshot(snapshotHandle, PSS_QUERY_INFORMATION_CLASS.PSS_QUERY_THREAD_INFORMATION, &threadInformation, Marshal.SizeOf<PSS_THREAD_INFORMATION>());

if (result != 0)
{
throw new Win32Exception(result, $"PssQuerySnapshot with PSS_QUERY_THREAD_INFORMATION failed with code {result}");
}

threadCount = threadInformation.ThreadsCaptured;

long userTime;
long kernelTime;
long memoryUsage;

if (Environment.Is64BitProcess)
{
PSS_PROCESS_INFORMATION_64 processInformation;

result = PssQuerySnapshot(snapshotHandle, PSS_QUERY_INFORMATION_CLASS.PSS_QUERY_PROCESS_INFORMATION, &processInformation, Marshal.SizeOf<PSS_PROCESS_INFORMATION_64>());

if (result != 0)
{
throw new Win32Exception(result, $"PssQuerySnapshot with PSS_QUERY_PROCESS_INFORMATION (64 bits) failed with code {result}");
}

userTime = (long)processInformation.UserTime;
kernelTime = (long)processInformation.KernelTime;
memoryUsage = (long)processInformation.PrivateUsage;
}
else
{
PSS_PROCESS_INFORMATION_32 processInformation;

result = PssQuerySnapshot(snapshotHandle, PSS_QUERY_INFORMATION_CLASS.PSS_QUERY_PROCESS_INFORMATION, &processInformation, Marshal.SizeOf<PSS_PROCESS_INFORMATION_32>());

if (result != 0)
{
throw new Win32Exception(result, $"PssQuerySnapshot with PSS_QUERY_PROCESS_INFORMATION (32 bits) failed with code {result}");
}

userTime = (long)processInformation.UserTime;
kernelTime = (long)processInformation.KernelTime;
memoryUsage = (long)processInformation.PrivateUsage;
}

userProcessorTime = TimeSpan.FromTicks(userTime);
systemCpuTime = TimeSpan.FromTicks(kernelTime);
privateMemorySize = memoryUsage;
return true;
}
finally
{
if (snapshotHandle != IntPtr.Zero)
{
var result = PssFreeSnapshot(CurrentProcessHandle, snapshotHandle);

if (result != 0)
{
Log.Error<IntPtr, int>("PssFreeSnapshot returned an error, the tracer might be leaking memory. Handle: {Handle}. Error code: {Result}.", snapshotHandle, result);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:panic:

}
}
}
}

[DllImport("kernel32.dll")]
private static extern int PssCaptureSnapshot(IntPtr processHandle, PSS_CAPTURE_FLAGS captureFlags, int threadContextFlags, out IntPtr snapshotHandle);

[DllImport("kernel32.dll")]
private static extern int PssFreeSnapshot(IntPtr processHandle, IntPtr snapshotHandle);

[DllImport("kernel32.dll")]
private static extern unsafe int PssQuerySnapshot(IntPtr snapshotHandle, PSS_QUERY_INFORMATION_CLASS informationClass, void* buffer, int bufferLength);

[StructLayout(LayoutKind.Sequential)]
private struct PSS_THREAD_INFORMATION
{
public int ThreadsCaptured;
public int ContextLength;
}

[StructLayout(LayoutKind.Sequential, Pack = 8)]
private unsafe struct PSS_PROCESS_INFORMATION_64
{
public uint ExitStatus;
public IntPtr PebBaseAddress;
public nint AffinityMask;
public int BasePriority;
public uint ProcessId;
public uint ParentProcessId;
public PSS_PROCESS_FLAGS Flags;
public ulong CreateTime;
public ulong ExitTime;
public ulong KernelTime;
public ulong UserTime;
public uint PriorityClass;
public nint PeakVirtualSize;
public nint VirtualSize;
public uint PageFaultCount;
public nint PeakWorkingSetSize;
public nint WorkingSetSize;
public nint QuotaPeakPagedPoolUsage;
public nint QuotaPagedPoolUsage;
public nint QuotaPeakNonPagedPoolUsage;
public nint QuotaNonPagedPoolUsage;
public nint PagefileUsage;
public nint PeakPagefileUsage;
public nuint PrivateUsage;
public uint ExecuteFlags;
public fixed char ImageFileName[260];
}

[StructLayout(LayoutKind.Sequential, Pack = 4)]
private unsafe struct PSS_PROCESS_INFORMATION_32
{
public uint ExitStatus;
public IntPtr PebBaseAddress;
public nint AffinityMask;
public int BasePriority;
public uint ProcessId;
public uint ParentProcessId;
public PSS_PROCESS_FLAGS Flags;
public ulong CreateTime;
public ulong ExitTime;
public ulong KernelTime;
public ulong UserTime;
public uint PriorityClass;
public nuint PeakVirtualSize;
public nuint VirtualSize;
public uint PageFaultCount;
public nuint PeakWorkingSetSize;
public nuint WorkingSetSize;
public nint QuotaPeakPagedPoolUsage;
public nint QuotaPagedPoolUsage;
public nint QuotaPeakNonPagedPoolUsage;
public nint QuotaNonPagedPoolUsage;
public nint PagefileUsage;
public nint PeakPagefileUsage;
public nuint PrivateUsage;
public uint ExecuteFlags;
public fixed char ImageFileName[260];
}
}
26 changes: 26 additions & 0 deletions tracer/src/Datadog.Trace/RuntimeMetrics/RuntimeMetricsWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,13 @@ internal class RuntimeMetricsWriter : IDisposable
private const string ProcessMetrics = $"{MetricsNames.ThreadsCount}, {MetricsNames.CommittedMemory}, {MetricsNames.CpuUserTime}, {MetricsNames.CpuSystemTime}, {MetricsNames.CpuPercentage}";
#endif

private static readonly Version Windows81Version = new(6, 3, 9600);

private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<RuntimeMetricsWriter>();
private static readonly Func<IDogStatsd, TimeSpan, bool, IRuntimeMetricsListener> InitializeListenerFunc = InitializeListener;

private static int _pssConsecutiveFailures;

private readonly Process _process;

private readonly TimeSpan _delay;
Expand Down Expand Up @@ -233,6 +237,28 @@ private void FirstChanceException(object sender, FirstChanceExceptionEventArgs e

private void GetCurrentProcessMetrics(out TimeSpan userProcessorTime, out TimeSpan systemCpuTime, out int threadCount, out long privateMemorySize)
{
if (_pssConsecutiveFailures < 3 && Environment.OSVersion.Platform == PlatformID.Win32NT && Environment.OSVersion.Version > Windows81Version)
{
try
{
ProcessSnapshotRuntimeInformation.GetCurrentProcessMetrics(out userProcessorTime, out systemCpuTime, out threadCount, out privateMemorySize);
_pssConsecutiveFailures = 0;
}
catch
{
_pssConsecutiveFailures += 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can technically have more than one RuntimeMetricsWriter in version conflict, right? Should we use interlocked here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But then they're different RuntimeMetricsWriter and they all have their own counter

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True. And we never have more than one instance of the same type, when we replace the tracer instance for example (before the old one has been disposed)?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though honestly I can't think of any reason not to use an interlocked here so I guess I can play safe and add one


if (_pssConsecutiveFailures >= 3)
{
Log.Error("Pss failed 3 times in a row, falling back to the Process API");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be a Warning message instead?

Copy link
Collaborator Author

@kevingosse kevingosse Mar 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made it an error because it's not supposed to ever happen, and I'd like to see it in telemetry logs if it does.

}

throw;
}

return;
}

_process.Refresh();
userProcessorTime = _process.UserProcessorTime;
systemCpuTime = _process.PrivilegedProcessorTime;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
// </copyright>

using System;
using System.Diagnostics;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Datadog.Trace.RuntimeMetrics;
using Datadog.Trace.Vendors.StatsdClient;
using FluentAssertions;
using Moq;
using Xunit;

Expand Down Expand Up @@ -100,6 +103,64 @@ public void ShouldCaptureFirstChanceExceptions()
}
}

[Fact]
public async Task ShouldCaptureProcessMetrics()
{
var statsd = new Mock<IDogStatsd>();
var listener = new Mock<IRuntimeMetricsListener>();

using (new RuntimeMetricsWriter(statsd.Object, TimeSpan.FromSeconds(1), false, (_, _, _) => listener.Object))
{
var expectedNumberOfThreads = Process.GetCurrentProcess().Threads.Count;

var tcs = new TaskCompletionSource<bool>();

double? actualNumberOfThreads = null;
double? userCpuTime = null;
double? kernelCpuTime = null;
double? memoryUsage = null;

statsd.Setup(s => s.Gauge(MetricsNames.ThreadsCount, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => actualNumberOfThreads = value);

statsd.Setup(s => s.Gauge(MetricsNames.CommittedMemory, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => memoryUsage = value);

statsd.Setup(s => s.Gauge(MetricsNames.CpuUserTime, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => userCpuTime = value);

statsd.Setup(s => s.Gauge(MetricsNames.CpuSystemTime, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, value, _, _) => kernelCpuTime = value);

// CPU percentage is the last pushed event
statsd.Setup(s => s.Gauge(MetricsNames.CpuPercentage, It.IsAny<double>(), It.IsAny<double>(), It.IsAny<string[]>()))
.Callback<string, double, double, string[]>((_, _, _, _) => tcs.TrySetResult(true));

var timeout = Task.Delay(TimeSpan.FromSeconds(30));

await Task.WhenAny(tcs.Task, timeout);

tcs.Task.IsCompleted.Should().BeTrue();

actualNumberOfThreads.Should().NotBeNull();

// To future generations: if 10 is not enough, feel free to bump it up. We're really just checking that the value is "realistic".
actualNumberOfThreads.Should()
.NotBeNull().And.BeGreaterThan(0).And.BeInRange(expectedNumberOfThreads - 10, expectedNumberOfThreads + 10);

// CPU time and memory usage can vary wildly, so don't try too hard to validate
userCpuTime.Should().NotBeNull().And.BeGreaterThan(0);

// Unfortunately we can't guarantee that the process will be eating kernel time, so greater or equal
kernelCpuTime.Should().NotBeNull().And.BeGreaterThanOrEqualTo(0);

// Between 10MB and 100GB seems realistic.
// If in the future the tests runner really get below 10MB, congratulations!
// If it gets above 100GB, God save us all.
memoryUsage.Should().NotBeNull().And.BeInRange(10.0 * 1024 * 1024, 100.0 * 1024 * 1024 * 1024);
}
}

[Fact]
public void CleanupResources()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Microsoft.AspNetCore.Routing, Version=2.0.0.0, Culture=neutral, PublicKeyToken=a
Microsoft.AspNetCore.Routing.Abstractions, Version=2.0.0.0, Culture=neutral, PublicKeyToken=adb9793829ddae60
Microsoft.Extensions.Primitives, Version=2.0.0.0, Culture=neutral, PublicKeyToken=adb9793829ddae60
Microsoft.Net.Http.Headers, Version=2.0.0.0, Culture=neutral, PublicKeyToken=adb9793829ddae60
Microsoft.Win32.Primitives, Version=6.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a
System.Collections, Version=6.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a
System.Collections.Concurrent, Version=6.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a
System.Collections.NonGeneric, Version=6.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Microsoft.AspNetCore.Routing, Version=2.0.0.0, Culture=neutral, PublicKeyToken=a
Microsoft.AspNetCore.Routing.Abstractions, Version=2.0.0.0, Culture=neutral, PublicKeyToken=adb9793829ddae60
Microsoft.Extensions.Primitives, Version=2.0.0.0, Culture=neutral, PublicKeyToken=adb9793829ddae60
Microsoft.Net.Http.Headers, Version=2.0.0.0, Culture=neutral, PublicKeyToken=adb9793829ddae60
Microsoft.Win32.Primitives, Version=4.1.2.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a
System.Collections, Version=4.1.2.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a
System.Collections.Concurrent, Version=4.0.15.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a
System.Collections.NonGeneric, Version=4.1.2.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a
Expand Down
Loading