Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eng/Versions.props
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<PropertyGroup Label="Version settings">
<MajorVersion>9</MajorVersion>
<MinorVersion>4</MinorVersion>
<PatchVersion>1</PatchVersion>
<PatchVersion>2</PatchVersion>
<PreReleaseVersionLabel>preview</PreReleaseVersionLabel>
<PreReleaseVersionIteration>1</PreReleaseVersionIteration>
<VersionPrefix>$(MajorVersion).$(MinorVersion).$(PatchVersion)</VersionPrefix>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
{
private const double One = 1.0;
private const long Hundred = 100L;
private const double CpuLimitThreshold110Percent = 1.1;

// Meters to track CPU utilization threshold exceedances
private readonly Counter<long>? _cpuUtilizationLimit100PercentExceededCounter;
private readonly Counter<long>? _cpuUtilizationLimit110PercentExceededCounter;

private readonly object _cpuLocker = new();
private readonly object _memoryLocker = new();
Expand All @@ -38,6 +43,8 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
private double _memoryPercentage;
private long _previousCgroupCpuTime;
private long _previousHostCpuTime;
private long _cpuUtilizationLimit100PercentExceeded;
private long _cpuUtilizationLimit110PercentExceeded;
public SystemResources Resources { get; }

public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILinuxUtilizationParser parser,
Expand Down Expand Up @@ -77,17 +84,21 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi

// Try to get the CPU request from cgroup
cpuRequest = _parser.GetCgroupRequestCpuV2();
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilizationWithoutHostDelta() / cpuLimit, unit: "1");

// Initialize the counters
_cpuUtilizationLimit100PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_100_percent_exceeded");
_cpuUtilizationLimit110PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_110_percent_exceeded");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilizationLimit(cpuLimit), unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilizationWithoutHostDelta() / cpuRequest, unit: "1");
}
else
{
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuLimit, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessCpuUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
}

_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, observeValue: MemoryUtilization, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessCpuUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessMemoryUtilization, observeValue: MemoryUtilization, unit: "1");

// cpuRequest is a CPU request (aka guaranteed number of CPU units) for pod, for host its 1 core
Expand Down Expand Up @@ -138,6 +149,34 @@ public double CpuUtilizationWithoutHostDelta()
return _lastCpuCoresUsed;
}

/// <summary>
/// Calculates CPU utilization relative to the CPU limit.
/// </summary>
/// <param name="cpuLimit">The CPU limit to use for the calculation.</param>
/// <returns>CPU usage as a ratio of the limit.</returns>
public double CpuUtilizationLimit(float cpuLimit)
{
double utilization = CpuUtilizationWithoutHostDelta() / cpuLimit;

// Increment counter if utilization exceeds 1 (100%)
if (utilization > 1.0)
{
_cpuUtilizationLimit100PercentExceededCounter?.Add(1);
_cpuUtilizationLimit100PercentExceeded++;
Log.CounterMessage100(_logger, _cpuUtilizationLimit100PercentExceeded);
}

// Increment counter if utilization exceeds 110%
if (utilization > CpuLimitThreshold110Percent)
{
_cpuUtilizationLimit110PercentExceededCounter?.Add(1);
_cpuUtilizationLimit110PercentExceeded++;
Log.CounterMessage110(_logger, _cpuUtilizationLimit110PercentExceeded);
}

return utilization;
}

public double CpuUtilization()
{
DateTimeOffset now = _timeProvider.GetUtcNow();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,16 @@ public static partial void CpuUsageDataV2(
long previousCgroupCpuTime,
double actualElapsedNanoseconds,
double cpuCores);

[LoggerMessage(5, LogLevel.Debug,
"CPU utilization exceeded 100%: Counter = {counterValue}")]
public static partial void CounterMessage100(
ILogger logger,
long counterValue);

[LoggerMessage(6, LogLevel.Debug,
"CPU utilization exceeded 110%: Counter = {counterValue}")]
public static partial void CounterMessage110(
ILogger logger,
long counterValue);
}
16 changes: 16 additions & 0 deletions src/Shared/Instruments/ResourceUtilizationInstruments.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,22 @@ internal static class ResourceUtilizationInstruments
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableUpDownCounter{T}"/>.
/// </remarks>
public const string SystemNetworkConnections = "system.network.connections";

/// <summary>
/// The name of an instrument to count occurrences when CPU utilization exceeds 100% of the limit.
/// </summary>
/// <remarks>
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.Counter{T}"/>.
/// </remarks>
public const string CpuUtilizationLimit100PercentExceeded = "cpu.utilization.limit.100percent.exceeded";

/// <summary>
/// The name of an instrument to count occurrences when CPU utilization exceeds 110% of the limit.
/// </summary>
/// <remarks>
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.Counter{T}"/>.
/// </remarks>
public const string CpuUtilizationLimit110PercentExceeded = "cpu.utilization.limit.110percent.exceeded";
}

#pragma warning disable CS1574
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ public void Provider_Registers_Instruments_CgroupV2_WithoutHostCpu()
listener.Start();
listener.RecordObservableInstruments();

Assert.Equal(5, samples.Count);
Assert.Equal(4, samples.Count);

Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ContainerCpuLimitUtilization);
Assert.True(double.IsNaN(samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ContainerCpuLimitUtilization).value));
Expand All @@ -269,9 +269,6 @@ public void Provider_Registers_Instruments_CgroupV2_WithoutHostCpu()
Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ContainerMemoryLimitUtilization);
Assert.Equal(1, samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ContainerMemoryLimitUtilization).value);

Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessCpuUtilization);
Assert.True(double.IsNaN(samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ProcessCpuUtilization).value));

Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
Assert.Equal(1, samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization).value);
}
Expand Down
Loading