diff --git a/src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationProvider.cs b/src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationProvider.cs index 4090bbb5619..af13b8100ba 100644 --- a/src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationProvider.cs +++ b/src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationProvider.cs @@ -2,7 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Collections.Generic; using System.Diagnostics.Metrics; +using System.Linq; +using System.Threading; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Extensions.Options; @@ -33,6 +36,10 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider private readonly double _scaleRelativeToCpuRequest; private readonly double _scaleRelativeToCpuRequestForTrackerApi; + private readonly TimeSpan _retryInterval = TimeSpan.FromMinutes(5); + private DateTimeOffset _lastFailure = DateTimeOffset.MinValue; + private int _measurementsUnavailable; + private DateTimeOffset _refreshAfterCpu; private DateTimeOffset _refreshAfterMemory; @@ -94,18 +101,44 @@ public LinuxUtilizationProvider(IOptions options, ILi // Initialize the counters _cpuUtilizationLimit100PercentExceededCounter = meter.CreateCounter("cpu_utilization_limit_100_percent_exceeded"); _cpuUtilizationLimit110PercentExceededCounter = meter.CreateCounter("cpu_utilization_limit_110_percent_exceeded"); - _ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilizationLimit(cpuLimit), unit: "1"); - _ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilizationWithoutHostDelta() / cpuRequest, unit: "1"); + + _ = meter.CreateObservableGauge( + ResourceUtilizationInstruments.ContainerCpuLimitUtilization, + () => GetMeasurementWithRetry(() => CpuUtilizationLimit(cpuLimit)), + "1"); + + _ = meter.CreateObservableGauge( + name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, + observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationWithoutHostDelta() / cpuRequest), + unit: "1"); } else { - _ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuLimit, unit: "1"); - _ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1"); - _ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessCpuUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1"); + _ = meter.CreateObservableGauge( + name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, + observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuLimit), + unit: "1"); + + _ = meter.CreateObservableGauge( + name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, + observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuRequest), + unit: "1"); + + _ = meter.CreateObservableGauge( + name: ResourceUtilizationInstruments.ProcessCpuUtilization, + observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuRequest), + unit: "1"); } - _ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, observeValue: MemoryUtilization, unit: "1"); - _ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessMemoryUtilization, observeValue: MemoryUtilization, unit: "1"); + _ = meter.CreateObservableGauge( + name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, + observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()), + unit: "1"); + + _ = meter.CreateObservableGauge( + name: ResourceUtilizationInstruments.ProcessMemoryUtilization, + observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()), + unit: "1"); // cpuRequest is a CPU request (aka guaranteed number of CPU units) for pod, for host its 1 core // cpuLimit is a CPU limit (aka max CPU units available) for a pod or for a host. @@ -288,4 +321,34 @@ public Snapshot GetSnapshot() userTimeSinceStart: TimeSpan.FromTicks((long)(cgroupTime / Hundred * _scaleRelativeToCpuRequestForTrackerApi)), memoryUsageInBytes: memoryUsed); } + + private IEnumerable> GetMeasurementWithRetry(Func func) + { + if (Volatile.Read(ref _measurementsUnavailable) == 1 && + _timeProvider.GetUtcNow() - _lastFailure < _retryInterval) + { + return Enumerable.Empty>(); + } + + try + { + double result = func(); + if (Volatile.Read(ref _measurementsUnavailable) == 1) + { + _ = Interlocked.Exchange(ref _measurementsUnavailable, 0); + } + + return new[] { new Measurement(result) }; + } + catch (Exception ex) when ( + ex is System.IO.FileNotFoundException || + ex is System.IO.DirectoryNotFoundException || + ex is System.UnauthorizedAccessException) + { + _lastFailure = _timeProvider.GetUtcNow(); + _ = Interlocked.Exchange(ref _measurementsUnavailable, 1); + + return Enumerable.Empty>(); + } + } } diff --git a/test/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring.Tests/Linux/LinuxUtilizationProviderTests.cs b/test/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring.Tests/Linux/LinuxUtilizationProviderTests.cs index e6e9a282eca..6ee3c40d44d 100644 --- a/test/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring.Tests/Linux/LinuxUtilizationProviderTests.cs +++ b/test/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring.Tests/Linux/LinuxUtilizationProviderTests.cs @@ -9,6 +9,7 @@ using System.Threading.Tasks; using Microsoft.Extensions.Diagnostics.ResourceMonitoring.Test.Helpers; using Microsoft.Extensions.Logging.Testing; +using Microsoft.Extensions.Time.Testing; using Microsoft.Shared.Instruments; using Microsoft.TestUtilities; using Moq; @@ -272,4 +273,133 @@ public void Provider_Registers_Instruments_CgroupV2_WithoutHostCpu() Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization); Assert.Equal(1, samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization).value); } + + [Fact] + public void Provider_GetMeasurementWithRetry_HandlesExceptionAndRecovers() + { + var meterName = Guid.NewGuid().ToString(); + var logger = new FakeLogger(); + var options = Options.Options.Create(new ResourceMonitoringOptions()); + using var meter = new Meter(nameof(Provider_GetMeasurementWithRetry_HandlesExceptionAndRecovers)); + var meterFactoryMock = new Mock(); + meterFactoryMock.Setup(x => x.Create(It.IsAny())) + .Returns(meter); + + var callCount = 0; + var parserMock = new Mock(); + parserMock.Setup(p => p.GetMemoryUsageInBytes()).Returns(() => + { + callCount++; + if (callCount <= 1) + { + throw new FileNotFoundException("Simulated failure to read file"); + } + + return 420UL; + }); + parserMock.Setup(p => p.GetAvailableMemoryInBytes()).Returns(1000UL); + parserMock.Setup(p => p.GetCgroupRequestCpu()).Returns(10f); + parserMock.Setup(p => p.GetCgroupLimitedCpus()).Returns(12f); + + var fakeTime = new FakeTimeProvider(DateTimeOffset.UtcNow); + var provider = new LinuxUtilizationProvider(options, parserMock.Object, meterFactoryMock.Object, logger, fakeTime); + + using var listener = new MeterListener + { + InstrumentPublished = (instrument, listener) => + { + if (ReferenceEquals(meter, instrument.Meter)) + { + listener.EnableMeasurementEvents(instrument); + } + } + }; + + var samples = new List<(Instrument instrument, double value)>(); + listener.SetMeasurementEventCallback((instrument, value, _, _) => + { + if (ReferenceEquals(meter, instrument.Meter)) + { + samples.Add((instrument, value)); + } + }); + + listener.Start(); + listener.RecordObservableInstruments(); + Assert.DoesNotContain(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization); + + fakeTime.Advance(TimeSpan.FromMinutes(1)); + listener.RecordObservableInstruments(); + Assert.DoesNotContain(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization); + + fakeTime.Advance(TimeSpan.FromMinutes(5)); + listener.RecordObservableInstruments(); + var metric = samples.SingleOrDefault(x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization); + Assert.Equal(0.42, metric.value); + + parserMock.Verify(p => p.GetMemoryUsageInBytes(), Times.Exactly(2)); + } + + [Fact] + public void Provider_GetMeasurementWithRetry_UnhandledException_DoesNotBlockFutureReads() + { + var meterName = Guid.NewGuid().ToString(); + var logger = new FakeLogger(); + var options = Options.Options.Create(new ResourceMonitoringOptions()); + using var meter = new Meter(nameof(Provider_GetMeasurementWithRetry_UnhandledException_DoesNotBlockFutureReads)); + var meterFactoryMock = new Mock(); + meterFactoryMock.Setup(x => x.Create(It.IsAny())) + .Returns(meter); + + var callCount = 0; + var parserMock = new Mock(); + parserMock.Setup(p => p.GetMemoryUsageInBytes()).Returns(() => + { + callCount++; + if (callCount <= 2) + { + throw new InvalidOperationException("Simulated unhandled exception"); + } + + return 1234UL; + }); + parserMock.Setup(p => p.GetAvailableMemoryInBytes()).Returns(2000UL); + parserMock.Setup(p => p.GetCgroupRequestCpu()).Returns(10f); + parserMock.Setup(p => p.GetCgroupLimitedCpus()).Returns(12f); + + var fakeTime = new FakeTimeProvider(DateTimeOffset.UtcNow); + var provider = new LinuxUtilizationProvider(options, parserMock.Object, meterFactoryMock.Object, logger, fakeTime); + + using var listener = new MeterListener + { + InstrumentPublished = (instrument, listener) => + { + if (ReferenceEquals(meter, instrument.Meter)) + { + listener.EnableMeasurementEvents(instrument); + } + } + }; + + var samples = new List<(Instrument instrument, double value)>(); + listener.SetMeasurementEventCallback((instrument, value, _, _) => + { + if (ReferenceEquals(meter, instrument.Meter)) + { + samples.Add((instrument, value)); + } + }); + + listener.Start(); + + Assert.Throws(() => listener.RecordObservableInstruments()); + Assert.DoesNotContain(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization); + + fakeTime.Advance(TimeSpan.FromMinutes(1)); + listener.RecordObservableInstruments(); + var metric = samples.SingleOrDefault(x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization); + Assert.Equal(1234f / 2000f, metric.value, 0.01f); + + parserMock.Verify(p => p.GetMemoryUsageInBytes(), Times.Exactly(3)); + } }