Skip to content

Commit

Permalink
Client Telemetry: Adds Thread Starvation Information (#3004)
Browse files Browse the repository at this point in the history
Adding 2 new Thread Starvation metrics in client telemetry:

1. Metrics Name: SystemPool_ThreadWaitInterval
    Metrics Unit: MilliSecond
It will tell about how much time a thread creation is taking.

2. Metrics Name: SystemPool_IsThreadStarving_True
    Metrics Unit: Count
It will tell, how many times Thread Starvation were Detected. There is no histogram calculation involved.
  • Loading branch information
sourabh1007 committed Jan 31, 2022
1 parent 2e2a9cd commit 5719a0c
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 21 deletions.
2 changes: 2 additions & 0 deletions Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetryHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ internal static void RecordSystemUsage(
systemInfoCollection.Add(TelemetrySystemUsage.GetCpuInfo(systemUsageHistory.Values));
systemInfoCollection.Add(TelemetrySystemUsage.GetMemoryRemainingInfo(systemUsageHistory.Values));
systemInfoCollection.Add(TelemetrySystemUsage.GetAvailableThreadsInfo(systemUsageHistory.Values));
systemInfoCollection.Add(TelemetrySystemUsage.GetThreadWaitIntervalInMs(systemUsageHistory.Values));
systemInfoCollection.Add(TelemetrySystemUsage.GetThreadStarvationSignalCount(systemUsageHistory.Values));
}

/// <summary>
Expand Down
12 changes: 11 additions & 1 deletion Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetryOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,17 @@ internal static class ClientTelemetryOptions
internal const long AvailableThreadsMin = 1;
internal const int AvailableThreadsPrecision = 2;
internal const String AvailableThreadsName = "SystemPool_AvailableThreads";
internal const String AvailableThreadsUnit = "ThreadCount";
internal const String AvailableThreadsUnit = "ThreadCount";

// Expecting histogram to have Minimum ThreadWaitIntervalInMs of 1 and Maximum ThreadWaitIntervalInMs of 1 second
internal const long ThreadWaitIntervalInMsMax = TimeSpan.TicksPerSecond;
internal const long ThreadWaitIntervalInMsMin = 1;
internal const int ThreadWaitIntervalInMsPrecision = 2;
internal const string ThreadWaitIntervalInMsName = "SystemPool_ThreadWaitInterval";
internal const string ThreadWaitIntervalInMsUnit = "MilliSecond";

internal const string IsThreadStarvingName = "SystemPool_IsThreadStarving_True";
internal const string IsThreadStarvingUnit = "Count";

internal const string DefaultVmMetadataUrL = "http://169.254.169.254/metadata/instance?api-version=2020-06-01";
internal const double DefaultTimeStampInSeconds = 600;
Expand Down
10 changes: 5 additions & 5 deletions Microsoft.Azure.Cosmos/src/Telemetry/MetricInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ internal MetricInfo(string metricsName, string unitName)

public MetricInfo(string metricsName,
string unitName,
double mean,
long count,
long min,
long max,
IReadOnlyDictionary<double, double> percentiles)
double mean = 0,
long count = 0,
long min = 0,
long max = 0,
IReadOnlyDictionary<double, double> percentiles = null)
: this(metricsName, unitName)
{
this.Mean = mean;
Expand Down
5 changes: 5 additions & 0 deletions Microsoft.Azure.Cosmos/src/Telemetry/SystemInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ internal SystemInfo(string metricsName, string unitName)
this.MetricInfo = new MetricInfo(metricsName, unitName);
}

internal SystemInfo(string metricsName, string unitName, int count)
{
this.MetricInfo = new MetricInfo(metricsName, unitName, count: count);
}

public SystemInfo(MetricInfo metricInfo)
{
this.MetricInfo = metricInfo;
Expand Down
60 changes: 57 additions & 3 deletions Microsoft.Azure.Cosmos/src/Telemetry/TelemetrySystemUsage.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

namespace Microsoft.Azure.Cosmos.Telemetry
{
using System;
using System.Collections.Generic;
using HdrHistogram;
using Microsoft.Azure.Documents.Rntbd;
Expand Down Expand Up @@ -35,7 +36,7 @@ public static SystemInfo GetCpuInfo(IReadOnlyCollection<SystemUsageLoad> systemU
long? infoToRecord = (long?)load.CpuUsage * ClientTelemetryOptions.HistogramPrecisionFactor;
if (infoToRecord.HasValue)
{
histogram.RecordValue((long)infoToRecord);
histogram.RecordValue(infoToRecord.Value);
}
}

Expand Down Expand Up @@ -64,7 +65,7 @@ public static SystemInfo GetMemoryRemainingInfo(IReadOnlyCollection<SystemUsageL
long? infoToRecord = (long?)load.MemoryAvailable;
if (infoToRecord.HasValue)
{
histogram.RecordValue((long)infoToRecord);
histogram.RecordValue(infoToRecord.Value);
}
}

Expand Down Expand Up @@ -93,7 +94,7 @@ public static SystemInfo GetAvailableThreadsInfo(IReadOnlyCollection<SystemUsage
long? infoToRecord = (long?)load.ThreadInfo?.AvailableThreads;
if (infoToRecord.HasValue)
{
histogram.RecordValue((long)infoToRecord);
histogram.RecordValue(infoToRecord.Value);
}
}

Expand All @@ -105,5 +106,58 @@ public static SystemInfo GetAvailableThreadsInfo(IReadOnlyCollection<SystemUsage
return systemInfo;
}

/// <summary>
/// Collecting Thread Starvation Flags Count
/// </summary>
/// <param name="systemUsageCollection"></param>
/// <returns>SystemInfo</returns>
public static SystemInfo GetThreadStarvationSignalCount(IReadOnlyCollection<SystemUsageLoad> systemUsageCollection)
{
int counter = 0;
foreach (SystemUsageLoad load in systemUsageCollection)
{
bool? infoToRecord = load.ThreadInfo?.IsThreadStarving;
if (infoToRecord.HasValue && infoToRecord.Value)
{
counter++;
}
}
SystemInfo systemInfo =
new SystemInfo(
metricsName: ClientTelemetryOptions.IsThreadStarvingName,
unitName: ClientTelemetryOptions.IsThreadStarvingUnit,
count: counter);

return systemInfo;
}

/// <summary>
/// Collecting Thread Wait Interval in Millisecond and aggregating using Histogram
/// </summary>
/// <param name="systemUsageCollection"></param>
/// <returns>SystemInfo</returns>
public static SystemInfo GetThreadWaitIntervalInMs(IReadOnlyCollection<SystemUsageLoad> systemUsageCollection)
{
LongConcurrentHistogram histogram = new LongConcurrentHistogram(ClientTelemetryOptions.ThreadWaitIntervalInMsMin,
ClientTelemetryOptions.ThreadWaitIntervalInMsMax,
ClientTelemetryOptions.ThreadWaitIntervalInMsPrecision);

SystemInfo systemInfo = new SystemInfo(ClientTelemetryOptions.ThreadWaitIntervalInMsName, ClientTelemetryOptions.ThreadWaitIntervalInMsUnit);
foreach (SystemUsageLoad load in systemUsageCollection)
{
double? infoToRecord = load.ThreadInfo?.ThreadWaitIntervalInMs;
if (infoToRecord.HasValue)
{
histogram.RecordValue(TimeSpan.FromMilliseconds(infoToRecord.Value).Ticks);
}
}

if (histogram.TotalCount > 0)
{
systemInfo.SetAggregators(histogram, ClientTelemetryOptions.TicksToMsFactor);
}

return systemInfo;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ namespace Microsoft.Azure.Cosmos.SDK.EmulatorTests
using Newtonsoft.Json;
using Documents.Rntbd;
using System.Globalization;
using System.Linq;
using Microsoft.VisualBasic;

[TestClass]
public class ClientTelemetryTests : BaseCosmosClientHelper
Expand Down Expand Up @@ -765,7 +763,9 @@ private static void AssertSystemLevelInformation(List<SystemInfo> actualSystemIn
{
{ ClientTelemetryOptions.CpuName, ClientTelemetryOptions.CpuUnit },
{ ClientTelemetryOptions.MemoryName, ClientTelemetryOptions.MemoryUnit },
{ ClientTelemetryOptions.AvailableThreadsName, ClientTelemetryOptions.AvailableThreadsUnit }
{ ClientTelemetryOptions.AvailableThreadsName, ClientTelemetryOptions.AvailableThreadsUnit },
{ ClientTelemetryOptions.IsThreadStarvingName, ClientTelemetryOptions.IsThreadStarvingUnit },
{ ClientTelemetryOptions.ThreadWaitIntervalInMsName, ClientTelemetryOptions.ThreadWaitIntervalInMsUnit }
};

Dictionary<string, string> actualMetricNameUnitMap = new Dictionary<string, string>();
Expand All @@ -781,16 +781,19 @@ private static void AssertSystemLevelInformation(List<SystemInfo> actualSystemIn
Assert.AreEqual(systemInfo.MetricInfo.UnitName, actualMetricNameUnitMap[systemInfo.MetricInfo.MetricsName]);
}

Assert.IsTrue(systemInfo.MetricInfo.Count > 0, "MetricInfo Count is not greater than 0");
Assert.IsNotNull(systemInfo.MetricInfo.Percentiles, "Percentiles is null");
Assert.IsTrue(systemInfo.MetricInfo.Mean >= 0, "MetricInfo Mean is not greater than or equal to 0");
Assert.IsTrue(systemInfo.MetricInfo.Max >= 0, "MetricInfo Max is not greater than or equal to 0");
Assert.IsTrue(systemInfo.MetricInfo.Min >= 0, "MetricInfo Min is not greater than or equal to 0");
if(!systemInfo.MetricInfo.MetricsName.Equals(ClientTelemetryOptions.IsThreadStarvingName))
{
Assert.IsTrue(systemInfo.MetricInfo.Count > 0, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Count is not greater than 0");
Assert.IsNotNull(systemInfo.MetricInfo.Percentiles, $"Percentiles is null for metrics ({systemInfo.MetricInfo.MetricsName})");
}
Assert.IsTrue(systemInfo.MetricInfo.Mean >= 0, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Mean is not greater than or equal to 0");
Assert.IsTrue(systemInfo.MetricInfo.Max >= 0, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Max is not greater than or equal to 0");
Assert.IsTrue(systemInfo.MetricInfo.Min >= 0, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Min is not greater than or equal to 0");
if (systemInfo.MetricInfo.MetricsName.Equals(ClientTelemetryOptions.CpuName))
{
Assert.IsTrue(systemInfo.MetricInfo.Mean <= 100, "MetricInfo Mean is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Max <= 100, "MetricInfo Max is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Min <= 100, "MetricInfo Min is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Mean <= 100, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Mean is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Max <= 100, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Max is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Min <= 100, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Min is not greater than 100 for CPU Usage");
};
}

Expand Down Expand Up @@ -851,7 +854,7 @@ private static void AssertAccountLevelInformation(List<ClientTelemetryProperties
actualOperationList.AddRange(telemetryInfo.OperationInfo);
actualSystemInformation.AddRange(telemetryInfo.SystemInfo);

Assert.AreEqual(3, telemetryInfo.SystemInfo.Count, $"System Information Count doesn't Match; {JsonConvert.SerializeObject(telemetryInfo.SystemInfo)}");
Assert.AreEqual(5, telemetryInfo.SystemInfo.Count, $"System Information Count doesn't Match; {JsonConvert.SerializeObject(telemetryInfo.SystemInfo)}");

Assert.IsNotNull(telemetryInfo.GlobalDatabaseAccountName, "GlobalDatabaseAccountName is null");
Assert.IsNotNull(telemetryInfo.DateTimeUtc, "Timestamp is null");
Expand Down

0 comments on commit 5719a0c

Please sign in to comment.