Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Client Telemetry: Adds Thread Starvation Information #3004

Merged
merged 6 commits into from
Jan 31, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ internal static void RecordSystemUsage(
systemInfoCollection.Add(TelemetrySystemUsage.GetCpuInfo(systemUsageHistory.Values));
systemInfoCollection.Add(TelemetrySystemUsage.GetMemoryRemainingInfo(systemUsageHistory.Values));
systemInfoCollection.Add(TelemetrySystemUsage.GetAvailableThreadsInfo(systemUsageHistory.Values));
systemInfoCollection.Add(TelemetrySystemUsage.GetThreadWaitIntervalInMs(systemUsageHistory.Values));
systemInfoCollection.Add(TelemetrySystemUsage.GetIsThreadStarving(systemUsageHistory.Values));
sourabh1007 marked this conversation as resolved.
Show resolved Hide resolved
}

/// <summary>
Expand Down
12 changes: 11 additions & 1 deletion Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetryOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,17 @@ internal static class ClientTelemetryOptions
internal const long AvailableThreadsMin = 1;
internal const int AvailableThreadsPrecision = 2;
internal const String AvailableThreadsName = "SystemPool_AvailableThreads";
internal const String AvailableThreadsUnit = "ThreadCount";
internal const String AvailableThreadsUnit = "ThreadCount";

// Expecting histogram to have Minimum ThreadWaitIntervalInMs of 1 and Maximum ThreadWaitIntervalInMs of 1 second
internal const long ThreadWaitIntervalInMsMax = TimeSpan.TicksPerSecond;
internal const long ThreadWaitIntervalInMsMin = 1;
internal const int ThreadWaitIntervalInMsPrecision = 2;
internal const string ThreadWaitIntervalInMsName = "SystemPool_ThreadWaitInterval";
internal const string ThreadWaitIntervalInMsUnit = "MilliSecond";

internal const string IsThreadStarvingName = "SystemPool_IsThreadStarving_True";
internal const string IsThreadStarvingUnit = "Count";

internal const string DefaultVmMetadataUrL = "http://169.254.169.254/metadata/instance?api-version=2020-06-01";
internal const double DefaultTimeStampInSeconds = 600;
Expand Down
10 changes: 5 additions & 5 deletions Microsoft.Azure.Cosmos/src/Telemetry/MetricInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ internal MetricInfo(string metricsName, string unitName)

public MetricInfo(string metricsName,
string unitName,
double mean,
long count,
long min,
long max,
IReadOnlyDictionary<double, double> percentiles)
double mean = 0,
long count = 0,
long min = 0,
long max = 0,
IReadOnlyDictionary<double, double> percentiles = null)
: this(metricsName, unitName)
{
this.Mean = mean;
Expand Down
5 changes: 5 additions & 0 deletions Microsoft.Azure.Cosmos/src/Telemetry/SystemInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ internal SystemInfo(string metricsName, string unitName)
this.MetricInfo = new MetricInfo(metricsName, unitName);
}

internal SystemInfo(string metricsName, string unitName, int count)
{
this.MetricInfo = new MetricInfo(metricsName, unitName, count: count);
}

public SystemInfo(MetricInfo metricInfo)
{
this.MetricInfo = metricInfo;
Expand Down
60 changes: 57 additions & 3 deletions Microsoft.Azure.Cosmos/src/Telemetry/TelemetrySystemUsage.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

namespace Microsoft.Azure.Cosmos.Telemetry
{
using System;
using System.Collections.Generic;
using HdrHistogram;
using Microsoft.Azure.Documents.Rntbd;
Expand Down Expand Up @@ -35,7 +36,7 @@ public static SystemInfo GetCpuInfo(IReadOnlyCollection<SystemUsageLoad> systemU
long? infoToRecord = (long?)load.CpuUsage * ClientTelemetryOptions.HistogramPrecisionFactor;
if (infoToRecord.HasValue)
{
histogram.RecordValue((long)infoToRecord);
histogram.RecordValue(infoToRecord.Value);
}
}

Expand Down Expand Up @@ -64,7 +65,7 @@ public static SystemInfo GetMemoryRemainingInfo(IReadOnlyCollection<SystemUsageL
long? infoToRecord = (long?)load.MemoryAvailable;
if (infoToRecord.HasValue)
{
histogram.RecordValue((long)infoToRecord);
histogram.RecordValue(infoToRecord.Value);
}
}

Expand Down Expand Up @@ -93,7 +94,7 @@ public static SystemInfo GetAvailableThreadsInfo(IReadOnlyCollection<SystemUsage
long? infoToRecord = (long?)load.ThreadInfo?.AvailableThreads;
if (infoToRecord.HasValue)
{
histogram.RecordValue((long)infoToRecord);
histogram.RecordValue(infoToRecord.Value);
}
}

Expand All @@ -105,5 +106,58 @@ public static SystemInfo GetAvailableThreadsInfo(IReadOnlyCollection<SystemUsage
return systemInfo;
}

/// <summary>
/// Collecting Thread Starvation Flags Count
/// </summary>
/// <param name="systemUsageCollection"></param>
/// <returns>SystemInfo</returns>
public static SystemInfo GetIsThreadStarving(IReadOnlyCollection<SystemUsageLoad> systemUsageCollection)
{
int counter = 0;
foreach (SystemUsageLoad load in systemUsageCollection)
{
bool? infoToRecord = load.ThreadInfo?.IsThreadStarving;
if (infoToRecord.HasValue && infoToRecord.Value)
{
counter++;
}
}
SystemInfo systemInfo =
new SystemInfo(
metricsName: ClientTelemetryOptions.IsThreadStarvingName,
unitName: ClientTelemetryOptions.IsThreadStarvingUnit,
count: counter);

return systemInfo;
}

/// <summary>
/// Collecting Thread Wait Interval in Millisecond and aggregating using Histogram
/// </summary>
/// <param name="systemUsageCollection"></param>
/// <returns>SystemInfo</returns>
public static SystemInfo GetThreadWaitIntervalInMs(IReadOnlyCollection<SystemUsageLoad> systemUsageCollection)
{
LongConcurrentHistogram histogram = new LongConcurrentHistogram(ClientTelemetryOptions.ThreadWaitIntervalInMsMin,
ClientTelemetryOptions.ThreadWaitIntervalInMsMax,
ClientTelemetryOptions.ThreadWaitIntervalInMsPrecision);

SystemInfo systemInfo = new SystemInfo(ClientTelemetryOptions.ThreadWaitIntervalInMsName, ClientTelemetryOptions.ThreadWaitIntervalInMsUnit);
foreach (SystemUsageLoad load in systemUsageCollection)
{
double? infoToRecord = load.ThreadInfo?.ThreadWaitIntervalInMs;
if (infoToRecord.HasValue)
{
histogram.RecordValue(TimeSpan.FromMilliseconds(infoToRecord.Value).Ticks);
}
}

if (histogram.TotalCount > 0)
{
systemInfo.SetAggregators(histogram, ClientTelemetryOptions.TicksToMsFactor);
}

return systemInfo;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ namespace Microsoft.Azure.Cosmos.SDK.EmulatorTests
using Newtonsoft.Json;
using Documents.Rntbd;
using System.Globalization;
using System.Linq;
using Microsoft.VisualBasic;

[TestClass]
public class ClientTelemetryTests : BaseCosmosClientHelper
Expand Down Expand Up @@ -765,7 +763,9 @@ private static void AssertSystemLevelInformation(List<SystemInfo> actualSystemIn
{
{ ClientTelemetryOptions.CpuName, ClientTelemetryOptions.CpuUnit },
{ ClientTelemetryOptions.MemoryName, ClientTelemetryOptions.MemoryUnit },
{ ClientTelemetryOptions.AvailableThreadsName, ClientTelemetryOptions.AvailableThreadsUnit }
{ ClientTelemetryOptions.AvailableThreadsName, ClientTelemetryOptions.AvailableThreadsUnit },
{ ClientTelemetryOptions.IsThreadStarvingName, ClientTelemetryOptions.IsThreadStarvingUnit },
{ ClientTelemetryOptions.ThreadWaitIntervalInMsName, ClientTelemetryOptions.ThreadWaitIntervalInMsUnit }
};

Dictionary<string, string> actualMetricNameUnitMap = new Dictionary<string, string>();
Expand All @@ -781,16 +781,19 @@ private static void AssertSystemLevelInformation(List<SystemInfo> actualSystemIn
Assert.AreEqual(systemInfo.MetricInfo.UnitName, actualMetricNameUnitMap[systemInfo.MetricInfo.MetricsName]);
}

Assert.IsTrue(systemInfo.MetricInfo.Count > 0, "MetricInfo Count is not greater than 0");
Assert.IsNotNull(systemInfo.MetricInfo.Percentiles, "Percentiles is null");
Assert.IsTrue(systemInfo.MetricInfo.Mean >= 0, "MetricInfo Mean is not greater than or equal to 0");
Assert.IsTrue(systemInfo.MetricInfo.Max >= 0, "MetricInfo Max is not greater than or equal to 0");
Assert.IsTrue(systemInfo.MetricInfo.Min >= 0, "MetricInfo Min is not greater than or equal to 0");
if(!systemInfo.MetricInfo.MetricsName.Equals(ClientTelemetryOptions.IsThreadStarvingName))
{
Assert.IsTrue(systemInfo.MetricInfo.Count > 0, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Count is not greater than 0");
Assert.IsNotNull(systemInfo.MetricInfo.Percentiles, $"Percentiles is null for metrics ({systemInfo.MetricInfo.MetricsName})");
}
Assert.IsTrue(systemInfo.MetricInfo.Mean >= 0, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Mean is not greater than or equal to 0");
Assert.IsTrue(systemInfo.MetricInfo.Max >= 0, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Max is not greater than or equal to 0");
Assert.IsTrue(systemInfo.MetricInfo.Min >= 0, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Min is not greater than or equal to 0");
if (systemInfo.MetricInfo.MetricsName.Equals(ClientTelemetryOptions.CpuName))
{
Assert.IsTrue(systemInfo.MetricInfo.Mean <= 100, "MetricInfo Mean is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Max <= 100, "MetricInfo Max is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Min <= 100, "MetricInfo Min is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Mean <= 100, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Mean is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Max <= 100, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Max is not greater than 100 for CPU Usage");
Assert.IsTrue(systemInfo.MetricInfo.Min <= 100, $"MetricInfo ({systemInfo.MetricInfo.MetricsName}) Min is not greater than 100 for CPU Usage");
};
}

Expand Down Expand Up @@ -851,7 +854,7 @@ private static void AssertAccountLevelInformation(List<ClientTelemetryProperties
actualOperationList.AddRange(telemetryInfo.OperationInfo);
actualSystemInformation.AddRange(telemetryInfo.SystemInfo);

Assert.AreEqual(3, telemetryInfo.SystemInfo.Count, $"System Information Count doesn't Match; {JsonConvert.SerializeObject(telemetryInfo.SystemInfo)}");
Assert.AreEqual(5, telemetryInfo.SystemInfo.Count, $"System Information Count doesn't Match; {JsonConvert.SerializeObject(telemetryInfo.SystemInfo)}");

Assert.IsNotNull(telemetryInfo.GlobalDatabaseAccountName, "GlobalDatabaseAccountName is null");
Assert.IsNotNull(telemetryInfo.DateTimeUtc, "Timestamp is null");
Expand Down