Skip to content

Commit

Permalink
Client Telemetry: Adds a limit on retry count of Job Failure (#3161)
Browse files Browse the repository at this point in the history
Right now, client telemetry get enabled at VM level, so if there are calls to different accounts (which include non-allowed accounts). With current design, Telemetry job will keep running in background even it is failing continuously (for non-allowed accounts).
So, This PR will stop the client telemetry job after 3 continuous failures.
  • Loading branch information
sourabh1007 authored Apr 28, 2022
1 parent 636703b commit eec5de5
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 7 deletions.
23 changes: 20 additions & 3 deletions Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetry.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ namespace Microsoft.Azure.Cosmos.Telemetry
/// </summary>
internal class ClientTelemetry : IDisposable
{
private const int allowedNumberOfFailures = 3;

private static readonly Uri endpointUrl = ClientTelemetryOptions.GetClientTelemetryEndpoint();
private static readonly TimeSpan observingWindow = ClientTelemetryOptions.GetScheduledTimeSpan();

Expand All @@ -44,6 +46,8 @@ internal class ClientTelemetry : IDisposable
private ConcurrentDictionary<OperationInfo, (LongConcurrentHistogram latency, LongConcurrentHistogram requestcharge)> operationInfoMap
= new ConcurrentDictionary<OperationInfo, (LongConcurrentHistogram latency, LongConcurrentHistogram requestcharge)>();

private int numberOfFailures = 0;

/// <summary>
/// Factory method to intiakize telemetry object and start observer task
/// </summary>
Expand Down Expand Up @@ -127,6 +131,12 @@ private async Task EnrichAndSendAsync()
{
while (!this.cancellationTokenSource.IsCancellationRequested)
{
if (this.numberOfFailures == allowedNumberOfFailures)
{
this.Dispose();
break;
}

// Load account information if not available, cache is already implemented
if (String.IsNullOrEmpty(this.clientTelemetryInfo.GlobalDatabaseAccountName))
{
Expand Down Expand Up @@ -335,16 +345,21 @@ await this.tokenProvider.AddAuthorizationHeaderAsync(

if (!response.IsSuccessStatusCode)
{
this.numberOfFailures++;

DefaultTrace.TraceError("Juno API response not successful. Status Code : {0}, Message : {1}", response.StatusCode, response.ReasonPhrase);
}
else
{
this.numberOfFailures = 0; // Ressetting failure counts on success call.
DefaultTrace.TraceInformation("Telemetry data sent successfully.");
}

}
catch (Exception ex)
{
this.numberOfFailures++;

DefaultTrace.TraceError("Exception while sending telemetry data : {0}", ex.Message);
}
finally
Expand All @@ -367,9 +382,11 @@ private void Reset()
/// </summary>
public void Dispose()
{
this.cancellationTokenSource.Cancel();
this.cancellationTokenSource.Dispose();

if (!this.cancellationTokenSource.IsCancellationRequested)
{
this.cancellationTokenSource.Cancel();
this.cancellationTokenSource.Dispose();
}
this.telemetryTask = null;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ public async Task CreateItemWithSubStatusCodeTest(ConnectionMode mode)

Container container = await this.CreateClientAndContainer(
mode: mode,
customHandler: httpHandler);
customHttpHandler: httpHandler);
try
{
ToDoActivity testItem = ToDoActivity.CreateRandomToDoActivity("MyTestPkValue");
Expand Down Expand Up @@ -960,6 +960,45 @@ private static void AssertAccountLevelInformation(

}

[TestMethod]
public async Task CheckMisconfiguredTelemetryEndpoint_should_stop_the_job()
{
int retryCounter = 0;
HttpClientHandlerHelper customHttpHandler = new HttpClientHandlerHelper
{
RequestCallBack = (request, cancellation) =>
{
if (request.RequestUri.AbsoluteUri.Equals(ClientTelemetryOptions.GetClientTelemetryEndpoint().AbsoluteUri))
{
retryCounter++;
throw new Exception("Exception while sending telemetry");
}

return null;
}
};

Container container = await this.CreateClientAndContainer(
mode: ConnectionMode.Direct,
customHttpHandler: customHttpHandler);

// Create an item
ToDoActivity testItem = ToDoActivity.CreateRandomToDoActivity("MyTestPkValue");
ItemResponse<ToDoActivity> createResponse = await container.CreateItemAsync<ToDoActivity>(testItem);
ToDoActivity testItemCreated = createResponse.Resource;

// Read an Item
ItemResponse<ToDoActivity> response = await container.ReadItemAsync<ToDoActivity>(testItem.id, new Cosmos.PartitionKey(testItem.id));

await Task.Delay(1500);

response = await container.ReadItemAsync<ToDoActivity>(testItem.id, new Cosmos.PartitionKey(testItem.id));

await Task.Delay(3500);

Assert.AreEqual(3, retryCounter);
}

private static ItemBatchOperation CreateItem(string itemId)
{
var testItem = new { id = itemId, Status = itemId };
Expand All @@ -970,7 +1009,7 @@ private async Task<Container> CreateClientAndContainer(ConnectionMode mode,
Microsoft.Azure.Cosmos.ConsistencyLevel? consistency = null,
bool isLargeContainer = false,
bool isAzureInstance = false,
HttpClientHandlerHelper customHandler = null)
HttpClientHandlerHelper customHttpHandler = null)
{
if (consistency.HasValue)
{
Expand All @@ -979,13 +1018,13 @@ private async Task<Container> CreateClientAndContainer(ConnectionMode mode,
}

HttpClientHandlerHelper handlerHelper;
if (customHandler == null)
if (customHttpHandler == null)
{
handlerHelper = isAzureInstance ? this.httpHandler : this.httpHandlerForNonAzureInstance;
}
else
{
handlerHelper = customHandler;
handlerHelper = customHttpHandler;
}

this.cosmosClientBuilder = this.cosmosClientBuilder
Expand Down

0 comments on commit eec5de5

Please sign in to comment.