Skip to content

Commit

Permalink
ClientRetryPolicy: Adds Cross Regional Retry Logic on 429/3092 (#4691)
Browse files Browse the repository at this point in the history
* Initial code changes to throw 503 on 429/3092.

* Updated client retry policy. Added more tests to cover 429/3092.

* Code changes to update direct package version. Updating the tests.

* Code changes to refactor client retry policy.

* Minor code cleanup.

* Reverting the direct version bump up change.

* Code changes to address some of the review comments.

* Code changes to move failover logic in client retry policy.

* Minor code clean up.

* Code changes to clean up some cosmetic items.

* Further clean up.

* Code changes to address review comments.

* Minor refactor to address cosmetic update.

* Code changes to address cosmetic review comment.
  • Loading branch information
kundadebdatta authored and NaluTripician committed Sep 18, 2024
1 parent 0fdb5e4 commit 5e55837
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 13 deletions.
95 changes: 85 additions & 10 deletions Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy
private int serviceUnavailableRetryCount;
private bool isReadRequest;
private bool canUseMultipleWriteLocations;
private bool isMultiMasterWriteRequest;
private Uri locationEndpoint;
private RetryContext retryContext;
private DocumentServiceRequest documentServiceRequest;
Expand All @@ -57,6 +58,7 @@ public ClientRetryPolicy(
this.sessionTokenRetryCount = 0;
this.serviceUnavailableRetryCount = 0;
this.canUseMultipleWriteLocations = false;
this.isMultiMasterWriteRequest = false;
this.isPertitionLevelFailoverEnabled = isPertitionLevelFailoverEnabled;
}

Expand Down Expand Up @@ -97,6 +99,23 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(

if (exception is DocumentClientException clientException)
{
// Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we
// get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable
// for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The
// reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code
// might not be a throttling scenario at all and the status code in that case would be different than 429.
if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
clientException.StatusCode,
clientException.GetSubStatus()))
{
DefaultTrace.TraceError(
"Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.",
StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable);

return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
shouldMarkEndpointUnavailableForPkRange: true);
}

ShouldRetryResult shouldRetryResult = await this.ShouldRetryInternalAsync(
clientException?.StatusCode,
clientException?.GetSubStatus());
Expand Down Expand Up @@ -143,6 +162,23 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
return shouldRetryResult;
}

// Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we
// get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable
// for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The
// reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code
// might not be a throttling scenario at all and the status code in that case would be different than 429.
if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
cosmosResponseMessage.StatusCode,
cosmosResponseMessage?.Headers.SubStatusCode))
{
DefaultTrace.TraceError(
"Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.",
StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable);

return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
shouldMarkEndpointUnavailableForPkRange: true);
}

return await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken);
}

Expand All @@ -156,6 +192,8 @@ public void OnBeforeSendRequest(DocumentServiceRequest request)
this.isReadRequest = request.IsReadOnlyRequest;
this.canUseMultipleWriteLocations = this.globalEndpointManager.CanUseMultipleWriteLocations(request);
this.documentServiceRequest = request;
this.isMultiMasterWriteRequest = !this.isReadRequest
&& (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request) ?? false);

// clear previous location-based routing directive
request.RequestContext.ClearRouteToLocation();
Expand Down Expand Up @@ -274,16 +312,8 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
// Received 503 due to client connect timeout or Gateway
if (statusCode == HttpStatusCode.ServiceUnavailable)
{
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

// Mark the partition as unavailable.
// Let the ClientRetry logic decide if the request should be retried
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);

return this.ShouldRetryOnServiceUnavailable();
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
shouldMarkEndpointUnavailableForPkRange: true);
}

return null;
Expand Down Expand Up @@ -406,6 +436,33 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques
}
}

/// <summary>
/// Attempts to mark the endpoint associated with the current partition key range as unavailable and determines if
/// a retry should be performed due to a ServiceUnavailable (503) response. This method is invoked when a 503
/// Service Unavailable response is received, indicating that the service might be temporarily unavailable.
/// It optionally marks the partition key range as unavailable, which will influence future routing decisions.
/// </summary>
/// <param name="shouldMarkEndpointUnavailableForPkRange">A boolean flag indicating whether the endpoint for the
/// current partition key range should be marked as unavailable.</param>
/// <returns>An instance of <see cref="ShouldRetryResult"/> indicating whether the operation should be retried.</returns>
private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
bool shouldMarkEndpointUnavailableForPkRange)
{
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

if (shouldMarkEndpointUnavailableForPkRange)
{
// Mark the partition as unavailable.
// Let the ClientRetry logic decide if the request should be retried
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);
}

return this.ShouldRetryOnServiceUnavailable();
}

/// <summary>
/// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available.
/// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well.
Expand Down Expand Up @@ -449,6 +506,24 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable()
return ShouldRetryResult.RetryAfter(TimeSpan.Zero);
}

/// <summary>
/// Returns a boolean flag indicating if the endpoint should be marked as unavailable
/// due to a 429 response with a sub status code of 3092 (system resource unavailable).
/// This is applicable for write requests targeted for multi master accounts.
/// </summary>
/// <param name="statusCode">An instance of <see cref="HttpStatusCode"/> containing the status code.</param>
/// <param name="subStatusCode">An instance of <see cref="SubStatusCodes"/> containing the sub status code.</param>
/// <returns>A boolean flag indicating is the endpoint should be marked as unavailable.</returns>
private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
HttpStatusCode? statusCode,
SubStatusCodes? subStatusCode)
{
return this.isMultiMasterWriteRequest
&& statusCode.HasValue
&& (int)statusCode.Value == (int)StatusCodes.TooManyRequests
&& subStatusCode == SubStatusCodes.SystemResourceUnavailable;
}

private sealed class RetryContext
{
public int RetryLocationIndex { get; set; }
Expand Down
14 changes: 14 additions & 0 deletions Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,20 @@ public virtual async Task RefreshLocationAsync(bool forceRefresh = false)

await this.RefreshDatabaseAccountInternalAsync(forceRefresh: forceRefresh);
}

/// <summary>
/// Determines whether the current configuration and state of the service allow for supporting multiple write locations.
/// This method returns True is the AvailableWriteLocations in LocationCache is more than 1. Otherwise, it returns False.
/// </summary>
/// <param name="request">The document service request for which the write location support is being evaluated.</param>
/// <returns>A boolean flag indicating if the available write locations are more than one.</returns>
public bool CanSupportMultipleWriteLocations(DocumentServiceRequest request)
{
return this.locationCache.CanUseMultipleWriteLocations()
&& this.locationCache.GetAvailableWriteLocations()?.Count > 1
&& (request.ResourceType == ResourceType.Document ||
(request.ResourceType == ResourceType.StoredProcedure && request.OperationType == OperationType.Execute));
}

#pragma warning disable VSTHRD100 // Avoid async void methods
private async void StartLocationBackgroundRefreshLoop()
Expand Down
2 changes: 2 additions & 0 deletions Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,7 @@ internal interface IGlobalEndpointManager : IDisposable
ReadOnlyDictionary<string, Uri> GetAvailableWriteEndpointsByLocation();

ReadOnlyDictionary<string, Uri> GetAvailableReadEndpointsByLocation();

bool CanSupportMultipleWriteLocations(DocumentServiceRequest request);
}
}
11 changes: 8 additions & 3 deletions Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ public ReadOnlyCollection<string> GetAvailableReadLocations()
{
return this.locationInfo.AvailableReadLocations;
}

public ReadOnlyCollection<string> GetAvailableWriteLocations()
{
return this.locationInfo.AvailableWriteLocations;
}

/// <summary>
/// Resolves request to service endpoint.
Expand Down Expand Up @@ -532,8 +537,8 @@ public bool CanUseMultipleWriteLocations(DocumentServiceRequest request)
return this.CanUseMultipleWriteLocations() &&
(request.ResourceType == ResourceType.Document ||
(request.ResourceType == ResourceType.StoredProcedure && request.OperationType == Documents.OperationType.ExecuteJavaScript));
}

}

private void ClearStaleEndpointUnavailabilityInfo()
{
if (this.locationUnavailablityInfoByEndpoint.Any())
Expand Down Expand Up @@ -768,7 +773,7 @@ private ReadOnlyDictionary<string, Uri> GetEndpointByLocation(IEnumerable<Accoun
return new ReadOnlyDictionary<string, Uri>(endpointsByLocation);
}

private bool CanUseMultipleWriteLocations()
internal bool CanUseMultipleWriteLocations()
{
return this.useMultipleWriteLocations && this.enableMultipleWriteLocations;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,82 @@ public void MultimasterMetadataWriteRetryTest()
retryPolicy.OnBeforeSendRequest(request);
Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint);
}

/// <summary>
/// Test to validate that when 429.3092 is thrown from the service, write requests on
/// a multi master account should be converted to 503 and retried to the next region.
/// </summary>
[TestMethod]
[DataRow(true, DisplayName = "Validate retry policy with multi master write account.")]
[DataRow(false, DisplayName = "Validate retry policy with single master write account.")]
public async Task ShouldRetryAsync_WhenRequestThrottledWithResourceNotAvailable_ShouldThrow503OnMultiMasterWriteAndRetryOnNextRegion(
bool isMultiMasterAccount)
{
// Arrange.
const bool enableEndpointDiscovery = true;
using GlobalEndpointManager endpointManager = this.Initialize(
useMultipleWriteLocations: isMultiMasterAccount,
enableEndpointDiscovery: enableEndpointDiscovery,
isPreferredLocationsListEmpty: false,
multimasterMetadataWriteRetryTest: true);

await endpointManager.RefreshLocationAsync();

ClientRetryPolicy retryPolicy = new (
endpointManager,
this.partitionKeyRangeLocationCache,
new RetryOptions(),
enableEndpointDiscovery,
false);

// Creates a sample write request.
DocumentServiceRequest request = this.CreateRequest(
isReadRequest: false,
isMasterResourceType: false);

// On first attempt should get (default/non hub) location.
retryPolicy.OnBeforeSendRequest(request);
Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint);

// Creation of 429.3092 Error.
HttpStatusCode throttleException = HttpStatusCode.TooManyRequests;
SubStatusCodes resourceNotAvailable = SubStatusCodes.SystemResourceUnavailable;

Exception innerException = new ();
Mock<INameValueCollection> nameValueCollection = new ();
DocumentClientException documentClientException = new (
message: "SystemResourceUnavailable: 429 with 3092 occurred.",
innerException: innerException,
statusCode: throttleException,
substatusCode: resourceNotAvailable,
requestUri: request.RequestContext.LocationEndpointToRoute,
responseHeaders: nameValueCollection.Object);

// Act.
Task<ShouldRetryResult> shouldRetry = retryPolicy.ShouldRetryAsync(
documentClientException,
new CancellationToken());

// Assert.
Assert.IsTrue(shouldRetry.Result.ShouldRetry);
retryPolicy.OnBeforeSendRequest(request);

if (isMultiMasterAccount)
{
Assert.AreEqual(
expected: ClientRetryPolicyTests.Location2Endpoint,
actual: request.RequestContext.LocationEndpointToRoute,
message: "The request should be routed to the next region, since the accound is a multi master write account and the request" +
"failed with 429.309 which got converted into 503 internally. This should trigger another retry attempt to the next region.");
}
else
{
Assert.AreEqual(
expected: ClientRetryPolicyTests.Location1Endpoint,
actual: request.RequestContext.LocationEndpointToRoute,
message: "Since this is asingle master account, the write request should not be retried on the next region.");
}
}

/// <summary>
/// Tests to see if different 503 substatus codes are handeled correctly
Expand Down

0 comments on commit 5e55837

Please sign in to comment.