Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ClientRetryPolicy: Adds Cross Regional Retry Logic on 429/3092 #4691

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
e18aa95
Initial code changes to throw 503 on 429/3092.
kundadebdatta Sep 11, 2024
e44f080
Updated client retry policy. Added more tests to cover 429/3092.
kundadebdatta Sep 11, 2024
9006b5f
Code changes to update direct package version. Updating the tests.
kundadebdatta Sep 12, 2024
e7710c1
Code changes to refactor client retry policy.
kundadebdatta Sep 12, 2024
5a1b505
Minor code cleanup.
kundadebdatta Sep 12, 2024
ebf1a11
Merge branch 'master' into users/kundadebdatta/4390_cross_regional_re…
kundadebdatta Sep 12, 2024
91cc3ea
Reverting the direct version bump up change.
kundadebdatta Sep 13, 2024
b479714
Merge branch 'master' into users/kundadebdatta/4656_cross_regional_re…
kundadebdatta Sep 13, 2024
6ebb9e6
Code changes to address some of the review comments.
kundadebdatta Sep 13, 2024
47cacdf
Merge branch 'master' into users/kundadebdatta/4656_cross_regional_re…
kundadebdatta Sep 13, 2024
e6e2766
Code changes to move failover logic in client retry policy.
kundadebdatta Sep 14, 2024
9575986
Minor code clean up.
kundadebdatta Sep 14, 2024
0c5304d
Code changes to clean up some cosmetic items.
kundadebdatta Sep 15, 2024
ce3c62f
Further clean up.
kundadebdatta Sep 15, 2024
bf5e649
Merge branch 'master' into users/kundadebdatta/4656_cross_regional_re…
kundadebdatta Sep 16, 2024
973f1f1
Code changes to address review comments.
kundadebdatta Sep 17, 2024
40560cc
Minor refactor to address cosmetic update.
kundadebdatta Sep 17, 2024
a198a04
Code changes to address cosmetic review comment.
kundadebdatta Sep 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 85 additions & 10 deletions Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy
private int serviceUnavailableRetryCount;
private bool isReadRequest;
private bool canUseMultipleWriteLocations;
private bool isMultiMasterWriteRequest;
private Uri locationEndpoint;
private RetryContext retryContext;
private DocumentServiceRequest documentServiceRequest;
Expand All @@ -57,6 +58,7 @@ public ClientRetryPolicy(
this.sessionTokenRetryCount = 0;
this.serviceUnavailableRetryCount = 0;
this.canUseMultipleWriteLocations = false;
this.isMultiMasterWriteRequest = false;
this.isPertitionLevelFailoverEnabled = isPertitionLevelFailoverEnabled;
}

Expand Down Expand Up @@ -97,6 +99,23 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(

if (exception is DocumentClientException clientException)
{
// Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we
// get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable
// for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The
// reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code
// might not be a throttling scenario at all and the status code in that case would be different than 429.
if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
clientException.StatusCode,
clientException.GetSubStatus()))
{
DefaultTrace.TraceError(
"Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.",
StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable);

return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
shouldMarkEndpointUnavailableForPkRange: true);
}

ShouldRetryResult shouldRetryResult = await this.ShouldRetryInternalAsync(
clientException?.StatusCode,
clientException?.GetSubStatus());
Expand Down Expand Up @@ -143,6 +162,23 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
return shouldRetryResult;
}

// Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we
// get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable
// for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The
// reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code
// might not be a throttling scenario at all and the status code in that case would be different than 429.
if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
cosmosResponseMessage.StatusCode,
cosmosResponseMessage?.Headers.SubStatusCode))
{
DefaultTrace.TraceError(
"Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.",
StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable);

return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
shouldMarkEndpointUnavailableForPkRange: true);
}

return await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken);
}

Expand All @@ -156,6 +192,8 @@ public void OnBeforeSendRequest(DocumentServiceRequest request)
this.isReadRequest = request.IsReadOnlyRequest;
this.canUseMultipleWriteLocations = this.globalEndpointManager.CanUseMultipleWriteLocations(request);
this.documentServiceRequest = request;
this.isMultiMasterWriteRequest = !this.isReadRequest
&& (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request) ?? false);

// clear previous location-based routing directive
request.RequestContext.ClearRouteToLocation();
Expand Down Expand Up @@ -274,16 +312,8 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
// Received 503 due to client connect timeout or Gateway
if (statusCode == HttpStatusCode.ServiceUnavailable)
{
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

// Mark the partition as unavailable.
// Let the ClientRetry logic decide if the request should be retried
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);

return this.ShouldRetryOnServiceUnavailable();
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
shouldMarkEndpointUnavailableForPkRange: true);
}

return null;
Expand Down Expand Up @@ -406,6 +436,33 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques
}
}

/// <summary>
/// Attempts to mark the endpoint associated with the current partition key range as unavailable and determines if
/// a retry should be performed due to a ServiceUnavailable (503) response. This method is invoked when a 503
/// Service Unavailable response is received, indicating that the service might be temporarily unavailable.
/// It optionally marks the partition key range as unavailable, which will influence future routing decisions.
/// </summary>
/// <param name="shouldMarkEndpointUnavailableForPkRange">A boolean flag indicating whether the endpoint for the
/// current partition key range should be marked as unavailable.</param>
/// <returns>An instance of <see cref="ShouldRetryResult"/> indicating whether the operation should be retried.</returns>
private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
bool shouldMarkEndpointUnavailableForPkRange)
{
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

if (shouldMarkEndpointUnavailableForPkRange)
{
// Mark the partition as unavailable.
// Let the ClientRetry logic decide if the request should be retried
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);
}

return this.ShouldRetryOnServiceUnavailable();
}

/// <summary>
/// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available.
/// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well.
Expand Down Expand Up @@ -449,6 +506,24 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable()
return ShouldRetryResult.RetryAfter(TimeSpan.Zero);
}

/// <summary>
/// Returns a boolean flag indicating if the endpoint should be marked as unavailable
/// due to a 429 response with a sub status code of 3092 (system resource unavailable).
/// This is applicable for write requests targeted for multi master accounts.
/// </summary>
/// <param name="statusCode">An instance of <see cref="HttpStatusCode"/> containing the status code.</param>
/// <param name="subStatusCode">An instance of <see cref="SubStatusCodes"/> containing the sub status code.</param>
/// <returns>A boolean flag indicating is the endpoint should be marked as unavailable.</returns>
private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
HttpStatusCode? statusCode,
SubStatusCodes? subStatusCode)
{
return this.isMultiMasterWriteRequest
&& statusCode.HasValue
kirankumarkolli marked this conversation as resolved.
Show resolved Hide resolved
&& (int)statusCode.Value == (int)StatusCodes.TooManyRequests
&& subStatusCode == SubStatusCodes.SystemResourceUnavailable;
}

private sealed class RetryContext
{
public int RetryLocationIndex { get; set; }
Expand Down
14 changes: 14 additions & 0 deletions Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,20 @@ public virtual async Task RefreshLocationAsync(bool forceRefresh = false)

await this.RefreshDatabaseAccountInternalAsync(forceRefresh: forceRefresh);
}

/// <summary>
/// Determines whether the current configuration and state of the service allow for supporting multiple write locations.
/// This method returns True is the AvailableWriteLocations in LocationCache is more than 1. Otherwise, it returns False.
/// </summary>
/// <param name="request">The document service request for which the write location support is being evaluated.</param>
/// <returns>A boolean flag indicating if the available write locations are more than one.</returns>
public bool CanSupportMultipleWriteLocations(DocumentServiceRequest request)
kundadebdatta marked this conversation as resolved.
Show resolved Hide resolved
{
return this.locationCache.CanUseMultipleWriteLocations()
&& this.locationCache.GetAvailableWriteLocations()?.Count > 1
&& (request.ResourceType == ResourceType.Document ||
(request.ResourceType == ResourceType.StoredProcedure && request.OperationType == OperationType.Execute));
}

#pragma warning disable VSTHRD100 // Avoid async void methods
private async void StartLocationBackgroundRefreshLoop()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,7 @@ internal interface IGlobalEndpointManager : IDisposable
ReadOnlyDictionary<string, Uri> GetAvailableWriteEndpointsByLocation();

ReadOnlyDictionary<string, Uri> GetAvailableReadEndpointsByLocation();

bool CanSupportMultipleWriteLocations(DocumentServiceRequest request);
}
}
11 changes: 8 additions & 3 deletions Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ public ReadOnlyCollection<string> GetAvailableReadLocations()
{
return this.locationInfo.AvailableReadLocations;
}

public ReadOnlyCollection<string> GetAvailableWriteLocations()
{
return this.locationInfo.AvailableWriteLocations;
}

/// <summary>
/// Resolves request to service endpoint.
Expand Down Expand Up @@ -532,8 +537,8 @@ public bool CanUseMultipleWriteLocations(DocumentServiceRequest request)
return this.CanUseMultipleWriteLocations() &&
(request.ResourceType == ResourceType.Document ||
(request.ResourceType == ResourceType.StoredProcedure && request.OperationType == Documents.OperationType.ExecuteJavaScript));
}

}
private void ClearStaleEndpointUnavailabilityInfo()
{
if (this.locationUnavailablityInfoByEndpoint.Any())
Expand Down Expand Up @@ -768,7 +773,7 @@ private ReadOnlyDictionary<string, Uri> GetEndpointByLocation(IEnumerable<Accoun
return new ReadOnlyDictionary<string, Uri>(endpointsByLocation);
}

private bool CanUseMultipleWriteLocations()
internal bool CanUseMultipleWriteLocations()
{
return this.useMultipleWriteLocations && this.enableMultipleWriteLocations;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,82 @@ public void MultimasterMetadataWriteRetryTest()
retryPolicy.OnBeforeSendRequest(request);
Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint);
}

/// <summary>
/// Test to validate that when 429.3092 is thrown from the service, write requests on
/// a multi master account should be converted to 503 and retried to the next region.
/// </summary>
[TestMethod]
[DataRow(true, DisplayName = "Validate retry policy with multi master write account.")]
[DataRow(false, DisplayName = "Validate retry policy with single master write account.")]
public async Task ShouldRetryAsync_WhenRequestThrottledWithResourceNotAvailable_ShouldThrow503OnMultiMasterWriteAndRetryOnNextRegion(
bool isMultiMasterAccount)
{
// Arrange.
const bool enableEndpointDiscovery = true;
using GlobalEndpointManager endpointManager = this.Initialize(
useMultipleWriteLocations: isMultiMasterAccount,
enableEndpointDiscovery: enableEndpointDiscovery,
isPreferredLocationsListEmpty: false,
multimasterMetadataWriteRetryTest: true);

await endpointManager.RefreshLocationAsync();

ClientRetryPolicy retryPolicy = new (
endpointManager,
this.partitionKeyRangeLocationCache,
new RetryOptions(),
enableEndpointDiscovery,
false);

// Creates a sample write request.
DocumentServiceRequest request = this.CreateRequest(
isReadRequest: false,
isMasterResourceType: false);

// On first attempt should get (default/non hub) location.
retryPolicy.OnBeforeSendRequest(request);
Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint);

// Creation of 429.3092 Error.
HttpStatusCode throttleException = HttpStatusCode.TooManyRequests;
SubStatusCodes resourceNotAvailable = SubStatusCodes.SystemResourceUnavailable;

Exception innerException = new ();
Mock<INameValueCollection> nameValueCollection = new ();
DocumentClientException documentClientException = new (
message: "SystemResourceUnavailable: 429 with 3092 occurred.",
innerException: innerException,
statusCode: throttleException,
substatusCode: resourceNotAvailable,
requestUri: request.RequestContext.LocationEndpointToRoute,
responseHeaders: nameValueCollection.Object);

// Act.
Task<ShouldRetryResult> shouldRetry = retryPolicy.ShouldRetryAsync(
documentClientException,
new CancellationToken());

// Assert.
Assert.IsTrue(shouldRetry.Result.ShouldRetry);
retryPolicy.OnBeforeSendRequest(request);

if (isMultiMasterAccount)
{
Assert.AreEqual(
expected: ClientRetryPolicyTests.Location2Endpoint,
actual: request.RequestContext.LocationEndpointToRoute,
message: "The request should be routed to the next region, since the accound is a multi master write account and the request" +
"failed with 429.309 which got converted into 503 internally. This should trigger another retry attempt to the next region.");
}
else
{
Assert.AreEqual(
expected: ClientRetryPolicyTests.Location1Endpoint,
actual: request.RequestContext.LocationEndpointToRoute,
message: "Since this is asingle master account, the write request should not be retried on the next region.");
}
}

/// <summary>
/// Tests to see if different 503 substatus codes are handeled correctly
Expand Down
Loading