-
Notifications
You must be signed in to change notification settings - Fork 491
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ClientRetryPolicy
: Adds Cross Regional Retry Logic on 429/3092
and 410/1022
#4677
Changes from all commits
e18aa95
e44f080
9006b5f
e7710c1
5a1b505
ebf1a11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,6 +48,7 @@ public ClientRetryPolicy( | |
{ | ||
this.throttlingRetry = new ResourceThrottleRetryPolicy( | ||
retryOptions.MaxRetryAttemptsOnThrottledRequests, | ||
endpointManager: globalEndpointManager, | ||
retryOptions.MaxRetryWaitTimeInSeconds); | ||
|
||
this.globalEndpointManager = globalEndpointManager; | ||
|
@@ -120,7 +121,18 @@ public async Task<ShouldRetryResult> ShouldRetryAsync( | |
} | ||
} | ||
|
||
return await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken); | ||
ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken); | ||
|
||
// Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy | ||
// is when we get 410 (Gone) with sub status code 3092 (System Resource Not Available). Note that this is applicable | ||
// for write requests targeted to a multiple master account. In such case, the 410/3092 will get converted into 503. | ||
if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Curious: Alternative is to do 429/3092 check before throttlingPolicy here explicitly, that seems direct and simpler right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error combination we are trying to handle here is On creating and throwing the exception part, I will double check if this can be optimized, but from the scenario handling perspective, I feel @FabianMeiswinkel : Any thoughts on this ? |
||
{ | ||
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( | ||
shouldMarkEndpointUnavailableForPkRange: true); | ||
} | ||
|
||
return throttleRetryResult; | ||
} | ||
|
||
/// <summary> | ||
|
@@ -143,7 +155,18 @@ public async Task<ShouldRetryResult> ShouldRetryAsync( | |
return shouldRetryResult; | ||
} | ||
|
||
return await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken); | ||
ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken); | ||
|
||
// Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy | ||
// is when we get 410 (Gone) with sub status code 3092 (System Resource Not Available). Note that this is applicable | ||
// for write requests targeted to a multiple master account. In such case, the 410/3092 will get converted into 503. | ||
if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException) | ||
{ | ||
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( | ||
shouldMarkEndpointUnavailableForPkRange: true); | ||
} | ||
|
||
return throttleRetryResult; | ||
} | ||
|
||
/// <summary> | ||
|
@@ -177,6 +200,7 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) | |
// This enables marking the endpoint unavailability on endpoint failover/unreachability | ||
this.locationEndpoint = this.globalEndpointManager.ResolveServiceEndpoint(request); | ||
request.RequestContext.RouteToLocation(this.locationEndpoint); | ||
this.throttlingRetry.OnBeforeSendRequest(request); | ||
} | ||
|
||
private async Task<ShouldRetryResult> ShouldRetryInternalAsync( | ||
|
@@ -274,16 +298,8 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync( | |
// Received 503 due to client connect timeout or Gateway | ||
if (statusCode == HttpStatusCode.ServiceUnavailable) | ||
{ | ||
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", | ||
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, | ||
this.documentServiceRequest?.ResourceAddress ?? string.Empty); | ||
|
||
// Mark the partition as unavailable. | ||
// Let the ClientRetry logic decide if the request should be retried | ||
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( | ||
this.documentServiceRequest); | ||
|
||
return this.ShouldRetryOnServiceUnavailable(); | ||
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( | ||
shouldMarkEndpointUnavailableForPkRange: true); | ||
} | ||
|
||
return null; | ||
|
@@ -406,6 +422,33 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques | |
} | ||
} | ||
|
||
/// <summary> | ||
/// Attempts to mark the endpoint associated with the current partition key range as unavailable and determines if | ||
/// a retry should be performed due to a ServiceUnavailable (503) response. This method is invoked when a 503 | ||
/// Service Unavailable response is received, indicating that the service might be temporarily unavailable. | ||
/// It optionally marks the partition key range as unavailable, which will influence future routing decisions. | ||
/// </summary> | ||
/// <param name="shouldMarkEndpointUnavailableForPkRange">A boolean flag indicating whether the endpoint for the | ||
/// current partition key range should be marked as unavailable.</param> | ||
/// <returns>An instance of <see cref="ShouldRetryResult"/> indicating whether the operation should be retried.</returns> | ||
private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( | ||
bool shouldMarkEndpointUnavailableForPkRange) | ||
{ | ||
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", | ||
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, | ||
this.documentServiceRequest?.ResourceAddress ?? string.Empty); | ||
|
||
if (shouldMarkEndpointUnavailableForPkRange) | ||
{ | ||
// Mark the partition as unavailable. | ||
// Let the ClientRetry logic decide if the request should be retried | ||
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( | ||
this.documentServiceRequest); | ||
} | ||
|
||
return this.ShouldRetryOnServiceUnavailable(); | ||
} | ||
|
||
/// <summary> | ||
/// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available. | ||
/// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -529,11 +529,12 @@ public bool ShouldRefreshEndpoints(out bool canRefreshInBackground) | |
|
||
public bool CanUseMultipleWriteLocations(DocumentServiceRequest request) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its used in other scenario like evaluating for sessionToken sending or not. These are two different usecases and should be distinguished. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally the background account refresh ( |
||
{ | ||
return this.CanUseMultipleWriteLocations() && | ||
(request.ResourceType == ResourceType.Document || | ||
return this.CanUseMultipleWriteLocations() | ||
&& this.locationInfo.AvailableWriteLocations.Count > 1 | ||
&& (request.ResourceType == ResourceType.Document || | ||
(request.ResourceType == ResourceType.StoredProcedure && request.OperationType == Documents.OperationType.ExecuteJavaScript)); | ||
} | ||
|
||
} | ||
private void ClearStaleEndpointUnavailabilityInfo() | ||
{ | ||
if (this.locationUnavailablityInfoByEndpoint.Any()) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
410/1022 is directly coming from direct package right? No changes are needed.
How about a separate PR just with that scope?
In-general that's the pattern we are following where direct revision with features it brings.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sense. However, if we are splitting this PR into two (410/1022 and 429/3092), then the Direct PR needed to go first since this work (329/3092) has dependency on the sub status codes, which is originated from the Direct package.