diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index 3137a6f042..6b3cd3f882 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -35,6 +35,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy private int serviceUnavailableRetryCount; private bool isReadRequest; private bool canUseMultipleWriteLocations; + private bool isMultiMasterWriteRequest; private Uri locationEndpoint; private RetryContext retryContext; private DocumentServiceRequest documentServiceRequest; @@ -57,6 +58,7 @@ public ClientRetryPolicy( this.sessionTokenRetryCount = 0; this.serviceUnavailableRetryCount = 0; this.canUseMultipleWriteLocations = false; + this.isMultiMasterWriteRequest = false; this.isPertitionLevelFailoverEnabled = isPertitionLevelFailoverEnabled; } @@ -97,6 +99,23 @@ public async Task ShouldRetryAsync( if (exception is DocumentClientException clientException) { + // Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we + // get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable + // for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The + // reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code + // might not be a throttling scenario at all and the status code in that case would be different than 429. + if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite( + clientException.StatusCode, + clientException.GetSubStatus())) + { + DefaultTrace.TraceError( + "Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", + StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable); + + return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( + shouldMarkEndpointUnavailableForPkRange: true); + } + ShouldRetryResult shouldRetryResult = await this.ShouldRetryInternalAsync( clientException?.StatusCode, clientException?.GetSubStatus()); @@ -143,6 +162,23 @@ public async Task ShouldRetryAsync( return shouldRetryResult; } + // Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we + // get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable + // for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The + // reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code + // might not be a throttling scenario at all and the status code in that case would be different than 429. + if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite( + cosmosResponseMessage.StatusCode, + cosmosResponseMessage?.Headers.SubStatusCode)) + { + DefaultTrace.TraceError( + "Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", + StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable); + + return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( + shouldMarkEndpointUnavailableForPkRange: true); + } + return await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken); } @@ -156,6 +192,8 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) this.isReadRequest = request.IsReadOnlyRequest; this.canUseMultipleWriteLocations = this.globalEndpointManager.CanUseMultipleWriteLocations(request); this.documentServiceRequest = request; + this.isMultiMasterWriteRequest = !this.isReadRequest + && (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request) ?? false); // clear previous location-based routing directive request.RequestContext.ClearRouteToLocation(); @@ -274,16 +312,8 @@ private async Task ShouldRetryInternalAsync( // Received 503 due to client connect timeout or Gateway if (statusCode == HttpStatusCode.ServiceUnavailable) { - DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", - this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, - this.documentServiceRequest?.ResourceAddress ?? string.Empty); - - // Mark the partition as unavailable. - // Let the ClientRetry logic decide if the request should be retried - this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( - this.documentServiceRequest); - - return this.ShouldRetryOnServiceUnavailable(); + return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( + shouldMarkEndpointUnavailableForPkRange: true); } return null; @@ -406,6 +436,33 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques } } + /// + /// Attempts to mark the endpoint associated with the current partition key range as unavailable and determines if + /// a retry should be performed due to a ServiceUnavailable (503) response. This method is invoked when a 503 + /// Service Unavailable response is received, indicating that the service might be temporarily unavailable. + /// It optionally marks the partition key range as unavailable, which will influence future routing decisions. + /// + /// A boolean flag indicating whether the endpoint for the + /// current partition key range should be marked as unavailable. + /// An instance of indicating whether the operation should be retried. + private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( + bool shouldMarkEndpointUnavailableForPkRange) + { + DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", + this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, + this.documentServiceRequest?.ResourceAddress ?? string.Empty); + + if (shouldMarkEndpointUnavailableForPkRange) + { + // Mark the partition as unavailable. + // Let the ClientRetry logic decide if the request should be retried + this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( + this.documentServiceRequest); + } + + return this.ShouldRetryOnServiceUnavailable(); + } + /// /// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available. /// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well. @@ -449,6 +506,24 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable() return ShouldRetryResult.RetryAfter(TimeSpan.Zero); } + /// + /// Returns a boolean flag indicating if the endpoint should be marked as unavailable + /// due to a 429 response with a sub status code of 3092 (system resource unavailable). + /// This is applicable for write requests targeted for multi master accounts. + /// + /// An instance of containing the status code. + /// An instance of containing the sub status code. + /// A boolean flag indicating is the endpoint should be marked as unavailable. + private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite( + HttpStatusCode? statusCode, + SubStatusCodes? subStatusCode) + { + return this.isMultiMasterWriteRequest + && statusCode.HasValue + && (int)statusCode.Value == (int)StatusCodes.TooManyRequests + && subStatusCode == SubStatusCodes.SystemResourceUnavailable; + } + private sealed class RetryContext { public int RetryLocationIndex { get; set; } diff --git a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs index 4e5184177b..3c0a0c7633 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs @@ -534,6 +534,20 @@ public virtual async Task RefreshLocationAsync(bool forceRefresh = false) await this.RefreshDatabaseAccountInternalAsync(forceRefresh: forceRefresh); } + + /// + /// Determines whether the current configuration and state of the service allow for supporting multiple write locations. + /// This method returns True is the AvailableWriteLocations in LocationCache is more than 1. Otherwise, it returns False. + /// + /// The document service request for which the write location support is being evaluated. + /// A boolean flag indicating if the available write locations are more than one. + public bool CanSupportMultipleWriteLocations(DocumentServiceRequest request) + { + return this.locationCache.CanUseMultipleWriteLocations() + && this.locationCache.GetAvailableWriteLocations()?.Count > 1 + && (request.ResourceType == ResourceType.Document || + (request.ResourceType == ResourceType.StoredProcedure && request.OperationType == OperationType.Execute)); + } #pragma warning disable VSTHRD100 // Avoid async void methods private async void StartLocationBackgroundRefreshLoop() diff --git a/Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs b/Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs index 427bbb8d2d..fe93027aec 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs @@ -36,5 +36,7 @@ internal interface IGlobalEndpointManager : IDisposable ReadOnlyDictionary GetAvailableWriteEndpointsByLocation(); ReadOnlyDictionary GetAvailableReadEndpointsByLocation(); + + bool CanSupportMultipleWriteLocations(DocumentServiceRequest request); } } \ No newline at end of file diff --git a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs index c95223f044..cc108a722e 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs @@ -263,6 +263,11 @@ public ReadOnlyCollection GetAvailableReadLocations() { return this.locationInfo.AvailableReadLocations; } + + public ReadOnlyCollection GetAvailableWriteLocations() + { + return this.locationInfo.AvailableWriteLocations; + } /// /// Resolves request to service endpoint. @@ -532,8 +537,8 @@ public bool CanUseMultipleWriteLocations(DocumentServiceRequest request) return this.CanUseMultipleWriteLocations() && (request.ResourceType == ResourceType.Document || (request.ResourceType == ResourceType.StoredProcedure && request.OperationType == Documents.OperationType.ExecuteJavaScript)); - } - + } + private void ClearStaleEndpointUnavailabilityInfo() { if (this.locationUnavailablityInfoByEndpoint.Any()) @@ -768,7 +773,7 @@ private ReadOnlyDictionary GetEndpointByLocation(IEnumerable(endpointsByLocation); } - private bool CanUseMultipleWriteLocations() + internal bool CanUseMultipleWriteLocations() { return this.useMultipleWriteLocations && this.enableMultipleWriteLocations; } diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs index d4c56bae2e..caa56b2911 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs @@ -86,6 +86,82 @@ public void MultimasterMetadataWriteRetryTest() retryPolicy.OnBeforeSendRequest(request); Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint); } + + /// + /// Test to validate that when 429.3092 is thrown from the service, write requests on + /// a multi master account should be converted to 503 and retried to the next region. + /// + [TestMethod] + [DataRow(true, DisplayName = "Validate retry policy with multi master write account.")] + [DataRow(false, DisplayName = "Validate retry policy with single master write account.")] + public async Task ShouldRetryAsync_WhenRequestThrottledWithResourceNotAvailable_ShouldThrow503OnMultiMasterWriteAndRetryOnNextRegion( + bool isMultiMasterAccount) + { + // Arrange. + const bool enableEndpointDiscovery = true; + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: isMultiMasterAccount, + enableEndpointDiscovery: enableEndpointDiscovery, + isPreferredLocationsListEmpty: false, + multimasterMetadataWriteRetryTest: true); + + await endpointManager.RefreshLocationAsync(); + + ClientRetryPolicy retryPolicy = new ( + endpointManager, + this.partitionKeyRangeLocationCache, + new RetryOptions(), + enableEndpointDiscovery, + false); + + // Creates a sample write request. + DocumentServiceRequest request = this.CreateRequest( + isReadRequest: false, + isMasterResourceType: false); + + // On first attempt should get (default/non hub) location. + retryPolicy.OnBeforeSendRequest(request); + Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint); + + // Creation of 429.3092 Error. + HttpStatusCode throttleException = HttpStatusCode.TooManyRequests; + SubStatusCodes resourceNotAvailable = SubStatusCodes.SystemResourceUnavailable; + + Exception innerException = new (); + Mock nameValueCollection = new (); + DocumentClientException documentClientException = new ( + message: "SystemResourceUnavailable: 429 with 3092 occurred.", + innerException: innerException, + statusCode: throttleException, + substatusCode: resourceNotAvailable, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: nameValueCollection.Object); + + // Act. + Task shouldRetry = retryPolicy.ShouldRetryAsync( + documentClientException, + new CancellationToken()); + + // Assert. + Assert.IsTrue(shouldRetry.Result.ShouldRetry); + retryPolicy.OnBeforeSendRequest(request); + + if (isMultiMasterAccount) + { + Assert.AreEqual( + expected: ClientRetryPolicyTests.Location2Endpoint, + actual: request.RequestContext.LocationEndpointToRoute, + message: "The request should be routed to the next region, since the accound is a multi master write account and the request" + + "failed with 429.309 which got converted into 503 internally. This should trigger another retry attempt to the next region."); + } + else + { + Assert.AreEqual( + expected: ClientRetryPolicyTests.Location1Endpoint, + actual: request.RequestContext.LocationEndpointToRoute, + message: "Since this is asingle master account, the write request should not be retried on the next region."); + } + } /// /// Tests to see if different 503 substatus codes are handeled correctly