From e18aa95c462cd718d3cf77dd8f70e28e74f05af8 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Wed, 11 Sep 2024 12:54:46 -0700 Subject: [PATCH 01/14] Initial code changes to throw 503 on 429/3092. --- .../src/Batch/BatchAsyncContainerExecutor.cs | 8 +- .../src/ClientRetryPolicy.cs | 1 + Microsoft.Azure.Cosmos/src/DocumentClient.cs | 5 +- .../src/MetadataRequestThrottleRetryPolicy.cs | 1 + .../src/ResourceThrottleRetryPolicy.cs | 38 +++- .../src/Routing/LocationCache.cs | 9 +- .../Batch/BatchAsyncBatcherTests.cs | 41 ++-- .../Batch/BatchAsyncOperationContextTests.cs | 36 +++- ...lkPartitionKeyRangeGoneRetryPolicyTests.cs | 34 +++- .../ResourceThrottleRetryPolicyTests.cs | 180 +++++++++++++++++- 10 files changed, 312 insertions(+), 41 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/Batch/BatchAsyncContainerExecutor.cs b/Microsoft.Azure.Cosmos/src/Batch/BatchAsyncContainerExecutor.cs index be166ed5a9..69eddb2fbc 100644 --- a/Microsoft.Azure.Cosmos/src/Batch/BatchAsyncContainerExecutor.cs +++ b/Microsoft.Azure.Cosmos/src/Batch/BatchAsyncContainerExecutor.cs @@ -92,7 +92,11 @@ public virtual async Task AddAsync( ItemBatchOperationContext context = new ItemBatchOperationContext( resolvedPartitionKeyRangeId, trace, - BatchAsyncContainerExecutor.GetRetryPolicy(this.cosmosContainer, operation.OperationType, this.retryOptions)); + BatchAsyncContainerExecutor.GetRetryPolicy( + this.cosmosContainer, + this.cosmosClientContext?.DocumentClient?.GlobalEndpointManager, + operation.OperationType, + this.retryOptions)); if (itemRequestOptions != null && itemRequestOptions.AddRequestHeaders != null) { @@ -159,6 +163,7 @@ internal virtual async Task ValidateOperationAsync( private static IDocumentClientRetryPolicy GetRetryPolicy( ContainerInternal containerInternal, + GlobalEndpointManager endpointManager, OperationType operationType, RetryOptions retryOptions) { @@ -167,6 +172,7 @@ private static IDocumentClientRetryPolicy GetRetryPolicy( operationType, new ResourceThrottleRetryPolicy( retryOptions.MaxRetryAttemptsOnThrottledRequests, + endpointManager, retryOptions.MaxRetryWaitTimeInSeconds)); } diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index 3137a6f042..d0b74e2fc7 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -48,6 +48,7 @@ public ClientRetryPolicy( { this.throttlingRetry = new ResourceThrottleRetryPolicy( retryOptions.MaxRetryAttemptsOnThrottledRequests, + endpointManager: globalEndpointManager, retryOptions.MaxRetryWaitTimeInSeconds); this.globalEndpointManager = globalEndpointManager; diff --git a/Microsoft.Azure.Cosmos/src/DocumentClient.cs b/Microsoft.Azure.Cosmos/src/DocumentClient.cs index 946fa4cb08..0f4ce7c9fd 100644 --- a/Microsoft.Azure.Cosmos/src/DocumentClient.cs +++ b/Microsoft.Azure.Cosmos/src/DocumentClient.cs @@ -988,8 +988,9 @@ internal virtual void Initialize(Uri serviceEndpoint, this.initializeTaskFactory = (_) => TaskHelper.InlineIfPossible( () => this.GetInitializationTaskAsync(storeClientFactory: storeClientFactory), new ResourceThrottleRetryPolicy( - this.ConnectionPolicy.RetryOptions.MaxRetryAttemptsOnThrottledRequests, - this.ConnectionPolicy.RetryOptions.MaxRetryWaitTimeInSeconds)); + maxAttemptCount: this.ConnectionPolicy.RetryOptions.MaxRetryAttemptsOnThrottledRequests, + endpointManager: this.GlobalEndpointManager, + maxWaitTimeInSeconds: this.ConnectionPolicy.RetryOptions.MaxRetryWaitTimeInSeconds)); // Create the task to start the initialize task // Task will be awaited on in the EnsureValidClientAsync diff --git a/Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs index 928d2f2e87..8f56139957 100644 --- a/Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs @@ -72,6 +72,7 @@ public MetadataRequestThrottleRetryPolicy( this.throttlingRetryPolicy = new ResourceThrottleRetryPolicy( maxRetryAttemptsOnThrottledRequests, + this.globalEndpointManager, maxRetryWaitTimeInSeconds); this.retryContext = new MetadataRetryContext diff --git a/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs index a99bc594f7..c74ee89e82 100644 --- a/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs @@ -9,6 +9,7 @@ namespace Microsoft.Azure.Cosmos using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Cosmos.Core.Trace; + using Microsoft.Azure.Cosmos.Routing; using Microsoft.Azure.Documents; // Retry when we receive the throttling from server. @@ -19,12 +20,15 @@ internal sealed class ResourceThrottleRetryPolicy : IDocumentClientRetryPolicy private readonly uint backoffDelayFactor; private readonly int maxAttemptCount; private readonly TimeSpan maxWaitTimeInMilliseconds; + private readonly IGlobalEndpointManager globalEndpointManager; private int currentAttemptCount; private TimeSpan cumulativeRetryDelay; + private bool? isMultiMasterWriteRegion; public ResourceThrottleRetryPolicy( int maxAttemptCount, + IGlobalEndpointManager endpointManager, int maxWaitTimeInSeconds = DefaultMaxWaitTimeInSeconds, uint backoffDelayFactor = 1) { @@ -33,6 +37,7 @@ public ResourceThrottleRetryPolicy( throw new ArgumentException("maxWaitTimeInSeconds", "maxWaitTimeInSeconds must be less than " + (int.MaxValue / 1000)); } + this.globalEndpointManager = endpointManager; this.maxAttemptCount = maxAttemptCount; this.backoffDelayFactor = backoffDelayFactor; this.maxWaitTimeInMilliseconds = TimeSpan.FromSeconds(maxWaitTimeInSeconds); @@ -59,7 +64,9 @@ public Task ShouldRetryAsync( return Task.FromResult(ShouldRetryResult.NoRetry()); } - return this.ShouldRetryInternalAsync(dce.RetryAfter); + return this.ShouldRetryInternalAsync( + dce?.GetSubStatus(), + dce?.RetryAfter); } DefaultTrace.TraceError( @@ -88,11 +95,34 @@ public Task ShouldRetryAsync( return Task.FromResult(ShouldRetryResult.NoRetry()); } - return this.ShouldRetryInternalAsync(cosmosResponseMessage?.Headers.RetryAfter); + return this.ShouldRetryInternalAsync( + cosmosResponseMessage?.Headers.SubStatusCode, + cosmosResponseMessage?.Headers.RetryAfter, + cosmosResponseMessage?.CosmosException); } - private Task ShouldRetryInternalAsync(TimeSpan? retryAfter) + private Task ShouldRetryInternalAsync( + SubStatusCodes? subStatusCode, + TimeSpan? retryAfter, + Exception exception = null) { + if (this.isMultiMasterWriteRegion.HasValue + && this.isMultiMasterWriteRegion.Value + && subStatusCode != null + && subStatusCode == SubStatusCodes.AadTokenExpired) + { + DefaultTrace.TraceError( + "Operation will NOT be retried. Converting 429/3092 to 503. Current attempt {0} sub status code: {1}.", + this.currentAttemptCount, SubStatusCodes.AadTokenExpired); + + ServiceUnavailableException exceptionToThrow = ServiceUnavailableException.Create( + SubStatusCodes.AadTokenExpired, + innerException: exception); + + return Task.FromResult( + ShouldRetryResult.NoRetry(exceptionToThrow)); + } + TimeSpan retryDelay = TimeSpan.Zero; if (this.currentAttemptCount < this.maxAttemptCount && this.CheckIfRetryNeeded(retryAfter, out retryDelay)) @@ -133,6 +163,8 @@ private object GetExceptionMessage(Exception exception) /// The request being sent to the service. public void OnBeforeSendRequest(DocumentServiceRequest request) { + this.isMultiMasterWriteRegion = !request.IsReadOnlyRequest + && (this.globalEndpointManager?.CanUseMultipleWriteLocations(request) ?? false); } /// diff --git a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs index c95223f044..eb14c08c0c 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs @@ -529,11 +529,12 @@ public bool ShouldRefreshEndpoints(out bool canRefreshInBackground) public bool CanUseMultipleWriteLocations(DocumentServiceRequest request) { - return this.CanUseMultipleWriteLocations() && - (request.ResourceType == ResourceType.Document || + return this.CanUseMultipleWriteLocations() + && this.locationInfo.AvailableWriteLocations.Count > 1 + && (request.ResourceType == ResourceType.Document || (request.ResourceType == ResourceType.StoredProcedure && request.OperationType == Documents.OperationType.ExecuteJavaScript)); - } - + } + private void ClearStaleEndpointUnavailabilityInfo() { if (this.locationUnavailablityInfoByEndpoint.Any()) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncBatcherTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncBatcherTests.cs index acc83d3ff4..1b421152bd 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncBatcherTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncBatcherTests.cs @@ -23,6 +23,23 @@ public class BatchAsyncBatcherTests { private static readonly Exception expectedException = new Exception(); private static readonly BatchPartitionMetric metric = new BatchPartitionMetric(); + private GlobalEndpointManager mockedEndpointManager; + + [TestInitialize] + public void Initialize() + { + Mock mockedClient = new(); + + this.mockedEndpointManager = new( + mockedClient.Object, + new ConnectionPolicy()); + } + + [TestCleanup] + public void Cleanup() + { + this.mockedEndpointManager.Dispose(); + } private ItemBatchOperation CreateItemBatchOperation(bool withContext = false) { @@ -565,12 +582,12 @@ public async Task RetrierGetsCalledOnSplit() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); @@ -594,12 +611,12 @@ public async Task RetrierGetsCalledOnCompletingSplit() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); @@ -623,12 +640,12 @@ public async Task RetrierGetsCalledOnCompletingPartitionMigration() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); @@ -672,17 +689,17 @@ public async Task RetrierGetsCalledOn413_3402() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); IDocumentClientRetryPolicy retryPolicy3 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Create, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); @@ -710,17 +727,17 @@ public async Task RetrierGetsCalledOn413_NoSubstatus() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); IDocumentClientRetryPolicy retryPolicy3 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Create, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncOperationContextTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncOperationContextTests.cs index 8e3c6bcbc3..f73e63368b 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncOperationContextTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncOperationContextTests.cs @@ -18,13 +18,31 @@ namespace Microsoft.Azure.Cosmos.Tests [TestClass] public class BatchAsyncOperationContextTests { + private GlobalEndpointManager mockedEndpointManager; + + [TestInitialize] + public void Initialize() + { + Mock mockedClient = new(); + + this.mockedEndpointManager = new( + mockedClient.Object, + new ConnectionPolicy()); + } + + [TestCleanup] + public void Cleanup() + { + this.mockedEndpointManager.Dispose(); + } + [TestMethod] public async Task TraceIsJoinedOnCompletionWithRetry() { IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); Trace rootTrace = Trace.GetRootTrace(name: "RootTrace"); @@ -65,7 +83,7 @@ public async Task TraceIsJoinedOnCompletionWithoutRetry() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); Trace rootTrace = Trace.GetRootTrace(name: "RootTrace"); @@ -179,7 +197,7 @@ public async Task ShouldRetry_WithPolicy_OnSuccess() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.OK); ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -193,7 +211,7 @@ public async Task ShouldRetry_WithPolicy_On429() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult((HttpStatusCode)StatusCodes.TooManyRequests); ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -207,7 +225,7 @@ public async Task ShouldRetry_WithPolicy_On413_3402() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.RequestEntityTooLarge) { SubStatusCode = (SubStatusCodes)3402 }; ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -221,7 +239,7 @@ public async Task ShouldRetry_WithPolicy_On413_0() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Create, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.RequestEntityTooLarge); ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -235,7 +253,7 @@ public async Task ShouldRetry_WithPolicy_OnSplit() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.PartitionKeyRangeGone }; ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -249,7 +267,7 @@ public async Task ShouldRetry_WithPolicy_OnCompletingSplit() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.CompletingSplit }; ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -263,7 +281,7 @@ public async Task ShouldRetry_WithPolicy_OnCompletingPartitionMigration() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.CompletingPartitionMigration }; ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BulkPartitionKeyRangeGoneRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BulkPartitionKeyRangeGoneRetryPolicyTests.cs index 1f87d40635..487b272dfc 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BulkPartitionKeyRangeGoneRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BulkPartitionKeyRangeGoneRetryPolicyTests.cs @@ -18,13 +18,31 @@ namespace Microsoft.Azure.Cosmos.Tests [TestClass] public class BulkPartitionKeyRangeGoneRetryPolicyTests { + private GlobalEndpointManager mockedEndpointManager; + + [TestInitialize] + public void Initialize() + { + Mock mockedClient = new (); + + this.mockedEndpointManager = new ( + mockedClient.Object, + new ConnectionPolicy()); + } + + [TestCleanup] + public void Cleanup() + { + this.mockedEndpointManager.Dispose(); + } + [TestMethod] public async Task NotRetryOnSuccess() { IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.OK); ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -37,7 +55,7 @@ public async Task RetriesOn429() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult((HttpStatusCode)StatusCodes.TooManyRequests); ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -50,7 +68,7 @@ public async Task RetriesOn413_3204() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.RequestEntityTooLarge) { SubStatusCode = (SubStatusCodes)3402 }; ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -63,7 +81,7 @@ public async Task RetriesOn413_0() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Create, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.RequestEntityTooLarge); ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -76,7 +94,7 @@ public async Task RetriesOnSplits() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.PartitionKeyRangeGone }; ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -89,7 +107,7 @@ public async Task RetriesOnSplits_UpToMax() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.PartitionKeyRangeGone }; ShouldRetryResult shouldRetryResult; @@ -109,7 +127,7 @@ public async Task RetriesOnCompletingSplits() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.CompletingSplit }; ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -122,7 +140,7 @@ public async Task RetriesOnCompletingPartitionMigrationSplits() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1)); + new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.CompletingPartitionMigration }; ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs index 274bbcb8e9..8b10f3332a 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs @@ -6,14 +6,24 @@ namespace Microsoft.Azure.Cosmos.Tests { using System; using System.Collections.Generic; + using System.Collections.ObjectModel; + using System.Collections.Specialized; using System.Diagnostics; + using System.Net; + using System.Threading; using System.Threading.Tasks; + using global::Azure.Core; + using Microsoft.Azure.Cosmos.Client.Tests; using Microsoft.Azure.Cosmos.Core.Trace; + using Microsoft.Azure.Cosmos.Routing; + using Microsoft.Azure.Documents; using Microsoft.VisualStudio.TestTools.UnitTesting; + using Moq; [TestClass] public class ResourceThrottleRetryPolicyTests { + private static readonly Uri DefaultEndpoint = new ("https://default.documents.azure.com"); private readonly List existingListener = new List(); private SourceSwitch existingSourceSwitch; @@ -44,8 +54,11 @@ public void ResetTraceConfiguration() [TestMethod] public async Task DoesNotSerializeExceptionOnTracingDisabled() { + Mock mockedClient = new(); + GlobalEndpointManager endpointManager = new(mockedClient.Object, new ConnectionPolicy()); + // No listeners - ResourceThrottleRetryPolicy policy = new ResourceThrottleRetryPolicy(0); + ResourceThrottleRetryPolicy policy = new ResourceThrottleRetryPolicy(0, endpointManager); CustomException exception = new CustomException(); await policy.ShouldRetryAsync(exception, default); Assert.AreEqual(0, exception.ToStringCount, "Exception was serialized"); @@ -54,15 +67,178 @@ public async Task DoesNotSerializeExceptionOnTracingDisabled() [TestMethod] public async Task DoesSerializeExceptionOnTracingEnabled() { + Mock mockedClient = new(); + GlobalEndpointManager endpointManager = new(mockedClient.Object, new ConnectionPolicy()); + // Let the default trace listener DefaultTrace.TraceSource.Switch = new SourceSwitch("ClientSwitch", "Error"); DefaultTrace.TraceSource.Listeners.Add(new DefaultTraceListener()); - ResourceThrottleRetryPolicy policy = new ResourceThrottleRetryPolicy(0); + ResourceThrottleRetryPolicy policy = new ResourceThrottleRetryPolicy(0, endpointManager); CustomException exception = new CustomException(); await policy.ShouldRetryAsync(exception, default); Assert.AreEqual(1, exception.ToStringCount, "Exception was not serialized"); } + [TestMethod] + [DataRow(true, DisplayName = "Validate retry policy with multi master write account.")] + [DataRow(false, DisplayName = "Validate retry policy with single master write account.")] + public async Task ShouldRetryAsync_WhenResourceNotAvailableThrown_ShouldThrow503OnMultiMasterWrite(bool isMultiMasterAccount) + { + Documents.Collections.INameValueCollection requestHeaders = new Documents.Collections.DictionaryNameValueCollection(); + + GlobalEndpointManager endpointManager = await this.InitializeEndpointManager( + useMultipleWriteLocations: isMultiMasterAccount, + enableEndpointDiscovery: true, + isPreferredLocationsListEmpty: false, + enforceSingleMasterSingleWriteLocation: !isMultiMasterAccount); + + ResourceThrottleRetryPolicy policy = new (0, endpointManager); + + DocumentServiceRequest request = new( + OperationType.Create, + ResourceType.Document, + "dbs/db/colls/coll1/docs/doc1", + null, + AuthorizationTokenType.PrimaryMasterKey, + requestHeaders); + + policy.OnBeforeSendRequest(request); + + DocumentClientException dce = new ( + "429 with 3092 occurred.", + HttpStatusCode.TooManyRequests, + SubStatusCodes.AadTokenExpired); + + ShouldRetryResult shouldRetryResult = await policy.ShouldRetryAsync(dce, default); + + if (isMultiMasterAccount) + { + Assert.IsFalse(shouldRetryResult.ShouldRetry); + Assert.IsNotNull(shouldRetryResult.ExceptionToThrow); + Assert.AreEqual(typeof(ServiceUnavailableException), shouldRetryResult.ExceptionToThrow.GetType()); + } + else + { + Assert.IsFalse(shouldRetryResult.ShouldRetry); + Assert.IsNull(shouldRetryResult.ExceptionToThrow); + } + } + + private async Task InitializeEndpointManager( + bool useMultipleWriteLocations, + bool enableEndpointDiscovery, + bool isPreferredLocationsListEmpty, + bool enforceSingleMasterSingleWriteLocation = false, // Some tests depend on the Initialize to create an account with multiple write locations, even when not multi master + ReadOnlyCollection preferedRegionListOverride = null, + bool isExcludeRegionsTest = false) + { + ReadOnlyCollection preferredLocations; + AccountProperties databaseAccount = ResourceThrottleRetryPolicyTests.CreateDatabaseAccount( + useMultipleWriteLocations, + enforceSingleMasterSingleWriteLocation, + isExcludeRegionsTest); + + if (isPreferredLocationsListEmpty) + { + preferredLocations = new List().AsReadOnly(); + } + else + { + // Allow for override at the test method level if needed + preferredLocations = preferedRegionListOverride ?? new List() + { + "location1", + "location2", + "location3" + }.AsReadOnly(); + } + + Mock mockedClient = new Mock(); + mockedClient.Setup(owner => owner.ServiceEndpoint).Returns(ResourceThrottleRetryPolicyTests.DefaultEndpoint); + mockedClient.Setup(owner => owner.GetDatabaseAccountInternalAsync(It.IsAny(), It.IsAny())).ReturnsAsync(databaseAccount); + + ConnectionPolicy connectionPolicy = new ConnectionPolicy() + { + EnableEndpointDiscovery = enableEndpointDiscovery, + UseMultipleWriteLocations = useMultipleWriteLocations, + }; + + foreach (string preferredLocation in preferredLocations) + { + connectionPolicy.PreferredLocations.Add(preferredLocation); + } + + GlobalEndpointManager endpointManager = new GlobalEndpointManager(mockedClient.Object, connectionPolicy); + await endpointManager.RefreshLocationAsync(false); + + return endpointManager; + } + + private static AccountProperties CreateDatabaseAccount( + bool useMultipleWriteLocations, + bool enforceSingleMasterSingleWriteLocation, + bool isExcludeRegionsTest = false) + { + Uri Location1Endpoint = new ("https://location1.documents.azure.com"); + Uri Location2Endpoint = new ("https://location2.documents.azure.com"); + Uri Location3Endpoint = new ("https://location3.documents.azure.com"); + Uri Location4Endpoint = new ("https://location4.documents.azure.com"); + + Collection writeLocations = isExcludeRegionsTest ? + + new Collection() + { + { new AccountRegion() { Name = "default", Endpoint = ResourceThrottleRetryPolicyTests.DefaultEndpoint.ToString() } }, + { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } }, + { new AccountRegion() { Name = "location2", Endpoint = Location2Endpoint.ToString() } }, + { new AccountRegion() { Name = "location3", Endpoint = Location3Endpoint.ToString() } }, + } : + new Collection() + { + { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } }, + { new AccountRegion() { Name = "location2", Endpoint = Location2Endpoint.ToString() } }, + { new AccountRegion() { Name = "location3", Endpoint = Location3Endpoint.ToString() } }, + }; + + if (!useMultipleWriteLocations + && enforceSingleMasterSingleWriteLocation) + { + // Some pre-existing tests depend on the account having multiple write locations even on single master setup + // Newer tests can correctly define a single master account (single write region) without breaking existing tests + writeLocations = isExcludeRegionsTest ? + new Collection() + { + { new AccountRegion() { Name = "default", Endpoint = ResourceThrottleRetryPolicyTests.DefaultEndpoint.ToString() } } + } : + new Collection() + { + { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } } + }; + } + + AccountProperties databaseAccount = new () + { + EnableMultipleWriteLocations = useMultipleWriteLocations, + ReadLocationsInternal = isExcludeRegionsTest ? + new Collection() + { + { new AccountRegion() { Name = "default", Endpoint = ResourceThrottleRetryPolicyTests.DefaultEndpoint.ToString() } }, + { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } }, + { new AccountRegion() { Name = "location2", Endpoint = Location2Endpoint.ToString() } }, + { new AccountRegion() { Name = "location4", Endpoint = Location4Endpoint.ToString() } }, + } : + new Collection() + { + { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } }, + { new AccountRegion() { Name = "location2", Endpoint = Location2Endpoint.ToString() } }, + { new AccountRegion() { Name = "location4", Endpoint = Location4Endpoint.ToString() } }, + }, + WriteLocationsInternal = writeLocations + }; + + return databaseAccount; + } + private class CustomException : Exception { public int ToStringCount { get; private set; } = 0; From e44f080e7856e3b422b4cb6b5423d8fa6726b926 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Wed, 11 Sep 2024 16:38:15 -0700 Subject: [PATCH 02/14] Updated client retry policy. Added more tests to cover 429/3092. --- .../src/ClientRetryPolicy.cs | 39 +++++++++++++- .../ClientRetryPolicyTests.cs | 53 +++++++++++++++++++ 2 files changed, 90 insertions(+), 2 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index d0b74e2fc7..d3cf115219 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -121,7 +121,24 @@ public async Task ShouldRetryAsync( } } - return await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken); + ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken); + + // Received 503 due to client connect timeout or Gateway + if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException) + { + DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", + this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, + this.documentServiceRequest?.ResourceAddress ?? string.Empty); + + // Mark the partition as unavailable. + // Let the ClientRetry logic decide if the request should be retried + this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( + this.documentServiceRequest); + + return this.ShouldRetryOnServiceUnavailable(); + } + + return throttleRetryResult; } /// @@ -144,7 +161,24 @@ public async Task ShouldRetryAsync( return shouldRetryResult; } - return await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken); + ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken); + + // Received 503 due to client connect timeout or Gateway + if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException) + { + DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", + this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, + this.documentServiceRequest?.ResourceAddress ?? string.Empty); + + // Mark the partition as unavailable. + // Let the ClientRetry logic decide if the request should be retried + this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( + this.documentServiceRequest); + + return this.ShouldRetryOnServiceUnavailable(); + } + + return throttleRetryResult; } /// @@ -178,6 +212,7 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) // This enables marking the endpoint unavailability on endpoint failover/unreachability this.locationEndpoint = this.globalEndpointManager.ResolveServiceEndpoint(request); request.RequestContext.RouteToLocation(this.locationEndpoint); + this.throttlingRetry.OnBeforeSendRequest(request); } private async Task ShouldRetryInternalAsync( diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs index d4c56bae2e..a26342ebf2 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs @@ -86,6 +86,59 @@ public void MultimasterMetadataWriteRetryTest() retryPolicy.OnBeforeSendRequest(request); Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint); } + + /// + /// Tests behavior of Multimaster Accounts on metadata writes where the default location is not the hub region + /// + [TestMethod] + public async Task MultimasterWith4293029WriteRetryTest() + { + const bool enableEndpointDiscovery = true; + + //Creates GlobalEndpointManager where enableEndpointDiscovery is False and + //Default location is false + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: true, + enableEndpointDiscovery: enableEndpointDiscovery, + isPreferredLocationsListEmpty: false, + multimasterMetadataWriteRetryTest: true); + + await endpointManager.RefreshLocationAsync(); + + ClientRetryPolicy retryPolicy = new ClientRetryPolicy(endpointManager, this.partitionKeyRangeLocationCache, new RetryOptions(), enableEndpointDiscovery, false); + + //Creates a metadata write request + DocumentServiceRequest request = this.CreateRequest(false, false); + + //On first attempt should get incorrect (default/non hub) location + retryPolicy.OnBeforeSendRequest(request); + Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint); + + //Creation of 403.3 Error + HttpStatusCode throttleException = HttpStatusCode.TooManyRequests; + SubStatusCodes resourceNotAvailable = SubStatusCodes.AadTokenExpired; + Exception forbiddenWriteFail = new Exception(); + Mock nameValueCollection = new Mock(); + + DocumentClientException documentClientException = new DocumentClientException( + message: "Multimaster Metadata Write Fail", + innerException: forbiddenWriteFail, + statusCode: throttleException, + substatusCode: resourceNotAvailable, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: nameValueCollection.Object); + + CancellationToken cancellationToken = new CancellationToken(); + + //Tests behavior of should retry + Task shouldRetry = retryPolicy.ShouldRetryAsync(documentClientException, cancellationToken); + + Assert.IsTrue(shouldRetry.Result.ShouldRetry); + + //Now since the retry context is not null, should route to the hub region + retryPolicy.OnBeforeSendRequest(request); + Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location2Endpoint); + } /// /// Tests to see if different 503 substatus codes are handeled correctly From 9006b5fb192d761fdf6efd4b038bc101d172c0b2 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Thu, 12 Sep 2024 14:27:08 -0700 Subject: [PATCH 03/14] Code changes to update direct package version. Updating the tests. --- Directory.Build.props | 2 +- .../src/ResourceThrottleRetryPolicy.cs | 8 +- .../ClientRetryPolicyTests.cs | 81 ++++++++++++------- .../ResourceThrottleRetryPolicyTests.cs | 7 +- 4 files changed, 61 insertions(+), 37 deletions(-) diff --git a/Directory.Build.props b/Directory.Build.props index 6e3714f8a0..04fc0bc918 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -3,7 +3,7 @@ 3.43.0 3.44.0 preview.0 - 3.35.0 + 3.36.0 2.0.4 2.1.0 preview4 diff --git a/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs index c74ee89e82..0084e86769 100644 --- a/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs @@ -109,14 +109,14 @@ private Task ShouldRetryInternalAsync( if (this.isMultiMasterWriteRegion.HasValue && this.isMultiMasterWriteRegion.Value && subStatusCode != null - && subStatusCode == SubStatusCodes.AadTokenExpired) + && subStatusCode == SubStatusCodes.SystemResourceUnavailable) { DefaultTrace.TraceError( - "Operation will NOT be retried. Converting 429/3092 to 503. Current attempt {0} sub status code: {1}.", - this.currentAttemptCount, SubStatusCodes.AadTokenExpired); + "Operation will NOT be retried. Converting SystemResourceUnavailable (429/3092) to ServiceUnavailable (503). Current attempt {0} sub status code: {1}.", + this.currentAttemptCount, SubStatusCodes.SystemResourceUnavailable); ServiceUnavailableException exceptionToThrow = ServiceUnavailableException.Create( - SubStatusCodes.AadTokenExpired, + SubStatusCodes.SystemResourceUnavailable, innerException: exception); return Task.FromResult( diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs index a26342ebf2..caa56b2911 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs @@ -88,56 +88,79 @@ public void MultimasterMetadataWriteRetryTest() } /// - /// Tests behavior of Multimaster Accounts on metadata writes where the default location is not the hub region + /// Test to validate that when 429.3092 is thrown from the service, write requests on + /// a multi master account should be converted to 503 and retried to the next region. /// - [TestMethod] - public async Task MultimasterWith4293029WriteRetryTest() - { + [TestMethod] + [DataRow(true, DisplayName = "Validate retry policy with multi master write account.")] + [DataRow(false, DisplayName = "Validate retry policy with single master write account.")] + public async Task ShouldRetryAsync_WhenRequestThrottledWithResourceNotAvailable_ShouldThrow503OnMultiMasterWriteAndRetryOnNextRegion( + bool isMultiMasterAccount) + { + // Arrange. const bool enableEndpointDiscovery = true; - - //Creates GlobalEndpointManager where enableEndpointDiscovery is False and - //Default location is false using GlobalEndpointManager endpointManager = this.Initialize( - useMultipleWriteLocations: true, + useMultipleWriteLocations: isMultiMasterAccount, enableEndpointDiscovery: enableEndpointDiscovery, isPreferredLocationsListEmpty: false, multimasterMetadataWriteRetryTest: true); await endpointManager.RefreshLocationAsync(); - ClientRetryPolicy retryPolicy = new ClientRetryPolicy(endpointManager, this.partitionKeyRangeLocationCache, new RetryOptions(), enableEndpointDiscovery, false); + ClientRetryPolicy retryPolicy = new ( + endpointManager, + this.partitionKeyRangeLocationCache, + new RetryOptions(), + enableEndpointDiscovery, + false); - //Creates a metadata write request - DocumentServiceRequest request = this.CreateRequest(false, false); + // Creates a sample write request. + DocumentServiceRequest request = this.CreateRequest( + isReadRequest: false, + isMasterResourceType: false); - //On first attempt should get incorrect (default/non hub) location + // On first attempt should get (default/non hub) location. retryPolicy.OnBeforeSendRequest(request); Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint); - //Creation of 403.3 Error + // Creation of 429.3092 Error. HttpStatusCode throttleException = HttpStatusCode.TooManyRequests; - SubStatusCodes resourceNotAvailable = SubStatusCodes.AadTokenExpired; - Exception forbiddenWriteFail = new Exception(); - Mock nameValueCollection = new Mock(); + SubStatusCodes resourceNotAvailable = SubStatusCodes.SystemResourceUnavailable; - DocumentClientException documentClientException = new DocumentClientException( - message: "Multimaster Metadata Write Fail", - innerException: forbiddenWriteFail, + Exception innerException = new (); + Mock nameValueCollection = new (); + DocumentClientException documentClientException = new ( + message: "SystemResourceUnavailable: 429 with 3092 occurred.", + innerException: innerException, statusCode: throttleException, substatusCode: resourceNotAvailable, requestUri: request.RequestContext.LocationEndpointToRoute, responseHeaders: nameValueCollection.Object); - CancellationToken cancellationToken = new CancellationToken(); - - //Tests behavior of should retry - Task shouldRetry = retryPolicy.ShouldRetryAsync(documentClientException, cancellationToken); - - Assert.IsTrue(shouldRetry.Result.ShouldRetry); - - //Now since the retry context is not null, should route to the hub region - retryPolicy.OnBeforeSendRequest(request); - Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location2Endpoint); + // Act. + Task shouldRetry = retryPolicy.ShouldRetryAsync( + documentClientException, + new CancellationToken()); + + // Assert. + Assert.IsTrue(shouldRetry.Result.ShouldRetry); + retryPolicy.OnBeforeSendRequest(request); + + if (isMultiMasterAccount) + { + Assert.AreEqual( + expected: ClientRetryPolicyTests.Location2Endpoint, + actual: request.RequestContext.LocationEndpointToRoute, + message: "The request should be routed to the next region, since the accound is a multi master write account and the request" + + "failed with 429.309 which got converted into 503 internally. This should trigger another retry attempt to the next region."); + } + else + { + Assert.AreEqual( + expected: ClientRetryPolicyTests.Location1Endpoint, + actual: request.RequestContext.LocationEndpointToRoute, + message: "Since this is asingle master account, the write request should not be retried on the next region."); + } } /// diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs index 8b10f3332a..7e314e6e07 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs @@ -82,7 +82,8 @@ public async Task DoesSerializeExceptionOnTracingEnabled() [TestMethod] [DataRow(true, DisplayName = "Validate retry policy with multi master write account.")] [DataRow(false, DisplayName = "Validate retry policy with single master write account.")] - public async Task ShouldRetryAsync_WhenResourceNotAvailableThrown_ShouldThrow503OnMultiMasterWrite(bool isMultiMasterAccount) + public async Task ShouldRetryAsync_WhenResourceNotAvailableThrown_ShouldThrow503OnMultiMasterWrite( + bool isMultiMasterAccount) { Documents.Collections.INameValueCollection requestHeaders = new Documents.Collections.DictionaryNameValueCollection(); @@ -105,9 +106,9 @@ public async Task ShouldRetryAsync_WhenResourceNotAvailableThrown_ShouldThrow503 policy.OnBeforeSendRequest(request); DocumentClientException dce = new ( - "429 with 3092 occurred.", + "SystemResourceUnavailable: 429 with 3092 occurred.", HttpStatusCode.TooManyRequests, - SubStatusCodes.AadTokenExpired); + SubStatusCodes.SystemResourceUnavailable); ShouldRetryResult shouldRetryResult = await policy.ShouldRetryAsync(dce, default); From e7710c1a2d0f65539f67f5e63ac5e28c55a55a03 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Thu, 12 Sep 2024 15:52:10 -0700 Subject: [PATCH 04/14] Code changes to refactor client retry policy. --- .../src/ClientRetryPolicy.cs | 71 ++++++++++--------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index d3cf115219..da467de1c5 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -123,19 +123,13 @@ public async Task ShouldRetryAsync( ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken); - // Received 503 due to client connect timeout or Gateway + // Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy + // is when we get 410 (Gone) with sub status code 3092 (System Resource Not Available). Note that this is applicable + // for write requests targeted to a multiple master account. In such case, the 410/3092 will get converted into 503. if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException) { - DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", - this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, - this.documentServiceRequest?.ResourceAddress ?? string.Empty); - - // Mark the partition as unavailable. - // Let the ClientRetry logic decide if the request should be retried - this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( - this.documentServiceRequest); - - return this.ShouldRetryOnServiceUnavailable(); + return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( + shouldMarkEndpointUnavailableForPkRange: true); } return throttleRetryResult; @@ -163,19 +157,13 @@ public async Task ShouldRetryAsync( ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken); - // Received 503 due to client connect timeout or Gateway + // Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy + // is when we get 410 (Gone) with sub status code 3092 (System Resource Not Available). Note that this is applicable + // for write requests targeted to a multiple master account. In such case, the 410/3092 will get converted into 503. if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException) { - DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", - this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, - this.documentServiceRequest?.ResourceAddress ?? string.Empty); - - // Mark the partition as unavailable. - // Let the ClientRetry logic decide if the request should be retried - this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( - this.documentServiceRequest); - - return this.ShouldRetryOnServiceUnavailable(); + return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( + shouldMarkEndpointUnavailableForPkRange: true); } return throttleRetryResult; @@ -310,16 +298,8 @@ private async Task ShouldRetryInternalAsync( // Received 503 due to client connect timeout or Gateway if (statusCode == HttpStatusCode.ServiceUnavailable) { - DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", - this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, - this.documentServiceRequest?.ResourceAddress ?? string.Empty); - - // Mark the partition as unavailable. - // Let the ClientRetry logic decide if the request should be retried - this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( - this.documentServiceRequest); - - return this.ShouldRetryOnServiceUnavailable(); + return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( + shouldMarkEndpointUnavailableForPkRange: true); } return null; @@ -442,6 +422,33 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques } } + /// + /// Attempts to mark the endpoint associated with the current partition key range as unavailable and determines if + /// a retry should be performed due to a ServiceUnavailable (503) response. This method is invoked when a 503 + /// Service Unavailable response is received, indicating that the service might be temporarily unavailable. + /// It optionally marks the partition key range as unavailable, which will influence future routing decisions. + /// + /// A boolean flag indicating whether the endpoint for the + /// current partition key range should be marked as unavailable. + /// An instance of indicating whether the operation should be retried. + private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( + bool shouldMarkEndpointUnavailableForPkRange) + { + DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}", + this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, + this.documentServiceRequest?.ResourceAddress ?? string.Empty); + + if (shouldMarkEndpointUnavailableForPkRange) + { + // Mark the partition as unavailable. + // Let the ClientRetry logic decide if the request should be retried + this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( + this.documentServiceRequest); + } + + return this.ShouldRetryOnServiceUnavailable(); + } + /// /// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available. /// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well. From 5a1b50556b1bf3940c64860177a925df1e97cb69 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Thu, 12 Sep 2024 15:54:32 -0700 Subject: [PATCH 05/14] Minor code cleanup. --- .../ResourceThrottleRetryPolicyTests.cs | 3 --- 1 file changed, 3 deletions(-) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs index 7e314e6e07..237acda4ea 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs @@ -7,13 +7,10 @@ namespace Microsoft.Azure.Cosmos.Tests using System; using System.Collections.Generic; using System.Collections.ObjectModel; - using System.Collections.Specialized; using System.Diagnostics; using System.Net; using System.Threading; using System.Threading.Tasks; - using global::Azure.Core; - using Microsoft.Azure.Cosmos.Client.Tests; using Microsoft.Azure.Cosmos.Core.Trace; using Microsoft.Azure.Cosmos.Routing; using Microsoft.Azure.Documents; From 91cc3eacb05eba71e12f90a1e3f8a02773237556 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Fri, 13 Sep 2024 10:56:07 -0700 Subject: [PATCH 06/14] Reverting the direct version bump up change. --- Directory.Build.props | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Directory.Build.props b/Directory.Build.props index 04fc0bc918..6e3714f8a0 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -3,7 +3,7 @@ 3.43.0 3.44.0 preview.0 - 3.36.0 + 3.35.0 2.0.4 2.1.0 preview4 From 6ebb9e69de7498da3a9172103de0508889a99ee5 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Fri, 13 Sep 2024 12:26:45 -0700 Subject: [PATCH 07/14] Code changes to address some of the review comments. --- .../src/ResourceThrottleRetryPolicy.cs | 2 +- .../src/Routing/GlobalEndpointManager.cs | 12 ++++++++++++ .../src/Routing/IGlobalEndpointManager.cs | 2 ++ Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs | 6 +++++- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs index 0084e86769..04fcd82359 100644 --- a/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs @@ -164,7 +164,7 @@ private object GetExceptionMessage(Exception exception) public void OnBeforeSendRequest(DocumentServiceRequest request) { this.isMultiMasterWriteRegion = !request.IsReadOnlyRequest - && (this.globalEndpointManager?.CanUseMultipleWriteLocations(request) ?? false); + && (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request) ?? false); } /// diff --git a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs index 4e5184177b..6edfe5e57b 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs @@ -534,6 +534,18 @@ public virtual async Task RefreshLocationAsync(bool forceRefresh = false) await this.RefreshDatabaseAccountInternalAsync(forceRefresh: forceRefresh); } + + /// + /// Determines whether the current configuration and state of the service allow for supporting multiple write locations. + /// This method returns True is the AvailableWriteLocations in LocationCache is more than 1. Otherwise, it returns False. + /// + /// The document service request for which the write location support is being evaluated. + /// A boolean flag indicating if the available write locations are more than one. + public bool CanSupportMultipleWriteLocations(DocumentServiceRequest request) + { + return this.CanUseMultipleWriteLocations(request) + && this.locationCache.GetAvailableWriteLocations()?.Count > 1; + } #pragma warning disable VSTHRD100 // Avoid async void methods private async void StartLocationBackgroundRefreshLoop() diff --git a/Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs b/Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs index 427bbb8d2d..fe93027aec 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs @@ -36,5 +36,7 @@ internal interface IGlobalEndpointManager : IDisposable ReadOnlyDictionary GetAvailableWriteEndpointsByLocation(); ReadOnlyDictionary GetAvailableReadEndpointsByLocation(); + + bool CanSupportMultipleWriteLocations(DocumentServiceRequest request); } } \ No newline at end of file diff --git a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs index eb14c08c0c..c7567d6c44 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs @@ -263,6 +263,11 @@ public ReadOnlyCollection GetAvailableReadLocations() { return this.locationInfo.AvailableReadLocations; } + + public ReadOnlyCollection GetAvailableWriteLocations() + { + return this.locationInfo.AvailableWriteLocations; + } /// /// Resolves request to service endpoint. @@ -530,7 +535,6 @@ public bool ShouldRefreshEndpoints(out bool canRefreshInBackground) public bool CanUseMultipleWriteLocations(DocumentServiceRequest request) { return this.CanUseMultipleWriteLocations() - && this.locationInfo.AvailableWriteLocations.Count > 1 && (request.ResourceType == ResourceType.Document || (request.ResourceType == ResourceType.StoredProcedure && request.OperationType == Documents.OperationType.ExecuteJavaScript)); } From e6e276635e9ae32b6fa6d19f23ac46a27c93796b Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Fri, 13 Sep 2024 22:00:40 -0700 Subject: [PATCH 08/14] Code changes to move failover logic in client retry policy. --- .../src/Batch/BatchAsyncContainerExecutor.cs | 8 +- .../src/ClientRetryPolicy.cs | 52 +++-- Microsoft.Azure.Cosmos/src/DocumentClient.cs | 5 +- .../src/MetadataRequestThrottleRetryPolicy.cs | 1 - .../src/ResourceThrottleRetryPolicy.cs | 38 +--- .../Batch/BatchAsyncBatcherTests.cs | 41 ++-- .../Batch/BatchAsyncOperationContextTests.cs | 36 +--- ...lkPartitionKeyRangeGoneRetryPolicyTests.cs | 34 +--- .../ResourceThrottleRetryPolicyTests.cs | 178 +----------------- 9 files changed, 70 insertions(+), 323 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/Batch/BatchAsyncContainerExecutor.cs b/Microsoft.Azure.Cosmos/src/Batch/BatchAsyncContainerExecutor.cs index 69eddb2fbc..be166ed5a9 100644 --- a/Microsoft.Azure.Cosmos/src/Batch/BatchAsyncContainerExecutor.cs +++ b/Microsoft.Azure.Cosmos/src/Batch/BatchAsyncContainerExecutor.cs @@ -92,11 +92,7 @@ public virtual async Task AddAsync( ItemBatchOperationContext context = new ItemBatchOperationContext( resolvedPartitionKeyRangeId, trace, - BatchAsyncContainerExecutor.GetRetryPolicy( - this.cosmosContainer, - this.cosmosClientContext?.DocumentClient?.GlobalEndpointManager, - operation.OperationType, - this.retryOptions)); + BatchAsyncContainerExecutor.GetRetryPolicy(this.cosmosContainer, operation.OperationType, this.retryOptions)); if (itemRequestOptions != null && itemRequestOptions.AddRequestHeaders != null) { @@ -163,7 +159,6 @@ internal virtual async Task ValidateOperationAsync( private static IDocumentClientRetryPolicy GetRetryPolicy( ContainerInternal containerInternal, - GlobalEndpointManager endpointManager, OperationType operationType, RetryOptions retryOptions) { @@ -172,7 +167,6 @@ private static IDocumentClientRetryPolicy GetRetryPolicy( operationType, new ResourceThrottleRetryPolicy( retryOptions.MaxRetryAttemptsOnThrottledRequests, - endpointManager, retryOptions.MaxRetryWaitTimeInSeconds)); } diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index da467de1c5..bb2056e25d 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -35,6 +35,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy private int serviceUnavailableRetryCount; private bool isReadRequest; private bool canUseMultipleWriteLocations; + private bool? isMultiMasterWriteRegion; private Uri locationEndpoint; private RetryContext retryContext; private DocumentServiceRequest documentServiceRequest; @@ -48,7 +49,6 @@ public ClientRetryPolicy( { this.throttlingRetry = new ResourceThrottleRetryPolicy( retryOptions.MaxRetryAttemptsOnThrottledRequests, - endpointManager: globalEndpointManager, retryOptions.MaxRetryWaitTimeInSeconds); this.globalEndpointManager = globalEndpointManager; @@ -58,6 +58,7 @@ public ClientRetryPolicy( this.sessionTokenRetryCount = 0; this.serviceUnavailableRetryCount = 0; this.canUseMultipleWriteLocations = false; + this.isMultiMasterWriteRegion = false; this.isPertitionLevelFailoverEnabled = isPertitionLevelFailoverEnabled; } @@ -98,6 +99,23 @@ public async Task ShouldRetryAsync( if (exception is DocumentClientException clientException) { + // Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy + // is when we get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable + // for write requests targeted to a multiple master account. In such case, the 429/3092 will get converted into 503. + if (this.isMultiMasterWriteRegion.HasValue + && this.isMultiMasterWriteRegion.Value + && clientException.StatusCode.HasValue + && (int)clientException.StatusCode.Value == (int)StatusCodes.TooManyRequests + && clientException.GetSubStatus() == SubStatusCodes.SystemResourceUnavailable) + { + DefaultTrace.TraceError( + "Operation will NOT be retried. Converting SystemResourceUnavailable (429/3092) to ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", + StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable); + + return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( + shouldMarkEndpointUnavailableForPkRange: true); + } + ShouldRetryResult shouldRetryResult = await this.ShouldRetryInternalAsync( clientException?.StatusCode, clientException?.GetSubStatus()); @@ -121,18 +139,7 @@ public async Task ShouldRetryAsync( } } - ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken); - - // Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy - // is when we get 410 (Gone) with sub status code 3092 (System Resource Not Available). Note that this is applicable - // for write requests targeted to a multiple master account. In such case, the 410/3092 will get converted into 503. - if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException) - { - return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( - shouldMarkEndpointUnavailableForPkRange: true); - } - - return throttleRetryResult; + return await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken); } /// @@ -155,18 +162,23 @@ public async Task ShouldRetryAsync( return shouldRetryResult; } - ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken); - // Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy - // is when we get 410 (Gone) with sub status code 3092 (System Resource Not Available). Note that this is applicable - // for write requests targeted to a multiple master account. In such case, the 410/3092 will get converted into 503. - if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException) + // is when we get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable + // for write requests targeted to a multiple master account. In such case, the 429/3092 will get converted into 503. + if (this.isMultiMasterWriteRegion.HasValue + && this.isMultiMasterWriteRegion.Value + && (int)cosmosResponseMessage.StatusCode == (int)StatusCodes.TooManyRequests + && cosmosResponseMessage?.Headers.SubStatusCode == SubStatusCodes.SystemResourceUnavailable) { + DefaultTrace.TraceError( + "Operation will NOT be retried. Converting SystemResourceUnavailable (429/3092) to ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", + StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable); + return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( shouldMarkEndpointUnavailableForPkRange: true); } - return throttleRetryResult; + return await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken); } /// @@ -179,6 +191,8 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) this.isReadRequest = request.IsReadOnlyRequest; this.canUseMultipleWriteLocations = this.globalEndpointManager.CanUseMultipleWriteLocations(request); this.documentServiceRequest = request; + this.isMultiMasterWriteRegion = !this.isReadRequest + && (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request) ?? false); // clear previous location-based routing directive request.RequestContext.ClearRouteToLocation(); diff --git a/Microsoft.Azure.Cosmos/src/DocumentClient.cs b/Microsoft.Azure.Cosmos/src/DocumentClient.cs index 0f4ce7c9fd..946fa4cb08 100644 --- a/Microsoft.Azure.Cosmos/src/DocumentClient.cs +++ b/Microsoft.Azure.Cosmos/src/DocumentClient.cs @@ -988,9 +988,8 @@ internal virtual void Initialize(Uri serviceEndpoint, this.initializeTaskFactory = (_) => TaskHelper.InlineIfPossible( () => this.GetInitializationTaskAsync(storeClientFactory: storeClientFactory), new ResourceThrottleRetryPolicy( - maxAttemptCount: this.ConnectionPolicy.RetryOptions.MaxRetryAttemptsOnThrottledRequests, - endpointManager: this.GlobalEndpointManager, - maxWaitTimeInSeconds: this.ConnectionPolicy.RetryOptions.MaxRetryWaitTimeInSeconds)); + this.ConnectionPolicy.RetryOptions.MaxRetryAttemptsOnThrottledRequests, + this.ConnectionPolicy.RetryOptions.MaxRetryWaitTimeInSeconds)); // Create the task to start the initialize task // Task will be awaited on in the EnsureValidClientAsync diff --git a/Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs index 8f56139957..928d2f2e87 100644 --- a/Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs @@ -72,7 +72,6 @@ public MetadataRequestThrottleRetryPolicy( this.throttlingRetryPolicy = new ResourceThrottleRetryPolicy( maxRetryAttemptsOnThrottledRequests, - this.globalEndpointManager, maxRetryWaitTimeInSeconds); this.retryContext = new MetadataRetryContext diff --git a/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs index 04fcd82359..a99bc594f7 100644 --- a/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ResourceThrottleRetryPolicy.cs @@ -9,7 +9,6 @@ namespace Microsoft.Azure.Cosmos using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Cosmos.Core.Trace; - using Microsoft.Azure.Cosmos.Routing; using Microsoft.Azure.Documents; // Retry when we receive the throttling from server. @@ -20,15 +19,12 @@ internal sealed class ResourceThrottleRetryPolicy : IDocumentClientRetryPolicy private readonly uint backoffDelayFactor; private readonly int maxAttemptCount; private readonly TimeSpan maxWaitTimeInMilliseconds; - private readonly IGlobalEndpointManager globalEndpointManager; private int currentAttemptCount; private TimeSpan cumulativeRetryDelay; - private bool? isMultiMasterWriteRegion; public ResourceThrottleRetryPolicy( int maxAttemptCount, - IGlobalEndpointManager endpointManager, int maxWaitTimeInSeconds = DefaultMaxWaitTimeInSeconds, uint backoffDelayFactor = 1) { @@ -37,7 +33,6 @@ public ResourceThrottleRetryPolicy( throw new ArgumentException("maxWaitTimeInSeconds", "maxWaitTimeInSeconds must be less than " + (int.MaxValue / 1000)); } - this.globalEndpointManager = endpointManager; this.maxAttemptCount = maxAttemptCount; this.backoffDelayFactor = backoffDelayFactor; this.maxWaitTimeInMilliseconds = TimeSpan.FromSeconds(maxWaitTimeInSeconds); @@ -64,9 +59,7 @@ public Task ShouldRetryAsync( return Task.FromResult(ShouldRetryResult.NoRetry()); } - return this.ShouldRetryInternalAsync( - dce?.GetSubStatus(), - dce?.RetryAfter); + return this.ShouldRetryInternalAsync(dce.RetryAfter); } DefaultTrace.TraceError( @@ -95,34 +88,11 @@ public Task ShouldRetryAsync( return Task.FromResult(ShouldRetryResult.NoRetry()); } - return this.ShouldRetryInternalAsync( - cosmosResponseMessage?.Headers.SubStatusCode, - cosmosResponseMessage?.Headers.RetryAfter, - cosmosResponseMessage?.CosmosException); + return this.ShouldRetryInternalAsync(cosmosResponseMessage?.Headers.RetryAfter); } - private Task ShouldRetryInternalAsync( - SubStatusCodes? subStatusCode, - TimeSpan? retryAfter, - Exception exception = null) + private Task ShouldRetryInternalAsync(TimeSpan? retryAfter) { - if (this.isMultiMasterWriteRegion.HasValue - && this.isMultiMasterWriteRegion.Value - && subStatusCode != null - && subStatusCode == SubStatusCodes.SystemResourceUnavailable) - { - DefaultTrace.TraceError( - "Operation will NOT be retried. Converting SystemResourceUnavailable (429/3092) to ServiceUnavailable (503). Current attempt {0} sub status code: {1}.", - this.currentAttemptCount, SubStatusCodes.SystemResourceUnavailable); - - ServiceUnavailableException exceptionToThrow = ServiceUnavailableException.Create( - SubStatusCodes.SystemResourceUnavailable, - innerException: exception); - - return Task.FromResult( - ShouldRetryResult.NoRetry(exceptionToThrow)); - } - TimeSpan retryDelay = TimeSpan.Zero; if (this.currentAttemptCount < this.maxAttemptCount && this.CheckIfRetryNeeded(retryAfter, out retryDelay)) @@ -163,8 +133,6 @@ private object GetExceptionMessage(Exception exception) /// The request being sent to the service. public void OnBeforeSendRequest(DocumentServiceRequest request) { - this.isMultiMasterWriteRegion = !request.IsReadOnlyRequest - && (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request) ?? false); } /// diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncBatcherTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncBatcherTests.cs index 1b421152bd..acc83d3ff4 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncBatcherTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncBatcherTests.cs @@ -23,23 +23,6 @@ public class BatchAsyncBatcherTests { private static readonly Exception expectedException = new Exception(); private static readonly BatchPartitionMetric metric = new BatchPartitionMetric(); - private GlobalEndpointManager mockedEndpointManager; - - [TestInitialize] - public void Initialize() - { - Mock mockedClient = new(); - - this.mockedEndpointManager = new( - mockedClient.Object, - new ConnectionPolicy()); - } - - [TestCleanup] - public void Cleanup() - { - this.mockedEndpointManager.Dispose(); - } private ItemBatchOperation CreateItemBatchOperation(bool withContext = false) { @@ -582,12 +565,12 @@ public async Task RetrierGetsCalledOnSplit() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); @@ -611,12 +594,12 @@ public async Task RetrierGetsCalledOnCompletingSplit() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); @@ -640,12 +623,12 @@ public async Task RetrierGetsCalledOnCompletingPartitionMigration() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); @@ -689,17 +672,17 @@ public async Task RetrierGetsCalledOn413_3402() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); IDocumentClientRetryPolicy retryPolicy3 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Create, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); @@ -727,17 +710,17 @@ public async Task RetrierGetsCalledOn413_NoSubstatus() IDocumentClientRetryPolicy retryPolicy1 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); IDocumentClientRetryPolicy retryPolicy2 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); IDocumentClientRetryPolicy retryPolicy3 = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Create, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); ItemBatchOperation operation1 = this.CreateItemBatchOperation(); ItemBatchOperation operation2 = this.CreateItemBatchOperation(); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncOperationContextTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncOperationContextTests.cs index f73e63368b..8e3c6bcbc3 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncOperationContextTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncOperationContextTests.cs @@ -18,31 +18,13 @@ namespace Microsoft.Azure.Cosmos.Tests [TestClass] public class BatchAsyncOperationContextTests { - private GlobalEndpointManager mockedEndpointManager; - - [TestInitialize] - public void Initialize() - { - Mock mockedClient = new(); - - this.mockedEndpointManager = new( - mockedClient.Object, - new ConnectionPolicy()); - } - - [TestCleanup] - public void Cleanup() - { - this.mockedEndpointManager.Dispose(); - } - [TestMethod] public async Task TraceIsJoinedOnCompletionWithRetry() { IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); Trace rootTrace = Trace.GetRootTrace(name: "RootTrace"); @@ -83,7 +65,7 @@ public async Task TraceIsJoinedOnCompletionWithoutRetry() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); Trace rootTrace = Trace.GetRootTrace(name: "RootTrace"); @@ -197,7 +179,7 @@ public async Task ShouldRetry_WithPolicy_OnSuccess() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.OK); ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -211,7 +193,7 @@ public async Task ShouldRetry_WithPolicy_On429() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult((HttpStatusCode)StatusCodes.TooManyRequests); ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -225,7 +207,7 @@ public async Task ShouldRetry_WithPolicy_On413_3402() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.RequestEntityTooLarge) { SubStatusCode = (SubStatusCodes)3402 }; ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -239,7 +221,7 @@ public async Task ShouldRetry_WithPolicy_On413_0() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Create, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.RequestEntityTooLarge); ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -253,7 +235,7 @@ public async Task ShouldRetry_WithPolicy_OnSplit() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.PartitionKeyRangeGone }; ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -267,7 +249,7 @@ public async Task ShouldRetry_WithPolicy_OnCompletingSplit() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.CompletingSplit }; ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); @@ -281,7 +263,7 @@ public async Task ShouldRetry_WithPolicy_OnCompletingPartitionMigration() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.CompletingPartitionMigration }; ItemBatchOperation operation = new ItemBatchOperation(OperationType.Create, 0, Cosmos.PartitionKey.Null); operation.AttachContext(new ItemBatchOperationContext(string.Empty, NoOpTrace.Singleton, retryPolicy)); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BulkPartitionKeyRangeGoneRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BulkPartitionKeyRangeGoneRetryPolicyTests.cs index 487b272dfc..1f87d40635 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BulkPartitionKeyRangeGoneRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BulkPartitionKeyRangeGoneRetryPolicyTests.cs @@ -18,31 +18,13 @@ namespace Microsoft.Azure.Cosmos.Tests [TestClass] public class BulkPartitionKeyRangeGoneRetryPolicyTests { - private GlobalEndpointManager mockedEndpointManager; - - [TestInitialize] - public void Initialize() - { - Mock mockedClient = new (); - - this.mockedEndpointManager = new ( - mockedClient.Object, - new ConnectionPolicy()); - } - - [TestCleanup] - public void Cleanup() - { - this.mockedEndpointManager.Dispose(); - } - [TestMethod] public async Task NotRetryOnSuccess() { IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.OK); ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -55,7 +37,7 @@ public async Task RetriesOn429() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult((HttpStatusCode)StatusCodes.TooManyRequests); ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -68,7 +50,7 @@ public async Task RetriesOn413_3204() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.RequestEntityTooLarge) { SubStatusCode = (SubStatusCodes)3402 }; ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -81,7 +63,7 @@ public async Task RetriesOn413_0() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( Mock.Of(), OperationType.Create, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.RequestEntityTooLarge); ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -94,7 +76,7 @@ public async Task RetriesOnSplits() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.PartitionKeyRangeGone }; ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -107,7 +89,7 @@ public async Task RetriesOnSplits_UpToMax() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.PartitionKeyRangeGone }; ShouldRetryResult shouldRetryResult; @@ -127,7 +109,7 @@ public async Task RetriesOnCompletingSplits() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.CompletingSplit }; ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); @@ -140,7 +122,7 @@ public async Task RetriesOnCompletingPartitionMigrationSplits() IDocumentClientRetryPolicy retryPolicy = new BulkExecutionRetryPolicy( GetSplitEnabledContainer(), OperationType.Read, - new ResourceThrottleRetryPolicy(1, this.mockedEndpointManager)); + new ResourceThrottleRetryPolicy(1)); TransactionalBatchOperationResult result = new TransactionalBatchOperationResult(HttpStatusCode.Gone) { SubStatusCode = SubStatusCodes.CompletingPartitionMigration }; ShouldRetryResult shouldRetryResult = await retryPolicy.ShouldRetryAsync(result.ToResponseMessage(), default); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs index 237acda4ea..274bbcb8e9 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ResourceThrottleRetryPolicyTests.cs @@ -6,21 +6,14 @@ namespace Microsoft.Azure.Cosmos.Tests { using System; using System.Collections.Generic; - using System.Collections.ObjectModel; using System.Diagnostics; - using System.Net; - using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Cosmos.Core.Trace; - using Microsoft.Azure.Cosmos.Routing; - using Microsoft.Azure.Documents; using Microsoft.VisualStudio.TestTools.UnitTesting; - using Moq; [TestClass] public class ResourceThrottleRetryPolicyTests { - private static readonly Uri DefaultEndpoint = new ("https://default.documents.azure.com"); private readonly List existingListener = new List(); private SourceSwitch existingSourceSwitch; @@ -51,11 +44,8 @@ public void ResetTraceConfiguration() [TestMethod] public async Task DoesNotSerializeExceptionOnTracingDisabled() { - Mock mockedClient = new(); - GlobalEndpointManager endpointManager = new(mockedClient.Object, new ConnectionPolicy()); - // No listeners - ResourceThrottleRetryPolicy policy = new ResourceThrottleRetryPolicy(0, endpointManager); + ResourceThrottleRetryPolicy policy = new ResourceThrottleRetryPolicy(0); CustomException exception = new CustomException(); await policy.ShouldRetryAsync(exception, default); Assert.AreEqual(0, exception.ToStringCount, "Exception was serialized"); @@ -64,179 +54,15 @@ public async Task DoesNotSerializeExceptionOnTracingDisabled() [TestMethod] public async Task DoesSerializeExceptionOnTracingEnabled() { - Mock mockedClient = new(); - GlobalEndpointManager endpointManager = new(mockedClient.Object, new ConnectionPolicy()); - // Let the default trace listener DefaultTrace.TraceSource.Switch = new SourceSwitch("ClientSwitch", "Error"); DefaultTrace.TraceSource.Listeners.Add(new DefaultTraceListener()); - ResourceThrottleRetryPolicy policy = new ResourceThrottleRetryPolicy(0, endpointManager); + ResourceThrottleRetryPolicy policy = new ResourceThrottleRetryPolicy(0); CustomException exception = new CustomException(); await policy.ShouldRetryAsync(exception, default); Assert.AreEqual(1, exception.ToStringCount, "Exception was not serialized"); } - [TestMethod] - [DataRow(true, DisplayName = "Validate retry policy with multi master write account.")] - [DataRow(false, DisplayName = "Validate retry policy with single master write account.")] - public async Task ShouldRetryAsync_WhenResourceNotAvailableThrown_ShouldThrow503OnMultiMasterWrite( - bool isMultiMasterAccount) - { - Documents.Collections.INameValueCollection requestHeaders = new Documents.Collections.DictionaryNameValueCollection(); - - GlobalEndpointManager endpointManager = await this.InitializeEndpointManager( - useMultipleWriteLocations: isMultiMasterAccount, - enableEndpointDiscovery: true, - isPreferredLocationsListEmpty: false, - enforceSingleMasterSingleWriteLocation: !isMultiMasterAccount); - - ResourceThrottleRetryPolicy policy = new (0, endpointManager); - - DocumentServiceRequest request = new( - OperationType.Create, - ResourceType.Document, - "dbs/db/colls/coll1/docs/doc1", - null, - AuthorizationTokenType.PrimaryMasterKey, - requestHeaders); - - policy.OnBeforeSendRequest(request); - - DocumentClientException dce = new ( - "SystemResourceUnavailable: 429 with 3092 occurred.", - HttpStatusCode.TooManyRequests, - SubStatusCodes.SystemResourceUnavailable); - - ShouldRetryResult shouldRetryResult = await policy.ShouldRetryAsync(dce, default); - - if (isMultiMasterAccount) - { - Assert.IsFalse(shouldRetryResult.ShouldRetry); - Assert.IsNotNull(shouldRetryResult.ExceptionToThrow); - Assert.AreEqual(typeof(ServiceUnavailableException), shouldRetryResult.ExceptionToThrow.GetType()); - } - else - { - Assert.IsFalse(shouldRetryResult.ShouldRetry); - Assert.IsNull(shouldRetryResult.ExceptionToThrow); - } - } - - private async Task InitializeEndpointManager( - bool useMultipleWriteLocations, - bool enableEndpointDiscovery, - bool isPreferredLocationsListEmpty, - bool enforceSingleMasterSingleWriteLocation = false, // Some tests depend on the Initialize to create an account with multiple write locations, even when not multi master - ReadOnlyCollection preferedRegionListOverride = null, - bool isExcludeRegionsTest = false) - { - ReadOnlyCollection preferredLocations; - AccountProperties databaseAccount = ResourceThrottleRetryPolicyTests.CreateDatabaseAccount( - useMultipleWriteLocations, - enforceSingleMasterSingleWriteLocation, - isExcludeRegionsTest); - - if (isPreferredLocationsListEmpty) - { - preferredLocations = new List().AsReadOnly(); - } - else - { - // Allow for override at the test method level if needed - preferredLocations = preferedRegionListOverride ?? new List() - { - "location1", - "location2", - "location3" - }.AsReadOnly(); - } - - Mock mockedClient = new Mock(); - mockedClient.Setup(owner => owner.ServiceEndpoint).Returns(ResourceThrottleRetryPolicyTests.DefaultEndpoint); - mockedClient.Setup(owner => owner.GetDatabaseAccountInternalAsync(It.IsAny(), It.IsAny())).ReturnsAsync(databaseAccount); - - ConnectionPolicy connectionPolicy = new ConnectionPolicy() - { - EnableEndpointDiscovery = enableEndpointDiscovery, - UseMultipleWriteLocations = useMultipleWriteLocations, - }; - - foreach (string preferredLocation in preferredLocations) - { - connectionPolicy.PreferredLocations.Add(preferredLocation); - } - - GlobalEndpointManager endpointManager = new GlobalEndpointManager(mockedClient.Object, connectionPolicy); - await endpointManager.RefreshLocationAsync(false); - - return endpointManager; - } - - private static AccountProperties CreateDatabaseAccount( - bool useMultipleWriteLocations, - bool enforceSingleMasterSingleWriteLocation, - bool isExcludeRegionsTest = false) - { - Uri Location1Endpoint = new ("https://location1.documents.azure.com"); - Uri Location2Endpoint = new ("https://location2.documents.azure.com"); - Uri Location3Endpoint = new ("https://location3.documents.azure.com"); - Uri Location4Endpoint = new ("https://location4.documents.azure.com"); - - Collection writeLocations = isExcludeRegionsTest ? - - new Collection() - { - { new AccountRegion() { Name = "default", Endpoint = ResourceThrottleRetryPolicyTests.DefaultEndpoint.ToString() } }, - { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } }, - { new AccountRegion() { Name = "location2", Endpoint = Location2Endpoint.ToString() } }, - { new AccountRegion() { Name = "location3", Endpoint = Location3Endpoint.ToString() } }, - } : - new Collection() - { - { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } }, - { new AccountRegion() { Name = "location2", Endpoint = Location2Endpoint.ToString() } }, - { new AccountRegion() { Name = "location3", Endpoint = Location3Endpoint.ToString() } }, - }; - - if (!useMultipleWriteLocations - && enforceSingleMasterSingleWriteLocation) - { - // Some pre-existing tests depend on the account having multiple write locations even on single master setup - // Newer tests can correctly define a single master account (single write region) without breaking existing tests - writeLocations = isExcludeRegionsTest ? - new Collection() - { - { new AccountRegion() { Name = "default", Endpoint = ResourceThrottleRetryPolicyTests.DefaultEndpoint.ToString() } } - } : - new Collection() - { - { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } } - }; - } - - AccountProperties databaseAccount = new () - { - EnableMultipleWriteLocations = useMultipleWriteLocations, - ReadLocationsInternal = isExcludeRegionsTest ? - new Collection() - { - { new AccountRegion() { Name = "default", Endpoint = ResourceThrottleRetryPolicyTests.DefaultEndpoint.ToString() } }, - { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } }, - { new AccountRegion() { Name = "location2", Endpoint = Location2Endpoint.ToString() } }, - { new AccountRegion() { Name = "location4", Endpoint = Location4Endpoint.ToString() } }, - } : - new Collection() - { - { new AccountRegion() { Name = "location1", Endpoint = Location1Endpoint.ToString() } }, - { new AccountRegion() { Name = "location2", Endpoint = Location2Endpoint.ToString() } }, - { new AccountRegion() { Name = "location4", Endpoint = Location4Endpoint.ToString() } }, - }, - WriteLocationsInternal = writeLocations - }; - - return databaseAccount; - } - private class CustomException : Exception { public int ToStringCount { get; private set; } = 0; From 9575986f6e318ae5f49a5866b4407243ec0f23df Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Fri, 13 Sep 2024 22:10:11 -0700 Subject: [PATCH 09/14] Minor code clean up. --- .../src/ClientRetryPolicy.cs | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index bb2056e25d..b4fa5862cc 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -102,11 +102,9 @@ public async Task ShouldRetryAsync( // Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy // is when we get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable // for write requests targeted to a multiple master account. In such case, the 429/3092 will get converted into 503. - if (this.isMultiMasterWriteRegion.HasValue - && this.isMultiMasterWriteRegion.Value - && clientException.StatusCode.HasValue - && (int)clientException.StatusCode.Value == (int)StatusCodes.TooManyRequests - && clientException.GetSubStatus() == SubStatusCodes.SystemResourceUnavailable) + if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( + clientException.StatusCode, + clientException.GetSubStatus())) { DefaultTrace.TraceError( "Operation will NOT be retried. Converting SystemResourceUnavailable (429/3092) to ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", @@ -165,10 +163,9 @@ public async Task ShouldRetryAsync( // Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy // is when we get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable // for write requests targeted to a multiple master account. In such case, the 429/3092 will get converted into 503. - if (this.isMultiMasterWriteRegion.HasValue - && this.isMultiMasterWriteRegion.Value - && (int)cosmosResponseMessage.StatusCode == (int)StatusCodes.TooManyRequests - && cosmosResponseMessage?.Headers.SubStatusCode == SubStatusCodes.SystemResourceUnavailable) + if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( + cosmosResponseMessage.StatusCode, + cosmosResponseMessage?.Headers.SubStatusCode)) { DefaultTrace.TraceError( "Operation will NOT be retried. Converting SystemResourceUnavailable (429/3092) to ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", @@ -506,6 +503,17 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable() return ShouldRetryResult.RetryAfter(TimeSpan.Zero); } + private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( + HttpStatusCode? statusCode, + SubStatusCodes? subStatusCode) + { + return this.isMultiMasterWriteRegion.HasValue + && this.isMultiMasterWriteRegion.Value + && statusCode.HasValue + && (int)statusCode.Value == (int)StatusCodes.TooManyRequests + && subStatusCode == SubStatusCodes.SystemResourceUnavailable; + } + private sealed class RetryContext { public int RetryLocationIndex { get; set; } From 0c5304db66c61ee11821a66036974557dfe212c1 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Sat, 14 Sep 2024 23:41:02 -0700 Subject: [PATCH 10/14] Code changes to clean up some cosmetic items. --- .../src/ClientRetryPolicy.cs | 29 +++++++++++++------ .../src/Routing/LocationCache.cs | 4 +-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index b4fa5862cc..3027abb836 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -99,15 +99,17 @@ public async Task ShouldRetryAsync( if (exception is DocumentClientException clientException) { - // Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy - // is when we get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable - // for write requests targeted to a multiple master account. In such case, the 429/3092 will get converted into 503. + // Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we + // get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable + // for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The + // reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code + // might not be a throttling scenario at all and the status code in that case would be different than 429. if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( clientException.StatusCode, clientException.GetSubStatus())) { DefaultTrace.TraceError( - "Operation will NOT be retried. Converting SystemResourceUnavailable (429/3092) to ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", + "Operation will NOT be retried. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable); return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( @@ -160,15 +162,17 @@ public async Task ShouldRetryAsync( return shouldRetryResult; } - // Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy - // is when we get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable - // for write requests targeted to a multiple master account. In such case, the 429/3092 will get converted into 503. + // Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we + // get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable + // for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The + // reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code + // might not be a throttling scenario at all and the status code in that case would be different than 429. if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( cosmosResponseMessage.StatusCode, cosmosResponseMessage?.Headers.SubStatusCode)) { DefaultTrace.TraceError( - "Operation will NOT be retried. Converting SystemResourceUnavailable (429/3092) to ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", + "Operation will NOT be retried. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable); return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( @@ -211,7 +215,6 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) // This enables marking the endpoint unavailability on endpoint failover/unreachability this.locationEndpoint = this.globalEndpointManager.ResolveServiceEndpoint(request); request.RequestContext.RouteToLocation(this.locationEndpoint); - this.throttlingRetry.OnBeforeSendRequest(request); } private async Task ShouldRetryInternalAsync( @@ -503,6 +506,14 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable() return ShouldRetryResult.RetryAfter(TimeSpan.Zero); } + /// + /// Returns a boolean flag indicating if the endpoint should be marked as unavailable + /// due to a 429 response with a sub status code of 3092 (system resource unavailable). + /// This is applicable for write requests targeted for multi master accounts. + /// + /// An instance of containing the status code. + /// An instance of containing the sub status code. + /// A boolean flag indicating is the endpoint should be marked as unavailable. private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( HttpStatusCode? statusCode, SubStatusCodes? subStatusCode) diff --git a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs index c7567d6c44..cae7f46e7c 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs @@ -534,8 +534,8 @@ public bool ShouldRefreshEndpoints(out bool canRefreshInBackground) public bool CanUseMultipleWriteLocations(DocumentServiceRequest request) { - return this.CanUseMultipleWriteLocations() - && (request.ResourceType == ResourceType.Document || + return this.CanUseMultipleWriteLocations() && + (request.ResourceType == ResourceType.Document || (request.ResourceType == ResourceType.StoredProcedure && request.OperationType == Documents.OperationType.ExecuteJavaScript)); } From ce3c62fcd48075808933ca46a825b0ac8a4636a0 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Sat, 14 Sep 2024 23:42:39 -0700 Subject: [PATCH 11/14] Further clean up. --- Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index 3027abb836..80aeade7b3 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -35,7 +35,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy private int serviceUnavailableRetryCount; private bool isReadRequest; private bool canUseMultipleWriteLocations; - private bool? isMultiMasterWriteRegion; + private bool isMultiMasterWriteRegion; private Uri locationEndpoint; private RetryContext retryContext; private DocumentServiceRequest documentServiceRequest; @@ -518,8 +518,7 @@ private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( HttpStatusCode? statusCode, SubStatusCodes? subStatusCode) { - return this.isMultiMasterWriteRegion.HasValue - && this.isMultiMasterWriteRegion.Value + return this.isMultiMasterWriteRegion && statusCode.HasValue && (int)statusCode.Value == (int)StatusCodes.TooManyRequests && subStatusCode == SubStatusCodes.SystemResourceUnavailable; From 973f1f18f4a4d9d394d4c634756a9c63a593bb22 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Mon, 16 Sep 2024 17:02:59 -0700 Subject: [PATCH 12/14] Code changes to address review comments. --- Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs | 8 ++++---- .../src/Routing/GlobalEndpointManager.cs | 6 ++++-- Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index 80aeade7b3..9966e43bc2 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -104,12 +104,12 @@ public async Task ShouldRetryAsync( // for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The // reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code // might not be a throttling scenario at all and the status code in that case would be different than 429. - if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( + if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite( clientException.StatusCode, clientException.GetSubStatus())) { DefaultTrace.TraceError( - "Operation will NOT be retried. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", + "Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable); return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( @@ -167,7 +167,7 @@ public async Task ShouldRetryAsync( // for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The // reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code // might not be a throttling scenario at all and the status code in that case would be different than 429. - if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( + if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite( cosmosResponseMessage.StatusCode, cosmosResponseMessage?.Headers.SubStatusCode)) { @@ -514,7 +514,7 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable() /// An instance of containing the status code. /// An instance of containing the sub status code. /// A boolean flag indicating is the endpoint should be marked as unavailable. - private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailable( + private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite( HttpStatusCode? statusCode, SubStatusCodes? subStatusCode) { diff --git a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs index 6edfe5e57b..3c0a0c7633 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs @@ -543,8 +543,10 @@ public virtual async Task RefreshLocationAsync(bool forceRefresh = false) /// A boolean flag indicating if the available write locations are more than one. public bool CanSupportMultipleWriteLocations(DocumentServiceRequest request) { - return this.CanUseMultipleWriteLocations(request) - && this.locationCache.GetAvailableWriteLocations()?.Count > 1; + return this.locationCache.CanUseMultipleWriteLocations() + && this.locationCache.GetAvailableWriteLocations()?.Count > 1 + && (request.ResourceType == ResourceType.Document || + (request.ResourceType == ResourceType.StoredProcedure && request.OperationType == OperationType.Execute)); } #pragma warning disable VSTHRD100 // Avoid async void methods diff --git a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs index cae7f46e7c..cc108a722e 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs @@ -773,7 +773,7 @@ private ReadOnlyDictionary GetEndpointByLocation(IEnumerable(endpointsByLocation); } - private bool CanUseMultipleWriteLocations() + internal bool CanUseMultipleWriteLocations() { return this.useMultipleWriteLocations && this.enableMultipleWriteLocations; } From 40560ccd708973a6f668d4552539fffcc8760858 Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Mon, 16 Sep 2024 17:06:06 -0700 Subject: [PATCH 13/14] Minor refactor to address cosmetic update. --- Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index 9966e43bc2..786df66541 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -172,7 +172,7 @@ public async Task ShouldRetryAsync( cosmosResponseMessage?.Headers.SubStatusCode)) { DefaultTrace.TraceError( - "Operation will NOT be retried. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", + "Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.", StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable); return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( From a198a04a48f0d6866b6777bd70dbf171fe5ad2cb Mon Sep 17 00:00:00 2001 From: Debdatta Kunda Date: Tue, 17 Sep 2024 10:11:47 -0700 Subject: [PATCH 14/14] Code changes to address cosmetic review comment. --- Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index 786df66541..6b3cd3f882 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -35,7 +35,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy private int serviceUnavailableRetryCount; private bool isReadRequest; private bool canUseMultipleWriteLocations; - private bool isMultiMasterWriteRegion; + private bool isMultiMasterWriteRequest; private Uri locationEndpoint; private RetryContext retryContext; private DocumentServiceRequest documentServiceRequest; @@ -58,7 +58,7 @@ public ClientRetryPolicy( this.sessionTokenRetryCount = 0; this.serviceUnavailableRetryCount = 0; this.canUseMultipleWriteLocations = false; - this.isMultiMasterWriteRegion = false; + this.isMultiMasterWriteRequest = false; this.isPertitionLevelFailoverEnabled = isPertitionLevelFailoverEnabled; } @@ -192,7 +192,7 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) this.isReadRequest = request.IsReadOnlyRequest; this.canUseMultipleWriteLocations = this.globalEndpointManager.CanUseMultipleWriteLocations(request); this.documentServiceRequest = request; - this.isMultiMasterWriteRegion = !this.isReadRequest + this.isMultiMasterWriteRequest = !this.isReadRequest && (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request) ?? false); // clear previous location-based routing directive @@ -518,7 +518,7 @@ private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite( HttpStatusCode? statusCode, SubStatusCodes? subStatusCode) { - return this.isMultiMasterWriteRegion + return this.isMultiMasterWriteRequest && statusCode.HasValue && (int)statusCode.Value == (int)StatusCodes.TooManyRequests && subStatusCode == SubStatusCodes.SystemResourceUnavailable;