-
Notifications
You must be signed in to change notification settings - Fork 297
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Azure Storage: Abandon prefetched orchestrator messages if the lease is lost #360
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,11 +21,14 @@ namespace DurableTask.AzureStorage.Tests | |
using System.Threading.Tasks; | ||
using DurableTask.AzureStorage.Messaging; | ||
using DurableTask.AzureStorage.Monitoring; | ||
using DurableTask.AzureStorage.Partitioning; | ||
using DurableTask.AzureStorage.Tracking; | ||
using DurableTask.Core; | ||
using DurableTask.Core.History; | ||
using Microsoft.VisualStudio.TestTools.UnitTesting; | ||
using Microsoft.WindowsAzure.Storage; | ||
using Microsoft.WindowsAzure.Storage.Blob; | ||
using Microsoft.WindowsAzure.Storage.Queue; | ||
using Microsoft.WindowsAzure.Storage.Table; | ||
|
||
/// <summary> | ||
|
@@ -172,11 +175,11 @@ public async Task<List<IListBlobItem>> ListBlobsAsync(CloudBlobDirectory client) | |
return results; | ||
} | ||
|
||
/// <summary> | ||
/// REQUIREMENT: Workers can be added or removed at any time and control-queue partitions are load-balanced automatically. | ||
/// REQUIREMENT: No two workers will ever process the same control queue. | ||
/// </summary> | ||
[TestMethod] | ||
/// <summary> | ||
/// REQUIREMENT: Workers can be added or removed at any time and control-queue partitions are load-balanced automatically. | ||
/// REQUIREMENT: No two workers will ever process the same control queue. | ||
/// </summary> | ||
[TestMethod] | ||
public async Task MultiWorkerLeaseMovement() | ||
{ | ||
const int MaxWorkerCount = 4; | ||
|
@@ -371,6 +374,75 @@ public async Task TestInstanceAndMessageDistribution() | |
} | ||
} | ||
|
||
/// <summary> | ||
/// If a partition is lost, verify that all pre-fetched messages associated | ||
/// with that partition are abandoned and not processed. | ||
/// </summary> | ||
[TestMethod] | ||
public async Task PartitionLost_AbandonPrefetchedSession() | ||
{ | ||
var settings = new AzureStorageOrchestrationServiceSettings() | ||
{ | ||
PartitionCount = 1, | ||
LeaseRenewInterval = TimeSpan.FromMilliseconds(500), | ||
TaskHubName = TestHelpers.GetTestTaskHubName(), | ||
StorageConnectionString = TestHelpers.GetTestStorageAccountConnectionString(), | ||
ControlQueueBufferThreshold = 100, | ||
}; | ||
|
||
// STEP 1: Start up the service and queue up a large number of messages | ||
var service = new AzureStorageOrchestrationService(settings); | ||
await service.CreateAsync(); | ||
await service.StartAsync(); | ||
|
||
// These instance IDs are set up specifically to bypass message validation logic | ||
// that might otherwise discard these messages as out-of-order, invalid, etc. | ||
var sourceInstance = new OrchestrationInstance(); | ||
var targetInstance = new OrchestrationInstance { InstanceId = "@counter@xyz" }; | ||
|
||
await TestHelpers.WaitFor( | ||
condition: () => service.OwnedControlQueues.Any(), | ||
timeout: TimeSpan.FromSeconds(10)); | ||
ControlQueue controlQueue = service.OwnedControlQueues.Single(); | ||
|
||
List<TaskMessage> messages = Enumerable.Range(0, 100).Select(i => new TaskMessage | ||
{ | ||
Event = new EventRaisedEvent(-1, null), | ||
SequenceNumber = i, | ||
OrchestrationInstance = targetInstance, | ||
}).ToList(); | ||
|
||
await messages.ParallelForEachAsync( | ||
maxConcurrency: 50, | ||
action: msg => controlQueue.AddMessageAsync(msg, sourceInstance)); | ||
|
||
// STEP 2: Force the lease to be stolen and wait for the lease status to update. | ||
// The orchestration service should detect this and update its state. | ||
BlobLease lease = (await service.ListBlobLeasesAsync()).Single(); | ||
await lease.Blob.ChangeLeaseAsync( | ||
proposedLeaseId: Guid.NewGuid().ToString(), | ||
accessCondition: AccessCondition.GenerateLeaseCondition(lease.Token)); | ||
await TestHelpers.WaitFor( | ||
condition: () => !service.OwnedControlQueues.Any(), | ||
timeout: TimeSpan.FromSeconds(10)); | ||
|
||
// Small additional delay to account for tiny race condition. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the race condition that we may have gotten rid of the lease but not yet abandoned our work items yet? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No the work item is abandoned as part of the |
||
await Task.Delay(250); | ||
|
||
// STEP 3: Try to get an orchestration work item - a null value should be returned | ||
// because the lease was lost. | ||
var workItem = await service.LockNextTaskOrchestrationWorkItemAsync( | ||
TimeSpan.FromMinutes(5), | ||
CancellationToken.None); | ||
Assert.IsNull(workItem); | ||
|
||
// STEP 4: Verify that all the enqueued messages were abandoned, i.e. put back | ||
// onto the queue with their dequeue counts incremented. | ||
IEnumerable<CloudQueueMessage> queueMessages = | ||
await controlQueue.InnerQueue.PeekMessagesAsync(settings.ControlQueueBatchSize); | ||
Assert.IsTrue(queueMessages.All(msg => msg.DequeueCount == 1)); | ||
} | ||
|
||
[TestMethod] | ||
public async Task MonitorIdleTaskHubDisconnected() | ||
{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -610,6 +610,13 @@ public async Task<TaskOrchestrationWorkItem> LockNextTaskOrchestrationWorkItemAs | |
return null; | ||
} | ||
|
||
// Make sure we still own the partition. If not, abandon the session. | ||
if (session.ControlQueue.IsReleased) | ||
{ | ||
await this.AbandonAndReleaseSessionAsync(session); | ||
return null; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the relevant change. It turned out that we were already setting |
||
|
||
session.StartNewLogicalTraceScope(); | ||
|
||
List<MessageData> outOfOrderMessages = null; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -221,6 +221,7 @@ public override Task DeleteMessageAsync(MessageData message, SessionBase session | |
public void Release() | ||
{ | ||
this.releaseTokenSource.Cancel(); | ||
this.IsReleased = true; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would love to see a comment that explains this in the code, as it may confuse us in the future otherwise. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I can add that. |
||
} | ||
|
||
public virtual void Dispose() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -179,9 +179,9 @@ await this.storageQueue.AddMessageAsync( | |
// We assume that auto-started orchestrations (i.e. instance ids starting with '@') | ||
// are used exclusively by durable entities; so we can follow | ||
// a custom naming convention to pass a time parameter. | ||
var eventName = eventRaisedEvent.Name; | ||
if (eventName.Length >= 3 && eventName[2] == '@' | ||
&& DateTime.TryParse(eventRaisedEvent.Name.Substring(3), out var scheduledTime)) | ||
string eventName = eventRaisedEvent.Name; | ||
if (eventName != null && eventName.Length >= 3 && eventName[2] == '@' | ||
&& DateTime.TryParse(eventName.Substring(3), out DateTime scheduledTime)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a fix for a null-ref issue I observed when developing my test. I don't think we would hit it under normal circumstances with Durable Entities since user's don't control the |
||
{ | ||
initialVisibilityDelay = scheduledTime.ToUniversalTime() - DateTime.UtcNow; | ||
if (initialVisibilityDelay < TimeSpan.Zero) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Like other tests in this class, we're testing methods on
AzureStorageOrchestrationService
rather than going throughTaskHubWorker
. We're still going through Azure Storage, but we have a bit more control using this test approach.