From 35fb8ce1455139d1213290cadfadda017c145018 Mon Sep 17 00:00:00 2001 From: Ben Christensen Date: Tue, 1 Oct 2013 13:42:41 -0700 Subject: [PATCH] BugFix: Race condition between run() and timeout This uses compareAndSet to synchronize timeout vs run() completion logic when there is a race. I have not been able to to replicate the issue reported at https://github.com/Netflix/Hystrix/issues/188 but I can see how the existing code is vulnerable to a race condition. This theoretically should fix it. All unit tests pass for existing functionality. All my attempts to create a unit test to reveal the issue failed. --- .../com/netflix/hystrix/HystrixCommand.java | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/hystrix-core/src/main/java/com/netflix/hystrix/HystrixCommand.java b/hystrix-core/src/main/java/com/netflix/hystrix/HystrixCommand.java index 27430df17..d6ed64065 100755 --- a/hystrix-core/src/main/java/com/netflix/hystrix/HystrixCommand.java +++ b/hystrix-core/src/main/java/com/netflix/hystrix/HystrixCommand.java @@ -108,10 +108,12 @@ public abstract class HystrixCommand implements HystrixExecutable { private volatile ExecutionResult executionResult = ExecutionResult.EMPTY; /* If this command executed and timed-out */ - private final AtomicBoolean isCommandTimedOut = new AtomicBoolean(false); + private final AtomicReference isCommandTimedOut = new AtomicReference(TimedOutStatus.NOT_EXECUTED); private final AtomicBoolean isExecutionComplete = new AtomicBoolean(false); private final AtomicBoolean isExecutedInThread = new AtomicBoolean(false); + private static enum TimedOutStatus {NOT_EXECUTED, EXECUTED, TIMED_OUT}; + private final HystrixCommandKey commandKey; private final HystrixCommandGroupKey commandGroup; @@ -1008,7 +1010,9 @@ public Subscription onSubscribe(final Observer observer) { @Override public void tick() { - if (originalCommand.isCommandTimedOut.compareAndSet(false, true)) { + // if we can go from NOT_EXECUTED to TIMED_OUT then we do the timeout codepath + // otherwise it means we lost a race and the run() execution completed + if (originalCommand.isCommandTimedOut.compareAndSet(TimedOutStatus.NOT_EXECUTED, TimedOutStatus.TIMED_OUT)) { // do fallback logic // report timeout failure @@ -1163,20 +1167,23 @@ public R call() throws Exception { // execute the command R r = executeCommand(); - if (isCommandTimedOut.get()) { + // if we can go from NOT_EXECUTED to EXECUTED then we did not timeout + if (isCommandTimedOut.compareAndSet(TimedOutStatus.NOT_EXECUTED, TimedOutStatus.EXECUTED)) { + // give the hook an opportunity to modify it + r = executionHook.onComplete(_this, r); + // pass to the observer + observer.onNext(r); + // state changes before termination + preTerminationWork(); + /* now complete which releases the consumer */ + observer.onCompleted(); + return r; + } else { + // this means we lost the race and the timeout logic has or is being executed // state changes before termination preTerminationWork(); return null; } - // give the hook an opportunity to modify it - r = executionHook.onComplete(_this, r); - // pass to the observer - observer.onNext(r); - // state changes before termination - preTerminationWork(); - /* now complete which releases the consumer */ - observer.onCompleted(); - return r; } finally { // pop this off the thread now that it's done Hystrix.endCurrentThreadExecutingCommand(); @@ -1254,7 +1261,7 @@ private R executeCommand() { long duration = System.currentTimeMillis() - startTime; metrics.addCommandExecutionTime(duration); - if (isCommandTimedOut.get()) { + if (isCommandTimedOut.get() == TimedOutStatus.TIMED_OUT) { // the command timed out in the wrapping thread so we will return immediately // and not increment any of the counters below or other such logic return null; @@ -1296,7 +1303,7 @@ private R executeCommand() { logger.warn("Error calling ExecutionHook.endRunFailure", hookException); } - if (isCommandTimedOut.get()) { + if (isCommandTimedOut.get() == TimedOutStatus.TIMED_OUT) { // http://jira/browse/API-4905 HystrixCommand: Error/Timeout Double-count if both occur // this means we have already timed out then we don't count this error stat and we just return // as this means the user-thread has already returned, we've already done fallback logic