Skip to content

Commit

Permalink
BugFix: Race condition between run() and timeout
Browse files Browse the repository at this point in the history
This uses compareAndSet to synchronize timeout vs run() completion logic when there is a race.

I have not been able to to replicate the issue reported at Netflix#188 but I can see how the existing code is vulnerable to a race condition. This theoretically should fix it. All unit tests pass for existing functionality. All my attempts to create a unit test to reveal the issue failed.
  • Loading branch information
benjchristensen committed Oct 1, 2013
1 parent c73ab84 commit 35fb8ce
Showing 1 changed file with 21 additions and 14 deletions.
35 changes: 21 additions & 14 deletions hystrix-core/src/main/java/com/netflix/hystrix/HystrixCommand.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,12 @@ public abstract class HystrixCommand<R> implements HystrixExecutable<R> {
private volatile ExecutionResult executionResult = ExecutionResult.EMPTY;

/* If this command executed and timed-out */
private final AtomicBoolean isCommandTimedOut = new AtomicBoolean(false);
private final AtomicReference<TimedOutStatus> isCommandTimedOut = new AtomicReference<TimedOutStatus>(TimedOutStatus.NOT_EXECUTED);
private final AtomicBoolean isExecutionComplete = new AtomicBoolean(false);
private final AtomicBoolean isExecutedInThread = new AtomicBoolean(false);

private static enum TimedOutStatus {NOT_EXECUTED, EXECUTED, TIMED_OUT};

private final HystrixCommandKey commandKey;
private final HystrixCommandGroupKey commandGroup;

Expand Down Expand Up @@ -1008,7 +1010,9 @@ public Subscription onSubscribe(final Observer<? super R> observer) {

@Override
public void tick() {
if (originalCommand.isCommandTimedOut.compareAndSet(false, true)) {
// if we can go from NOT_EXECUTED to TIMED_OUT then we do the timeout codepath
// otherwise it means we lost a race and the run() execution completed
if (originalCommand.isCommandTimedOut.compareAndSet(TimedOutStatus.NOT_EXECUTED, TimedOutStatus.TIMED_OUT)) {
// do fallback logic

// report timeout failure
Expand Down Expand Up @@ -1163,20 +1167,23 @@ public R call() throws Exception {

// execute the command
R r = executeCommand();
if (isCommandTimedOut.get()) {
// if we can go from NOT_EXECUTED to EXECUTED then we did not timeout
if (isCommandTimedOut.compareAndSet(TimedOutStatus.NOT_EXECUTED, TimedOutStatus.EXECUTED)) {
// give the hook an opportunity to modify it
r = executionHook.onComplete(_this, r);
// pass to the observer
observer.onNext(r);
// state changes before termination
preTerminationWork();
/* now complete which releases the consumer */
observer.onCompleted();
return r;
} else {
// this means we lost the race and the timeout logic has or is being executed
// state changes before termination
preTerminationWork();
return null;
}
// give the hook an opportunity to modify it
r = executionHook.onComplete(_this, r);
// pass to the observer
observer.onNext(r);
// state changes before termination
preTerminationWork();
/* now complete which releases the consumer */
observer.onCompleted();
return r;
} finally {
// pop this off the thread now that it's done
Hystrix.endCurrentThreadExecutingCommand();
Expand Down Expand Up @@ -1254,7 +1261,7 @@ private R executeCommand() {
long duration = System.currentTimeMillis() - startTime;
metrics.addCommandExecutionTime(duration);

if (isCommandTimedOut.get()) {
if (isCommandTimedOut.get() == TimedOutStatus.TIMED_OUT) {
// the command timed out in the wrapping thread so we will return immediately
// and not increment any of the counters below or other such logic
return null;
Expand Down Expand Up @@ -1296,7 +1303,7 @@ private R executeCommand() {
logger.warn("Error calling ExecutionHook.endRunFailure", hookException);
}

if (isCommandTimedOut.get()) {
if (isCommandTimedOut.get() == TimedOutStatus.TIMED_OUT) {
// http://jira/browse/API-4905 HystrixCommand: Error/Timeout Double-count if both occur
// this means we have already timed out then we don't count this error stat and we just return
// as this means the user-thread has already returned, we've already done fallback logic
Expand Down

0 comments on commit 35fb8ce

Please sign in to comment.