Skip to content

Commit

Permalink
Report more details of unobtainable ShardLock (#61255)
Browse files Browse the repository at this point in the history
Today a common reason for a `ShardLockObtainFailedException` is when a
shard is removed from a node and then assigned straight back to it again
before the node has had a chance to shut the previous shard instance
down. For instance, this can happen if a node briefly leaves the cluster
holding a primary with no in-sync replicas.

The message in this case is typically as follows:

    obtaining shard lock timed out after 5000ms, previous lock details: [shard creation] trying to lock for [shard creation]

This is pretty hard to interpret, and doesn't raise the important
question: "why didn't the shard shut down sooner?"

With this change we reword the message a bit, report the age of the
shard lock, and adjust the details to report that the lock is held by a
closing shard:

    obtaining shard lock for [starting shard] timed out after [5000ms], lock already held for [closing shard] with age [12345ms]

Relates #38807
  • Loading branch information
DaveCTurner authored Aug 19, 2020
1 parent f4f3a0b commit 98213df
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 7 deletions.
22 changes: 17 additions & 5 deletions server/src/main/java/org/elasticsearch/env/NodeEnvironment.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.elasticsearch.common.Randomness;
import org.elasticsearch.common.SuppressForbidden;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.io.FileSystemUtils;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.settings.Setting;
Expand Down Expand Up @@ -786,6 +787,11 @@ protected void closeInternal() {
shardLock.release();
logger.trace("released shard lock for [{}]", shardId);
}

@Override
public void setDetails(String details) {
shardLock.setDetails(details);
}
};
}

Expand Down Expand Up @@ -817,13 +823,13 @@ private final class InternalShardLock {
*/
private final Semaphore mutex = new Semaphore(1);
private int waitCount = 1; // guarded by shardLocks
private String lockDetails;
private final ShardId shardId;
private volatile Tuple<Long, String> lockDetails;

InternalShardLock(final ShardId shardId, final String details) {
this.shardId = shardId;
mutex.acquireUninterruptibly();
lockDetails = details;
lockDetails = Tuple.tuple(System.nanoTime(), details);
}

protected void release() {
Expand Down Expand Up @@ -854,17 +860,23 @@ private void decWaitCount() {
void acquire(long timeoutInMillis, final String details) throws ShardLockObtainFailedException {
try {
if (mutex.tryAcquire(timeoutInMillis, TimeUnit.MILLISECONDS)) {
lockDetails = details;
setDetails(details);
} else {
final Tuple<Long, String> lockDetails = this.lockDetails; // single volatile read
throw new ShardLockObtainFailedException(shardId,
"obtaining shard lock timed out after " + timeoutInMillis + "ms, previous lock details: [" + lockDetails +
"] trying to lock for [" + details + "]");
"obtaining shard lock for [" + details + "] timed out after [" + timeoutInMillis +
"ms], lock already held for [" + lockDetails.v2() + "] with age [" +
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - lockDetails.v1()) + "ms]");
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new ShardLockObtainFailedException(shardId, "thread interrupted while trying to obtain shard lock", e);
}
}

public void setDetails(String details) {
lockDetails = Tuple.tuple(System.nanoTime(), details);
}
}

public boolean hasNodeFile() {
Expand Down
9 changes: 8 additions & 1 deletion server/src/main/java/org/elasticsearch/env/ShardLock.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,14 @@ public final void close() {
}
}

protected abstract void closeInternal();
protected abstract void closeInternal();

/**
* Update the details of the holder of this lock. These details are displayed alongside a {@link ShardLockObtainFailedException}. Must
* only be called by the holder of this lock.
*/
public void setDetails(String details) {
}

@Override
public String toString() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ public synchronized IndexShard createShard(
IndexShard indexShard = null;
ShardLock lock = null;
try {
lock = nodeEnv.shardLock(shardId, "shard creation", TimeUnit.SECONDS.toMillis(5));
lock = nodeEnv.shardLock(shardId, "starting shard", TimeUnit.SECONDS.toMillis(5));
eventListener.beforeIndexShardCreated(shardId, indexSettings);
ShardPath path;
try {
Expand Down Expand Up @@ -515,6 +515,9 @@ public synchronized void removeShard(int shardId, String reason) {
private void closeShard(String reason, ShardId sId, IndexShard indexShard, Store store, IndexEventListener listener) {
final int shardId = sId.id();
final Settings indexSettings = this.getIndexSettings().getSettings();
if (store != null) {
store.beforeClose();
}
try {
try {
listener.beforeIndexShardClosed(sId, indexShard, indexSettings);
Expand Down
4 changes: 4 additions & 0 deletions server/src/main/java/org/elasticsearch/index/store/Store.java
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,10 @@ public int refCount() {
return refCounter.refCount();
}

public void beforeClose() {
shardLock.setDetails("closing shard");
}

static final class StoreDirectory extends FilterDirectory {

private final Logger deletesLogger;
Expand Down

0 comments on commit 98213df

Please sign in to comment.