Skip to content

Commit

Permalink
fix cluster not able to spin up issue when disk usage exceeds thresho…
Browse files Browse the repository at this point in the history
…ld (#15258)

* fix cluster not able to spin up issue when disk usage exceeds threshold

Signed-off-by: zane-neo <zaniu@amazon.com>

* Add comment to changes

Signed-off-by: zane-neo <zaniu@amazon.com>

* Add UT to ensure the keepAliveThread starts before node starts

Signed-off-by: zane-neo <zaniu@amazon.com>

* remove unused imports

Signed-off-by: zane-neo <zaniu@amazon.com>

* Fix forbidden API calls check failed issue

Signed-off-by: zane-neo <zaniu@amazon.com>

* format code

Signed-off-by: zane-neo <zaniu@amazon.com>

* format code

Signed-off-by: zane-neo <zaniu@amazon.com>

* change setInstance method to static

Signed-off-by: zane-neo <zaniu@amazon.com>

* Add countdownlatch in test to coordinate the thread to avoid concureency issue caused test failure

Signed-off-by: zane-neo <zaniu@amazon.com>

---------

Signed-off-by: zane-neo <zaniu@amazon.com>
(cherry picked from commit 62081f2)
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
github-actions[bot] committed Oct 16, 2024
1 parent 40f3046 commit c57388d
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Fix protobuf-java leak through client library dependencies ([#16254](https://github.com/opensearch-project/OpenSearch/pull/16254))
- Fix multi-search with template doesn't return status code ([#16265](https://github.com/opensearch-project/OpenSearch/pull/16265))
- Fix wrong default value when setting `index.number_of_routing_shards` to null on index creation ([#16331](https://github.com/opensearch-project/OpenSearch/pull/16331))
- Fix disk usage exceeds threshold cluster can't spin up issue ([#15258](https://github.com/opensearch-project/OpenSearch/pull/15258)))


### Security

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,15 @@

package org.opensearch.bootstrap;

import org.opensearch.common.logging.LogConfigurator;
import org.opensearch.common.settings.KeyStoreCommandTestCase;
import org.opensearch.common.settings.KeyStoreWrapper;
import org.opensearch.common.settings.SecureSettings;
import org.opensearch.common.settings.Settings;
import org.opensearch.common.util.io.IOUtils;
import org.opensearch.core.common.settings.SecureString;
import org.opensearch.env.Environment;
import org.opensearch.node.Node;
import org.opensearch.test.OpenSearchTestCase;
import org.junit.After;
import org.junit.Before;
Expand All @@ -51,8 +53,14 @@
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import static org.hamcrest.Matchers.equalTo;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;

public class BootstrapTests extends OpenSearchTestCase {
Environment env;
Expand Down Expand Up @@ -131,4 +139,38 @@ private void assertPassphraseRead(String source, String expected) {
}
}

public void testInitExecutionOrder() throws Exception {
AtomicInteger order = new AtomicInteger(0);
CountDownLatch countDownLatch = new CountDownLatch(1);
Thread mockThread = new Thread(() -> {
assertEquals(0, order.getAndIncrement());
countDownLatch.countDown();
});

Node mockNode = mock(Node.class);
doAnswer(invocation -> {
try {
boolean threadStarted = countDownLatch.await(1000, TimeUnit.MILLISECONDS);
assertTrue(
"Waited for one second but the keepAliveThread isn't started, please check the execution order of"
+ "keepAliveThread.start and node.start",
threadStarted
);
} catch (InterruptedException e) {
fail("Thread interrupted");
}
assertEquals(1, order.getAndIncrement());
return null;
}).when(mockNode).start();

LogConfigurator.registerErrorListener();
Bootstrap testBootstrap = new Bootstrap(mockThread, mockNode);
Bootstrap.setInstance(testBootstrap);

Bootstrap.startInstance(testBootstrap);

verify(mockNode).start();
assertEquals(2, order.get());
}

}
21 changes: 19 additions & 2 deletions server/src/main/java/org/opensearch/bootstrap/Bootstrap.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,17 @@ final class Bootstrap {
private final Thread keepAliveThread;
private final Spawner spawner = new Spawner();

// For testing purpose
static void setInstance(Bootstrap bootstrap) {
INSTANCE = bootstrap;
}

// For testing purpose
Bootstrap(Thread keepAliveThread, Node node) {
this.keepAliveThread = keepAliveThread;
this.node = node;
}

/** creates a new instance */
Bootstrap() {
keepAliveThread = new Thread(new Runnable() {
Expand Down Expand Up @@ -336,8 +347,10 @@ private static Environment createEnvironment(
}

private void start() throws NodeValidationException {
node.start();
// keepAliveThread should start first than node to ensure the cluster can spin up successfully in edge cases:
// https://github.com/opensearch-project/OpenSearch/issues/14791
keepAliveThread.start();
node.start();
}

static void stop() throws IOException {
Expand Down Expand Up @@ -410,7 +423,7 @@ static void init(final boolean foreground, final Path pidFile, final boolean qui
throw new BootstrapException(e);
}

INSTANCE.start();
startInstance(INSTANCE);

Check warning on line 426 in server/src/main/java/org/opensearch/bootstrap/Bootstrap.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/bootstrap/Bootstrap.java#L426

Added line #L426 was not covered by tests

// We don't close stderr if `--quiet` is passed, because that
// hides fatal startup errors. For example, if OpenSearch is
Expand Down Expand Up @@ -462,6 +475,10 @@ static void init(final boolean foreground, final Path pidFile, final boolean qui
}
}

static void startInstance(Bootstrap instance) throws NodeValidationException {
instance.start();
}

@SuppressForbidden(reason = "System#out")
private static void closeSystOut() {
System.out.close();
Expand Down

0 comments on commit c57388d

Please sign in to comment.