Skip to content

Commit 04fff47

Browse files
author
Bhumika Sharma
committed
Add node-left metric
Signed-off-by: Bhumika Sharma <bhumikka@amazon.com>
1 parent d8e2680 commit 04fff47

File tree

3 files changed

+32
-4
lines changed

3 files changed

+32
-4
lines changed

server/src/main/java/org/opensearch/cluster/ClusterManagerMetrics.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
*/
2424
public final class ClusterManagerMetrics {
2525

26+
public static final String NODE_ID_TAG = "node_id";
27+
public static final String REASON_TAG = "reason";
2628
private static final String LATENCY_METRIC_UNIT_MS = "ms";
2729
private static final String COUNTER_METRICS_UNIT = "1";
2830

@@ -36,6 +38,7 @@ public final class ClusterManagerMetrics {
3638
public final Counter followerChecksFailureCounter;
3739
public final Counter asyncFetchFailureCounter;
3840
public final Counter asyncFetchSuccessCounter;
41+
public final Counter nodeLeftCounter;
3942

4043
public ClusterManagerMetrics(MetricsRegistry metricsRegistry) {
4144
clusterStateAppliersHistogram = metricsRegistry.createHistogram(
@@ -83,7 +86,7 @@ public ClusterManagerMetrics(MetricsRegistry metricsRegistry) {
8386
"Counter for number of successful async fetches",
8487
COUNTER_METRICS_UNIT
8588
);
86-
89+
nodeLeftCounter = metricsRegistry.createCounter("node.left.count", "Counter for node left operation", COUNTER_METRICS_UNIT);
8790
}
8891

8992
public void recordLatency(Histogram histogram, Double value) {

server/src/main/java/org/opensearch/cluster/coordination/Coordinator.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
import org.opensearch.monitor.NodeHealthService;
9191
import org.opensearch.monitor.StatusInfo;
9292
import org.opensearch.node.remotestore.RemoteStoreNodeService;
93+
import org.opensearch.telemetry.metrics.tags.Tags;
9394
import org.opensearch.threadpool.Scheduler;
9495
import org.opensearch.threadpool.ThreadPool.Names;
9596
import org.opensearch.transport.TransportService;
@@ -111,6 +112,12 @@
111112
import java.util.stream.Stream;
112113
import java.util.stream.StreamSupport;
113114

115+
import static org.opensearch.cluster.ClusterManagerMetrics.NODE_ID_TAG;
116+
import static org.opensearch.cluster.ClusterManagerMetrics.REASON_TAG;
117+
import static org.opensearch.cluster.coordination.FollowersChecker.NODE_LEFT_REASON_DISCONNECTED;
118+
import static org.opensearch.cluster.coordination.FollowersChecker.NODE_LEFT_REASON_FOLLOWER_CHECK_RETRY_FAIL;
119+
import static org.opensearch.cluster.coordination.FollowersChecker.NODE_LEFT_REASON_HEALTHCHECK_FAIL;
120+
import static org.opensearch.cluster.coordination.FollowersChecker.NODE_LEFT_REASON_LAGGING;
114121
import static org.opensearch.cluster.coordination.NoClusterManagerBlockService.NO_CLUSTER_MANAGER_BLOCK_ID;
115122
import static org.opensearch.cluster.decommission.DecommissionHelper.nodeCommissioned;
116123
import static org.opensearch.gateway.ClusterStateUpdaters.hideStateIfNotRecovered;
@@ -193,6 +200,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
193200
private final RemoteStoreNodeService remoteStoreNodeService;
194201
private NodeConnectionsService nodeConnectionsService;
195202
private final ClusterSettings clusterSettings;
203+
private final ClusterManagerMetrics clusterManagerMetrics;
196204

197205
/**
198206
* @param nodeName The name of the node, used to name the {@link java.util.concurrent.ExecutorService} of the {@link SeedHostsResolver}.
@@ -250,6 +258,7 @@ public Coordinator(
250258
this.publishTimeout = PUBLISH_TIMEOUT_SETTING.get(settings);
251259
clusterSettings.addSettingsUpdateConsumer(PUBLISH_TIMEOUT_SETTING, this::setPublishTimeout);
252260
this.publishInfoTimeout = PUBLISH_INFO_TIMEOUT_SETTING.get(settings);
261+
this.clusterManagerMetrics = clusterManagerMetrics;
253262
this.random = random;
254263
this.electionSchedulerFactory = new ElectionSchedulerFactory(settings, random, transportService.getThreadPool());
255264
this.preVoteCollector = new PreVoteCollector(
@@ -359,6 +368,18 @@ private void removeNode(DiscoveryNode discoveryNode, String reason) {
359368
nodeRemovalExecutor,
360369
nodeRemovalExecutor
361370
);
371+
String reasonToPublish = switch (reason) {
372+
case NODE_LEFT_REASON_DISCONNECTED -> "disconnected";
373+
case NODE_LEFT_REASON_LAGGING -> "lagging";
374+
case NODE_LEFT_REASON_FOLLOWER_CHECK_RETRY_FAIL -> "follower.check.fail";
375+
case NODE_LEFT_REASON_HEALTHCHECK_FAIL -> "health.check.fail";
376+
default -> reason;
377+
};
378+
clusterManagerMetrics.incrementCounter(
379+
clusterManagerMetrics.nodeLeftCounter,
380+
1.0,
381+
Optional.ofNullable(Tags.create().addTag(NODE_ID_TAG, discoveryNode.getId()).addTag(REASON_TAG, reasonToPublish))
382+
);
362383
}
363384
}
364385
}

server/src/main/java/org/opensearch/cluster/coordination/FollowersChecker.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ public class FollowersChecker {
8686
private static final Logger logger = LogManager.getLogger(FollowersChecker.class);
8787

8888
public static final String FOLLOWER_CHECK_ACTION_NAME = "internal:coordination/fault_detection/follower_check";
89+
public static final String NODE_LEFT_REASON_LAGGING = "lagging";
90+
public static final String NODE_LEFT_REASON_DISCONNECTED = "disconnected";
91+
public static final String NODE_LEFT_REASON_HEALTHCHECK_FAIL = "health check failed";
92+
public static final String NODE_LEFT_REASON_FOLLOWER_CHECK_RETRY_FAIL = "followers check retry count exceeded";
8993

9094
// the time between checks sent to each node
9195
public static final Setting<TimeValue> FOLLOWER_CHECK_INTERVAL_SETTING = Setting.timeSetting(
@@ -398,13 +402,13 @@ public void handleException(TransportException exp) {
398402
final String reason;
399403
if (exp instanceof ConnectTransportException || exp.getCause() instanceof ConnectTransportException) {
400404
logger.info(() -> new ParameterizedMessage("{} disconnected", FollowerChecker.this), exp);
401-
reason = "disconnected";
405+
reason = NODE_LEFT_REASON_DISCONNECTED;
402406
} else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
403407
logger.info(() -> new ParameterizedMessage("{} health check failed", FollowerChecker.this), exp);
404-
reason = "health check failed";
408+
reason = NODE_LEFT_REASON_HEALTHCHECK_FAIL;
405409
} else if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
406410
logger.info(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
407-
reason = "followers check retry count exceeded";
411+
reason = NODE_LEFT_REASON_FOLLOWER_CHECK_RETRY_FAIL;
408412
} else {
409413
logger.info(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp);
410414
scheduleNextWakeUp();

0 commit comments

Comments
 (0)