|
90 | 90 | import org.opensearch.monitor.NodeHealthService; |
91 | 91 | import org.opensearch.monitor.StatusInfo; |
92 | 92 | import org.opensearch.node.remotestore.RemoteStoreNodeService; |
| 93 | +import org.opensearch.telemetry.metrics.tags.Tags; |
93 | 94 | import org.opensearch.threadpool.Scheduler; |
94 | 95 | import org.opensearch.threadpool.ThreadPool.Names; |
95 | 96 | import org.opensearch.transport.TransportService; |
|
111 | 112 | import java.util.stream.Stream; |
112 | 113 | import java.util.stream.StreamSupport; |
113 | 114 |
|
| 115 | +import static org.opensearch.cluster.ClusterManagerMetrics.NODE_ID_TAG; |
| 116 | +import static org.opensearch.cluster.ClusterManagerMetrics.REASON_TAG; |
| 117 | +import static org.opensearch.cluster.coordination.FollowersChecker.NODE_LEFT_REASON_DISCONNECTED; |
| 118 | +import static org.opensearch.cluster.coordination.FollowersChecker.NODE_LEFT_REASON_FOLLOWER_CHECK_RETRY_FAIL; |
| 119 | +import static org.opensearch.cluster.coordination.FollowersChecker.NODE_LEFT_REASON_HEALTHCHECK_FAIL; |
| 120 | +import static org.opensearch.cluster.coordination.FollowersChecker.NODE_LEFT_REASON_LAGGING; |
114 | 121 | import static org.opensearch.cluster.coordination.NoClusterManagerBlockService.NO_CLUSTER_MANAGER_BLOCK_ID; |
115 | 122 | import static org.opensearch.cluster.decommission.DecommissionHelper.nodeCommissioned; |
116 | 123 | import static org.opensearch.gateway.ClusterStateUpdaters.hideStateIfNotRecovered; |
@@ -193,6 +200,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery |
193 | 200 | private final RemoteStoreNodeService remoteStoreNodeService; |
194 | 201 | private NodeConnectionsService nodeConnectionsService; |
195 | 202 | private final ClusterSettings clusterSettings; |
| 203 | + private final ClusterManagerMetrics clusterManagerMetrics; |
196 | 204 |
|
197 | 205 | /** |
198 | 206 | * @param nodeName The name of the node, used to name the {@link java.util.concurrent.ExecutorService} of the {@link SeedHostsResolver}. |
@@ -250,6 +258,7 @@ public Coordinator( |
250 | 258 | this.publishTimeout = PUBLISH_TIMEOUT_SETTING.get(settings); |
251 | 259 | clusterSettings.addSettingsUpdateConsumer(PUBLISH_TIMEOUT_SETTING, this::setPublishTimeout); |
252 | 260 | this.publishInfoTimeout = PUBLISH_INFO_TIMEOUT_SETTING.get(settings); |
| 261 | + this.clusterManagerMetrics = clusterManagerMetrics; |
253 | 262 | this.random = random; |
254 | 263 | this.electionSchedulerFactory = new ElectionSchedulerFactory(settings, random, transportService.getThreadPool()); |
255 | 264 | this.preVoteCollector = new PreVoteCollector( |
@@ -359,6 +368,18 @@ private void removeNode(DiscoveryNode discoveryNode, String reason) { |
359 | 368 | nodeRemovalExecutor, |
360 | 369 | nodeRemovalExecutor |
361 | 370 | ); |
| 371 | + String reasonToPublish = switch (reason) { |
| 372 | + case NODE_LEFT_REASON_DISCONNECTED -> "disconnected"; |
| 373 | + case NODE_LEFT_REASON_LAGGING -> "lagging"; |
| 374 | + case NODE_LEFT_REASON_FOLLOWER_CHECK_RETRY_FAIL -> "follower.check.fail"; |
| 375 | + case NODE_LEFT_REASON_HEALTHCHECK_FAIL -> "health.check.fail"; |
| 376 | + default -> reason; |
| 377 | + }; |
| 378 | + clusterManagerMetrics.incrementCounter( |
| 379 | + clusterManagerMetrics.nodeLeftCounter, |
| 380 | + 1.0, |
| 381 | + Optional.ofNullable(Tags.create().addTag(NODE_ID_TAG, discoveryNode.getId()).addTag(REASON_TAG, reasonToPublish)) |
| 382 | + ); |
362 | 383 | } |
363 | 384 | } |
364 | 385 | } |
|
0 commit comments