Skip to content

Commit 8c03df6

Browse files
authored
Add preflight checks to Health API to ensure health is obtainable (elastic#86404)
This PR introduces an idea of preflight health indicator services to the new health service. Preflight indicators are structurally identical to regular indicators, but they are executed first when calculating health and conditionally block downstream indicators from running on an unstable or unknown cluster state.
1 parent 3ef46b0 commit 8c03df6

File tree

4 files changed

+519
-44
lines changed

4 files changed

+519
-44
lines changed

docs/changelog/86404.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 86404
2+
summary: Add preflight checks to Health API to ensure health is obtainable
3+
area: Health
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/health/HealthService.java

Lines changed: 135 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,28 +9,71 @@
99
package org.elasticsearch.health;
1010

1111
import org.elasticsearch.ResourceNotFoundException;
12+
import org.elasticsearch.cluster.service.ClusterService;
1213
import org.elasticsearch.core.Nullable;
14+
import org.elasticsearch.gateway.GatewayService;
1315

16+
import java.util.Collections;
1417
import java.util.HashSet;
1518
import java.util.List;
1619
import java.util.Locale;
20+
import java.util.Map;
1721
import java.util.Set;
1822
import java.util.TreeMap;
1923
import java.util.stream.Collectors;
24+
import java.util.stream.Stream;
2025

26+
import static java.util.function.Predicate.isEqual;
2127
import static java.util.stream.Collectors.collectingAndThen;
2228
import static java.util.stream.Collectors.groupingBy;
2329
import static java.util.stream.Collectors.toList;
30+
import static java.util.stream.Collectors.toMap;
2431

2532
/**
2633
* This service collects health indicators from all modules and plugins of elasticsearch
2734
*/
2835
public class HealthService {
2936

37+
// Visible for testing
38+
static final String UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED = "Could not determine indicator state. Cluster state is not stable. Check "
39+
+ "details for critical issues keeping this indicator from running.";
40+
static final String UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED =
41+
"Could not determine indicator state. The current node handling the health request is not ready to assess the health of the "
42+
+ "cluster. Try again later or execute the health API against a different node.";
43+
44+
/**
45+
* Detail map key that contains the reasons a result was marked as UNKNOWN
46+
*/
47+
private static final String REASON = "reasons";
48+
49+
private static final String CLUSTER_STATE_RECOVERED = "cluster_state_recovered";
50+
private static final SimpleHealthIndicatorDetails DETAILS_UNKNOWN_STATE_NOT_RECOVERED = new SimpleHealthIndicatorDetails(
51+
Map.of(REASON, Map.of(CLUSTER_STATE_RECOVERED, false))
52+
);
53+
54+
private final List<HealthIndicatorService> preflightHealthIndicatorServices;
3055
private final List<HealthIndicatorService> healthIndicatorServices;
56+
private final ClusterService clusterService;
3157

32-
public HealthService(List<HealthIndicatorService> healthIndicatorServices) {
58+
/**
59+
* Creates a new HealthService.
60+
*
61+
* Accepts a list of regular indicator services and a list of preflight indicator services. Preflight indicators are run first and
62+
* represent serious cascading health problems. If any of these preflight indicators are not GREEN status, all remaining indicators are
63+
* likely to be degraded in some way or will not be able to calculate their state correctly. The remaining health indicators will return
64+
* UNKNOWN statuses in this case.
65+
*
66+
* @param preflightHealthIndicatorServices indicators that are run first and represent a serious cascading health problem.
67+
* @param healthIndicatorServices indicators that are run if the preflight indicators return GREEN results.
68+
*/
69+
public HealthService(
70+
List<HealthIndicatorService> preflightHealthIndicatorServices,
71+
List<HealthIndicatorService> healthIndicatorServices,
72+
ClusterService clusterService
73+
) {
74+
this.preflightHealthIndicatorServices = preflightHealthIndicatorServices;
3375
this.healthIndicatorServices = healthIndicatorServices;
76+
this.clusterService = clusterService;
3477
}
3578

3679
/**
@@ -47,11 +90,52 @@ public HealthService(List<HealthIndicatorService> healthIndicatorServices) {
4790
public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nullable String indicatorName, boolean computeDetails) {
4891
final boolean shouldDrillDownToIndicatorLevel = indicatorName != null;
4992
final boolean showRolledUpComponentStatus = shouldDrillDownToIndicatorLevel == false;
93+
94+
// Is the cluster state recovered? If not, ALL indicators should return UNKNOWN
95+
boolean clusterStateRecovered = clusterService.state()
96+
.getBlocks()
97+
.hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK) == false;
98+
99+
List<HealthIndicatorResult> preflightResults;
100+
if (clusterStateRecovered) {
101+
// Determine if cluster is stable enough to calculate health before running other indicators
102+
preflightResults = preflightHealthIndicatorServices.stream().map(service -> service.calculate(computeDetails)).toList();
103+
} else {
104+
// Mark preflight indicators as UNKNOWN
105+
HealthIndicatorDetails details = computeDetails ? DETAILS_UNKNOWN_STATE_NOT_RECOVERED : HealthIndicatorDetails.EMPTY;
106+
preflightResults = preflightHealthIndicatorServices.stream()
107+
.map(service -> generateUnknownResult(service, UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED, details))
108+
.toList();
109+
}
110+
111+
// If any of these are not GREEN, then we cannot obtain health from other indicators
112+
boolean clusterHealthIsObtainable = preflightResults.isEmpty()
113+
|| preflightResults.stream().map(HealthIndicatorResult::status).allMatch(isEqual(HealthStatus.GREEN));
114+
115+
// Filter remaining indicators by component name and indicator name if present before calculating their results
116+
Stream<HealthIndicatorService> filteredIndicators = healthIndicatorServices.stream()
117+
.filter(service -> componentName == null || service.component().equals(componentName))
118+
.filter(service -> indicatorName == null || service.name().equals(indicatorName));
119+
120+
Stream<HealthIndicatorResult> filteredIndicatorResults;
121+
if (clusterStateRecovered && clusterHealthIsObtainable) {
122+
// Calculate remaining indicators
123+
filteredIndicatorResults = filteredIndicators.map(service -> service.calculate(computeDetails));
124+
} else {
125+
// Mark remaining indicators as UNKNOWN
126+
String unknownSummary = clusterStateRecovered ? UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED : UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED;
127+
HealthIndicatorDetails unknownDetails = healthUnknownReason(preflightResults, clusterStateRecovered, computeDetails);
128+
filteredIndicatorResults = filteredIndicators.map(service -> generateUnknownResult(service, unknownSummary, unknownDetails));
129+
}
130+
131+
// Filter the cluster indicator results by component name and indicator name if present
132+
Stream<HealthIndicatorResult> filteredPreflightResults = preflightResults.stream()
133+
.filter(result -> componentName == null || result.component().equals(componentName))
134+
.filter(result -> indicatorName == null || result.name().equals(indicatorName));
135+
136+
// Combine indicator results
50137
List<HealthComponentResult> components = List.copyOf(
51-
healthIndicatorServices.stream()
52-
.filter(service -> componentName == null || service.component().equals(componentName))
53-
.filter(service -> indicatorName == null || service.name().equals(indicatorName))
54-
.map(service -> service.calculate(computeDetails))
138+
Stream.concat(filteredPreflightResults, filteredIndicatorResults)
55139
.collect(
56140
groupingBy(
57141
HealthIndicatorResult::component,
@@ -76,6 +160,52 @@ public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nu
76160
return components;
77161
}
78162

163+
/**
164+
* Return details to include on health indicator results when health information cannot be obtained due to unstable cluster.
165+
* @param preflightResults Results of indicators used to determine if health checks can happen.
166+
* @param computeDetails If details should be calculated on which indicators are causing the UNKNOWN state.
167+
* @return Details explaining why results are UNKNOWN, or an empty detail set if computeDetails is false.
168+
*/
169+
private HealthIndicatorDetails healthUnknownReason(
170+
List<HealthIndicatorResult> preflightResults,
171+
boolean clusterStateRecovered,
172+
boolean computeDetails
173+
) {
174+
assert clusterStateRecovered == false || preflightResults.isEmpty() == false
175+
: "Requires at least one non-GREEN preflight result or cluster state not recovered";
176+
HealthIndicatorDetails unknownDetails;
177+
if (computeDetails) {
178+
if (clusterStateRecovered) {
179+
// Determine why the cluster is not stable enough for running remaining indicators
180+
Map<String, String> clusterUnstableReasons = preflightResults.stream()
181+
.filter(result -> HealthStatus.GREEN.equals(result.status()) == false)
182+
.collect(toMap(HealthIndicatorResult::name, result -> result.status().xContentValue()));
183+
assert clusterUnstableReasons.isEmpty() == false : "Requires at least one non-GREEN preflight result";
184+
unknownDetails = new SimpleHealthIndicatorDetails(Map.of(REASON, clusterUnstableReasons));
185+
} else {
186+
unknownDetails = DETAILS_UNKNOWN_STATE_NOT_RECOVERED;
187+
}
188+
} else {
189+
unknownDetails = HealthIndicatorDetails.EMPTY;
190+
}
191+
return unknownDetails;
192+
}
193+
194+
/**
195+
* Generates an UNKNOWN result for an indicator
196+
* @param indicatorService the indicator to generate a result for
197+
* @param summary the summary to include for the UNKNOWN result
198+
* @param details the details to include on the result
199+
* @return A result with the UNKNOWN status
200+
*/
201+
private HealthIndicatorResult generateUnknownResult(
202+
HealthIndicatorService indicatorService,
203+
String summary,
204+
HealthIndicatorDetails details
205+
) {
206+
return indicatorService.createIndicator(HealthStatus.UNKNOWN, summary, details, Collections.emptyList(), Collections.emptyList());
207+
}
208+
79209
// Non-private for testing purposes
80210
static HealthComponentResult createComponentFromIndicators(List<HealthIndicatorResult> indicators, boolean showComponentSummary) {
81211
assert indicators.size() > 0 : "Component should not be non empty";

server/src/main/java/org/elasticsearch/node/Node.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@
9898
import org.elasticsearch.gateway.GatewayService;
9999
import org.elasticsearch.gateway.MetaStateService;
100100
import org.elasticsearch.gateway.PersistedClusterStateService;
101+
import org.elasticsearch.health.HealthIndicatorService;
101102
import org.elasticsearch.health.HealthService;
102103
import org.elasticsearch.http.HttpServerTransport;
103104
import org.elasticsearch.index.IndexSettingProviders;
@@ -1039,16 +1040,22 @@ protected Node(
10391040
}
10401041

10411042
private HealthService createHealthService(ClusterService clusterService, ClusterModule clusterModule) {
1043+
List<HealthIndicatorService> preflightHealthIndicatorServices = Collections.singletonList(
1044+
new InstanceHasMasterHealthIndicatorService(clusterService)
1045+
);
10421046
var serverHealthIndicatorServices = List.of(
1043-
new InstanceHasMasterHealthIndicatorService(clusterService),
10441047
new RepositoryIntegrityHealthIndicatorService(clusterService),
10451048
new ShardsAvailabilityHealthIndicatorService(clusterService, clusterModule.getAllocationService())
10461049
);
10471050
var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)
10481051
.stream()
10491052
.flatMap(plugin -> plugin.getHealthIndicatorServices().stream())
10501053
.toList();
1051-
return new HealthService(concatLists(serverHealthIndicatorServices, pluginHealthIndicatorServices));
1054+
return new HealthService(
1055+
preflightHealthIndicatorServices,
1056+
concatLists(serverHealthIndicatorServices, pluginHealthIndicatorServices),
1057+
clusterService
1058+
);
10521059
}
10531060

10541061
private RecoveryPlannerService getRecoveryPlannerService(

0 commit comments

Comments
 (0)