99package org .elasticsearch .health ;
1010
1111import org .elasticsearch .ResourceNotFoundException ;
12+ import org .elasticsearch .cluster .service .ClusterService ;
1213import org .elasticsearch .core .Nullable ;
14+ import org .elasticsearch .gateway .GatewayService ;
1315
16+ import java .util .Collections ;
1417import java .util .HashSet ;
1518import java .util .List ;
1619import java .util .Locale ;
20+ import java .util .Map ;
1721import java .util .Set ;
1822import java .util .TreeMap ;
1923import java .util .stream .Collectors ;
24+ import java .util .stream .Stream ;
2025
26+ import static java .util .function .Predicate .isEqual ;
2127import static java .util .stream .Collectors .collectingAndThen ;
2228import static java .util .stream .Collectors .groupingBy ;
2329import static java .util .stream .Collectors .toList ;
30+ import static java .util .stream .Collectors .toMap ;
2431
2532/**
2633 * This service collects health indicators from all modules and plugins of elasticsearch
2734 */
2835public class HealthService {
2936
37+ // Visible for testing
38+ static final String UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED = "Could not determine indicator state. Cluster state is not stable. Check "
39+ + "details for critical issues keeping this indicator from running." ;
40+ static final String UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED =
41+ "Could not determine indicator state. The current node handling the health request is not ready to assess the health of the "
42+ + "cluster. Try again later or execute the health API against a different node." ;
43+
44+ /**
45+ * Detail map key that contains the reasons a result was marked as UNKNOWN
46+ */
47+ private static final String REASON = "reasons" ;
48+
49+ private static final String CLUSTER_STATE_RECOVERED = "cluster_state_recovered" ;
50+ private static final SimpleHealthIndicatorDetails DETAILS_UNKNOWN_STATE_NOT_RECOVERED = new SimpleHealthIndicatorDetails (
51+ Map .of (REASON , Map .of (CLUSTER_STATE_RECOVERED , false ))
52+ );
53+
54+ private final List <HealthIndicatorService > preflightHealthIndicatorServices ;
3055 private final List <HealthIndicatorService > healthIndicatorServices ;
56+ private final ClusterService clusterService ;
3157
32- public HealthService (List <HealthIndicatorService > healthIndicatorServices ) {
58+ /**
59+ * Creates a new HealthService.
60+ *
61+ * Accepts a list of regular indicator services and a list of preflight indicator services. Preflight indicators are run first and
62+ * represent serious cascading health problems. If any of these preflight indicators are not GREEN status, all remaining indicators are
63+ * likely to be degraded in some way or will not be able to calculate their state correctly. The remaining health indicators will return
64+ * UNKNOWN statuses in this case.
65+ *
66+ * @param preflightHealthIndicatorServices indicators that are run first and represent a serious cascading health problem.
67+ * @param healthIndicatorServices indicators that are run if the preflight indicators return GREEN results.
68+ */
69+ public HealthService (
70+ List <HealthIndicatorService > preflightHealthIndicatorServices ,
71+ List <HealthIndicatorService > healthIndicatorServices ,
72+ ClusterService clusterService
73+ ) {
74+ this .preflightHealthIndicatorServices = preflightHealthIndicatorServices ;
3375 this .healthIndicatorServices = healthIndicatorServices ;
76+ this .clusterService = clusterService ;
3477 }
3578
3679 /**
@@ -47,11 +90,52 @@ public HealthService(List<HealthIndicatorService> healthIndicatorServices) {
4790 public List <HealthComponentResult > getHealth (@ Nullable String componentName , @ Nullable String indicatorName , boolean computeDetails ) {
4891 final boolean shouldDrillDownToIndicatorLevel = indicatorName != null ;
4992 final boolean showRolledUpComponentStatus = shouldDrillDownToIndicatorLevel == false ;
93+
94+ // Is the cluster state recovered? If not, ALL indicators should return UNKNOWN
95+ boolean clusterStateRecovered = clusterService .state ()
96+ .getBlocks ()
97+ .hasGlobalBlock (GatewayService .STATE_NOT_RECOVERED_BLOCK ) == false ;
98+
99+ List <HealthIndicatorResult > preflightResults ;
100+ if (clusterStateRecovered ) {
101+ // Determine if cluster is stable enough to calculate health before running other indicators
102+ preflightResults = preflightHealthIndicatorServices .stream ().map (service -> service .calculate (computeDetails )).toList ();
103+ } else {
104+ // Mark preflight indicators as UNKNOWN
105+ HealthIndicatorDetails details = computeDetails ? DETAILS_UNKNOWN_STATE_NOT_RECOVERED : HealthIndicatorDetails .EMPTY ;
106+ preflightResults = preflightHealthIndicatorServices .stream ()
107+ .map (service -> generateUnknownResult (service , UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED , details ))
108+ .toList ();
109+ }
110+
111+ // If any of these are not GREEN, then we cannot obtain health from other indicators
112+ boolean clusterHealthIsObtainable = preflightResults .isEmpty ()
113+ || preflightResults .stream ().map (HealthIndicatorResult ::status ).allMatch (isEqual (HealthStatus .GREEN ));
114+
115+ // Filter remaining indicators by component name and indicator name if present before calculating their results
116+ Stream <HealthIndicatorService > filteredIndicators = healthIndicatorServices .stream ()
117+ .filter (service -> componentName == null || service .component ().equals (componentName ))
118+ .filter (service -> indicatorName == null || service .name ().equals (indicatorName ));
119+
120+ Stream <HealthIndicatorResult > filteredIndicatorResults ;
121+ if (clusterStateRecovered && clusterHealthIsObtainable ) {
122+ // Calculate remaining indicators
123+ filteredIndicatorResults = filteredIndicators .map (service -> service .calculate (computeDetails ));
124+ } else {
125+ // Mark remaining indicators as UNKNOWN
126+ String unknownSummary = clusterStateRecovered ? UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED : UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED ;
127+ HealthIndicatorDetails unknownDetails = healthUnknownReason (preflightResults , clusterStateRecovered , computeDetails );
128+ filteredIndicatorResults = filteredIndicators .map (service -> generateUnknownResult (service , unknownSummary , unknownDetails ));
129+ }
130+
131+ // Filter the cluster indicator results by component name and indicator name if present
132+ Stream <HealthIndicatorResult > filteredPreflightResults = preflightResults .stream ()
133+ .filter (result -> componentName == null || result .component ().equals (componentName ))
134+ .filter (result -> indicatorName == null || result .name ().equals (indicatorName ));
135+
136+ // Combine indicator results
50137 List <HealthComponentResult > components = List .copyOf (
51- healthIndicatorServices .stream ()
52- .filter (service -> componentName == null || service .component ().equals (componentName ))
53- .filter (service -> indicatorName == null || service .name ().equals (indicatorName ))
54- .map (service -> service .calculate (computeDetails ))
138+ Stream .concat (filteredPreflightResults , filteredIndicatorResults )
55139 .collect (
56140 groupingBy (
57141 HealthIndicatorResult ::component ,
@@ -76,6 +160,52 @@ public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nu
76160 return components ;
77161 }
78162
163+ /**
164+ * Return details to include on health indicator results when health information cannot be obtained due to unstable cluster.
165+ * @param preflightResults Results of indicators used to determine if health checks can happen.
166+ * @param computeDetails If details should be calculated on which indicators are causing the UNKNOWN state.
167+ * @return Details explaining why results are UNKNOWN, or an empty detail set if computeDetails is false.
168+ */
169+ private HealthIndicatorDetails healthUnknownReason (
170+ List <HealthIndicatorResult > preflightResults ,
171+ boolean clusterStateRecovered ,
172+ boolean computeDetails
173+ ) {
174+ assert clusterStateRecovered == false || preflightResults .isEmpty () == false
175+ : "Requires at least one non-GREEN preflight result or cluster state not recovered" ;
176+ HealthIndicatorDetails unknownDetails ;
177+ if (computeDetails ) {
178+ if (clusterStateRecovered ) {
179+ // Determine why the cluster is not stable enough for running remaining indicators
180+ Map <String , String > clusterUnstableReasons = preflightResults .stream ()
181+ .filter (result -> HealthStatus .GREEN .equals (result .status ()) == false )
182+ .collect (toMap (HealthIndicatorResult ::name , result -> result .status ().xContentValue ()));
183+ assert clusterUnstableReasons .isEmpty () == false : "Requires at least one non-GREEN preflight result" ;
184+ unknownDetails = new SimpleHealthIndicatorDetails (Map .of (REASON , clusterUnstableReasons ));
185+ } else {
186+ unknownDetails = DETAILS_UNKNOWN_STATE_NOT_RECOVERED ;
187+ }
188+ } else {
189+ unknownDetails = HealthIndicatorDetails .EMPTY ;
190+ }
191+ return unknownDetails ;
192+ }
193+
194+ /**
195+ * Generates an UNKNOWN result for an indicator
196+ * @param indicatorService the indicator to generate a result for
197+ * @param summary the summary to include for the UNKNOWN result
198+ * @param details the details to include on the result
199+ * @return A result with the UNKNOWN status
200+ */
201+ private HealthIndicatorResult generateUnknownResult (
202+ HealthIndicatorService indicatorService ,
203+ String summary ,
204+ HealthIndicatorDetails details
205+ ) {
206+ return indicatorService .createIndicator (HealthStatus .UNKNOWN , summary , details , Collections .emptyList (), Collections .emptyList ());
207+ }
208+
79209 // Non-private for testing purposes
80210 static HealthComponentResult createComponentFromIndicators (List <HealthIndicatorResult > indicators , boolean showComponentSummary ) {
81211 assert indicators .size () > 0 : "Component should not be non empty" ;
0 commit comments