Skip to content

Commit e34a6d5

Browse files
authored
[Fleet] Add unhealthy reason (input/output/other) to agent metrics (#178605)
## Summary Closes elastic/ingest-dev#2522 Added `unhealthy_reason` aggregation when querying agent metrics. The [mapping change](elastic/elasticsearch#106246) and [fleet-server change](elastic/fleet-server#3338) is needed to be merged first to verify end to end. Steps to verify: - enroll an agent with docker - add endpoint integration, expect an input and output unit error status on the agent doc - wait a few seconds so that the agent metrics are published - verify that the agent metrics include `unhealthy_reason`, using the query below ``` GET metrics-fleet_server.agent_status-default/_search { "_source": ["fleet.agents"] } "hits": [ { "_index": ".ds-metrics-fleet_server.agent_status-default-2024.03.11-000001", "_id": "3JdPioUh-9j8DxQrAAABjjclRhU", "_score": 1, "_source": { "fleet": { "agents": { "enrolled": 12, "healthy": 0, "inactive": 0, "offline": 11, "total": 13, "unenrolled": 1, "unhealthy": 1, "updating": 0, "upgrading_step": { "downloading": 0, "extracting": 0, "failed": 0, "replacing": 0, "requested": 0, "restarting": 0, "rollback": 0, "scheduled": 0, "watching": 0 }, "unhealthy_reason": { "input": 1, "output": 1 } } } } }, ``` ### Checklist - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios
1 parent 300b8ee commit e34a6d5

File tree

6 files changed

+104
-1
lines changed

6 files changed

+104
-1
lines changed

x-pack/plugins/fleet/common/types/models/agent.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,13 @@ interface AgentBase {
107107
tags?: string[];
108108
components?: FleetServerAgentComponent[];
109109
agent?: FleetServerAgentMetadata;
110+
unhealthy_reason?: UnhealthyReason[];
111+
}
112+
113+
export enum UnhealthyReason {
114+
INPUT = 'input',
115+
OUTPUT = 'output',
116+
OTHER = 'other',
110117
}
111118

112119
export interface AgentMetrics {
@@ -336,6 +343,11 @@ export interface FleetServerAgent {
336343
* Outputs map
337344
*/
338345
outputs?: OutputMap;
346+
347+
/**
348+
* Unhealthy reason: input, output, other
349+
*/
350+
unhealthy_reason?: UnhealthyReason[];
339351
}
340352

341353
/**

x-pack/plugins/fleet/server/services/agents/helpers.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ export function searchHitToAgent(
9292
// key-value pairs
9393
user_provided_metadata: hit._source?.user_provided_metadata!,
9494
local_metadata: hit._source?.local_metadata!,
95+
unhealthy_reason: hit._source?.unhealthy_reason,
9596
};
9697

9798
if (!hit.fields?.status?.length) {

x-pack/plugins/fleet/server/services/metrics/fetch_agent_metrics.test.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,22 @@ describe('fetchAgentMetrics', () => {
7171
},
7272
],
7373
},
74+
unhealthy_reason: {
75+
buckets: [
76+
{
77+
key: 'input',
78+
doc_count: 2,
79+
},
80+
{
81+
key: 'output',
82+
doc_count: 1,
83+
},
84+
{
85+
key: 'other',
86+
doc_count: 3,
87+
},
88+
],
89+
},
7490
},
7591
});
7692

@@ -95,6 +111,11 @@ describe('fetchAgentMetrics', () => {
95111
scheduled: 0,
96112
watching: 0,
97113
},
114+
unhealthy_reason: {
115+
input: 2,
116+
output: 1,
117+
other: 3,
118+
},
98119
});
99120
});
100121
});

x-pack/plugins/fleet/server/services/metrics/fetch_agent_metrics.ts

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ export interface AgentMetrics {
2020
agents: AgentUsage;
2121
agents_per_version: AgentPerVersion[];
2222
upgrading_step: UpgradingSteps;
23+
unhealthy_reason: UnhealthyReason;
2324
}
2425

2526
export interface UpgradingSteps {
@@ -34,6 +35,12 @@ export interface UpgradingSteps {
3435
failed: number;
3536
}
3637

38+
export interface UnhealthyReason {
39+
input: number;
40+
output: number;
41+
other: number;
42+
}
43+
3744
export const fetchAgentMetrics = async (
3845
core: CoreSetup,
3946
abortController: AbortController
@@ -63,6 +70,7 @@ export const fetchAgentMetrics = async (
6370
agents: await getAgentUsage(soClient, esClient),
6471
agents_per_version: await getAgentsPerVersion(esClient, abortController),
6572
upgrading_step: await getUpgradingSteps(esClient, abortController),
73+
unhealthy_reason: await getUnhealthyReason(esClient, abortController),
6674
};
6775
return usage;
6876
};
@@ -195,3 +203,53 @@ export const getUpgradingSteps = async (
195203
return upgradingSteps;
196204
}
197205
};
206+
207+
export const getUnhealthyReason = async (
208+
esClient: ElasticsearchClient,
209+
abortController: AbortController
210+
): Promise<UnhealthyReason> => {
211+
const unhealthyReason = {
212+
input: 0,
213+
output: 0,
214+
other: 0,
215+
};
216+
try {
217+
const response = await retryTransientEsErrors(() =>
218+
esClient.search(
219+
{
220+
index: AGENTS_INDEX,
221+
size: 0,
222+
aggs: {
223+
unhealthy_reason: {
224+
terms: { field: 'unhealthy_reason' },
225+
},
226+
},
227+
},
228+
{ signal: abortController.signal }
229+
)
230+
);
231+
((response?.aggregations?.unhealthy_reason as any)?.buckets ?? []).forEach((bucket: any) => {
232+
switch (bucket.key) {
233+
case 'input':
234+
unhealthyReason.input = bucket.doc_count;
235+
break;
236+
case 'output':
237+
unhealthyReason.output = bucket.doc_count;
238+
break;
239+
case 'other':
240+
unhealthyReason.other = bucket.doc_count;
241+
break;
242+
default:
243+
break;
244+
}
245+
});
246+
return unhealthyReason;
247+
} catch (error) {
248+
if (error.statusCode === 404) {
249+
appContextService.getLogger().debug('Index .fleet-agents does not exist yet.');
250+
} else {
251+
throw error;
252+
}
253+
return unhealthyReason;
254+
}
255+
};

x-pack/plugins/fleet/server/services/metrics/fleet_metrics_task.test.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ describe('fleet metrics task', () => {
104104
count: 2,
105105
},
106106
],
107+
unhealthy_reason: {
108+
input: 2,
109+
output: 1,
110+
other: 3,
111+
},
107112
});
108113
});
109114

@@ -149,6 +154,11 @@ describe('fleet metrics task', () => {
149154
scheduled: 1,
150155
requested: 1,
151156
},
157+
unhealthy_reason: {
158+
input: 2,
159+
output: 1,
160+
other: 3,
161+
},
152162
},
153163
},
154164
}),

x-pack/plugins/fleet/server/services/metrics/fleet_metrics_task.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import { appContextService } from '../app_context';
2020
import type { AgentMetrics } from './fetch_agent_metrics';
2121

2222
export const TYPE = 'Fleet-Metrics-Task';
23-
export const VERSION = '1.0.0';
23+
export const VERSION = '1.1.0';
2424
const TITLE = 'Fleet Metrics Task';
2525
const TIMEOUT = '1m';
2626
const SCOPE = ['fleet'];
@@ -115,6 +115,7 @@ export class FleetMetricsTask {
115115
unhealthy: agents.unhealthy,
116116
inactive: agents.inactive,
117117
upgrading_step: agentMetrics.upgrading_step,
118+
unhealthy_reason: agentMetrics.unhealthy_reason,
118119
},
119120
},
120121
};

0 commit comments

Comments
 (0)