Skip to content

Commit

Permalink
added telemetry with most common error from agent logs (#146107)
Browse files Browse the repository at this point in the history
## Summary

Closes elastic/ingest-dev#1261

Merged: [elasticsearch
change](elastic/elasticsearch#91701) to give
kibana_system the missing privilege to read logs-elastic_agent* indices.

## Top 3 most common errors in the Elastic Agent logs

Added most common elastic-agent and fleet-server logs to telemetry.

Using a query of message field using sampler and categorize text
aggregation. This is a workaround as we can't directly do aggregation on
`message` field.
```
GET logs-elastic_agent*/_search
{
    "size": 0,
    "query": {
        "bool": {
            "must": [
                {
                    "term": {
                        "log.level": "error"
                    }
                },
                {
                    "range": {
                        "@timestamp": {
                            "gte": "now-1h"
                        }
                    }
                }
            ]
        }
    },
    "aggregations": {
        "message_sample": {
            "sampler": {
                "shard_size": 200
            },
            "aggs": {
                "categories": {
                    "categorize_text": {
                        "field": "message",
                        "size": 10
                    }
                }
            }
        }
    }
}
```

Tested with latest Elasticsearch snapshot, and verified that the logs
are added to telemetry:
```
   {
      "agent_logs_top_errors": [
         "failed to dispatch actions error failed reloading q q q nil nil config failed reloading artifact config for composed snapshot.downloader failed to generate snapshot config failed to detect remote snapshot repo proceeding with configured not an agent uri",
         "fleet-server stderr level info time message No applicable limit for agents using default \\n level info time message No applicable limit for agents using default \\n",
         "stderr panic close of closed channel n ngoroutine running Stop"
      ],
      "fleet_server_logs_top_errors": [
         "Dispatch abort response",
         "error while closing",
         "failed to take ownership"
      ]
   }
```

Did some measurements locally, and the query took a few ms only. I'll
try to check with larger datasets in elastic agent logs too.


### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
  • Loading branch information
juliaElastic and kibanamachine authored Nov 29, 2022
1 parent 5a86b58 commit 585bf36
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 0 deletions.
89 changes: 89 additions & 0 deletions x-pack/plugins/fleet/server/collectors/agent_logs.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import type { ElasticsearchClient } from '@kbn/core-elasticsearch-server';

import { appContextService } from '../services';

export interface AgentLogsData {
agent_logs_top_errors: string[];
fleet_server_logs_top_errors: string[];
}

const DEFAULT_LOGS_DATA = {
agent_logs_top_errors: [],
fleet_server_logs_top_errors: [],
};

export async function getAgentLogsTopErrors(
esClient?: ElasticsearchClient
): Promise<AgentLogsData> {
if (!esClient) {
return DEFAULT_LOGS_DATA;
}
try {
const queryTopMessages = (index: string) =>
esClient.search({
index,
size: 0,
query: {
bool: {
filter: [
{
term: {
'log.level': 'error',
},
},
{
range: {
'@timestamp': {
gte: 'now-1h',
},
},
},
],
},
},
aggs: {
message_sample: {
sampler: {
shard_size: 200,
},
aggs: {
categories: {
categorize_text: {
field: 'message',
size: 10,
},
},
},
},
},
});

const transformBuckets = (resp: any) =>
((resp?.aggregations?.message_sample as any)?.categories?.buckets ?? [])
.slice(0, 3)
.map((bucket: any) => bucket.key);

const agentResponse = await queryTopMessages('logs-elastic_agent-*');

const fleetServerResponse = await queryTopMessages('logs-elastic_agent.fleet_server-*');

return {
agent_logs_top_errors: transformBuckets(agentResponse),
fleet_server_logs_top_errors: transformBuckets(fleetServerResponse),
};
} catch (error) {
if (error.statusCode === 404) {
appContextService.getLogger().debug('Index pattern logs-elastic_agent* does not exist yet.');
} else {
throw error;
}
return DEFAULT_LOGS_DATA;
}
}
2 changes: 2 additions & 0 deletions x-pack/plugins/fleet/server/collectors/register.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import type { PackageUsage } from './package_collectors';
import { getFleetServerUsage, getFleetServerConfig } from './fleet_server_collector';
import type { FleetServerUsage } from './fleet_server_collector';
import { getAgentPoliciesUsage } from './agent_policies';
import { getAgentLogsTopErrors } from './agent_logs';

export interface Usage {
agents_enabled: boolean;
Expand All @@ -44,6 +45,7 @@ export const fetchFleetUsage = async (
...(await getAgentData(esClient, abortController)),
fleet_server_config: await getFleetServerConfig(soClient),
agent_policies: await getAgentPoliciesUsage(esClient, abortController),
...(await getAgentLogsTopErrors(esClient)),
};
return usage;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,32 @@ describe('fleet usage telemetry', () => {
refresh: 'wait_for',
});

await esClient.create({
index: 'logs-elastic_agent-default',
id: 'log1',
body: {
log: {
level: 'error',
},
'@timestamp': new Date().toISOString(),
message: 'stderr panic close of closed channel',
},
refresh: 'wait_for',
});

await esClient.create({
index: 'logs-elastic_agent.fleet_server-default',
id: 'log2',
body: {
log: {
level: 'error',
},
'@timestamp': new Date().toISOString(),
message: 'failed to unenroll offline agents',
},
refresh: 'wait_for',
});

const soClient = kbnServer.coreStart.savedObjects.createInternalRepository();
await soClient.create('ingest-package-policies', {
name: 'fleet_server-1',
Expand Down Expand Up @@ -255,6 +281,8 @@ describe('fleet usage telemetry', () => {
],
},
agent_policies: { count: 3, output_types: ['elasticsearch'] },
agent_logs_top_errors: ['stderr panic close of closed channel'],
fleet_server_logs_top_errors: ['failed to unenroll offline agents'],
})
);
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,18 @@ export const fleetUsagesSchema: RootSchema<any> = {
},
},
},
agent_logs_top_errors: {
type: 'array',
items: {
type: 'text',
_meta: { description: 'Top messages from agent error logs' },
},
},
fleet_server_logs_top_errors: {
type: 'array',
items: {
type: 'text',
_meta: { description: 'Top messages from fleet server error logs' },
},
},
};

0 comments on commit 585bf36

Please sign in to comment.