diff --git a/.changelog/10340.txt b/.changelog/10340.txt new file mode 100644 index 000000000000..ff2a882a2543 --- /dev/null +++ b/.changelog/10340.txt @@ -0,0 +1,3 @@ +```release-note:improvement +telemetry: The usage data in the `metrics` API now includes cluster member counts, reporting clients on a per segment basis. +``` diff --git a/agent/consul/server.go b/agent/consul/server.go index 9382b2e9f5f0..4164970f1fb5 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -568,7 +568,15 @@ func NewServer(config *Config, flat Deps) (*Server, error) { WithStateProvider(s.fsm). WithLogger(s.logger). WithDatacenter(s.config.Datacenter). - WithReportingInterval(s.config.MetricsReportingInterval), + WithReportingInterval(s.config.MetricsReportingInterval). + WithGetMembersFunc(func() []serf.Member { + members, err := s.LANMembersAllSegments() + if err != nil { + return []serf.Member{} + } + + return members + }), ) if err != nil { s.Shutdown() @@ -1137,7 +1145,7 @@ func (s *Server) LANMembers() []serf.Member { return s.serfLAN.Members() } -// WANMembers is used to return the members of the LAN cluster +// WANMembers is used to return the members of the WAN cluster func (s *Server) WANMembers() []serf.Member { if s.serfWAN == nil { return nil diff --git a/agent/consul/usagemetrics/usagemetrics.go b/agent/consul/usagemetrics/usagemetrics.go index da09890e5fb8..353e9a45df61 100644 --- a/agent/consul/usagemetrics/usagemetrics.go +++ b/agent/consul/usagemetrics/usagemetrics.go @@ -11,6 +11,7 @@ import ( "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" + "github.com/hashicorp/serf/serf" ) var Gauges = []prometheus.GaugeDefinition{ @@ -26,8 +27,18 @@ var Gauges = []prometheus.GaugeDefinition{ Name: []string{"consul", "state", "service_instances"}, Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.", }, + { + Name: []string{"consul", "members", "clients"}, + Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.", + }, + { + Name: []string{"consul", "members", "servers"}, + Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.", + }, } +type getMembersFunc func() []serf.Member + // Config holds the settings for various parameters for the // UsageMetricsReporter type Config struct { @@ -35,6 +46,7 @@ type Config struct { metricLabels []metrics.Label stateProvider StateProvider tickerInterval time.Duration + getMembersFunc getMembersFunc } // WithDatacenter adds the datacenter as a label to all metrics emitted by the @@ -63,6 +75,12 @@ func (c *Config) WithStateProvider(sp StateProvider) *Config { return c } +// WithGetMembersFunc specifies the function used to identify cluster members +func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config { + c.getMembersFunc = fn + return c +} + // StateProvider defines an inteface for retrieving a state.Store handle. In // non-test code, this is satisfied by the fsm.FSM struct. type StateProvider interface { @@ -77,6 +95,7 @@ type UsageMetricsReporter struct { metricLabels []metrics.Label stateProvider StateProvider tickerInterval time.Duration + getMembersFunc getMembersFunc } func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) { @@ -84,6 +103,10 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) { return nil, errors.New("must provide a StateProvider to usage reporter") } + if cfg.getMembersFunc == nil { + return nil, errors.New("must provide a getMembersFunc to usage reporter") + } + if cfg.logger == nil { cfg.logger = hclog.NewNullLogger() } @@ -98,6 +121,7 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) { stateProvider: cfg.stateProvider, metricLabels: cfg.metricLabels, tickerInterval: cfg.tickerInterval, + getMembersFunc: cfg.getMembersFunc, } return u, nil @@ -137,4 +161,66 @@ func (u *UsageMetricsReporter) runOnce() { } u.emitServiceUsage(serviceUsage) + + servers, clients := u.memberUsage() + u.emitMemberUsage(servers, clients) +} + +func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) { + if u.getMembersFunc == nil { + return 0, nil + } + + mems := u.getMembersFunc() + if len(mems) <= 0 { + u.logger.Warn("cluster reported zero members") + return 0, nil + } + + servers := 0 + clients := make(map[string]int) + + for _, m := range mems { + if m.Status != serf.StatusAlive { + continue + } + + switch m.Tags["role"] { + case "node": + clients[m.Tags["segment"]]++ + case "consul": + servers++ + } + } + + return servers, clients +} + +func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) { + totalClients := 0 + + for seg, c := range clients { + segmentLabel := metrics.Label{Name: "segment", Value: seg} + labels := append([]metrics.Label{segmentLabel}, u.metricLabels...) + + metrics.SetGaugeWithLabels( + []string{"consul", "members", "clients"}, + float32(c), + labels, + ) + + totalClients += c + } + + metrics.SetGaugeWithLabels( + []string{"consul", "members", "clients"}, + float32(totalClients), + u.metricLabels, + ) + + metrics.SetGaugeWithLabels( + []string{"consul", "members", "servers"}, + float32(servers), + u.metricLabels, + ) } diff --git a/agent/consul/usagemetrics/usagemetrics_oss_test.go b/agent/consul/usagemetrics/usagemetrics_oss_test.go index d4919914ff5c..e232014358d8 100644 --- a/agent/consul/usagemetrics/usagemetrics_oss_test.go +++ b/agent/consul/usagemetrics/usagemetrics_oss_test.go @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/sdk/testutil" + "github.com/hashicorp/serf/serf" ) func newStateStore() (*state.Store, error) { @@ -21,6 +22,7 @@ func newStateStore() (*state.Store, error) { func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { type testCase struct { modfiyStateStore func(t *testing.T, s *state.Store) + getMembersFunc getMembersFunc expectedGauges map[string]metrics.GaugeValue } cases := map[string]testCase{ @@ -45,24 +47,64 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { {Name: "datacenter", Value: "dc1"}, }, }, + "consul.usage.test.consul.members.clients;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 0, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 0, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, }, + getMembersFunc: func() []serf.Member { return []serf.Member{} }, }, "nodes-and-services": { modfiyStateStore: func(t *testing.T, s *state.Store) { require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"})) require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) + require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"})) // Typical services and some consul services spread across two nodes - require.Nil(t, s.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000})) - require.Nil(t, s.EnsureService(5, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000})) - require.Nil(t, s.EnsureService(6, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) - require.Nil(t, s.EnsureService(7, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) + require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000})) + require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000})) + require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) + require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) + }, + getMembersFunc: func() []serf.Member { + return []serf.Member{ + { + Name: "foo", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "bar", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "baz", + Tags: map[string]string{"role": "node", "segment": "a"}, + Status: serf.StatusAlive, + }, + { + Name: "qux", + Tags: map[string]string{"role": "node", "segment": "b"}, + Status: serf.StatusAlive, + }, + } }, expectedGauges: map[string]metrics.GaugeValue{ "consul.usage.test.consul.state.nodes;datacenter=dc1": { Name: "consul.usage.test.consul.state.nodes", - Value: 3, + Value: 4, Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, }, "consul.usage.test.consul.state.services;datacenter=dc1": { @@ -79,6 +121,36 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { {Name: "datacenter", Value: "dc1"}, }, }, + "consul.usage.test.consul.members.clients;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 2, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 2, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, + "consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 1, + Labels: []metrics.Label{ + {Name: "segment", Value: "a"}, + {Name: "datacenter", Value: "dc1"}, + }, + }, + "consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 1, + Labels: []metrics.Label{ + {Name: "segment", Value: "b"}, + {Name: "datacenter", Value: "dc1"}, + }, + }, }, }, } @@ -102,7 +174,8 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { new(Config). WithStateProvider(mockStateProvider). WithLogger(testutil.Logger(t)). - WithDatacenter("dc1"), + WithDatacenter("dc1"). + WithGetMembersFunc(tcase.getMembersFunc), ) require.NoError(t, err) diff --git a/agent/consul/usagemetrics/usagemetrics_test.go b/agent/consul/usagemetrics/usagemetrics_test.go index cd34581c6121..1c4be1d5b177 100644 --- a/agent/consul/usagemetrics/usagemetrics_test.go +++ b/agent/consul/usagemetrics/usagemetrics_test.go @@ -11,6 +11,7 @@ import ( "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/sdk/testutil" + "github.com/hashicorp/serf/serf" ) type mockStateProvider struct { @@ -25,6 +26,7 @@ func (m *mockStateProvider) State() *state.Store { func TestUsageReporter_Run_Nodes(t *testing.T) { type testCase struct { modfiyStateStore func(t *testing.T, s *state.Store) + getMembersFunc getMembersFunc expectedGauges map[string]metrics.GaugeValue } cases := map[string]testCase{ @@ -36,6 +38,7 @@ func TestUsageReporter_Run_Nodes(t *testing.T) { Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, }, }, + getMembersFunc: func() []serf.Member { return []serf.Member{} }, }, "nodes": { modfiyStateStore: func(t *testing.T, s *state.Store) { @@ -43,12 +46,41 @@ func TestUsageReporter_Run_Nodes(t *testing.T) { require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) }, + getMembersFunc: func() []serf.Member { + return []serf.Member{ + { + Name: "foo", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "bar", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "baz", + Tags: map[string]string{"role": "node"}, + Status: serf.StatusAlive, + }, + } + }, expectedGauges: map[string]metrics.GaugeValue{ "consul.usage.test.consul.state.nodes;datacenter=dc1": { Name: "consul.usage.test.consul.state.nodes", Value: 3, Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, }, + "consul.usage.test.consul.members.clients;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 1, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 2, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, }, }, } @@ -73,7 +105,8 @@ func TestUsageReporter_Run_Nodes(t *testing.T) { new(Config). WithStateProvider(mockStateProvider). WithLogger(testutil.Logger(t)). - WithDatacenter("dc1"), + WithDatacenter("dc1"). + WithGetMembersFunc(tcase.getMembersFunc), ) require.NoError(t, err) diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 241e50d1cfb9..800eedad7122 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -11,7 +11,7 @@ description: >- The Consul agent collects various runtime metrics about the performance of different libraries and subsystems. These metrics are aggregated on a ten -second (10s) interval and are retained for one minute. An _interval_ is the period of time between instances of data being collected and aggregated. +second (10s) interval and are retained for one minute. An _interval_ is the period of time between instances of data being collected and aggregated. When telemetry is being streamed to an external metrics store, the interval is defined to be that store's flush interval. @@ -96,7 +96,7 @@ These are some metrics emitted that can help you understand the health of your c **Why it's important:** Autopilot can expose the overall health of your cluster with a simple boolean. -**What to look for:** Alert if `healthy` is 0. Some other indicators of an unhealthy cluster would be: +**What to look for:** Alert if `healthy` is 0. Some other indicators of an unhealthy cluster would be: - `consul.raft.commitTime` - This can help reflect the speed of state store changes being performmed by the agent. If this number is rising, the server may be experiencing an issue due to degraded resources on the host. @@ -193,6 +193,8 @@ This is a full list of metrics emitted by Consul. | `consul.state.nodes` | Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | | `consul.state.services` | Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | | `consul.state.service_instances` | Measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | +| `consul.members.clients` | Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of clients | gauge | +| `consul.members.servers` | Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of servers | gauge | | `consul.dns.stale_queries` | Increments when an agent serves a query within the allowed stale threshold. | queries | counter | | `consul.dns.ptr_query.` | Measures the time spent handling a reverse DNS query for the given node. | ms | timer | | `consul.dns.domain_query.` | Measures the time spent handling a domain query for the given node. | ms | timer |