Skip to content

Commit

Permalink
usagemetrics: add cluster members to metrics API (#10340)
Browse files Browse the repository at this point in the history
This PR adds cluster members to the metrics API. The number of members per
segment are reported as well as the total number of members.

Tested by running a multi-node cluster locally and ensuring the numbers were
correct. Also added unit test coverage to add the new expected gauges to
existing test cases.
  • Loading branch information
Paul Ewing authored and hc-github-team-consul-core committed Jun 3, 2021
1 parent d50875f commit a9c2f6a
Show file tree
Hide file tree
Showing 6 changed files with 218 additions and 13 deletions.
3 changes: 3 additions & 0 deletions .changelog/10340.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
telemetry: The usage data in the `metrics` API now includes cluster member counts, reporting clients on a per segment basis.
```
12 changes: 10 additions & 2 deletions agent/consul/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,15 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
WithStateProvider(s.fsm).
WithLogger(s.logger).
WithDatacenter(s.config.Datacenter).
WithReportingInterval(s.config.MetricsReportingInterval),
WithReportingInterval(s.config.MetricsReportingInterval).
WithGetMembersFunc(func() []serf.Member {
members, err := s.LANMembersAllSegments()
if err != nil {
return []serf.Member{}
}

return members
}),
)
if err != nil {
s.Shutdown()
Expand Down Expand Up @@ -1138,7 +1146,7 @@ func (s *Server) LANMembers() []serf.Member {
return s.serfLAN.Members()
}

// WANMembers is used to return the members of the LAN cluster
// WANMembers is used to return the members of the WAN cluster
func (s *Server) WANMembers() []serf.Member {
if s.serfWAN == nil {
return nil
Expand Down
86 changes: 86 additions & 0 deletions agent/consul/usagemetrics/usagemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/serf/serf"
)

var Gauges = []prometheus.GaugeDefinition{
Expand All @@ -26,15 +27,26 @@ var Gauges = []prometheus.GaugeDefinition{
Name: []string{"consul", "state", "service_instances"},
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
},
{
Name: []string{"consul", "members", "clients"},
Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
{
Name: []string{"consul", "members", "servers"},
Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
}

type getMembersFunc func() []serf.Member

// Config holds the settings for various parameters for the
// UsageMetricsReporter
type Config struct {
logger hclog.Logger
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
getMembersFunc getMembersFunc
}

// WithDatacenter adds the datacenter as a label to all metrics emitted by the
Expand Down Expand Up @@ -63,6 +75,12 @@ func (c *Config) WithStateProvider(sp StateProvider) *Config {
return c
}

// WithGetMembersFunc specifies the function used to identify cluster members
func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config {
c.getMembersFunc = fn
return c
}

// StateProvider defines an inteface for retrieving a state.Store handle. In
// non-test code, this is satisfied by the fsm.FSM struct.
type StateProvider interface {
Expand All @@ -77,13 +95,18 @@ type UsageMetricsReporter struct {
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
getMembersFunc getMembersFunc
}

func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
if cfg.stateProvider == nil {
return nil, errors.New("must provide a StateProvider to usage reporter")
}

if cfg.getMembersFunc == nil {
return nil, errors.New("must provide a getMembersFunc to usage reporter")
}

if cfg.logger == nil {
cfg.logger = hclog.NewNullLogger()
}
Expand All @@ -98,6 +121,7 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
stateProvider: cfg.stateProvider,
metricLabels: cfg.metricLabels,
tickerInterval: cfg.tickerInterval,
getMembersFunc: cfg.getMembersFunc,
}

return u, nil
Expand Down Expand Up @@ -137,4 +161,66 @@ func (u *UsageMetricsReporter) runOnce() {
}

u.emitServiceUsage(serviceUsage)

servers, clients := u.memberUsage()
u.emitMemberUsage(servers, clients)
}

func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) {
if u.getMembersFunc == nil {
return 0, nil
}

mems := u.getMembersFunc()
if len(mems) <= 0 {
u.logger.Warn("cluster reported zero members")
return 0, nil
}

servers := 0
clients := make(map[string]int)

for _, m := range mems {
if m.Status != serf.StatusAlive {
continue
}

switch m.Tags["role"] {
case "node":
clients[m.Tags["segment"]]++
case "consul":
servers++
}
}

return servers, clients
}

func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) {
totalClients := 0

for seg, c := range clients {
segmentLabel := metrics.Label{Name: "segment", Value: seg}
labels := append([]metrics.Label{segmentLabel}, u.metricLabels...)

metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(c),
labels,
)

totalClients += c
}

metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(totalClients),
u.metricLabels,
)

metrics.SetGaugeWithLabels(
[]string{"consul", "members", "servers"},
float32(servers),
u.metricLabels,
)
}
85 changes: 79 additions & 6 deletions agent/consul/usagemetrics/usagemetrics_oss_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/serf/serf"
)

func newStateStore() (*state.Store, error) {
Expand All @@ -21,6 +22,7 @@ func newStateStore() (*state.Store, error) {
func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store)
getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue
}
cases := map[string]testCase{
Expand All @@ -45,24 +47,64 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 0,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 0,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
},
getMembersFunc: func() []serf.Member { return []serf.Member{} },
},
"nodes-and-services": {
modfiyStateStore: func(t *testing.T, s *state.Store) {
require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"}))
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"}))

// Typical services and some consul services spread across two nodes
require.Nil(t, s.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(5, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(6, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(7, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
},
getMembersFunc: func() []serf.Member {
return []serf.Member{
{
Name: "foo",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "bar",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "baz",
Tags: map[string]string{"role": "node", "segment": "a"},
Status: serf.StatusAlive,
},
{
Name: "qux",
Tags: map[string]string{"role": "node", "segment": "b"},
Status: serf.StatusAlive,
},
}
},
expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes",
Value: 3,
Value: 4,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.state.services;datacenter=dc1": {
Expand All @@ -79,6 +121,36 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 2,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 2,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{
{Name: "segment", Value: "a"},
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{
{Name: "segment", Value: "b"},
{Name: "datacenter", Value: "dc1"},
},
},
},
},
}
Expand All @@ -102,7 +174,8 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
new(Config).
WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)).
WithDatacenter("dc1"),
WithDatacenter("dc1").
WithGetMembersFunc(tcase.getMembersFunc),
)
require.NoError(t, err)

Expand Down
35 changes: 34 additions & 1 deletion agent/consul/usagemetrics/usagemetrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/serf/serf"
)

type mockStateProvider struct {
Expand All @@ -25,6 +26,7 @@ func (m *mockStateProvider) State() *state.Store {
func TestUsageReporter_Run_Nodes(t *testing.T) {
type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store)
getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue
}
cases := map[string]testCase{
Expand All @@ -36,19 +38,49 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
},
getMembersFunc: func() []serf.Member { return []serf.Member{} },
},
"nodes": {
modfiyStateStore: func(t *testing.T, s *state.Store) {
require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"}))
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
},
getMembersFunc: func() []serf.Member {
return []serf.Member{
{
Name: "foo",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "bar",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "baz",
Tags: map[string]string{"role": "node"},
Status: serf.StatusAlive,
},
}
},
expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes",
Value: 3,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 2,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
},
},
}
Expand All @@ -73,7 +105,8 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
new(Config).
WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)).
WithDatacenter("dc1"),
WithDatacenter("dc1").
WithGetMembersFunc(tcase.getMembersFunc),
)
require.NoError(t, err)

Expand Down
Loading

0 comments on commit a9c2f6a

Please sign in to comment.