Skip to content

Commit

Permalink
NET-10124: Add metrics endpoint and labels to sync catalog deployment (
Browse files Browse the repository at this point in the history
…#4212)

add initial ideas
  • Loading branch information
NiniOak authored Aug 29, 2024
1 parent aef66b6 commit a02fd53
Show file tree
Hide file tree
Showing 17 changed files with 564 additions and 205 deletions.
3 changes: 3 additions & 0 deletions .changelog/4212.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:enhancement
sync-catalog: expose prometheus scrape metrics on sync-catalog pods
```
22 changes: 22 additions & 0 deletions charts/consul/templates/sync-catalog-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ spec:
"vault.hashicorp.com/namespace": "{{ .Values.global.secretsBackend.vault.vaultNamespace }}"
{{- end }}
{{- end }}
{{- if .Values.syncCatalog.metrics.enabled | default .Values.global.metrics.enabled }}
"prometheus.io/scrape": "true"
{{- if not (hasKey (default "" .Values.syncCatalog.annotations | fromYaml) "prometheus.io/path")}}
"prometheus.io/path": {{ default "/metrics" .Values.syncCatalog.metrics.path }}
{{- end }}
"prometheus.io/port": {{ .Values.syncCatalog.metrics.port | default "20300" | quote }}
{{- end }}
spec:
serviceAccountName: {{ template "consul.fullname" . }}-sync-catalog
volumes:
Expand Down Expand Up @@ -196,6 +203,16 @@ spec:
{{- if .Values.syncCatalog.syncLoadBalancerEndpoints }}
-sync-lb-services-endpoints=true \
{{- end }}
{{- if .Values.syncCatalog.metrics.enabled | default .Values.global.metrics.enabled }}
-enable-metrics \
{{- end }}
{{- if .Values.syncCatalog.metrics.path }}
-metrics-path={{ .Values.syncCatalog.metrics.path }} \
{{- end }}
{{- if .Values.syncCatalog.metrics.port }}
-metrics-port={{ .Values.syncCatalog.metrics.port }} \
{{- end }}
-prometheus-retention-time={{ .Values.global.metrics.agentMetricsRetentionTime }} \
livenessProbe:
httpGet:
path: /health/ready
Expand All @@ -220,6 +237,11 @@ spec:
resources:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- if or (eq (.Values.syncCatalog.metrics.enabled | toString) "-") .Values.syncCatalog.metrics.enabled .Values.global.metrics.enabled }}
ports:
- name: prometheus
containerPort: {{ .Values.syncCatalog.metrics.port | default "20300" | int }}
{{- end }}
{{- if .Values.syncCatalog.priorityClassName }}
priorityClassName: {{ .Values.syncCatalog.priorityClassName | quote }}
{{- end }}
Expand Down
22 changes: 22 additions & 0 deletions charts/consul/test/unit/sync-catalog-deployment.bats
Original file line number Diff line number Diff line change
Expand Up @@ -1025,6 +1025,28 @@ load _helpers
[ "${actual}" = "bar" ]
}
@test "syncCatalog/Deployment: metrics annotations can be set" {
cd `chart_dir`
local object=$(helm template \
-s templates/sync-catalog-deployment.yaml \
--set 'syncCatalog.enabled=true' \
--set 'syncCatalog.metrics.enabled=true' \
. | tee /dev/stderr |
yq -r '.spec.template.metadata.annotations |
del(."consul.hashicorp.com/connect-inject") |
del(."consul.hashicorp.com/mesh-inject")' |
tee /dev/stderr)
# Annotations to check
annotations=("prometheus.io/scrape" "prometheus.io/path" "prometheus.io/port")
# Check each annotation
for annotation in "${annotations[@]}"; do
actual=$(echo "$object" | yq -r "has(\"$annotation\")")
[ "$actual" = "true" ]
done
}
#--------------------------------------------------------------------
# logLevel
Expand Down
13 changes: 13 additions & 0 deletions charts/consul/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2203,6 +2203,19 @@ syncCatalog:
# If false, LoadBalancer endpoints are not synced to Consul.
syncLoadBalancerEndpoints: false

# Metrics settings for syncCatalog
metrics:
# This value enables or disables metrics collection for registered services, overriding the global metrics collection settings.
# @type: boolean
enabled: false
# This value sets the port to use for scraping syncCatalog metrics via prometheus, defaults to 20300 if not set. Must be in the port
# range of 1024-65535.
# @type: int
port: null
# This value sets the path to use for scraping syncCatalog metrics via prometheus, defaults to /metrics if not set.
# @type: string
path: null

ingress:
# Syncs the hostname from a Kubernetes Ingress resource to service registrations
# when a rule matched a service. Currently only supports host based routing and
Expand Down
65 changes: 65 additions & 0 deletions control-plane/catalog/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package metrics

import (
"strconv"

"github.com/armon/go-metrics"
metricsutil "github.com/hashicorp/consul-k8s/control-plane/subcommand/common"
)

const (
defaultScrapePort = 20300
defaultScrapePath = "/metrics"
)

type Config struct {
// EnableSyncCatalogMetrics indicates whether or not SyncCatalog metrics should be enabled
// by default on a deployed consul-sync-catalog, passed from the helm chart via command-line flags to our controller.
EnableSyncCatalogMetrics bool

// The default path to use for scraping prometheus metrics, passed from the helm chart via command-line flags to our controller.
DefaultPrometheusScrapePath string

// The default port to use for scraping prometheus metrics, passed from the helm chart via command-line flags to our controller.
DefaultPrometheusScrapePort int

// Configures the retention time for metrics in the metrics store, passed from the helm chart via command-line flags to our controller.
PrometheusMetricsRetentionTime string
}

func syncCatalogMetricsPort(portString string) int {
port, err := strconv.Atoi(portString)
if err != nil {
return defaultScrapePort
}

if port < 1024 || port > 65535 {
// if we requested a privileged port, use the default
return defaultScrapePort
}

return port
}

func syncCatalogMetricsPath(path string) string {
if path, isSet := metricsutil.GetScrapePath(path); isSet {
return path
}

// otherwise, fallback to the global helm setting
return defaultScrapePath
}

func SyncCatalogMetricsConfig(enableMetrics bool, metricsPort, metricsPath string) Config {
return Config{
EnableSyncCatalogMetrics: enableMetrics,
DefaultPrometheusScrapePort: syncCatalogMetricsPort(metricsPort),
DefaultPrometheusScrapePath: syncCatalogMetricsPath(metricsPath),
}
}

func ServiceNameLabel(serviceName string) []metrics.Label {
return []metrics.Label{
{Name: "service_name", Value: serviceName},
}
}
6 changes: 6 additions & 0 deletions control-plane/catalog/to-consul/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"sync"

mapset "github.com/deckarep/golang-set"
"github.com/hashicorp/consul-k8s/control-plane/catalog/metrics"
"github.com/hashicorp/consul-k8s/control-plane/helper/controller"
"github.com/hashicorp/consul-k8s/control-plane/helper/parsetags"
"github.com/hashicorp/consul-k8s/control-plane/namespaces"
Expand Down Expand Up @@ -102,6 +103,11 @@ type ServiceResource struct {
// LoadBalancerEndpointsSync set to true (default false) will sync ServiceTypeLoadBalancer endpoints.
LoadBalancerEndpointsSync bool

// MetricsConfig contains metrics configuration and has methods to determine whether
// configuration should come from the default flags or annotations. The syncCatalog uses this to configure prometheus
// annotations.
MetricsConfig metrics.Config

// NodeExternalIPSync set to true (the default) syncs NodePort services
// using the node's external ip address. When false, the node's internal
// ip address will be used instead.
Expand Down
82 changes: 82 additions & 0 deletions control-plane/catalog/to-consul/syncer.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
"sync"
"time"

"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/cenkalti/backoff"
mapset "github.com/deckarep/golang-set"
"github.com/hashicorp/consul-k8s/control-plane/consul"
Expand All @@ -17,6 +19,41 @@ import (
"github.com/hashicorp/go-hclog"
)

var (
baseName = []string{"consul", "sync_catalog", "to_consul"}
registerName = append(baseName, "register")
deregisterName = append(baseName, "deregister")
registerErrorName = append(baseName, "register", "error")
deregisterErrorName = append(baseName, "deregister", "error")
syncCatalogStatus = append(baseName, "status")
)

var SyncToConsulCounters = []prometheus.CounterDefinition{
{
Name: registerName,
Help: "Increments for each service instance registered to Consul via catalog sync",
},
{
Name: deregisterName,
Help: "Increments for each service deregistered from Consul via catalog sync",
},
{
Name: registerErrorName,
Help: "Increments whenever a Consul API client returns an error for a catalog sync register request",
},
{
Name: deregisterErrorName,
Help: "Increments whenever a Consul API client returns an error for a catalog sync deregister request request",
},
}

var SyncCatalogGauge = []prometheus.GaugeDefinition{
{
Name: syncCatalogStatus,
Help: "Status of the Consul Client endpoint. 1 for connected, 0 for disconnected",
},
}

const (
// ConsulSyncPeriod is how often the syncer will attempt to
// reconcile the expected service states with the remote Consul server.
Expand Down Expand Up @@ -101,6 +138,8 @@ type ConsulSyncer struct {
// watchers is all namespaces mapped to a map of Consul service
// names mapped to a cancel function for watcher routines
watchers map[string]map[string]context.CancelFunc

PrometheusSink *prometheus.PrometheusSink
}

// Sync implements Syncer.
Expand Down Expand Up @@ -433,14 +472,30 @@ func (s *ConsulSyncer) syncFull(ctx context.Context) {
"node-name", r.Node,
"service-id", r.ServiceID,
"service-consul-namespace", r.Namespace)

_, err = consulClient.Catalog().Deregister(r, nil)
if err != nil {
// metric count for error deregistering k8s services from Consul
labels := []metrics.Label{
{Name: "error", Value: err.Error()},
}
s.PrometheusSink.IncrCounterWithLabels(deregisterErrorName, 1, labels)

s.Log.Warn("error deregistering service",
"node-name", r.Node,
"service-id", r.ServiceID,
"service-consul-namespace", r.Namespace,
"err", err)
continue
}

// metric count for deregistering k8s services from Consul
labels := []metrics.Label{
{Name: "id", Value: r.ServiceID},
{Name: "node", Value: r.Node},
{Name: "namespace", Value: r.Namespace},
}
s.PrometheusSink.IncrCounterWithLabels(deregisterName, 1, labels)
}

// Always clear deregistrations, they'll repopulate if we had errors
Expand All @@ -465,6 +520,14 @@ func (s *ConsulSyncer) syncFull(ctx context.Context) {
// Register the service.
_, err = consulClient.Catalog().Register(r, nil)
if err != nil {
// metric count for error syncing K8S services to Consul
label := []metrics.Label{
{Name: "error", Value: err.Error()},
}
s.PrometheusSink.IncrCounterWithLabels(registerErrorName, 1, label)
// Set to 0 if the endpoint is down or returns an error
s.PrometheusSink.SetGauge(syncCatalogStatus, 0)

s.Log.Warn("error registering service",
"node-name", r.Node,
"service-name", r.Service.Service,
Expand All @@ -478,6 +541,25 @@ func (s *ConsulSyncer) syncFull(ctx context.Context) {
"service-name", r.Service.Service,
"consul-namespace-name", r.Service.Namespace,
"service", r.Service)

// metric count and service metadata syncing k8s services to Consul
labels := []metrics.Label{
{Name: "id", Value: r.Service.ID},
{Name: "service", Value: r.Service.Service},
{Name: "node", Value: r.Node},
{Name: "namespace", Value: r.Service.Namespace},
{Name: "datacenter", Value: r.Datacenter},
}

if val, exists := r.Service.Meta["external-k8s-ref-name"]; exists && val != "" {
labels = append(labels, metrics.Label{Name: "external_k8s_ref_name", Value: val})
}
if r.Check != nil {
labels = append(labels, metrics.Label{Name: "status", Value: r.Check.Status})
}
s.PrometheusSink.IncrCounterWithLabels(registerName, 1, labels)
// Set to 1 if the endpoint is healthy
s.PrometheusSink.SetGauge(syncCatalogStatus, 1)
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions control-plane/catalog/to-consul/syncer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"testing"
"time"

"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/go-hclog"
Expand Down Expand Up @@ -286,6 +287,7 @@ func testConsulSyncerWithConfig(testClient *test.TestServerClient, configurator
ServicePollPeriod: 50 * time.Millisecond,
ConsulK8STag: TestConsulK8STag,
ConsulNodeName: ConsulSyncNodeName,
PrometheusSink: &prometheus.PrometheusSink{},
}
configurator(s)
s.init()
Expand Down
Loading

0 comments on commit a02fd53

Please sign in to comment.