From b138371ba58cb1566f9ea2ad1d0ad8217e962801 Mon Sep 17 00:00:00 2001 From: Jane Quintero Date: Thu, 22 Apr 2021 09:31:19 -0700 Subject: [PATCH 1/6] add various metrics --- docs/pages/metrics-logs-reference.mdx | 7 +++++++ lib/auth/auth.go | 18 +++++++++++++++++- lib/auth/grpcserver.go | 19 +++++++++++++++++-- lib/auth/methods.go | 1 + lib/auth/sessions.go | 2 +- lib/cache/cache.go | 17 +++++++++++++++++ lib/services/server.go | 10 ++++++++++ lib/srv/authhandlers.go | 24 ++++++++++++++++++++++++ lib/srv/regular/proxy.go | 15 +++++++++++++++ lib/srv/regular/sshserver.go | 15 +++++++++++++++ lib/sshutils/server.go | 17 +++++++++++++++++ metrics.go | 27 +++++++++++++++++++++++++++ 12 files changed, 168 insertions(+), 4 deletions(-) diff --git a/docs/pages/metrics-logs-reference.mdx b/docs/pages/metrics-logs-reference.mdx index 2bec15ec08728..aa66cc156a567 100644 --- a/docs/pages/metrics-logs-reference.mdx +++ b/docs/pages/metrics-logs-reference.mdx @@ -106,3 +106,10 @@ Now you can see the monitoring information by visiting several endpoints: | `auth_generate_requests_throttled_total` | counter | Teleport Auth | Number of throttled requests to generate new server keys | | `auth_generate_requests_total` | counter | Teleport Auth | Number of requests to generate new server keys | | `auth_generate_seconds` | `histogram` | Teleport Auth | Latency for generate requests | +| `cluster_name_not_found_total` | counter | Teleport Auth | Number of times a cluster was not found | +| `heartbeat_connections_received_total` | counter | Teleport Auth | Number of times auth received a heartbeat connection | +| `user_login_total` | counter | Teleport Auth | Number of user logins | +| `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of times a user failed connecting to a node | +| `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the proxy connection limit | +| `certificate_mismatch_total` | counter | Teleport Proxy | Number of times there was a certificate mismatch | +| `user_max_concurrent_sessions_hit_total` | counter | Teleport Node | Number of times a user exceeded their concurrent session limit | diff --git a/lib/auth/auth.go b/lib/auth/auth.go index 1fbe0f8219c8d..167a850facca8 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -201,6 +201,20 @@ var ( Buckets: prometheus.ExponentialBuckets(0.001, 2, 16), }, ) + // UserLoginCount counts user logins + UserLoginCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricUserLoginCount, + Help: "Number of times there was a user login", + }, + ) + + heartbeatsMissedByAuth = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: teleport.MetricHeartbeatsMissed, + Help: "Number of hearbeats missed by auth server", + }, + ) ) // Server keeps the cluster together. It acts as a certificate authority (CA) for @@ -1662,7 +1676,7 @@ func (a *Server) NewWebSession(req types.NewWebSessionRequest) (services.WebSess BearerTokenExpires: startTime.UTC().Add(bearerTokenTTL), LoginTime: req.LoginTime, } - + UserLoginCount.Inc() return services.NewWebSession(token, services.KindWebSession, services.KindWebSession, sessionSpec), nil } @@ -2537,4 +2551,6 @@ func init() { prometheus.MustRegister(generateThrottledRequestsCount) prometheus.MustRegister(generateRequestsCurrent) prometheus.MustRegister(generateRequestsLatencies) + prometheus.MustRegister(UserLoginCount) + prometheus.MustRegister(heartbeatsMissedByAuth) } diff --git a/lib/auth/grpcserver.go b/lib/auth/grpcserver.go index f46e83bb02478..68cd29099e59b 100644 --- a/lib/auth/grpcserver.go +++ b/lib/auth/grpcserver.go @@ -35,10 +35,11 @@ import ( "github.com/gravitational/teleport/lib/services" "github.com/gravitational/teleport/lib/session" "github.com/gravitational/teleport/lib/utils" - - "github.com/golang/protobuf/ptypes/empty" "github.com/gravitational/trace" "github.com/gravitational/trace/trail" + + "github.com/golang/protobuf/ptypes/empty" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -66,6 +67,19 @@ func (g *GRPCServer) GetServer() (*grpc.Server, error) { return g.server, nil } +var ( + heartbeatConnectionsReceived = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricHeartbeatConnectionsReceived, + Help: "Number of auth received a heartbeat", + }, + ) +) + +func init() { + prometheus.MustRegister(heartbeatConnectionsReceived) +} + // EmitAuditEvent emits audit event func (g *GRPCServer) EmitAuditEvent(ctx context.Context, req *events.OneOf) (*empty.Empty, error) { auth, err := g.authenticate(ctx) @@ -91,6 +105,7 @@ func (g *GRPCServer) SendKeepAlives(stream proto.AuthService_SendKeepAlivesServe return trail.ToGRPC(err) } g.Debugf("Got heartbeat connection from %v.", auth.User.GetName()) + heartbeatConnectionsReceived.Inc() for { keepAlive, err := stream.Recv() if err == io.EOF { diff --git a/lib/auth/methods.go b/lib/auth/methods.go index e810bb5295579..3b69df01efddd 100644 --- a/lib/auth/methods.go +++ b/lib/auth/methods.go @@ -393,6 +393,7 @@ func (s *Server) AuthenticateSSHUser(req AuthenticateSSHRequest) (*SSHLoginRespo if err != nil { return nil, trace.Wrap(err) } + UserLoginCount.Inc() return &SSHLoginResponse{ Username: req.Username, Cert: certs.ssh, diff --git a/lib/auth/sessions.go b/lib/auth/sessions.go index 4c19130aace59..4667c8f2edba7 100644 --- a/lib/auth/sessions.go +++ b/lib/auth/sessions.go @@ -98,7 +98,7 @@ func (s *Server) CreateAppSession(ctx context.Context, req services.CreateAppSes return nil, trace.Wrap(err) } log.Debugf("Generated application web session for %v with TTL %v.", req.Username, ttl) - + UserLoginCount.Inc() return session, nil } diff --git a/lib/cache/cache.go b/lib/cache/cache.go index 9cd94bf9b8ad7..21b3c95c17816 100644 --- a/lib/cache/cache.go +++ b/lib/cache/cache.go @@ -31,6 +31,7 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" "go.uber.org/atomic" ) @@ -1014,10 +1015,26 @@ func (c *Cache) GetClusterConfig(opts ...services.MarshalOption) (services.Clust return rg.clusterConfig.GetClusterConfig(services.AddOptions(opts, services.SkipValidation())...) } +var ( + clusterNameNotFound = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricClusterNameNotFound, + Help: "Number of times a cluster name was not found", + }, + ) +) + +func init() { + prometheus.MustRegister(clusterNameNotFound) +} + // GetClusterName gets the name of the cluster from the backend. func (c *Cache) GetClusterName(opts ...services.MarshalOption) (services.ClusterName, error) { rg, err := c.read() if err != nil { + if trace.IsNotFound(err) { + clusterNameNotFound.Inc() + } return nil, trace.Wrap(err) } defer rg.Release() diff --git a/lib/services/server.go b/lib/services/server.go index 279b041f296b0..0813ed052aeb5 100644 --- a/lib/services/server.go +++ b/lib/services/server.go @@ -19,6 +19,7 @@ package services import ( "encoding/json" "fmt" + "time" "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/lib/defaults" @@ -498,3 +499,12 @@ func MarshalServers(s []Server) ([]byte, error) { return bytes, nil } + +// NodeHasMissedKeepAlives checks if node has missed its keep alive +func NodeHasMissedKeepAlives(s Server) bool { + serverExpiry := s.Expiry() + if serverExpiry.Before(time.Now().Add(defaults.ServerAnnounceTTL - (defaults.ServerKeepAliveTTL * 2))) { + return true + } + return false +} diff --git a/lib/srv/authhandlers.go b/lib/srv/authhandlers.go index f94826b2231cb..cad4d0f2a366e 100644 --- a/lib/srv/authhandlers.go +++ b/lib/srv/authhandlers.go @@ -34,6 +34,7 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" ) @@ -150,6 +151,27 @@ func (h *AuthHandlers) CheckPortForward(addr string, ctx *ServerContext) error { return nil } +var ( + failedLoginCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricFailedLoginAttempts, + Help: "Number of times there was a failed login", + }, + ) + + certificateMismatchCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricCertificateMistmatch, + Help: "Number of times there was a certificate mismatch", + }, + ) +) + +func init() { + prometheus.MustRegister(failedLoginCount) + prometheus.MustRegister(certificateMismatchCount) +} + // UserKeyAuth implements SSH client authentication using public keys and is // called by the server every time the client connects. func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*ssh.Permissions, error) { @@ -184,6 +206,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s // only failed attempts are logged right now recordFailedLogin := func(err error) { + failedLoginCount.Inc() if err := h.Emitter.EmitAuditEvent(h.Server.Context(), &events.AuthAttempt{ Metadata: events.Metadata{ Type: events.AuthAttemptEvent, @@ -222,6 +245,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s } permissions, err := certChecker.Authenticate(conn, key) if err != nil { + certificateMismatchCount.Inc() recordFailedLogin(err) return nil, trace.Wrap(err) } diff --git a/lib/srv/regular/proxy.go b/lib/srv/regular/proxy.go index ae25898588011..625d2e5405d2c 100644 --- a/lib/srv/regular/proxy.go +++ b/lib/srv/regular/proxy.go @@ -285,6 +285,20 @@ func (t *proxySubsys) proxyToSite( return nil } +var ( + // failedConnectingToNode counts failed attempts to connect to a node + failedConnectingToNode = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricFailedConnectToNodeAttempts, + Help: "Number of times client failed to connect to a node", + }, + ) +) + +func init() { + prometheus.MustRegister(failedConnectingToNode) +} + // proxyToHost establishes a proxy connection from the connected SSH client to the // requested remote node (t.host:t.port) via the given site func (t *proxySubsys) proxyToHost( @@ -434,6 +448,7 @@ func (t *proxySubsys) proxyToHost( ConnType: services.NodeTunnel, }) if err != nil { + failedConnectingToNode.Inc() return trace.Wrap(err) } diff --git a/lib/srv/regular/sshserver.go b/lib/srv/regular/sshserver.go index dd9dfffd0dc11..bc75228c0840b 100644 --- a/lib/srv/regular/sshserver.go +++ b/lib/srv/regular/sshserver.go @@ -52,6 +52,7 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" ) @@ -815,6 +816,19 @@ func (s *Server) serveAgent(ctx *srv.ServerContext) error { return nil } +var ( + userSessionLimitHitCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricUserMaxConcurrentSessionsHit, + Help: "Number of times the user exceeded their max concurrent ssh connections", + }, + ) +) + +func init() { + prometheus.MustRegister(userSessionLimitHitCount) +} + // HandleRequest processes global out-of-band requests. Global out-of-band // requests are processed in order (this way the originator knows which // request we are responding to). If Teleport does not support the request @@ -880,6 +894,7 @@ func (s *Server) HandleNewConn(ctx context.Context, ccx *sshutils.ConnectionCont if err != nil { if strings.Contains(err.Error(), teleport.MaxLeases) { // user has exceeded their max concurrent ssh connections. + userSessionLimitHitCount.Inc() if err := s.EmitAuditEvent(s.ctx, &events.SessionReject{ Metadata: events.Metadata{ Type: events.SessionRejectedEvent, diff --git a/lib/sshutils/server.go b/lib/sshutils/server.go index 0a865584bac10..5da60ae806492 100644 --- a/lib/sshutils/server.go +++ b/lib/sshutils/server.go @@ -37,6 +37,7 @@ import ( "github.com/gravitational/trace" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" ) @@ -391,6 +392,19 @@ func (s *Server) trackConnections(delta int32) int32 { return atomic.AddInt32(&s.conns, delta) } +var ( + proxyConnectionLimitHitCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricProxyConnectionLimitHit, + Help: "Number of times proxy connection limit was exceeded", + }, + ) +) + +func init() { + prometheus.MustRegister(proxyConnectionLimitHitCount) +} + // HandleConnection is called every time an SSH server accepts a new // connection from a client. // @@ -407,6 +421,9 @@ func (s *Server) HandleConnection(conn net.Conn) { log.Errorf(err.Error()) } if err := s.limiter.AcquireConnection(remoteAddr); err != nil { + if trace.IsLimitExceeded(err) { + proxyConnectionLimitHitCount.Inc() + } log.Errorf(err.Error()) conn.Close() return diff --git a/metrics.go b/metrics.go index b37812ef3c876..e737da6d7135c 100644 --- a/metrics.go +++ b/metrics.go @@ -43,6 +43,33 @@ const ( // MetricTrustedClusters counts trusted clusters MetricTrustedClusters = "trusted_clusters" + // MetricClusterNameNotFound counts times a cluster name was not found + MetricClusterNameNotFound = "cluster_name_not_found_total" + + // MetricFailedLoginAttempts counts failed login attempts + MetricFailedLoginAttempts = "failed_login_attempts_total" + + // MetricFailedConnectToNodeAttempts counts failed ssh attempts + MetricFailedConnectToNodeAttempts = "failed_connect_to_node_attempts_total" + + // MetricUserMaxConcurrentSessionsHit counts number of times the user exceeded their max concurrent ssh connections + MetricUserMaxConcurrentSessionsHit = "user_max_concurrent_sessions_hit_total" + + // MetricProxyConnectionLimitHit counts the number of times proxy connection limit was exceeded + MetricProxyConnectionLimitHit = "proxy_connection_limit_exceeded_total" + + // MetricUserLoginCount counts user logins + MetricUserLoginCount = "user_login_total" + + // MetricHeartbeatConnectionsReceived counts heartbeat connections received by auth + MetricHeartbeatConnectionsReceived = "heartbeat_connections_received_total" + + // MetricCertificateMistmatch counts login failures due to cert mismatch + MetricCertificateMistmatch = "certificate_mismatch_total" + + // MetricHeartbeatsMissed counts the nodes that failed to heartbeat + MetricHeartbeatsMissed = "heartbeats_missed_total" + // TagCluster is a metric tag for a cluster TagCluster = "cluster" ) From a5a5d47c1c51d8c4710716e3a414d0e96e69f7b3 Mon Sep 17 00:00:00 2001 From: Jane Quintero Date: Thu, 22 Apr 2021 09:49:32 -0700 Subject: [PATCH 2/6] lint and update docs --- docs/pages/metrics-logs-reference.mdx | 1 + lib/services/server.go | 5 +---- metrics.go | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/pages/metrics-logs-reference.mdx b/docs/pages/metrics-logs-reference.mdx index aa66cc156a567..d0abecda96017 100644 --- a/docs/pages/metrics-logs-reference.mdx +++ b/docs/pages/metrics-logs-reference.mdx @@ -112,4 +112,5 @@ Now you can see the monitoring information by visiting several endpoints: | `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of times a user failed connecting to a node | | `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the proxy connection limit | | `certificate_mismatch_total` | counter | Teleport Proxy | Number of times there was a certificate mismatch | +| `failed_login_attempts_total` | counter | Teleport Proxy | Number of failed `tsh login` or `tsh ssh` logins | | `user_max_concurrent_sessions_hit_total` | counter | Teleport Node | Number of times a user exceeded their concurrent session limit | diff --git a/lib/services/server.go b/lib/services/server.go index 0813ed052aeb5..3b83e13a460c8 100644 --- a/lib/services/server.go +++ b/lib/services/server.go @@ -503,8 +503,5 @@ func MarshalServers(s []Server) ([]byte, error) { // NodeHasMissedKeepAlives checks if node has missed its keep alive func NodeHasMissedKeepAlives(s Server) bool { serverExpiry := s.Expiry() - if serverExpiry.Before(time.Now().Add(defaults.ServerAnnounceTTL - (defaults.ServerKeepAliveTTL * 2))) { - return true - } - return false + return serverExpiry.Before(time.Now().Add(defaults.ServerAnnounceTTL - (defaults.ServerKeepAliveTTL * 2))) } diff --git a/metrics.go b/metrics.go index e737da6d7135c..77c7eb71546fa 100644 --- a/metrics.go +++ b/metrics.go @@ -64,7 +64,7 @@ const ( // MetricHeartbeatConnectionsReceived counts heartbeat connections received by auth MetricHeartbeatConnectionsReceived = "heartbeat_connections_received_total" - // MetricCertificateMistmatch counts login failures due to cert mismatch + // MetricCertificateMistmatch counts login failures due to cert mismatch MetricCertificateMistmatch = "certificate_mismatch_total" // MetricHeartbeatsMissed counts the nodes that failed to heartbeat From 81c4ad6823544be9750a80ee8fcfeb9f07a8d921 Mon Sep 17 00:00:00 2001 From: Jane Quintero Date: Thu, 22 Apr 2021 11:11:23 -0700 Subject: [PATCH 3/6] periodically check heartbeat status from nodes --- lib/auth/auth.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/auth/auth.go b/lib/auth/auth.go index 167a850facca8..0cf584215cc35 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -56,6 +56,7 @@ import ( "github.com/gravitational/teleport/lib/sshutils" "github.com/gravitational/teleport/lib/tlsca" "github.com/gravitational/teleport/lib/utils" + "github.com/gravitational/teleport/lib/utils/interval" "github.com/coreos/go-oidc/oauth2" "github.com/coreos/go-oidc/oidc" @@ -298,7 +299,14 @@ func (a *Server) runPeriodicOperations() { period := defaults.HighResPollingPeriod + time.Duration(r.Intn(int(defaults.HighResPollingPeriod/time.Second)))*time.Second log.Debugf("Ticking with period: %v.", period) ticker := time.NewTicker(period) + // Create a ticker with jitter + heartbeatCheckTicker := interval.New(interval.Config{ + Duration: defaults.ServerKeepAliveTTL * 2, + Jitter: utils.NewSeventhJitter(), + }) + missedKeepAliveCount := 0 defer ticker.Stop() + defer heartbeatCheckTicker.Stop() for { select { case <-a.closeCtx.Done(): @@ -312,6 +320,18 @@ func (a *Server) runPeriodicOperations() { log.Errorf("Failed to perform cert rotation check: %v.", err) } } + case <-heartbeatCheckTicker.Next(): + nodes, err := a.GetNodes(defaults.Namespace, services.SkipValidation()) + if err != nil { + log.Errorf("Failed to update prometheus metric: %v", err) + } + for _, node := range nodes { + if services.NodeHasMissedKeepAlives(node) { + missedKeepAliveCount++ + } + } + // Update prometheus gauge + heartbeatsMissedByAuth.Set(float64(missedKeepAliveCount)) } } } From 63bc38e4e6a5803af17c62c8b91f672ab456db07 Mon Sep 17 00:00:00 2001 From: Jane Quintero Date: Thu, 22 Apr 2021 11:23:40 -0700 Subject: [PATCH 4/6] update docs --- docs/pages/metrics-logs-reference.mdx | 1 + lib/auth/auth.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/pages/metrics-logs-reference.mdx b/docs/pages/metrics-logs-reference.mdx index d0abecda96017..db50950d6f184 100644 --- a/docs/pages/metrics-logs-reference.mdx +++ b/docs/pages/metrics-logs-reference.mdx @@ -108,6 +108,7 @@ Now you can see the monitoring information by visiting several endpoints: | `auth_generate_seconds` | `histogram` | Teleport Auth | Latency for generate requests | | `cluster_name_not_found_total` | counter | Teleport Auth | Number of times a cluster was not found | | `heartbeat_connections_received_total` | counter | Teleport Auth | Number of times auth received a heartbeat connection | +| `heartbeat_connections_missed_total` | counter | Teleport Auth | Number of times auth did not receive a heartbeat from a node | | `user_login_total` | counter | Teleport Auth | Number of user logins | | `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of times a user failed connecting to a node | | `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the proxy connection limit | diff --git a/lib/auth/auth.go b/lib/auth/auth.go index 0cf584215cc35..ebd54eaf9dff4 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -299,7 +299,7 @@ func (a *Server) runPeriodicOperations() { period := defaults.HighResPollingPeriod + time.Duration(r.Intn(int(defaults.HighResPollingPeriod/time.Second)))*time.Second log.Debugf("Ticking with period: %v.", period) ticker := time.NewTicker(period) - // Create a ticker with jitter + // Create a ticker with jitter heartbeatCheckTicker := interval.New(interval.Config{ Duration: defaults.ServerKeepAliveTTL * 2, Jitter: utils.NewSeventhJitter(), From 1be02b27ab88ca1728e9a063e093acebf006f0f6 Mon Sep 17 00:00:00 2001 From: Jane Quintero Date: Fri, 23 Apr 2021 16:51:32 -0700 Subject: [PATCH 5/6] fix up help messages --- lib/auth/grpcserver.go | 2 +- lib/srv/regular/proxy.go | 2 +- lib/srv/regular/sshserver.go | 2 +- lib/sshutils/server.go | 2 +- metrics.go | 6 +++--- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/auth/grpcserver.go b/lib/auth/grpcserver.go index 68cd29099e59b..c420c3b6edf5e 100644 --- a/lib/auth/grpcserver.go +++ b/lib/auth/grpcserver.go @@ -71,7 +71,7 @@ var ( heartbeatConnectionsReceived = prometheus.NewCounter( prometheus.CounterOpts{ Name: teleport.MetricHeartbeatConnectionsReceived, - Help: "Number of auth received a heartbeat", + Help: "Number of times auth received a heartbeat connection", }, ) ) diff --git a/lib/srv/regular/proxy.go b/lib/srv/regular/proxy.go index 625d2e5405d2c..cee559f7612a0 100644 --- a/lib/srv/regular/proxy.go +++ b/lib/srv/regular/proxy.go @@ -290,7 +290,7 @@ var ( failedConnectingToNode = prometheus.NewCounter( prometheus.CounterOpts{ Name: teleport.MetricFailedConnectToNodeAttempts, - Help: "Number of times client failed to connect to a node", + Help: "Number of failed attempts to connect to a node", }, ) ) diff --git a/lib/srv/regular/sshserver.go b/lib/srv/regular/sshserver.go index bc75228c0840b..e3b57cb135517 100644 --- a/lib/srv/regular/sshserver.go +++ b/lib/srv/regular/sshserver.go @@ -820,7 +820,7 @@ var ( userSessionLimitHitCount = prometheus.NewCounter( prometheus.CounterOpts{ Name: teleport.MetricUserMaxConcurrentSessionsHit, - Help: "Number of times the user exceeded their max concurrent ssh connections", + Help: "Number of times a user exceeded their max concurrent ssh connections", }, ) ) diff --git a/lib/sshutils/server.go b/lib/sshutils/server.go index 5da60ae806492..58deb28fdc8d0 100644 --- a/lib/sshutils/server.go +++ b/lib/sshutils/server.go @@ -396,7 +396,7 @@ var ( proxyConnectionLimitHitCount = prometheus.NewCounter( prometheus.CounterOpts{ Name: teleport.MetricProxyConnectionLimitHit, - Help: "Number of times proxy connection limit was exceeded", + Help: "Number of times the proxy connection limit was exceeded", }, ) ) diff --git a/metrics.go b/metrics.go index 77c7eb71546fa..270377cf0342c 100644 --- a/metrics.go +++ b/metrics.go @@ -52,10 +52,10 @@ const ( // MetricFailedConnectToNodeAttempts counts failed ssh attempts MetricFailedConnectToNodeAttempts = "failed_connect_to_node_attempts_total" - // MetricUserMaxConcurrentSessionsHit counts number of times the user exceeded their max concurrent ssh connections + // MetricUserMaxConcurrentSessionsHit counts number of times a user exceeded their max concurrent ssh connections MetricUserMaxConcurrentSessionsHit = "user_max_concurrent_sessions_hit_total" - // MetricProxyConnectionLimitHit counts the number of times proxy connection limit was exceeded + // MetricProxyConnectionLimitHit counts the number of times the proxy connection limit was exceeded MetricProxyConnectionLimitHit = "proxy_connection_limit_exceeded_total" // MetricUserLoginCount counts user logins @@ -64,7 +64,7 @@ const ( // MetricHeartbeatConnectionsReceived counts heartbeat connections received by auth MetricHeartbeatConnectionsReceived = "heartbeat_connections_received_total" - // MetricCertificateMistmatch counts login failures due to cert mismatch + // MetricCertificateMistmatch counts login failures due to certificate mismatch MetricCertificateMistmatch = "certificate_mismatch_total" // MetricHeartbeatsMissed counts the nodes that failed to heartbeat From 814fedd9a3f5026821a7a1e70feb8390310c7a12 Mon Sep 17 00:00:00 2001 From: Jane Quintero Date: Tue, 27 Apr 2021 10:02:05 -0700 Subject: [PATCH 6/6] hook cluster name not found counter elsewhere --- lib/auth/auth.go | 2 +- lib/cache/cache.go | 17 ----------------- lib/services/local/configuration.go | 16 ++++++++++++++++ 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/lib/auth/auth.go b/lib/auth/auth.go index ebd54eaf9dff4..b1783eade0c40 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -323,7 +323,7 @@ func (a *Server) runPeriodicOperations() { case <-heartbeatCheckTicker.Next(): nodes, err := a.GetNodes(defaults.Namespace, services.SkipValidation()) if err != nil { - log.Errorf("Failed to update prometheus metric: %v", err) + log.Errorf("Failed to load nodes for heartbeat metric calculation: %v", err) } for _, node := range nodes { if services.NodeHasMissedKeepAlives(node) { diff --git a/lib/cache/cache.go b/lib/cache/cache.go index 21b3c95c17816..9cd94bf9b8ad7 100644 --- a/lib/cache/cache.go +++ b/lib/cache/cache.go @@ -31,7 +31,6 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" - "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" "go.uber.org/atomic" ) @@ -1015,26 +1014,10 @@ func (c *Cache) GetClusterConfig(opts ...services.MarshalOption) (services.Clust return rg.clusterConfig.GetClusterConfig(services.AddOptions(opts, services.SkipValidation())...) } -var ( - clusterNameNotFound = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: teleport.MetricClusterNameNotFound, - Help: "Number of times a cluster name was not found", - }, - ) -) - -func init() { - prometheus.MustRegister(clusterNameNotFound) -} - // GetClusterName gets the name of the cluster from the backend. func (c *Cache) GetClusterName(opts ...services.MarshalOption) (services.ClusterName, error) { rg, err := c.read() if err != nil { - if trace.IsNotFound(err) { - clusterNameNotFound.Inc() - } return nil, trace.Wrap(err) } defer rg.Release() diff --git a/lib/services/local/configuration.go b/lib/services/local/configuration.go index 2468973bec741..83c62f2ba2c47 100644 --- a/lib/services/local/configuration.go +++ b/lib/services/local/configuration.go @@ -19,12 +19,27 @@ package local import ( "context" + "github.com/gravitational/teleport" "github.com/gravitational/teleport/lib/backend" "github.com/gravitational/teleport/lib/services" "github.com/gravitational/trace" + "github.com/prometheus/client_golang/prometheus" ) +var ( + clusterNameNotFound = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricClusterNameNotFound, + Help: "Number of times a cluster name was not found", + }, + ) +) + +func init() { + prometheus.MustRegister(clusterNameNotFound) +} + // ClusterConfigurationService is responsible for managing cluster configuration. type ClusterConfigurationService struct { backend.Backend @@ -42,6 +57,7 @@ func (s *ClusterConfigurationService) GetClusterName(opts ...services.MarshalOpt item, err := s.Get(context.TODO(), backend.Key(clusterConfigPrefix, namePrefix)) if err != nil { if trace.IsNotFound(err) { + clusterNameNotFound.Inc() return nil, trace.NotFound("cluster name not found") } return nil, trace.Wrap(err)