Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional Prometheus Metrics #6511

Merged
merged 12 commits into from
Apr 28, 2021
9 changes: 9 additions & 0 deletions docs/pages/metrics-logs-reference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,12 @@ Now you can see the monitoring information by visiting several endpoints:
| `auth_generate_requests_throttled_total` | counter | Teleport Auth | Number of throttled requests to generate new server keys |
| `auth_generate_requests_total` | counter | Teleport Auth | Number of requests to generate new server keys |
| `auth_generate_seconds` | `histogram` | Teleport Auth | Latency for generate requests |
| `cluster_name_not_found_total` | counter | Teleport Auth | Number of times a cluster was not found |
| `heartbeat_connections_received_total` | counter | Teleport Auth | Number of times auth received a heartbeat connection |
| `heartbeat_connections_missed_total` | counter | Teleport Auth | Number of times auth did not receive a heartbeat from a node |
| `user_login_total` | counter | Teleport Auth | Number of user logins |
| `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of times a user failed connecting to a node |
| `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the proxy connection limit |
| `certificate_mismatch_total` | counter | Teleport Proxy | Number of times there was a certificate mismatch |
| `failed_login_attempts_total` | counter | Teleport Proxy | Number of failed `tsh login` or `tsh ssh` logins |
| `user_max_concurrent_sessions_hit_total` | counter | Teleport Node | Number of times a user exceeded their concurrent session limit |
38 changes: 37 additions & 1 deletion lib/auth/auth.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ import (
"github.com/gravitational/teleport/lib/sshutils"
"github.com/gravitational/teleport/lib/tlsca"
"github.com/gravitational/teleport/lib/utils"
"github.com/gravitational/teleport/lib/utils/interval"

"github.com/coreos/go-oidc/oauth2"
"github.com/coreos/go-oidc/oidc"
Expand Down Expand Up @@ -201,6 +202,20 @@ var (
Buckets: prometheus.ExponentialBuckets(0.001, 2, 16),
},
)
// UserLoginCount counts user logins
UserLoginCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricUserLoginCount,
Help: "Number of times there was a user login",
},
)

heartbeatsMissedByAuth = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: teleport.MetricHeartbeatsMissed,
Help: "Number of hearbeats missed by auth server",
},
)
)

// Server keeps the cluster together. It acts as a certificate authority (CA) for
Expand Down Expand Up @@ -284,7 +299,14 @@ func (a *Server) runPeriodicOperations() {
period := defaults.HighResPollingPeriod + time.Duration(r.Intn(int(defaults.HighResPollingPeriod/time.Second)))*time.Second
log.Debugf("Ticking with period: %v.", period)
ticker := time.NewTicker(period)
// Create a ticker with jitter
heartbeatCheckTicker := interval.New(interval.Config{
Duration: defaults.ServerKeepAliveTTL * 2,
Jitter: utils.NewSeventhJitter(),
})
missedKeepAliveCount := 0
defer ticker.Stop()
defer heartbeatCheckTicker.Stop()
for {
select {
case <-a.closeCtx.Done():
Expand All @@ -298,6 +320,18 @@ func (a *Server) runPeriodicOperations() {
log.Errorf("Failed to perform cert rotation check: %v.", err)
}
}
case <-heartbeatCheckTicker.Next():
nodes, err := a.GetNodes(defaults.Namespace, services.SkipValidation())
if err != nil {
log.Errorf("Failed to update prometheus metric: %v", err)
quinqu marked this conversation as resolved.
Show resolved Hide resolved
}
for _, node := range nodes {
if services.NodeHasMissedKeepAlives(node) {
missedKeepAliveCount++
}
}
// Update prometheus gauge
heartbeatsMissedByAuth.Set(float64(missedKeepAliveCount))
}
}
}
Expand Down Expand Up @@ -1662,7 +1696,7 @@ func (a *Server) NewWebSession(req types.NewWebSessionRequest) (services.WebSess
BearerTokenExpires: startTime.UTC().Add(bearerTokenTTL),
LoginTime: req.LoginTime,
}

UserLoginCount.Inc()
return services.NewWebSession(token, services.KindWebSession, services.KindWebSession, sessionSpec), nil
}

Expand Down Expand Up @@ -2537,4 +2571,6 @@ func init() {
prometheus.MustRegister(generateThrottledRequestsCount)
prometheus.MustRegister(generateRequestsCurrent)
prometheus.MustRegister(generateRequestsLatencies)
prometheus.MustRegister(UserLoginCount)
prometheus.MustRegister(heartbeatsMissedByAuth)
}
19 changes: 17 additions & 2 deletions lib/auth/grpcserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ import (
"github.com/gravitational/teleport/lib/services"
"github.com/gravitational/teleport/lib/session"
"github.com/gravitational/teleport/lib/utils"

"github.com/golang/protobuf/ptypes/empty"
"github.com/gravitational/trace"
"github.com/gravitational/trace/trail"

"github.com/golang/protobuf/ptypes/empty"
"github.com/prometheus/client_golang/prometheus"
quinqu marked this conversation as resolved.
Show resolved Hide resolved
"github.com/sirupsen/logrus"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
Expand Down Expand Up @@ -66,6 +67,19 @@ func (g *GRPCServer) GetServer() (*grpc.Server, error) {
return g.server, nil
}

var (
heartbeatConnectionsReceived = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricHeartbeatConnectionsReceived,
Help: "Number of times auth received a heartbeat connection",
},
)
)

func init() {
prometheus.MustRegister(heartbeatConnectionsReceived)
}

// EmitAuditEvent emits audit event
func (g *GRPCServer) EmitAuditEvent(ctx context.Context, req *events.OneOf) (*empty.Empty, error) {
auth, err := g.authenticate(ctx)
Expand All @@ -91,6 +105,7 @@ func (g *GRPCServer) SendKeepAlives(stream proto.AuthService_SendKeepAlivesServe
return trail.ToGRPC(err)
}
g.Debugf("Got heartbeat connection from %v.", auth.User.GetName())
heartbeatConnectionsReceived.Inc()
for {
keepAlive, err := stream.Recv()
if err == io.EOF {
Expand Down
1 change: 1 addition & 0 deletions lib/auth/methods.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ func (s *Server) AuthenticateSSHUser(req AuthenticateSSHRequest) (*SSHLoginRespo
if err != nil {
return nil, trace.Wrap(err)
}
UserLoginCount.Inc()
return &SSHLoginResponse{
Username: req.Username,
Cert: certs.ssh,
Expand Down
2 changes: 1 addition & 1 deletion lib/auth/sessions.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func (s *Server) CreateAppSession(ctx context.Context, req services.CreateAppSes
return nil, trace.Wrap(err)
}
log.Debugf("Generated application web session for %v with TTL %v.", req.Username, ttl)

UserLoginCount.Inc()
return session, nil
}

Expand Down
17 changes: 17 additions & 0 deletions lib/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
"go.uber.org/atomic"
)
Expand Down Expand Up @@ -1014,10 +1015,26 @@ func (c *Cache) GetClusterConfig(opts ...services.MarshalOption) (services.Clust
return rg.clusterConfig.GetClusterConfig(services.AddOptions(opts, services.SkipValidation())...)
}

var (
clusterNameNotFound = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricClusterNameNotFound,
Help: "Number of times a cluster name was not found",
},
)
)
quinqu marked this conversation as resolved.
Show resolved Hide resolved

func init() {
prometheus.MustRegister(clusterNameNotFound)
}

// GetClusterName gets the name of the cluster from the backend.
func (c *Cache) GetClusterName(opts ...services.MarshalOption) (services.ClusterName, error) {
rg, err := c.read()
if err != nil {
if trace.IsNotFound(err) {
clusterNameNotFound.Inc()
}
return nil, trace.Wrap(err)
}
defer rg.Release()
Expand Down
7 changes: 7 additions & 0 deletions lib/services/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package services
import (
"encoding/json"
"fmt"
"time"

"github.com/gravitational/teleport/api/types"
"github.com/gravitational/teleport/lib/defaults"
Expand Down Expand Up @@ -498,3 +499,9 @@ func MarshalServers(s []Server) ([]byte, error) {

return bytes, nil
}

// NodeHasMissedKeepAlives checks if node has missed its keep alive
func NodeHasMissedKeepAlives(s Server) bool {
serverExpiry := s.Expiry()
return serverExpiry.Before(time.Now().Add(defaults.ServerAnnounceTTL - (defaults.ServerKeepAliveTTL * 2)))
}
24 changes: 24 additions & 0 deletions lib/srv/authhandlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/gravitational/trace"

"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
)

Expand Down Expand Up @@ -150,6 +151,27 @@ func (h *AuthHandlers) CheckPortForward(addr string, ctx *ServerContext) error {
return nil
}

var (
failedLoginCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricFailedLoginAttempts,
Help: "Number of times there was a failed login",
},
)

certificateMismatchCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricCertificateMistmatch,
Help: "Number of times there was a certificate mismatch",
},
)
)

func init() {
prometheus.MustRegister(failedLoginCount)
prometheus.MustRegister(certificateMismatchCount)
}

// UserKeyAuth implements SSH client authentication using public keys and is
// called by the server every time the client connects.
func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*ssh.Permissions, error) {
Expand Down Expand Up @@ -184,6 +206,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s

// only failed attempts are logged right now
recordFailedLogin := func(err error) {
failedLoginCount.Inc()
if err := h.Emitter.EmitAuditEvent(h.Server.Context(), &events.AuthAttempt{
Metadata: events.Metadata{
Type: events.AuthAttemptEvent,
Expand Down Expand Up @@ -222,6 +245,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s
}
permissions, err := certChecker.Authenticate(conn, key)
if err != nil {
certificateMismatchCount.Inc()
recordFailedLogin(err)
return nil, trace.Wrap(err)
}
Expand Down
15 changes: 15 additions & 0 deletions lib/srv/regular/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,20 @@ func (t *proxySubsys) proxyToSite(
return nil
}

var (
// failedConnectingToNode counts failed attempts to connect to a node
failedConnectingToNode = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricFailedConnectToNodeAttempts,
Help: "Number of failed attempts to connect to a node",
},
)
)

func init() {
prometheus.MustRegister(failedConnectingToNode)
}

// proxyToHost establishes a proxy connection from the connected SSH client to the
// requested remote node (t.host:t.port) via the given site
func (t *proxySubsys) proxyToHost(
Expand Down Expand Up @@ -434,6 +448,7 @@ func (t *proxySubsys) proxyToHost(
ConnType: services.NodeTunnel,
})
if err != nil {
failedConnectingToNode.Inc()
return trace.Wrap(err)
}

Expand Down
15 changes: 15 additions & 0 deletions lib/srv/regular/sshserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ import (
"github.com/gravitational/trace"

"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
)

Expand Down Expand Up @@ -815,6 +816,19 @@ func (s *Server) serveAgent(ctx *srv.ServerContext) error {
return nil
}

var (
userSessionLimitHitCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricUserMaxConcurrentSessionsHit,
Help: "Number of times a user exceeded their max concurrent ssh connections",
},
)
)

func init() {
prometheus.MustRegister(userSessionLimitHitCount)
}

// HandleRequest processes global out-of-band requests. Global out-of-band
// requests are processed in order (this way the originator knows which
// request we are responding to). If Teleport does not support the request
Expand Down Expand Up @@ -880,6 +894,7 @@ func (s *Server) HandleNewConn(ctx context.Context, ccx *sshutils.ConnectionCont
if err != nil {
if strings.Contains(err.Error(), teleport.MaxLeases) {
// user has exceeded their max concurrent ssh connections.
userSessionLimitHitCount.Inc()
if err := s.EmitAuditEvent(s.ctx, &events.SessionReject{
Metadata: events.Metadata{
Type: events.SessionRejectedEvent,
Expand Down
17 changes: 17 additions & 0 deletions lib/sshutils/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (

"github.com/gravitational/trace"

"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
)

Expand Down Expand Up @@ -391,6 +392,19 @@ func (s *Server) trackConnections(delta int32) int32 {
return atomic.AddInt32(&s.conns, delta)
}

var (
proxyConnectionLimitHitCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricProxyConnectionLimitHit,
Help: "Number of times the proxy connection limit was exceeded",
},
)
)

func init() {
prometheus.MustRegister(proxyConnectionLimitHitCount)
}

// HandleConnection is called every time an SSH server accepts a new
// connection from a client.
//
Expand All @@ -407,6 +421,9 @@ func (s *Server) HandleConnection(conn net.Conn) {
log.Errorf(err.Error())
}
if err := s.limiter.AcquireConnection(remoteAddr); err != nil {
if trace.IsLimitExceeded(err) {
proxyConnectionLimitHitCount.Inc()
}
log.Errorf(err.Error())
conn.Close()
return
Expand Down
27 changes: 27 additions & 0 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,33 @@ const (
// MetricTrustedClusters counts trusted clusters
MetricTrustedClusters = "trusted_clusters"

// MetricClusterNameNotFound counts times a cluster name was not found
MetricClusterNameNotFound = "cluster_name_not_found_total"

// MetricFailedLoginAttempts counts failed login attempts
MetricFailedLoginAttempts = "failed_login_attempts_total"

// MetricFailedConnectToNodeAttempts counts failed ssh attempts
MetricFailedConnectToNodeAttempts = "failed_connect_to_node_attempts_total"

// MetricUserMaxConcurrentSessionsHit counts number of times a user exceeded their max concurrent ssh connections
MetricUserMaxConcurrentSessionsHit = "user_max_concurrent_sessions_hit_total"

// MetricProxyConnectionLimitHit counts the number of times the proxy connection limit was exceeded
MetricProxyConnectionLimitHit = "proxy_connection_limit_exceeded_total"

// MetricUserLoginCount counts user logins
MetricUserLoginCount = "user_login_total"

// MetricHeartbeatConnectionsReceived counts heartbeat connections received by auth
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure on the terminology but perhaps these would be better described as heartbeat messages? Correct me if I am wrong but I think a connection is persistently held and used for heartbeats.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for pointing this out because i did actually hook at the wrong place for heartbeat connections (in grpcserver.go )-- it should still be counting heartbeat connections but based on where i initally hooked -- i believe it counts messages. i will fix this.

MetricHeartbeatConnectionsReceived = "heartbeat_connections_received_total"

// MetricCertificateMistmatch counts login failures due to certificate mismatch
MetricCertificateMistmatch = "certificate_mismatch_total"

// MetricHeartbeatsMissed counts the nodes that failed to heartbeat
MetricHeartbeatsMissed = "heartbeats_missed_total"

// TagCluster is a metric tag for a cluster
TagCluster = "cluster"
)
Expand Down