Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v11] Improve trusted cluster observability #18609

Merged
merged 5 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions docs/pages/includes/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,13 @@
| `grpc_client_handled_total` | counter | Teleport Proxy | Total number of RPCs completed on the client, regardless of success or failure. |
| `grpc_client_msg_received_total` | counter | Teleport Proxy | Total number of RPC stream messages received on the client. |
| `grpc_client_msg_sent_total` | counter | Teleport Proxy | Total number of gRPC stream messages sent by the client. |
| `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the Proxy Service connection limit. |
| `proxy_ssh_sessions_total` | gauge | Teleport Proxy | Number of active sessions through this Proxy Service instance. |
| `proxy_missing_ssh_tunnels` | gauge | Teleport Proxy | Number of missing SSH tunnels. Used to debug if Nodes have discovered all Proxy Service instances. |
| `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the Proxy Service connection limit. |
| `proxy_ssh_sessions_total` | gauge | Teleport Proxy | Number of active sessions through this Proxy Service instance. |
| `proxy_missing_ssh_tunnels` | gauge | Teleport Proxy | Number of missing SSH tunnels. Used to debug if Nodes have discovered all Proxy Service instances. |
| `remote_clusters` | gauge | Teleport Proxy | Number of inbound connections from leaf clusters. |
| `teleport_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of SSH connection attempts to a node. Use with `failed_connect_to_node_attempts_total` to get the failure rate. |
| `teleport_reverse_tunnels_connected` | gauge | Teleport Proxy | Number of reverse SSH tunnels connected to the Teleport Proxy Service by Teleport instances. |
| `trusted_clusters` | gauge | Teleport Proxy | Number of outbound connections to leaf clusters. |

## Teleport Nodes

Expand All @@ -97,7 +99,6 @@
| `teleport_build_info` | gauge | Teleport | Provides build information of Teleport including gitref (git describe --long --tags), Go version, and Teleport version. The value of this gauge will always be 1. |
| `teleport_cache_events` | counter | Teleport | Number of events received by a Teleport service cache. Teleport's Auth Service, Proxy Service, and other services cache incoming events related to their service. |
| `teleport_cache_stale_events` | counter | Teleport | Number of stale events received by a Teleport service cache. A high percentage of stale events can indicate a degraded backend. |
| `trusted_clusters` | gauge | Teleport | Number of tunnels per state. |
| `tx` | counter | Teleport | Number of bytes transmitted during an SSH connection. |


Expand Down
16 changes: 9 additions & 7 deletions lib/auth/trustedcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,12 @@ func (a *Server) UpsertTrustedCluster(ctx context.Context, trustedCluster types.
}
}

logger := log.WithField("trusted_cluster", trustedCluster.GetName())

// change state
switch {
case exists == true && enable == true:
log.Debugf("Enabling existing Trusted Cluster relationship.")
logger.Info("Enabling existing Trusted Cluster relationship.")

if err := a.activateCertAuthority(trustedCluster); err != nil {
if trace.IsNotFound(err) {
Expand All @@ -82,7 +84,7 @@ func (a *Server) UpsertTrustedCluster(ctx context.Context, trustedCluster types.
return nil, trace.Wrap(err)
}
case exists == true && enable == false:
log.Debugf("Disabling existing Trusted Cluster relationship.")
logger.Info("Disabling existing Trusted Cluster relationship.")

if err := a.deactivateCertAuthority(trustedCluster); err != nil {
if trace.IsNotFound(err) {
Expand All @@ -95,7 +97,7 @@ func (a *Server) UpsertTrustedCluster(ctx context.Context, trustedCluster types.
return nil, trace.Wrap(err)
}
case exists == false && enable == true:
log.Debugf("Creating enabled Trusted Cluster relationship.")
logger.Info("Creating enabled Trusted Cluster relationship.")

if err := a.checkLocalRoles(ctx, trustedCluster.GetRoleMap()); err != nil {
return nil, trace.Wrap(err)
Expand All @@ -119,7 +121,7 @@ func (a *Server) UpsertTrustedCluster(ctx context.Context, trustedCluster types.
}

case exists == false && enable == false:
log.Debugf("Creating disabled Trusted Cluster relationship.")
logger.Info("Creating disabled Trusted Cluster relationship.")

if err := a.checkLocalRoles(ctx, trustedCluster.GetRoleMap()); err != nil {
return nil, trace.Wrap(err)
Expand Down Expand Up @@ -157,7 +159,7 @@ func (a *Server) UpsertTrustedCluster(ctx context.Context, trustedCluster types.
Name: trustedCluster.GetName(),
},
}); err != nil {
log.WithError(err).Warn("Failed to emit trusted cluster create event.")
logger.WithError(err).Warn("Failed to emit trusted cluster create event.")
}

return tc, nil
Expand Down Expand Up @@ -258,7 +260,7 @@ func (a *Server) establishTrust(ctx context.Context, trustedCluster types.Truste
}

// log the local certificate authorities that we are sending
log.Debugf("Sending validate request; token=%s, CAs=%v", backend.MaskKeyName(validateRequest.Token), validateRequest.CAs)
log.Infof("Sending validate request; token=%s, CAs=%v", backend.MaskKeyName(validateRequest.Token), validateRequest.CAs)

// send the request to the remote auth server via the proxy
validateResponse, err := a.sendValidateRequestToProxy(trustedCluster.GetProxyAddress(), &validateRequest)
Expand All @@ -271,7 +273,7 @@ func (a *Server) establishTrust(ctx context.Context, trustedCluster types.Truste
}

// log the remote certificate authorities we are adding
log.Debugf("Received validate response; CAs=%v", validateResponse.CAs)
log.Infof("Received validate response; CAs=%v", validateResponse.CAs)

for _, ca := range validateResponse.CAs {
for _, keyPair := range ca.GetActiveKeys().TLS {
Expand Down
4 changes: 4 additions & 0 deletions lib/reversetunnel/agentpool.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ func (p *AgentPool) GetConnectedProxyGetter() *ConnectedProxyGetter {
}

func (p *AgentPool) updateConnectedProxies() {
if p.IsRemoteCluster {
trustedClustersStats.WithLabelValues(p.Cluster).Set(float64(p.active.len()))
}

if !p.runtimeConfig.reportConnectedProxies() {
p.ConnectedProxyGetter.setProxyIDs(nil)
return
Expand Down
6 changes: 3 additions & 3 deletions lib/reversetunnel/localsite.go
Original file line number Diff line number Diff line change
Expand Up @@ -559,15 +559,15 @@ func (s *localSite) handleHeartbeat(rconn *remoteConn, ch ssh.Channel, reqC <-ch
for {
select {
case <-s.srv.ctx.Done():
s.log.Infof("closing")
logger.Infof("closing")
return
case <-proxyResyncTicker.Chan():
req := discoveryRequest{
Proxies: s.srv.proxyWatcher.GetCurrent(),
}

if err := rconn.sendDiscoveryRequest(req); err != nil {
s.log.WithError(err).Debug("Marking connection invalid on error")
logger.WithError(err).Debug("Marking connection invalid on error")
rconn.markInvalid(err)
return
}
Expand Down Expand Up @@ -803,7 +803,7 @@ func (s *localSite) sshTunnelStats() error {
if n > 10 {
n = 10
}
log.Debugf("Cluster %v is missing %v tunnels. A small number of missing tunnels is normal, for example, a node could have just been shut down, the proxy restarted, etc. However, if this error persists with an elevated number of missing tunnels, it often indicates nodes can not discover all registered proxies. Check that all of your proxies are behind a load balancer and the load balancer is using a round robin strategy. Some of the missing hosts: %v.", s.domainName, len(missing), missing[:n])
s.log.Debugf("Cluster %v is missing %v tunnels. A small number of missing tunnels is normal, for example, a node could have just been shut down, the proxy restarted, etc. However, if this error persists with an elevated number of missing tunnels, it often indicates nodes can not discover all registered proxies. Check that all of your proxies are behind a load balancer and the load balancer is using a round robin strategy. Some of the missing hosts: %v.", s.domainName, len(missing), missing[:n])
}
return nil
}
Expand Down
2 changes: 2 additions & 0 deletions lib/reversetunnel/rc_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ func (w *RemoteClusterTunnelManager) Sync(ctx context.Context) error {
continue
}
pool.Stop()
trustedClustersStats.DeleteLabelValues(pool.Cluster)
delete(w.pools, k)
}

Expand All @@ -202,6 +203,7 @@ func (w *RemoteClusterTunnelManager) Sync(ctx context.Context) error {
continue
}

trustedClustersStats.WithLabelValues(k.cluster).Set(0)
pool, err := w.newAgentPool(ctx, w.cfg, k.cluster, k.addr)
if err != nil {
errs = append(errs, trace.Wrap(err))
Expand Down
Loading