Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added metrics for missing SSH tunnels. #8603

Merged
merged 1 commit into from
Oct 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/pages/setup/reference/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ Now you can see the monitoring information by visiting several endpoints:
| `promhttp_metric_handler_requests_in_flight` | gauge | prometheus | Current number of scrapes being served. |
| `promhttp_metric_handler_requests_total` | counter | prometheus | Total number of scrapes by HTTP status code. |
| `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the proxy connection limit. |
| `proxy_missing_ssh_tunnels` | gauge | Teleport Proxy | Number of missing SSH tunnels. Used to debug if nodes have discovered all proxies. |
| `reversetunnel_connected_proxies` | gauge | Teleport | Number of known proxies being sought. |
| `rx` | counter | Teleport | Number of bytes received. |
| `server_interactive_sessions_total` | gauge | Teleport | Number of active sessions. |
Expand Down
95 changes: 93 additions & 2 deletions lib/reversetunnel/localsite.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,24 @@ import (
"github.com/gravitational/teleport/api/types"
"github.com/gravitational/teleport/api/utils/sshutils"
"github.com/gravitational/teleport/lib/auth"
"github.com/gravitational/teleport/lib/defaults"
"github.com/gravitational/teleport/lib/services"
"github.com/gravitational/teleport/lib/srv/forward"
"github.com/gravitational/teleport/lib/utils"
"github.com/gravitational/teleport/lib/utils/proxy"
"github.com/prometheus/client_golang/prometheus"

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"
log "github.com/sirupsen/logrus"
)

func newlocalSite(srv *server, domainName string, client auth.ClientI) (*localSite, error) {
err := utils.RegisterPrometheusCollectors(localClusterCollectors...)
if err != nil {
return nil, trace.Wrap(err)
}

accessPoint, err := srv.newAccessPoint(client, []string{"reverse", domainName})
if err != nil {
return nil, trace.Wrap(err)
Expand All @@ -53,7 +61,7 @@ func newlocalSite(srv *server, domainName string, client auth.ClientI) (*localSi
return nil, trace.Wrap(err)
}

return &localSite{
s := &localSite{
srv: srv,
client: client,
accessPoint: accessPoint,
Expand All @@ -68,7 +76,12 @@ func newlocalSite(srv *server, domainName string, client auth.ClientI) (*localSi
},
}),
offlineThreshold: srv.offlineThreshold,
}, nil
}

// Start periodic functions for the the local cluster in the background.
go s.periodicFunctions()

return s, nil
}

// localSite allows to directly access the remote servers
Expand Down Expand Up @@ -468,3 +481,81 @@ func (s *localSite) chanTransportConn(rconn *remoteConn, dreq *sshutils.DialReq)

return conn, nil
}

// periodicFunctions runs functions periodic functions for the local cluster.
func (s *localSite) periodicFunctions() {
ticker := time.NewTicker(defaults.ResyncInterval)
defer ticker.Stop()

for {
select {
case <-s.srv.ctx.Done():
return
case <-ticker.C:
if err := s.sshTunnelStats(); err != nil {
s.log.Warningf("Failed to report SSH tunnel statistics for: %v: %v.", s.domainName, err)
}
}
}
}

// sshTunnelStats reports SSH tunnel statistics for the cluster.
func (s *localSite) sshTunnelStats() error {
servers, err := s.accessPoint.GetNodes(s.srv.ctx, apidefaults.Namespace)
if err != nil {
return trace.Wrap(err)
}

var missing []string

for _, server := range servers {
// Skip over any servers that that have a TTL larger than announce TTL (10
// minutes) and are non-IoT SSH servers (they won't have tunnels).
//
// Servers with a TTL larger than the announce TTL skipped over to work around
// an issue with DynamoDB where objects can hang around for 48 hours after
// their TTL value.
ttl := s.clock.Now().Add(-1 * apidefaults.ServerAnnounceTTL)
if server.Expiry().Before(ttl) {
continue
}
if !server.GetUseTunnel() {
continue
}

// Check if the tunnel actually exists.
_, err := s.getRemoteConn(&sshutils.DialReq{
ServerID: fmt.Sprintf("%v.%v", server.GetName(), s.domainName),
ConnType: types.NodeTunnel,
})
if err == nil {
continue
}

missing = append(missing, server.GetName())
}

// Update Prometheus metrics and also log if any tunnels are missing.
missingSSHTunnels.Set(float64(len(missing)))
if len(missing) > 0 {
// Don't show all the missing nodes, thousands could be missing, just show
// the first 10.
n := len(missing)
if n > 10 {
n = 10
}
log.Debugf("Cluster %v is missing %v tunnels. A small number of missing tunnels is normal, for example, a node could have just been shut down, the proxy restarted, etc. However, if this error persists with an elevated number of missing tunnels, it often indicates nodes can not discover all registered proxies. Check that all of your proxies are behind a load balancer and the load balancer is using a round robin strategy. Some of the missing hosts: %v.", s.domainName, len(missing), missing[:n])
}
return nil
}

var (
missingSSHTunnels = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: teleport.MetricMissingSSHTunnels,
Help: "Number of missing SSH tunnels",
},
)

localClusterCollectors = []prometheus.Collector{missingSSHTunnels}
)
3 changes: 3 additions & 0 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ const (
// MetricWatcherEventSizes measures the size of watcher events that are emitted
MetricWatcherEventSizes = "watcher_event_sizes"

// MetricMissingSSHTunnels returns the number of missing SSH tunnels for this proxy.
MetricMissingSSHTunnels = "proxy_missing_ssh_tunnels"

// TagCluster is a metric tag for a cluster
TagCluster = "cluster"
)
Expand Down