From 673e91dae9158392a74d90ec2c9944ca7e0d1a02 Mon Sep 17 00:00:00 2001 From: Tim Ross Date: Fri, 6 Dec 2024 10:39:51 -0500 Subject: [PATCH] Prevent races counting missed heartbeats Increments the `heartbeatsMissedByAuth` gauge directly instead of keeping a local count and setting the gauge once. This fixes a race caught in https://github.com/gravitational/teleport/actions/runs/11828850190/job/32959672995?pr=48949. ``` ================== WARNING: DATA RACE Write at 0x00c003857da8 by goroutine 48258: github.com/gravitational/teleport/lib/auth.(*Server).runPeriodicOperations.func3.1() /__w/teleport/teleport/lib/auth/auth.go:1471 +0x84 github.com/gravitational/teleport/lib/services.(*UnifiedResourceCache).getRange.func1.1() /__w/teleport/teleport/lib/services/unified_resource.go:239 +0x14d github.com/google/btree.(*node[go.shape.*uint8]).iterate() /go/pkg/mod/github.com/google/btree@v1.1.3/btree_generic.go:522 +0x62c github.com/google/btree.(*BTreeG[go.shape.*uint8]).AscendRange() /go/pkg/mod/github.com/google/btree@v1.1.3/btree_generic.go:752 +0x104 github.com/google/btree.(*BTreeG[*github.com/gravitational/teleport/lib/services.item]).AscendRange-fm() :1 +0x58 github.com/gravitational/teleport/lib/services.(*UnifiedResourceCache).getRange.func1() /__w/teleport/teleport/lib/services/unified_resource.go:230 +0x7a8 github.com/gravitational/teleport/lib/services.(*UnifiedResourceCache).read() /__w/teleport/teleport/lib/services/unified_resource.go:591 +0x2ad github.com/gravitational/teleport/lib/services.(*UnifiedResourceCache).getRange() /__w/teleport/teleport/lib/services/unified_resource.go:215 +0x2d9 github.com/gravitational/teleport/lib/services.(*UnifiedResourceCache).IterateUnifiedResources() /__w/teleport/teleport/lib/services/unified_resource.go:288 +0xe6 github.com/gravitational/teleport/lib/auth.(*Server).runPeriodicOperations.func3() /__w/teleport/teleport/lib/auth/auth.go:1464 +0x237 Previous read at 0x00c003857da8 by goroutine 48219: github.com/gravitational/teleport/lib/auth.(*Server).runPeriodicOperations.func3() /__w/teleport/teleport/lib/auth/auth.go:1489 +0x2d2 Goroutine 48258 (running) created at: github.com/gravitational/teleport/lib/auth.(*Server).runPeriodicOperations() /__w/teleport/teleport/lib/auth/auth.go:1460 +0xf9e github.com/gravitational/teleport/lib/auth.initCluster.gowrap1() /__w/teleport/teleport/lib/auth/init.go:594 +0x33 Goroutine 48219 (finished) created at: github.com/gravitational/teleport/lib/auth.(*Server).runPeriodicOperations() /__w/teleport/teleport/lib/auth/auth.go:1460 +0xf9e github.com/gravitational/teleport/lib/auth.initCluster.gowrap1() /__w/teleport/teleport/lib/auth/init.go:594 +0x33 ================== ``` --- lib/auth/auth.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lib/auth/auth.go b/lib/auth/auth.go index 7e1a31640cdaa..dedda21f57012 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -1394,8 +1394,6 @@ func (a *Server) runPeriodicOperations() { defer ticker.Stop() - missedKeepAliveCount := 0 - // Prevent some periodic operations from running for dashboard tenants. if !services.IsDashboard(*modules.GetModules().Features().ToProto()) { ticker.Push(interval.SubInterval[periodicIntervalKey]{ @@ -1497,7 +1495,7 @@ func (a *Server) runPeriodicOperations() { return false, nil } if services.NodeHasMissedKeepAlives(srv) { - missedKeepAliveCount++ + heartbeatsMissedByAuth.Inc() } // TODO(tross) DELETE in v20.0.0 - all invalid hostnames should have been sanitized by then. @@ -1535,9 +1533,6 @@ func (a *Server) runPeriodicOperations() { break } } - - // Update prometheus gauge - heartbeatsMissedByAuth.Set(float64(missedKeepAliveCount)) }() case metricsKey: go a.updateAgentMetrics()