From c3ba32a2b3cd733dab25fa4e9e1c0df72c073939 Mon Sep 17 00:00:00 2001 From: taylanisikdemir Date: Thu, 6 Jun 2024 00:46:53 -0700 Subject: [PATCH] Fix replication metric emitter shutdown (#6117) --- .../history/replication/metrics_emitter.go | 27 +++++++++---- .../replication/metrics_emitter_test.go | 39 +++++++++++++------ 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/service/history/replication/metrics_emitter.go b/service/history/replication/metrics_emitter.go index f2a4f0eb0af..dc47f337c3d 100644 --- a/service/history/replication/metrics_emitter.go +++ b/service/history/replication/metrics_emitter.go @@ -23,9 +23,10 @@ package replication import ( - ctx "context" + "context" "fmt" "strconv" + "sync" "sync/atomic" "time" @@ -53,7 +54,10 @@ type ( scope metrics.Scope logger log.Logger status int32 - done chan struct{} + interval time.Duration + ctx context.Context + cancelCtx context.CancelFunc + wg sync.WaitGroup } // metricsEmitterShardData is for testing. @@ -84,6 +88,7 @@ func NewMetricsEmitter( tag.ClusterName(currentCluster), tag.ShardID(shardID)) + ctx, cancel := context.WithCancel(context.Background()) return &MetricsEmitterImpl{ shardID: shardID, currentCluster: currentCluster, @@ -92,8 +97,10 @@ func NewMetricsEmitter( shardData: shardData, reader: reader, scope: scope, + interval: metricsEmissionInterval, logger: logger, - done: make(chan struct{}), + ctx: ctx, + cancelCtx: cancel, } } @@ -102,6 +109,7 @@ func (m *MetricsEmitterImpl) Start() { return } + m.wg.Add(1) go m.emitMetricsLoop() m.logger.Info("ReplicationMetricsEmitter started.") } @@ -112,17 +120,22 @@ func (m *MetricsEmitterImpl) Stop() { } m.logger.Info("ReplicationMetricsEmitter shutting down.") - close(m.done) + m.cancelCtx() + if !common.AwaitWaitGroup(&m.wg, 5*time.Second) { + m.logger.Warn("ReplicationMetricsEmitter timed out on shutdown.") + } } func (m *MetricsEmitterImpl) emitMetricsLoop() { - ticker := time.NewTicker(metricsEmissionInterval) + defer m.wg.Done() + + ticker := time.NewTicker(m.interval) defer ticker.Stop() defer func() { log.CapturePanic(recover(), m.logger, nil) }() for { select { - case <-m.done: + case <-m.ctx.Done(): return case <-ticker.C: m.emitMetrics() @@ -149,7 +162,7 @@ func (m *MetricsEmitterImpl) determineReplicationLatency(remoteClusterName strin logger := m.logger.WithTags(tag.RemoteCluster(remoteClusterName)) lastReadTaskID := m.shardData.GetClusterReplicationLevel(remoteClusterName) - tasks, _, err := m.reader.Read(ctx.Background(), lastReadTaskID, lastReadTaskID+1) + tasks, _, err := m.reader.Read(m.ctx, lastReadTaskID, lastReadTaskID+1) if err != nil { logger.Error(fmt.Sprintf( "Error reading when determining replication latency, lastReadTaskID=%v", lastReadTaskID), diff --git a/service/history/replication/metrics_emitter_test.go b/service/history/replication/metrics_emitter_test.go index c91bb9b3598..5b7faab53c7 100644 --- a/service/history/replication/metrics_emitter_test.go +++ b/service/history/replication/metrics_emitter_test.go @@ -27,6 +27,7 @@ import ( "time" "github.com/stretchr/testify/assert" + "go.uber.org/goleak" "github.com/uber/cadence/common/clock" "github.com/uber/cadence/common/cluster" @@ -43,17 +44,23 @@ var ( cluster3 = "cluster3" ) +func TestMetricsEmitterStartStop(t *testing.T) { + goleak.VerifyNone(t) + + timeSource := clock.NewMockedTimeSource() + metadata := newClusterMetadata(t) + testShardData := newTestShardData(timeSource, metadata) + + metricsEmitter := NewMetricsEmitter(1, testShardData, fakeTaskReader{}, metrics.NewNoopMetricsClient()) + metricsEmitter.interval = 5 * time.Millisecond + metricsEmitter.Start() + time.Sleep(20 * time.Millisecond) // let the metrics emitter run a few times + metricsEmitter.Stop() +} + func TestMetricsEmitter(t *testing.T) { timeSource := clock.NewMockedTimeSource() - metadata := cluster.NewMetadata(0, cluster1, cluster1, map[string]config.ClusterInformation{ - cluster1: {Enabled: true}, - cluster2: {Enabled: true}, - cluster3: {Enabled: true}, - }, - func(d string) bool { return false }, - metrics.NewNoopMetricsClient(), - testlogger.New(t), - ) + metadata := newClusterMetadata(t) testShardData := newTestShardData(timeSource, metadata) task1 := persistence.ReplicationTaskInfo{TaskID: 1, CreationTime: timeSource.Now().Add(-time.Hour).UnixNano()} @@ -84,9 +91,7 @@ func TestMetricsEmitter(t *testing.T) { } type testShardData struct { - shardID int logger log.Logger - maxReadLevel int64 clusterReplicationLevel map[string]int64 timeSource clock.TimeSource metadata cluster.Metadata @@ -121,3 +126,15 @@ func (t testShardData) GetTimeSource() clock.TimeSource { func (t testShardData) GetClusterMetadata() cluster.Metadata { return t.metadata } + +func newClusterMetadata(t *testing.T) cluster.Metadata { + return cluster.NewMetadata(0, cluster1, cluster1, map[string]config.ClusterInformation{ + cluster1: {Enabled: true}, + cluster2: {Enabled: true}, + cluster3: {Enabled: true}, + }, + func(d string) bool { return false }, + metrics.NewNoopMetricsClient(), + testlogger.New(t), + ) +}