Skip to content

Commit

Permalink
Fix replication metric emitter shutdown (cadence-workflow#6117)
Browse files Browse the repository at this point in the history
  • Loading branch information
taylanisikdemir authored and timl3136 committed Jun 6, 2024
1 parent 6a522cd commit c3ba32a
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 18 deletions.
27 changes: 20 additions & 7 deletions service/history/replication/metrics_emitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@
package replication

import (
ctx "context"
"context"
"fmt"
"strconv"
"sync"
"sync/atomic"
"time"

Expand Down Expand Up @@ -53,7 +54,10 @@ type (
scope metrics.Scope
logger log.Logger
status int32
done chan struct{}
interval time.Duration
ctx context.Context
cancelCtx context.CancelFunc
wg sync.WaitGroup
}

// metricsEmitterShardData is for testing.
Expand Down Expand Up @@ -84,6 +88,7 @@ func NewMetricsEmitter(
tag.ClusterName(currentCluster),
tag.ShardID(shardID))

ctx, cancel := context.WithCancel(context.Background())
return &MetricsEmitterImpl{
shardID: shardID,
currentCluster: currentCluster,
Expand All @@ -92,8 +97,10 @@ func NewMetricsEmitter(
shardData: shardData,
reader: reader,
scope: scope,
interval: metricsEmissionInterval,
logger: logger,
done: make(chan struct{}),
ctx: ctx,
cancelCtx: cancel,
}
}

Expand All @@ -102,6 +109,7 @@ func (m *MetricsEmitterImpl) Start() {
return
}

m.wg.Add(1)
go m.emitMetricsLoop()
m.logger.Info("ReplicationMetricsEmitter started.")
}
Expand All @@ -112,17 +120,22 @@ func (m *MetricsEmitterImpl) Stop() {
}

m.logger.Info("ReplicationMetricsEmitter shutting down.")
close(m.done)
m.cancelCtx()
if !common.AwaitWaitGroup(&m.wg, 5*time.Second) {
m.logger.Warn("ReplicationMetricsEmitter timed out on shutdown.")
}
}

func (m *MetricsEmitterImpl) emitMetricsLoop() {
ticker := time.NewTicker(metricsEmissionInterval)
defer m.wg.Done()

ticker := time.NewTicker(m.interval)
defer ticker.Stop()
defer func() { log.CapturePanic(recover(), m.logger, nil) }()

for {
select {
case <-m.done:
case <-m.ctx.Done():
return
case <-ticker.C:
m.emitMetrics()
Expand All @@ -149,7 +162,7 @@ func (m *MetricsEmitterImpl) determineReplicationLatency(remoteClusterName strin
logger := m.logger.WithTags(tag.RemoteCluster(remoteClusterName))
lastReadTaskID := m.shardData.GetClusterReplicationLevel(remoteClusterName)

tasks, _, err := m.reader.Read(ctx.Background(), lastReadTaskID, lastReadTaskID+1)
tasks, _, err := m.reader.Read(m.ctx, lastReadTaskID, lastReadTaskID+1)
if err != nil {
logger.Error(fmt.Sprintf(
"Error reading when determining replication latency, lastReadTaskID=%v", lastReadTaskID),
Expand Down
39 changes: 28 additions & 11 deletions service/history/replication/metrics_emitter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"time"

"github.com/stretchr/testify/assert"
"go.uber.org/goleak"

"github.com/uber/cadence/common/clock"
"github.com/uber/cadence/common/cluster"
Expand All @@ -43,17 +44,23 @@ var (
cluster3 = "cluster3"
)

func TestMetricsEmitterStartStop(t *testing.T) {
goleak.VerifyNone(t)

timeSource := clock.NewMockedTimeSource()
metadata := newClusterMetadata(t)
testShardData := newTestShardData(timeSource, metadata)

metricsEmitter := NewMetricsEmitter(1, testShardData, fakeTaskReader{}, metrics.NewNoopMetricsClient())
metricsEmitter.interval = 5 * time.Millisecond
metricsEmitter.Start()
time.Sleep(20 * time.Millisecond) // let the metrics emitter run a few times
metricsEmitter.Stop()
}

func TestMetricsEmitter(t *testing.T) {
timeSource := clock.NewMockedTimeSource()
metadata := cluster.NewMetadata(0, cluster1, cluster1, map[string]config.ClusterInformation{
cluster1: {Enabled: true},
cluster2: {Enabled: true},
cluster3: {Enabled: true},
},
func(d string) bool { return false },
metrics.NewNoopMetricsClient(),
testlogger.New(t),
)
metadata := newClusterMetadata(t)
testShardData := newTestShardData(timeSource, metadata)

task1 := persistence.ReplicationTaskInfo{TaskID: 1, CreationTime: timeSource.Now().Add(-time.Hour).UnixNano()}
Expand Down Expand Up @@ -84,9 +91,7 @@ func TestMetricsEmitter(t *testing.T) {
}

type testShardData struct {
shardID int
logger log.Logger
maxReadLevel int64
clusterReplicationLevel map[string]int64
timeSource clock.TimeSource
metadata cluster.Metadata
Expand Down Expand Up @@ -121,3 +126,15 @@ func (t testShardData) GetTimeSource() clock.TimeSource {
func (t testShardData) GetClusterMetadata() cluster.Metadata {
return t.metadata
}

func newClusterMetadata(t *testing.T) cluster.Metadata {
return cluster.NewMetadata(0, cluster1, cluster1, map[string]config.ClusterInformation{
cluster1: {Enabled: true},
cluster2: {Enabled: true},
cluster3: {Enabled: true},
},
func(d string) bool { return false },
metrics.NewNoopMetricsClient(),
testlogger.New(t),
)
}

0 comments on commit c3ba32a

Please sign in to comment.