Skip to content

Commit

Permalink
kvserver: add metric to track failed attempts to close timestamps
Browse files Browse the repository at this point in the history
This commit adds a store-level metric to track failed attempts to close
timestamps by the min prop tracker, due to either an epoch mismatch or pending
evaluation(s) below the timestamp it attempted to close.

This is intended to aid our debugging of stuck or lagging closed timestamps.

Release note: None
  • Loading branch information
aayushshah15 committed Oct 7, 2020
1 parent 0a51965 commit 1ccf5c3
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 2 deletions.
1 change: 1 addition & 0 deletions pkg/kv/kvserver/closedts/closedts.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ type ReleaseFunc func(context.Context, ctpb.Epoch, roachpb.RangeID, ctpb.LAI)
type TrackerI interface {
Close(next hlc.Timestamp, expCurEpoch ctpb.Epoch) (hlc.Timestamp, map[roachpb.RangeID]ctpb.LAI, bool)
Track(ctx context.Context) (hlc.Timestamp, ReleaseFunc)
FailedCloseAttempts() int64
}

// A Storage holds the closed timestamps and associated MLAIs for each node. It
Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/closedts/container/noop.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ func (noopEverything) Close(
func (noopEverything) Track(ctx context.Context) (hlc.Timestamp, closedts.ReleaseFunc) {
return hlc.Timestamp{}, func(context.Context, ctpb.Epoch, roachpb.RangeID, ctpb.LAI) {}
}
func (noopEverything) FailedCloseAttempts() int64 {
return 0
}
func (noopEverything) VisitAscending(roachpb.NodeID, func(ctpb.Entry) (done bool)) {}
func (noopEverything) VisitDescending(roachpb.NodeID, func(ctpb.Entry) (done bool)) {}
func (noopEverything) Add(roachpb.NodeID, ctpb.Entry) {}
Expand Down
18 changes: 18 additions & 0 deletions pkg/kv/kvserver/closedts/minprop/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ type Tracker struct {
leftMLAI, rightMLAI map[roachpb.RangeID]ctpb.LAI
leftRef, rightRef int
leftEpoch, rightEpoch ctpb.Epoch
// failedCloseAttempts keeps track of the number of attempts by the tracker
// that failed to close a timestamp due to an epoch mismatch or pending
// evaluations.
failedCloseAttempts int64
}
}

Expand Down Expand Up @@ -191,6 +195,12 @@ func (t *Tracker) Close(
) (ts hlc.Timestamp, mlai map[roachpb.RangeID]ctpb.LAI, ok bool) {
t.mu.Lock()
defer t.mu.Unlock()
defer func() {
if mlai == nil {
// Record if our attempt to close a timestamp fails.
t.mu.failedCloseAttempts++
}
}()

if log.V(3) {
log.Infof(context.TODO(),
Expand Down Expand Up @@ -293,6 +303,14 @@ func (t *Tracker) Track(ctx context.Context) (hlc.Timestamp, closedts.ReleaseFun
return minProp, release
}

// FailedCloseAttempts returns the numbers of attempts by the tracker that failed to
// close a timestamp due to an epoch mismatch or pending evaluations.
func (t *Tracker) FailedCloseAttempts() int64 {
t.mu.Lock()
defer t.mu.Unlock()
return t.mu.failedCloseAttempts
}

// release is the business logic to release properly account for the release of
// a tracked proposal. It is called from the ReleaseFunc closure returned from
// Track.
Expand Down
12 changes: 10 additions & 2 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,12 @@ var (
Measurement: "Nanoseconds",
Unit: metric.Unit_NANOSECONDS,
}
metaClosedTimestampFailuresToClose = metric.Metadata{
Name: "kv.closed_timestamp.failures_to_close",
Help: "Number of times the min prop tracker failed to close timestamps due to epoch mismatch or pending evaluations",
Measurement: "Attempts",
Unit: metric.Unit_COUNT,
}
)

// StoreMetrics is the set of metrics for a given store.
Expand Down Expand Up @@ -1208,7 +1214,8 @@ type StoreMetrics struct {
RangeFeedMetrics *rangefeed.Metrics

// Closed timestamp metrics.
ClosedTimestampMaxBehindNanos *metric.Gauge
ClosedTimestampMaxBehindNanos *metric.Gauge
ClosedTimestampFailuresToClose *metric.Gauge
}

// TenantsStorageMetrics are metrics which are aggregated over all tenants
Expand Down Expand Up @@ -1583,7 +1590,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RangeFeedMetrics: rangefeed.NewMetrics(),

// Closed timestamp metrics.
ClosedTimestampMaxBehindNanos: metric.NewGauge(metaClosedTimestampMaxBehindNanos),
ClosedTimestampMaxBehindNanos: metric.NewGauge(metaClosedTimestampMaxBehindNanos),
ClosedTimestampFailuresToClose: metric.NewGauge(metaClosedTimestampFailuresToClose),
}
storeRegistry.AddMetricStruct(sm)

Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2577,6 +2577,9 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
nanos := timeutil.Since(minMaxClosedTS.GoTime()).Nanoseconds()
s.metrics.ClosedTimestampMaxBehindNanos.Update(nanos)
}
s.metrics.ClosedTimestampFailuresToClose.Update(
s.cfg.ClosedTimestamp.Tracker.FailedCloseAttempts(),
)

return nil
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,10 @@ var charts = []sectionDescription{
Title: "Count",
Metrics: []string{"follower_reads.success_count"},
},
{
Title: "Failed Attempts To Close",
Metrics: []string{"kv.closed_timestamp.failures_to_close"},
},
},
},
{
Expand Down

0 comments on commit 1ccf5c3

Please sign in to comment.