diff --git a/pkg/util/metric/timing.go b/pkg/util/metric/timing.go new file mode 100644 index 000000000000..641438c9a1b3 --- /dev/null +++ b/pkg/util/metric/timing.go @@ -0,0 +1,147 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package metric + +import ( + "context" + "time" +) + +// A Timing is a container that stores Events resulting from a single-threaded +// execution of an operation, which it thus breaks up into contiguous (i.e. +// non-overlapping and without gaps) time intervals. The use of a Timing +// integrates easily into existing code paths and interacts well with both +// metrics and tracing. +// +// With each event added, the caller learns the duration elapsed since the last +// event (or another previous event of choice), which facilitates recording the +// durations of sub-operations to metrics. A "middleware" OnEvent can be set up +// to tie into tracing, so that existing calls to `log.Event` can be lifted to +// calls to `Timing.Event`. +type Timing struct { + Now func() int64 + // OnEvent is called every time Event is called. It will be invoked directly + // from Event, i.e. logging in OnEvent needs to skip two stack frames to get + // to the caller of Event. + OnEvent func(context.Context, interface{}, time.Time) + + ents []entry +} + +type entry struct { + ts int64 // unix nanos + tr interface{} +} + +// Reset resets the Timing. This prepares it for recording a new +// operation while allowing re-use of underlying memory and configuration. +func (tm *Timing) Reset() { + tm.ents = append(tm.ents[:0], entry{ + ts: tm.Now(), + }) +} + +func (tm *Timing) lastIdx() int { + return len(tm.ents) - 1 +} + +type End struct { + Event interface{} +} + +// Event records an event to the timing. It returns the duration +// elapsed since the last Event, and the internal index assigned +// to the new event (for use in Between). The same event may be +// recorded multiple times, which is useful in case of retry +// loops. +// +// Note that the returned duration is only meaningful if the preceding +// event is stable. In the following code, the likelyhood of a third event +// being present (or slipping in down the road) is high and would lead to +// under-reporting of the duration attributed to 'pouncing': +// +// dur := tm.Event(ctx, "pouncing ends") +// doSomethingIncludingMaybeAddAnotherEvent(tm) +// fmt.Printf("pouncing took %s!", dur) +// +// The use of Between is generally preferable to avoid this problem in all but +// trivially correct code. +// +// TODO update comment +func (tm *Timing) Event(ctx context.Context, tr interface{}) int { + ts := tm.Now() + tm.ents = append(tm.ents, entry{ + ts: ts, + tr: tr, + }) + //tm.OnEvent(ctx, tr, ts) + return tm.lastIdx() +} + +// Between returns the duration elapsed between two calls to Event as identified +// by their respective indexes. +func (tm *Timing) Between(from, to int) time.Duration { + return time.Duration(tm.ents[to].ts - tm.ents[from].ts) // TODO overflow checks +} + +func (tm *Timing) Duration() time.Duration { + last := tm.lastIdx() + if last == 0 { + return 0 + } + return tm.Between(0, last) +} + +func (tm *Timing) Summary() map[interface{}]time.Duration { + res := make(map[interface{}]time.Duration) + m := map[interface{}]int{} // idx, key is never &End{X} + + for i := range tm.ents { + // TODO: also compute the duration for which there isn't a single open + // interval. This is "time between the cracks". We may double-account time + // when intervals overlap, but if no interval is open, that's something that + // should go in a separate bucket and be reported. + el := &tm.ents[i] + e, ok := el.tr.(*End) + if !ok { + idx, open := m[el.tr] + if open { + // Invalid nesting, close current interval and start new one. + res[el.tr] += tm.Between(idx, i) + // Fall through. + } + m[el.tr] = i + continue + } + // Closing an existing interval. If it isn't open, that's a programming + // error but we'll handle it gracefully by ignoring it. + idx, open := m[e.Event] + if !open { + continue // ignore + } + res[e.Event] += tm.Between(idx, i) + delete(m, e.Event) + } + for e, idx := range m { + // Close any leftover events. + res[e] += tm.Between(idx, tm.lastIdx()) + } + return res +} + +// TODO(tbg): provide a method to summarize the timings. +// This could power a granular per-range breakdown of execution +// timings via a map[struct{A, B interface{}}ewma.MovingAverage. +// where there is an entry for any events A and B that were ever +// observed in succession. We should only do this if we know that +// the set of possible events is small, for example if they are +// guaranteed to come from a small set of singletons (as is the +// envisioned reality). diff --git a/pkg/util/metric/timing_test.go b/pkg/util/metric/timing_test.go new file mode 100644 index 000000000000..c6ff793e4d05 --- /dev/null +++ b/pkg/util/metric/timing_test.go @@ -0,0 +1,86 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package metric + +import ( + "context" + "math/rand" + "testing" + + "github.com/cockroachdb/cockroach/pkg/util/hlc" +) + +func TestTiming(t *testing.T) { + ctx := context.Background() + + const ( + evStart = iota + evLeaseStart + evContentionStart + evSequenceStart + evEvalStart + evReplStart + ) + mc := hlc.NewManualClock(1) + tm := &Timing{Now: func() int64 { + return mc.UnixNano() + }} + r := rand.New(rand.NewSource(123)) + _ = r + tick := func() { + //nanos := r.Int63n(100) + const nanos = 1000 + mc.Increment(nanos) + } + + tm.Event(ctx, evStart) + tick() + tm.Event(ctx, &End{evStart}) + tick() + tick() + tick() + tm.Event(ctx, evStart) + tick() + tm.Event(ctx, &End{evStart}) + + if false { + tm.Event(ctx, evStart) + tick() + tm.Event(ctx, &End{evLeaseStart}) + tick() // where is this accounted for? Can return "gaps" from summary? + tm.Event(ctx, evLeaseStart) + tick() + tm.Event(ctx, &End{evLeaseStart}) + tick() + tm.Event(ctx, evSequenceStart) + tick() + tm.Event(ctx, &End{evSequenceStart}) + tick() + tm.Event(ctx, evContentionStart) + tick() + tm.Event(ctx, &End{evContentionStart}) + tick() + tm.Event(ctx, evSequenceStart) + tick() + tm.Event(ctx, &End{evSequenceStart}) + tick() + tm.Event(ctx, evEvalStart) + tick() + tm.Event(ctx, &End{evEvalStart}) + tick() + tm.Event(ctx, evReplStart) + tick() + tm.Event(ctx, &End{evReplStart}) + } + + //require.EqualValues(t, mc.UnixNano(), 1+tm.Duration()) + t.Error(tm.Summary()) +}