Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(WIP) storage: add storage lag metrics #1168

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -38,6 +38,7 @@ require (
golang.org/x/net v0.0.0-20220225172249-27dd8689420f
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c
golang.org/x/sys v0.0.0-20220317061510-51cd9980dadf
golang.org/x/time v0.0.0-20220224211638-0e9765cccd65
google.golang.org/grpc v1.44.0
)

@@ -158,7 +159,6 @@ require (
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 // indirect
golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/time v0.0.0-20220224211638-0e9765cccd65 // indirect
golang.org/x/tools v0.1.10 // indirect
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/api v0.69.0 // indirect
10 changes: 10 additions & 0 deletions pump/storage/metrics.go
Original file line number Diff line number Diff line change
@@ -108,6 +108,15 @@ var (
Help: "How long the catch up step takes to run.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 22),
})

commitTsLagHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "binlog",
Subsystem: "pump_storage",
Name: "commit_ts_lag_time",
Help: "Bucketed histogram of the lag of currently handled maximum commit-ts",
Buckets: prometheus.ExponentialBuckets(0.00005, 2, 18),
}, []string{"type"})
)

// InitMetircs register the metrics to registry
@@ -123,4 +132,5 @@ func InitMetircs(registry *prometheus.Registry) {
registry.MustRegister(storageSizeGauge)
registry.MustRegister(slowChaserCount)
registry.MustRegister(slowChaserCatchUpTimeHistogram)
registry.MustRegister(commitTsLagHistogram)
}
52 changes: 48 additions & 4 deletions pump/storage/sorter.go
Original file line number Diff line number Diff line change
@@ -16,6 +16,10 @@ package storage
import (
"container/list"
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/tikv/client-go/v2/oracle"
"go.uber.org/zap"
"golang.org/x/time/rate"
"math/rand"
"sync"
"sync/atomic"
@@ -105,6 +109,9 @@ type sorter struct {
// save the startTS of txn which we need to wait for the C binlog
waitStartTS map[int64]struct{}

commitTsLagMetrics prometheus.Observer
maxPushItemCommitTs int64 // atomic. Used for metrics only

lock sync.Mutex
cond *sync.Cond
items *list.List
@@ -114,14 +121,16 @@ type sorter struct {

func newSorter(fn func(item sortItem)) *sorter {
sorter := &sorter{
maxTSItemCB: fn,
items: list.New(),
waitStartTS: make(map[int64]struct{}),
maxTSItemCB: fn,
items: list.New(),
waitStartTS: make(map[int64]struct{}),
commitTsLagMetrics: commitTsLagHistogram.With(map[string]string{"type": "sorter"}),
}

sorter.cond = sync.NewCond(&sorter.lock)
sorter.wg.Add(1)
sorter.wg.Add(2)
go sorter.run()
go sorter.updateMetrics()

return sorter
}
@@ -148,6 +157,12 @@ func (s *sorter) pushTSItem(item sortItem) {
if item.tp == pb.BinlogType_Prewrite {
s.waitStartTS[item.start] = struct{}{}
} else {
if item.commit > atomic.LoadInt64(&s.maxPushItemCommitTs) {
// No need to CAS because we are inside a lock, and
// this is the only place where s.maxPushItemCommitTs
// is written to.
atomic.StoreInt64(&s.maxPushItemCommitTs, item.commit)
}
delete(s.waitStartTS, item.start)
s.cond.Signal()
}
@@ -225,6 +240,35 @@ func (s *sorter) run() {
}
}

func (s *sorter) updateMetrics() {
defer s.wg.Done()

tick := time.NewTicker(1 * time.Second)
defer tick.Stop()

rl := rate.NewLimiter(rate.Every(time.Second*10), 1)

for range tick.C {
if s.isClosed() {
return
}

curMaxPushItemCommitTs := atomic.LoadInt64(&s.maxPushItemCommitTs)
maxCommitTsPhysicalMs := oracle.ExtractPhysical(uint64(curMaxPushItemCommitTs))
millisecond := time.Now().UnixMilli() - maxCommitTsPhysicalMs

// Update metrics
s.commitTsLagMetrics.Observe(float64(millisecond) / 1000.0)

if millisecond > 10000 /* 10s */ && rl.Allow() {
// Prints a log if the lag is more than 10 seconds, and
// if the rate limiter allows.
duration := time.Duration(millisecond) * time.Millisecond
log.Info("sorter commit ts lag", zap.Duration("duration", duration))
}
}
}

func (s *sorter) isClosed() bool {
return atomic.LoadInt32(&s.closed) == 1
}
29 changes: 29 additions & 0 deletions pump/storage/storage.go
Original file line number Diff line number Diff line change
@@ -16,6 +16,7 @@ package storage
import (
"context"
"encoding/binary"
"golang.org/x/time/rate"
"math"
"os"
"path"
@@ -119,6 +120,8 @@ type Append struct {

options *Options

maxWriteToKVCommitTs int64 // atomic. Used for metrics only.

close chan struct{}
wg sync.WaitGroup
}
@@ -362,6 +365,14 @@ func (a *Append) updateStatus() {
logStatsTicker := time.NewTicker(time.Second * 10)
defer logStatsTicker.Stop()

updateMetricsTicker := time.NewTicker(time.Second * 1)
defer updateMetricsTicker.Stop()

kvCommitTsLagMetrics := commitTsLagHistogram.With(map[string]string{
"type": "kv",
})
logRL := rate.NewLimiter(rate.Every(time.Second*10), 1)

for {
select {
case <-a.close:
@@ -389,6 +400,20 @@ func (a *Append) updateStatus() {
log.Warn("in WritePaused stat")
}
}
case <-updateMetricsTicker.C:
curKVCommitTs := atomic.LoadInt64(&a.maxWriteToKVCommitTs)
maxCommitTsPhysicalMs := oracle.ExtractPhysical(uint64(curKVCommitTs))
millisecond := time.Now().UnixMilli() - maxCommitTsPhysicalMs

// Update metrics
kvCommitTsLagMetrics.Observe(float64(millisecond) / 1000.0)

if millisecond > 10000 /* 10s */ && logRL.Allow() {
// Prints a log if the lag is more than 10 seconds, and
// if the rate limiter allows.
duration := time.Duration(millisecond) * time.Millisecond
log.Info("kv commit ts lag", zap.Duration("duration", duration))
}
}
}
}
@@ -888,6 +913,10 @@ func (a *Append) writeToKV(reqs chan *request) chan *request {
return
}
for _, req := range bufReqs {
if req.commitTS > atomic.LoadInt64(&a.maxWriteToKVCommitTs) {
atomic.StoreInt64(&a.maxWriteToKVCommitTs, req.commitTS)
}

done <- req
}
}