Skip to content

Commit

Permalink
config: enable user configuration for retry strategy. (#1055)
Browse files Browse the repository at this point in the history
Signed-off-by: Smityz <smityz@qq.com>
Co-authored-by: disksing <i@disksing.com>
  • Loading branch information
Smityz and disksing authored Nov 14, 2023
1 parent 8fc8a52 commit 6ba909c
Show file tree
Hide file tree
Showing 38 changed files with 735 additions and 32 deletions.
File renamed without changes.
File renamed without changes.
207 changes: 207 additions & 0 deletions config/retry/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
// Copyright 2021 TiKV Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// NOTE: The code in this file is based on code from the
// TiDB project, licensed under the Apache License v 2.0
//
// https://github.com/pingcap/tidb/tree/cc5e161ac06827589c4966674597c137cc9e809c/store/tikv/retry/config.go
//

// Copyright 2021 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package retry

import (
"context"
"math"
"math/rand"
"strings"
"time"

"github.com/prometheus/client_golang/prometheus"
tikverr "github.com/tikv/client-go/v2/error"
"github.com/tikv/client-go/v2/internal/logutil"
"github.com/tikv/client-go/v2/kv"
"github.com/tikv/client-go/v2/metrics"
"go.uber.org/zap"
)

// Config is the configuration of the Backoff function.
type Config struct {
name string
metric *prometheus.Observer
fnCfg *BackoffFnCfg
err error
}

// backoffFn is the backoff function which compute the sleep time and do sleep.
type backoffFn func(ctx context.Context, maxSleepMs int) int

func (c *Config) createBackoffFn(vars *kv.Variables) backoffFn {
if strings.EqualFold(c.name, txnLockFastName) {
return newBackoffFn(vars.BackoffLockFast, c.fnCfg.cap, c.fnCfg.jitter)
}
return newBackoffFn(c.fnCfg.base, c.fnCfg.cap, c.fnCfg.jitter)
}

// BackoffFnCfg is the configuration for the backoff func which implements exponential backoff with
// optional jitters.
// See http://www.awsarchitectureblog.com/2015/03/backoff.html
type BackoffFnCfg struct {
base int
cap int
jitter int
}

// NewBackoffFnCfg creates the config for BackoffFn.
func NewBackoffFnCfg(base, cap, jitter int) *BackoffFnCfg {
return &BackoffFnCfg{
base,
cap,
jitter,
}
}

// NewConfig creates a new Config for the Backoff operation.
func NewConfig(name string, metric *prometheus.Observer, backoffFnCfg *BackoffFnCfg, err error) *Config {
return &Config{
name: name,
metric: metric,
fnCfg: backoffFnCfg,
err: err,
}
}

func (c *Config) String() string {
return c.name
}

// SetErrors sets a more detailed error instead of the default bo config.
func (c *Config) SetErrors(err error) {
c.err = err
}

func (c *Config) SetBackoffFnCfg(fnCfg *BackoffFnCfg) {
c.fnCfg = fnCfg
}

const txnLockFastName = "txnLockFast"

// Backoff Config variables.
var (
// TODO: distinguish tikv and tiflash in metrics
BoTiKVRPC = NewConfig("tikvRPC", &metrics.BackoffHistogramRPC, NewBackoffFnCfg(100, 2000, EqualJitter), tikverr.ErrTiKVServerTimeout)
BoTiFlashRPC = NewConfig("tiflashRPC", &metrics.BackoffHistogramRPC, NewBackoffFnCfg(100, 2000, EqualJitter), tikverr.ErrTiFlashServerTimeout)
BoTxnLock = NewConfig("txnLock", &metrics.BackoffHistogramLock, NewBackoffFnCfg(100, 3000, EqualJitter), tikverr.ErrResolveLockTimeout)
BoPDRPC = NewConfig("pdRPC", &metrics.BackoffHistogramPD, NewBackoffFnCfg(500, 3000, EqualJitter), tikverr.NewErrPDServerTimeout(""))
// change base time to 2ms, because it may recover soon.
BoRegionMiss = NewConfig("regionMiss", &metrics.BackoffHistogramRegionMiss, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable)
BoRegionScheduling = NewConfig("regionScheduling", &metrics.BackoffHistogramRegionScheduling, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable)
BoTiKVServerBusy = NewConfig("tikvServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiKVServerBusy)
BoTiKVDiskFull = NewConfig("tikvDiskFull", &metrics.BackoffHistogramTiKVDiskFull, NewBackoffFnCfg(500, 5000, NoJitter), tikverr.ErrTiKVDiskFull)
BoRegionRecoveryInProgress = NewConfig("regionRecoveryInProgress", &metrics.BackoffHistogramRegionRecoveryInProgress, NewBackoffFnCfg(100, 10000, EqualJitter), tikverr.ErrRegionRecoveryInProgress)
BoTiFlashServerBusy = NewConfig("tiflashServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiFlashServerBusy)
BoTxnNotFound = NewConfig("txnNotFound", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrResolveLockTimeout)
BoStaleCmd = NewConfig("staleCommand", &metrics.BackoffHistogramStaleCmd, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrTiKVStaleCommand)
BoMaxTsNotSynced = NewConfig("maxTsNotSynced", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrTiKVMaxTimestampNotSynced)
BoMaxDataNotReady = NewConfig("dataNotReady", &metrics.BackoffHistogramDataNotReady, NewBackoffFnCfg(2, 2000, NoJitter), tikverr.ErrRegionDataNotReady)
BoMaxRegionNotInitialized = NewConfig("regionNotInitialized", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrRegionNotInitialized)
BoIsWitness = NewConfig("isWitness", &metrics.BackoffHistogramIsWitness, NewBackoffFnCfg(1000, 10000, EqualJitter), tikverr.ErrIsWitness)
// TxnLockFast's `base` load from vars.BackoffLockFast when create BackoffFn.
BoTxnLockFast = NewConfig(txnLockFastName, &metrics.BackoffHistogramLockFast, NewBackoffFnCfg(2, 3000, EqualJitter), tikverr.ErrResolveLockTimeout)
)

var isSleepExcluded = map[string]int{
BoTiKVServerBusy.name: 600000, // The max excluded limit is 10min.
// add BoTiFlashServerBusy if appropriate
}

// setBackoffExcluded is used for test only.
func setBackoffExcluded(name string, maxVal int) {
if _, ok := isSleepExcluded[name]; ok {
isSleepExcluded[name] = maxVal
}
}

const (
// NoJitter makes the backoff sequence strict exponential.
NoJitter = 1 + iota
// FullJitter applies random factors to strict exponential.
FullJitter
// EqualJitter is also randomized, but prevents very short sleeps.
EqualJitter
// DecorrJitter increases the maximum jitter based on the last random value.
DecorrJitter
)

// newBackoffFn creates a backoff func which implements exponential backoff with
// optional jitters.
// See http://www.awsarchitectureblog.com/2015/03/backoff.html
func newBackoffFn(base, cap, jitter int) backoffFn {
if base < 2 {
// Top prevent panic in 'rand.Intn'.
base = 2
}
attempts := 0
lastSleep := base
return func(ctx context.Context, maxSleepMs int) int {
var sleep int
switch jitter {
case NoJitter:
sleep = expo(base, cap, attempts)
case FullJitter:
v := expo(base, cap, attempts)
sleep = rand.Intn(v)
case EqualJitter:
v := expo(base, cap, attempts)
sleep = v/2 + rand.Intn(v/2)
case DecorrJitter:
sleep = int(math.Min(float64(cap), float64(base+rand.Intn(lastSleep*3-base))))
}
logutil.BgLogger().Debug("backoff",
zap.Int("base", base),
zap.Int("sleep", sleep),
zap.Int("attempts", attempts))

realSleep := sleep
// when set maxSleepMs >= 0 in `tikv.BackoffWithMaxSleep` will force sleep maxSleepMs milliseconds.
if maxSleepMs >= 0 && realSleep > maxSleepMs {
realSleep = maxSleepMs
}
select {
case <-time.After(time.Duration(realSleep) * time.Millisecond):
attempts++
lastSleep = sleep
return realSleep
case <-ctx.Done():
return 0
}
}
}

func expo(base, cap, n int) int {
return int(math.Min(float64(cap), float64(base)*math.Pow(2.0, float64(n))))
}
File renamed without changes.
2 changes: 1 addition & 1 deletion internal/client/client_batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ import (
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/tikv/client-go/v2/config"
"github.com/tikv/client-go/v2/config/retry"
tikverr "github.com/tikv/client-go/v2/error"
"github.com/tikv/client-go/v2/internal/logutil"
"github.com/tikv/client-go/v2/internal/retry"
"github.com/tikv/client-go/v2/metrics"
"github.com/tikv/client-go/v2/tikvrpc"
"github.com/tikv/client-go/v2/util"
Expand Down
4 changes: 2 additions & 2 deletions internal/client/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ func TestMain(m *testing.M) {
util.EnableFailpoints()
opts := []goleak.Option{
goleak.IgnoreTopFunction("google.golang.org/grpc.(*ClientConn).WaitForStateChange"),
goleak.IgnoreTopFunction("github.com/tikv/client-go/v2/internal/retry.newBackoffFn.func1"),
goleak.IgnoreTopFunction("github.com/tikv/client-go/v2/internal/retry.(*Config).createBackoffFn.newBackoffFn.func2"),
goleak.IgnoreTopFunction("github.com/tikv/client-go/v2/config/retry.newBackoffFn.func1"),
goleak.IgnoreTopFunction("github.com/tikv/client-go/v2/config/retry.(*Config).createBackoffFn.newBackoffFn.func2"),
}
goleak.VerifyTestMain(m, opts...)
}
Loading

0 comments on commit 6ba909c

Please sign in to comment.