Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

resource_control: allow configuration of the maximum retry time for the local bucket (#8352) #8360

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 65 additions & 7 deletions client/resource_group/controller/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ const (
defaultTargetPeriod = 5 * time.Second
// defaultMaxWaitDuration is the max duration to wait for the token before throwing error.
defaultMaxWaitDuration = 30 * time.Second
// defaultLTBTokenRPCMaxDelay is the upper bound of backoff delay for local token bucket RPC.
defaultLTBTokenRPCMaxDelay = 1 * time.Second
// defaultWaitRetryTimes is the times to retry when waiting for the token.
defaultWaitRetryTimes = 20
// defaultWaitRetryInterval is the interval to retry when waiting for the token.
defaultWaitRetryInterval = 50 * time.Millisecond
)

const (
Expand All @@ -73,18 +79,36 @@ const (

// Because the resource manager has not been deployed in microservice mode,
// do not enable this function.
defaultDegradedModeWaitDuration = 0
defaultDegradedModeWaitDuration = time.Duration(0)
defaultAvgBatchProportion = 0.7
)

// Config is the configuration of the resource manager controller which includes some option for client needed.
type Config struct {
// TokenRPCParams is the parameters for local bucket RPC.
type TokenRPCParams struct {
// WaitRetryInterval is the interval to retry when waiting for the token.
WaitRetryInterval Duration `toml:"wait-retry-interval" json:"wait-retry-interval"`

// WaitRetryTimes is the times to retry when waiting for the token.
WaitRetryTimes int `toml:"wait-retry-times" json:"wait-retry-times"`
}

// LocalBucketConfig is the configuration for local bucket. not export to server side.
type LocalBucketConfig struct {
TokenRPCParams `toml:"token-rpc-params" json:"token-rpc-params"`
}

// BaseConfig is the configuration of the resource manager controller which includes some option for client needed.
// TODO: unified the configuration for client and server, server side in pkg/mcs/resourcemanger/config.go.
type BaseConfig struct {
// EnableDegradedMode is to control whether resource control client enable degraded mode when server is disconnect.
DegradedModeWaitDuration Duration `toml:"degraded-mode-wait-duration" json:"degraded-mode-wait-duration"`

// LTBMaxWaitDuration is the max wait time duration for local token bucket.
LTBMaxWaitDuration Duration `toml:"ltb-max-wait-duration" json:"ltb-max-wait-duration"`

// LTBTokenRPCMaxDelay is the upper bound of backoff delay for local token bucket RPC.
LTBTokenRPCMaxDelay Duration `toml:"ltb-token-rpc-max-delay" json:"ltb-token-rpc-max-delay"`

// RequestUnit is the configuration determines the coefficients of the RRU and WRU cost.
// This configuration should be modified carefully.
RequestUnit RequestUnitConfig `toml:"request-unit" json:"request-unit"`
Expand All @@ -93,13 +117,43 @@ type Config struct {
EnableControllerTraceLog bool `toml:"enable-controller-trace-log" json:"enable-controller-trace-log,string"`
}

// Config is the configuration of the resource manager controller.
type Config struct {
BaseConfig
LocalBucketConfig
}

// Adjust adjusts the configuration.
func (c *Config) Adjust() {
// valid the configuration, TODO: separately add the valid function.
if c.BaseConfig.LTBMaxWaitDuration.Duration == 0 {
c.BaseConfig.LTBMaxWaitDuration = NewDuration(defaultMaxWaitDuration)
}
if c.LocalBucketConfig.WaitRetryInterval.Duration == 0 {
c.LocalBucketConfig.WaitRetryInterval = NewDuration(defaultWaitRetryInterval)
}
// adjust the client settings. calculate the retry times.
if int(c.BaseConfig.LTBTokenRPCMaxDelay.Duration) != int(c.LocalBucketConfig.WaitRetryInterval.Duration)*c.LocalBucketConfig.WaitRetryTimes {
c.LocalBucketConfig.WaitRetryTimes = int(c.BaseConfig.LTBTokenRPCMaxDelay.Duration / c.LocalBucketConfig.WaitRetryInterval.Duration)
}
}

// DefaultConfig returns the default resource manager controller configuration.
func DefaultConfig() *Config {
return &Config{
DegradedModeWaitDuration: NewDuration(defaultDegradedModeWaitDuration),
LTBMaxWaitDuration: NewDuration(defaultMaxWaitDuration),
RequestUnit: DefaultRequestUnitConfig(),
EnableControllerTraceLog: false,
BaseConfig: BaseConfig{
DegradedModeWaitDuration: NewDuration(defaultDegradedModeWaitDuration),
RequestUnit: DefaultRequestUnitConfig(),
EnableControllerTraceLog: false,
LTBMaxWaitDuration: NewDuration(defaultMaxWaitDuration),
LTBTokenRPCMaxDelay: NewDuration(defaultLTBTokenRPCMaxDelay),
},
LocalBucketConfig: LocalBucketConfig{
TokenRPCParams: TokenRPCParams{
WaitRetryInterval: NewDuration(defaultWaitRetryInterval),
WaitRetryTimes: defaultWaitRetryTimes,
},
},
}
}

Expand Down Expand Up @@ -155,6 +209,8 @@ type RUConfig struct {

// some config for client
LTBMaxWaitDuration time.Duration
WaitRetryInterval time.Duration
WaitRetryTimes int
DegradedModeWaitDuration time.Duration
}

Expand All @@ -176,6 +232,8 @@ func GenerateRUConfig(config *Config) *RUConfig {
WriteBytesCost: RequestUnit(config.RequestUnit.WriteCostPerByte),
CPUMsCost: RequestUnit(config.RequestUnit.CPUMsCost),
LTBMaxWaitDuration: config.LTBMaxWaitDuration.Duration,
WaitRetryInterval: config.WaitRetryInterval.Duration,
WaitRetryTimes: config.WaitRetryTimes,
DegradedModeWaitDuration: config.DegradedModeWaitDuration.Duration,
}
}
33 changes: 23 additions & 10 deletions client/resource_group/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ import (

const (
controllerConfigPath = "resource_group/controller"
maxRetry = 10
retryInterval = 50 * time.Millisecond
maxNotificationChanLen = 200
needTokensAmplification = 1.1
trickleReserveDuration = 1250 * time.Millisecond
Expand Down Expand Up @@ -104,6 +102,20 @@ func WithMaxWaitDuration(d time.Duration) ResourceControlCreateOption {
}
}

// WithWaitRetryInterval is the option to set the retry interval when waiting for the token.
func WithWaitRetryInterval(d time.Duration) ResourceControlCreateOption {
return func(controller *ResourceGroupsController) {
controller.ruConfig.WaitRetryInterval = d
}
}

// WithWaitRetryTimes is the option to set the times to retry when waiting for the token.
func WithWaitRetryTimes(times int) ResourceControlCreateOption {
return func(controller *ResourceGroupsController) {
controller.ruConfig.WaitRetryTimes = times
}
}

var _ ResourceGroupKVInterceptor = (*ResourceGroupsController)(nil)

// ResourceGroupsController implements ResourceGroupKVInterceptor.
Expand Down Expand Up @@ -172,6 +184,7 @@ func NewResourceGroupController(
log.Info("load resource controller config", zap.Reflect("config", config), zap.Reflect("ru-config", controller.ruConfig))
controller.calculators = []ResourceCalculator{newKVCalculator(controller.ruConfig), newSQLCalculator(controller.ruConfig)}
controller.safeRuConfig.Store(controller.ruConfig)
enableControllerTraceLog.Store(config.EnableControllerTraceLog)
return controller, nil
}

Expand All @@ -180,12 +193,13 @@ func loadServerConfig(ctx context.Context, provider ResourceGroupProvider) (*Con
if err != nil {
return nil, err
}
config := DefaultConfig()
defer config.Adjust()
kvs := resp.GetKvs()
if len(kvs) == 0 {
log.Warn("[resource group controller] server does not save config, load config failed")
return DefaultConfig(), nil
return config, nil
}
config := &Config{}
err = json.Unmarshal(kvs[0].GetValue(), config)
if err != nil {
return nil, err
Expand Down Expand Up @@ -288,7 +302,6 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
watchRetryTimer.Reset(watchRetryInterval)
}
}

case <-emergencyTokenAcquisitionTicker.C:
c.executeOnAllGroups((*groupCostController).resetEmergencyTokenAcquisition)
/* channels */
Expand Down Expand Up @@ -366,10 +379,11 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
}
for _, item := range resp {
cfgRevision = item.Kv.ModRevision
config := &Config{}
config := DefaultConfig()
if err := json.Unmarshal(item.Kv.Value, config); err != nil {
continue
}
config.Adjust()
c.ruConfig = GenerateRUConfig(config)

// Stay compatible with serverless
Expand All @@ -383,7 +397,6 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
}
log.Info("load resource controller config after config changed", zap.Reflect("config", config), zap.Reflect("ruConfig", c.ruConfig))
}

case gc := <-c.tokenBucketUpdateChan:
now := gc.run.now
go gc.handleTokenBucketUpdateEvent(c.loopCtx, now)
Expand Down Expand Up @@ -1228,7 +1241,7 @@ func (gc *groupCostController) onRequestWait(
var i int
var d time.Duration
retryLoop:
for i = 0; i < maxRetry; i++ {
for i = 0; i < gc.mainCfg.WaitRetryTimes; i++ {
switch gc.mode {
case rmpb.GroupMode_RawMode:
res := make([]*Reservation, 0, len(requestResourceLimitTypeList))
Expand All @@ -1252,8 +1265,8 @@ func (gc *groupCostController) onRequestWait(
}
}
gc.metrics.requestRetryCounter.Inc()
time.Sleep(retryInterval)
waitDuration += retryInterval
time.Sleep(gc.mainCfg.WaitRetryInterval)
waitDuration += gc.mainCfg.WaitRetryInterval
}
if err != nil {
if errs.ErrClientResourceGroupThrottled.Equal(err) {
Expand Down
25 changes: 18 additions & 7 deletions pkg/mcs/resourcemanager/server/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ const (
defaultDegradedModeWaitDuration = time.Second * 0
// defaultMaxWaitDuration is the max duration to wait for the token before throwing error.
defaultMaxWaitDuration = 30 * time.Second
// defaultLTBTokenRPCMaxDelay is the upper bound of backoff delay for local token bucket RPC.
defaultLTBTokenRPCMaxDelay = 1 * time.Second
)

// Config is the configuration for the resource manager.
Expand Down Expand Up @@ -99,6 +101,9 @@ type ControllerConfig struct {
// LTBMaxWaitDuration is the max wait time duration for local token bucket.
LTBMaxWaitDuration typeutil.Duration `toml:"ltb-max-wait-duration" json:"ltb-max-wait-duration"`

// LTBTokenRPCMaxDelay is the upper bound of backoff delay for local token bucket RPC.
LTBTokenRPCMaxDelay typeutil.Duration `toml:"ltb-token-rpc-max-delay" json:"ltb-token-rpc-max-delay"`

// RequestUnit is the configuration determines the coefficients of the RRU and WRU cost.
// This configuration should be modified carefully.
RequestUnit RequestUnitConfig `toml:"request-unit" json:"request-unit"`
Expand All @@ -112,10 +117,16 @@ func (rmc *ControllerConfig) Adjust(meta *configutil.ConfigMetaData) {
if rmc == nil {
return
}
rmc.RequestUnit.Adjust()

configutil.AdjustDuration(&rmc.DegradedModeWaitDuration, defaultDegradedModeWaitDuration)
configutil.AdjustDuration(&rmc.LTBMaxWaitDuration, defaultMaxWaitDuration)
rmc.RequestUnit.Adjust(meta.Child("request-unit"))
if !meta.IsDefined("degraded-mode-wait-duration") {
configutil.AdjustDuration(&rmc.DegradedModeWaitDuration, defaultDegradedModeWaitDuration)
}
if !meta.IsDefined("ltb-max-wait-duration") {
configutil.AdjustDuration(&rmc.LTBMaxWaitDuration, defaultMaxWaitDuration)
}
if !meta.IsDefined("ltb-token-rpc-max-delay") {
configutil.AdjustDuration(&rmc.LTBTokenRPCMaxDelay, defaultLTBTokenRPCMaxDelay)
}
failpoint.Inject("enableDegradedMode", func() {
configutil.AdjustDuration(&rmc.DegradedModeWaitDuration, time.Second)
})
Expand Down Expand Up @@ -145,7 +156,7 @@ type RequestUnitConfig struct {
}

// Adjust adjusts the configuration and initializes it with the default value if necessary.
func (ruc *RequestUnitConfig) Adjust() {
func (ruc *RequestUnitConfig) Adjust(_ *configutil.ConfigMetaData) {
if ruc == nil {
return
}
Expand Down Expand Up @@ -202,11 +213,11 @@ func (c *Config) Parse(flagSet *pflag.FlagSet) error {
configutil.AdjustCommandLineString(flagSet, &c.ListenAddr, "listen-addr")
configutil.AdjustCommandLineString(flagSet, &c.AdvertiseListenAddr, "advertise-listen-addr")

return c.Adjust(meta, false)
return c.Adjust(meta)
}

// Adjust is used to adjust the resource manager configurations.
func (c *Config) Adjust(meta *toml.MetaData, reloading bool) error {
func (c *Config) Adjust(meta *toml.MetaData) error {
configMetaData := configutil.NewConfigMetadata(meta)
if err := configMetaData.CheckUndecoded(); err != nil {
c.WarningMsgs = append(c.WarningMsgs, err.Error())
Expand Down
8 changes: 5 additions & 3 deletions pkg/mcs/resourcemanager/server/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ func TestControllerConfig(t *testing.T) {
cfgData := `
[controller]
ltb-max-wait-duration = "60s"
ltb-token-rpc-max-delay = "500ms"
degraded-mode-wait-duration = "2s"
[controller.request-unit]
read-base-cost = 1.0
Expand All @@ -39,11 +40,12 @@ read-cpu-ms-cost = 5.0
cfg := NewConfig()
meta, err := toml.Decode(cfgData, &cfg)
re.NoError(err)
err = cfg.Adjust(&meta, false)
err = cfg.Adjust(&meta)
re.NoError(err)

re.Equal(cfg.Controller.DegradedModeWaitDuration.Duration, time.Second*2)
re.Equal(cfg.Controller.LTBMaxWaitDuration.Duration, time.Second*60)
re.Equal(2*time.Second, cfg.Controller.DegradedModeWaitDuration.Duration)
re.Equal(60*time.Second, cfg.Controller.LTBMaxWaitDuration.Duration)
re.Equal(500*time.Millisecond, cfg.Controller.LTBTokenRPCMaxDelay.Duration)
re.LessOrEqual(math.Abs(cfg.Controller.RequestUnit.CPUMsCost-5), 1e-7)
re.LessOrEqual(math.Abs(cfg.Controller.RequestUnit.WriteCostPerByte-4), 1e-7)
re.LessOrEqual(math.Abs(cfg.Controller.RequestUnit.WriteBaseCost-3), 1e-7)
Expand Down
24 changes: 20 additions & 4 deletions tests/integrations/mcs/resourcemanager/resource_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/tikv/pd/client/resource_group/controller"
"github.com/tikv/pd/pkg/mcs/resourcemanager/server"
"github.com/tikv/pd/pkg/utils/testutil"
"github.com/tikv/pd/pkg/utils/typeutil"
"github.com/tikv/pd/tests"
"go.uber.org/goleak"

Expand Down Expand Up @@ -1362,16 +1363,24 @@ func (suite *resourceManagerClientTestSuite) TestResourceGroupControllerConfigCh

configURL := "/resource-manager/api/v1/config/controller"
waitDuration := 10 * time.Second
tokenRPCMaxDelay := 2 * time.Second
readBaseCost := 1.5
defaultCfg := controller.DefaultConfig()
// failpoint enableDegradedMode will setup and set it be 1s.
defaultCfg.DegradedModeWaitDuration.Duration = time.Second
expectCfg := server.ControllerConfig{
// failpoint enableDegradedMode will setup and set it be 1s.
DegradedModeWaitDuration: typeutil.NewDuration(time.Second),
LTBMaxWaitDuration: typeutil.Duration(defaultCfg.LTBMaxWaitDuration),
LTBTokenRPCMaxDelay: typeutil.Duration(defaultCfg.LTBTokenRPCMaxDelay),
RequestUnit: server.RequestUnitConfig(defaultCfg.RequestUnit),
EnableControllerTraceLog: defaultCfg.EnableControllerTraceLog,
}
expectRUCfg := controller.GenerateRUConfig(defaultCfg)
expectRUCfg.DegradedModeWaitDuration = time.Second
// initial config verification
respString := sendRequest("GET", getAddr()+configURL, nil)
defaultString, err := json.Marshal(defaultCfg)
expectStr, err := json.Marshal(expectCfg)
re.NoError(err)
re.JSONEq(string(respString), string(defaultString))
re.JSONEq(string(respString), string(expectStr))
re.EqualValues(expectRUCfg, c1.GetConfig())

testCases := []struct {
Expand All @@ -1384,6 +1393,13 @@ func (suite *resourceManagerClientTestSuite) TestResourceGroupControllerConfigCh
value: waitDuration,
expected: func(ruConfig *controller.RUConfig) { ruConfig.DegradedModeWaitDuration = waitDuration },
},
{
configJSON: fmt.Sprintf(`{"ltb-token-rpc-max-delay": "%v"}`, tokenRPCMaxDelay),
value: waitDuration,
expected: func(ruConfig *controller.RUConfig) {
ruConfig.WaitRetryTimes = int(tokenRPCMaxDelay / ruConfig.WaitRetryInterval)
},
},
{
configJSON: fmt.Sprintf(`{"ltb-max-wait-duration": "%v"}`, waitDuration),
value: waitDuration,
Expand Down
Loading
Loading