Skip to content
This repository has been archived by the owner on Jul 24, 2024. It is now read-only.

pd: disable location replacement #555

Merged
merged 23 commits into from
Oct 22, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions pkg/pdutil/pd.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ var (

// DefaultPDCfg find by https://github.com/tikv/pd/blob/master/conf/config.toml.
DefaultPDCfg = map[string]interface{}{
"max-merge-region-keys": 200000,
"max-merge-region-size": 20,
"leader-schedule-limit": 4,
"region-schedule-limit": 2048,
"max-snapshot-count": 3,
"max-merge-region-keys": 200000,
"max-merge-region-size": 20,
"leader-schedule-limit": 4,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to @Yisaer, these two configs "leader-schedule-limit" and "region-schedule-limit" don't help the schedule speed. should we remove it in this PR?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could leave a TODO here and try to merge this firstly... I'm afraid that if any error occurs during editing, we cannot merge this in the current unstable CI environment...

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently leader-schedule-limit and region-schedule-limit is not supported in temporary configuration tikv/pd#3088

"region-schedule-limit": 2048,
"max-snapshot-count": 3,
"enable-location-replacement": "true",
}
)

Expand Down Expand Up @@ -410,6 +411,7 @@ func (p *PdController) UpdatePDScheduleConfig(
if e == nil {
return nil
}
log.Warn("failed to update PD config, will try next", zap.Error(e), zap.String("pd", addr))
}
return errors.Annotate(berrors.ErrPDUpdateFailed, "failed to update PD schedule config")
}
Expand Down Expand Up @@ -444,6 +446,12 @@ func restoreSchedulers(ctx context.Context, pd *PdController, clusterCfg cluster
if err := pd.UpdatePDScheduleConfig(ctx, scheduleLimitCfg); err != nil {
return errors.Annotate(err, "fail to update PD schedule config")
}
if locationPlacement, ok := clusterCfg.scheduleCfg["enable-location-replacement"]; ok {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about adding "enable-location-replacement" to pdScheduleLimitCfg?

Copy link
Collaborator Author

@YuJuncen YuJuncen Oct 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The map pdScheduleLimitCfg is also used in removeSchedulers, and they also have some generic traits(e.g. any of their value is number), so I'm fearing adding enable-location-replacement needs more redundant codes.

See here:

br/pkg/pdutil/pd.go

Lines 511 to 522 in 6734867

for _, cfgKey := range pdScheduleLimitCfg {
value := scheduleCfg[cfgKey]
if value == nil {
// Ignore non-exist config.
continue
}
// Speed update PD scheduler by enlarging scheduling limits.
// Multiply limits by store count but no more than 40.
// Larger limit may make cluster unstable.
limit := int(value.(float64))
scheduleLimitCfg[cfgKey] = math.Min(40, float64(limit*len(stores)))

log.Debug("restoring config enable-location-replacement", zap.Any("enable-location-placement", locationPlacement))
if err := pd.UpdatePDScheduleConfig(ctx, map[string]interface{}{"enable-location-replacement": locationPlacement}); err != nil {
return err
}
}
return nil
}

Expand Down Expand Up @@ -485,6 +493,7 @@ func (p *PdController) RemoveSchedulers(ctx context.Context) (undo utils.UndoFun
}

undo = p.makeUndoFunctionByConfig(clusterConfig{scheduler: removedSchedulers, scheduleCfg: scheduleCfg})
log.Debug("saved PD config", zap.Any("config", scheduleCfg))

disableMergeCfg := make(map[string]interface{})
for _, cfgKey := range pdRegionMergeCfg {
Expand Down Expand Up @@ -515,7 +524,10 @@ func (p *PdController) RemoveSchedulers(ctx context.Context) (undo utils.UndoFun
limit := int(value.(float64))
scheduleLimitCfg[cfgKey] = math.Min(40, float64(limit*len(stores)))
}
return undo, p.UpdatePDScheduleConfig(ctx, scheduleLimitCfg)
if err := p.UpdatePDScheduleConfig(ctx, scheduleLimitCfg); err != nil {
return undo, err
}
return undo, p.UpdatePDScheduleConfig(ctx, map[string]interface{}{"enable-location-replacement": "false"})
}

// Close close the connection to pd.
Expand Down
6 changes: 0 additions & 6 deletions run-test.sh

This file was deleted.

16 changes: 10 additions & 6 deletions tests/br_other/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ curl "http://localhost:$PPROF_PORT/debug/pprof/trace?seconds=1" 2>&1 > /dev/null
echo "pprof started..."

curl http://$PD_ADDR/pd/api/v1/config/schedule | grep '"disable": false'

curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."enable-location-replacement"' | grep "false"
backup_fail=0
echo "another backup start expect to fail due to last backup add a lockfile"
run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB/lock" --concurrency 4 || backup_fail=1
Expand All @@ -94,7 +94,7 @@ fi

# check is there still exists scheduler not in pause.
pause_schedulers=$(curl http://$PD_ADDR/pd/api/v1/schedulers?status="paused" | grep "scheduler" | wc -l)
if [ "$pause_schedulers" -ne "3" ];then
if [ "$pause_schedulers" -lt "3" ];then
echo "TEST: [$TEST_NAME] failed because paused scheduler are not enough"
exit 1
fi
Expand All @@ -120,6 +120,7 @@ then
exit 1
fi


default_pd_values='{
"max-merge-region-keys": 200000,
"max-merge-region-size": 20,
Expand All @@ -136,22 +137,25 @@ for key in $(echo $default_pd_values | jq 'keys[]'); do
fi
done

pd_settings=5

# check is there still exists scheduler in pause.
pause_schedulers=$(curl http://$PD_ADDR/pd/api/v1/schedulers?status="paused" | grep "scheduler" | wc -l)
# There shouldn't be any paused schedulers since BR gracfully shutdown.
if [ "$pause_schedulers" -ne "0" ];then
# There shouldn't be any paused schedulers since BR gracfully shutdown.
if [ "$pause_schedulers" -ne "0" ];then
echo "TEST: [$TEST_NAME] failed because paused scheduler has changed"
exit 1
fi

pd_settings=6

# balance-region scheduler enabled
curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."schedulers-v2"[] | {disable: .disable, type: ."type" | select (.=="balance-region")}' | grep '"disable": false' || ((pd_settings--))
# balance-leader scheduler enabled
curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."schedulers-v2"[] | {disable: .disable, type: ."type" | select (.=="balance-leader")}' | grep '"disable": false' || ((pd_settings--))
# hot region scheduler enabled
curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."schedulers-v2"[] | {disable: .disable, type: ."type" | select (.=="hot-region")}' | grep '"disable": false' || ((pd_settings--))
# location replacement enabled
curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."enable-location-replacement"' | grep "true" || ((pd_settings--))

# we need reset pd config to default
# until pd has the solution to temporary set these scheduler/configs.
Expand All @@ -163,7 +167,7 @@ curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-size"' |
# max-merge-region-keys set to default 200000
curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-keys"' | grep "200000" || ((pd_settings--))

if [ "$pd_settings" -ne "5" ];then
if [ "$pd_settings" -ne "6" ];then
echo "TEST: [$TEST_NAME] test validate reset pd config failed!"
exit 1
fi
Expand Down
5 changes: 3 additions & 2 deletions tests/br_tiflash/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,13 @@ run_br backup full -s "local://$TEST_DIR/$DB" --pd $PD_ADDR
run_sql "DROP DATABASE $DB"
run_br restore full -s "local://$TEST_DIR/$DB" --pd $PD_ADDR

AFTER_BR_COUNT=`run_sql "SELECT count(*) FROM $DB.kv;" | sed -n "s/[^0-9]//g;/^[0-9]*$/p" | tail -n1`
# FIXME after stopping schedulers, tiflash takes many time to sync with TiKV(even 30s isn't enough).
AFTER_BR_COUNT=`run_sql "SELECT count(*) /*+ READ_FROM_STORAGE(TIKV[$DB.kv]) */ FROM $DB.kv;" | sed -n "s/[^0-9]//g;/^[0-9]*$/p" | tail -n1`
if [ $AFTER_BR_COUNT -ne $RECORD_COUNT ]; then
echo "failed to restore, before: $RECORD_COUNT; after: $AFTER_BR_COUNT"
exit 1
fi

run_sql "DROP DATABASE $DB"

echo "TEST $TEST_NAME passed!"
echo "TEST $TEST_NAME passed!"