Skip to content
This repository has been archived by the owner on Nov 24, 2023. It is now read-only.

fix: don't forbid DM-master leader startup for unexpected DDL locks (#1193) #1199

Merged
merged 1 commit into from
Oct 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/chaos-mesh.yml
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,9 @@ jobs:
mkdir ./logs
sudo cp -r -L /var/log/containers/. ./logs
sudo chown -R runner ./logs
# Update logs as artifact seems not stable, so we set `continue-on-error: true` here.
- name: Upload logs
continue-on-error: true
uses: actions/upload-artifact@v2
if: ${{ always() }}
with:
Expand Down
2 changes: 2 additions & 0 deletions dm/master/election.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ func (s *Server) electionNotify(ctx context.Context) {
log.L().Info("current member become the leader", zap.String("current member", s.cfg.Name))
s.leader.Set(oneselfStartingLeader)

// NOTE: for logic errors, we should return with `true`, so that the cluster can serve requests and the user can fix these errors.
// otherwise no member of DM-master can become the leader and the user can't fix them (the cluster may need to be fixed offline with some other tools like etcdctl).
ok := s.startLeaderComponent(ctx)

if !ok {
Expand Down
1 change: 1 addition & 0 deletions dm/master/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ func NewScheduler(pLogger *log.Logger, securityCfg config.Security) *Scheduler {
}

// Start starts the scheduler for work.
// NOTE: for logic errors, it should start without returning errors (but report via metrics or log) so that the user can fix them.
func (s *Scheduler) Start(pCtx context.Context, etcdCli *clientv3.Client) error {
s.logger.Info("the scheduler is starting")

Expand Down
5 changes: 4 additions & 1 deletion dm/master/shardddl/optimist.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func NewOptimist(pLogger *log.Logger) *Optimist {
}

// Start starts the shard DDL coordination in optimism mode.
// NOTE: for logic errors, it should start without returning errors (but report via metrics or log) so that the user can fix them.
func (o *Optimist) Start(pCtx context.Context, etcdCli *clientv3.Client) error {
o.logger.Info("the shard DDL optimist is starting")

Expand Down Expand Up @@ -246,7 +247,9 @@ func (o *Optimist) rebuildLocks() (revSource, revInfo, revOperation int64, err e
// recover the shard DDL lock based on history shard DDL info & lock operation.
err = o.recoverLocks(ifm, opm)
if err != nil {
return 0, 0, 0, err
// only log the error, and don't return it to forbid the startup of the DM-master leader.
// then these unexpected locks can be handled by the user.
o.logger.Error("fail to recover locks", log.ShortError(err))
}
return revSource, revInfo, revOperation, nil
}
Expand Down
5 changes: 4 additions & 1 deletion dm/master/shardddl/pessimist.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ func NewPessimist(pLogger *log.Logger, taskSources func(task string) []string) *
}

// Start starts the shard DDL coordination in pessimism mode.
// NOTE: for logic errors, it should start without returning errors (but report via metrics or log) so that the user can fix them.
func (p *Pessimist) Start(pCtx context.Context, etcdCli *clientv3.Client) error {
p.logger.Info("the shard DDL pessimist is starting")

Expand Down Expand Up @@ -145,7 +146,9 @@ func (p *Pessimist) buildLocks(etcdCli *clientv3.Client) (int64, int64, error) {
// recover the shard DDL lock based on history shard DDL info & lock operation.
err = p.recoverLocks(ifm, opm)
if err != nil {
return 0, 0, err
// only log the error, and don't return it to forbid the startup of the DM-master leader.
// then these unexpected locks can be handled by the user.
p.logger.Error("fail to recover locks", log.ShortError(err))
}
return rev1, rev2, nil
}
Expand Down
4 changes: 0 additions & 4 deletions pkg/parser/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,6 @@ const (
// Parse wraps parser.Parse(), makes `parser` suitable for dm
func Parse(p *parser.Parser, sql, charset, collation string) (stmt []ast.StmtNode, err error) {
stmts, warnings, err := p.Parse(sql, charset, collation)
if err != nil {
log.L().Error("parse statement", zap.String("sql", sql), log.ShortError(err))
}

if len(warnings) > 0 {
log.L().Warn("parse statement", zap.String("sql", sql), zap.Errors("warning messages", warnings))
}
Expand Down
67 changes: 67 additions & 0 deletions tests/shardddl3/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,71 @@ function DM_RemoveLock() {
"bound" 2
}

function DM_RestartMaster_CASE() {
run_sql_source1 "insert into ${shardddl1}.${tb1} values(1,'aaa');"
run_sql_source2 "insert into ${shardddl1}.${tb1} values(2,'bbb');"

check_sync_diff $WORK_DIR $cur/conf/diff_config.toml

run_sql_source1 "alter table ${shardddl1}.${tb1} add column c double;"
run_sql_source2 "alter table ${shardddl1}.${tb1} add column c text;"

if [[ "$1" = "pessimistic" ]]; then
# count of 2: `blockingDDLs` and `unresolvedGroups`
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status test" \
'ALTER TABLE `shardddl`.`tb` ADD COLUMN `c` DOUBLE' 2 \
'ALTER TABLE `shardddl`.`tb` ADD COLUMN `c` TEXT' 2
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"show-ddl-locks" \
'ALTER TABLE `shardddl`.`tb` ADD COLUMN `c`' 1
else
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status test" \
'because schema conflict detected' 1
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"show-ddl-locks" \
'mysql-replica-01-`shardddl1`.`tb1`' 1 \
'mysql-replica-02-`shardddl1`.`tb1`' 1
fi

echo "restart dm-master"
ps aux | grep dm-master |awk '{print $2}'|xargs kill || true
check_port_offline $MASTER_PORT 20
run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT

if [[ "$1" = "pessimistic" ]]; then
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status test" \
'ALTER TABLE `shardddl`.`tb` ADD COLUMN `c` DOUBLE' 2 \
'ALTER TABLE `shardddl`.`tb` ADD COLUMN `c` TEXT' 2
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"show-ddl-locks" \
'ALTER TABLE `shardddl`.`tb` ADD COLUMN `c`' 1
else
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status test" \
'because schema conflict detected' 1
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"show-ddl-locks" \
'mysql-replica-01-`shardddl1`.`tb1`' 1 \
'mysql-replica-02-`shardddl1`.`tb1`' 1
fi
}

function DM_RestartMaster() {
run_case RestartMaster "double-source-pessimistic" \
"run_sql_source1 \"create table ${shardddl1}.${tb1} (a int, b varchar(10));\"; \
run_sql_source2 \"create table ${shardddl1}.${tb1} (a int, b varchar(10));\"" \
"clean_table" "pessimistic"

run_case RestartMaster "double-source-optimistic" \
"run_sql_source1 \"create table ${shardddl1}.${tb1} (a int, b varchar(10));\"; \
run_sql_source2 \"create table ${shardddl1}.${tb1} (a int, b varchar(10));\"" \
"clean_table" "optimistic"
}

function run() {
init_cluster
init_database
Expand All @@ -582,6 +647,8 @@ function run() {
done

DM_RemoveLock

DM_RestartMaster
}

cleanup_data $shardddl
Expand Down