Skip to content

Commit

Permalink
scheduler(ticdc): Fix lost finish operation (#4942) (#5001)
Browse files Browse the repository at this point in the history
close #4963
  • Loading branch information
ti-chi-bot authored Mar 24, 2022
1 parent 96a1104 commit 1e29b06
Show file tree
Hide file tree
Showing 8 changed files with 323 additions and 81 deletions.
67 changes: 58 additions & 9 deletions cdc/processor/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ type agentImpl struct {

changeFeed model.ChangeFeedID
ownerCaptureID model.CaptureID
ownerRevision int64

clock clock.Clock
barrierSeqs map[p2p.Topic]p2p.Seq
Expand All @@ -85,7 +86,7 @@ func newAgent(
messageRouter p2p.MessageRouter,
executor scheduler.TableExecutor,
changeFeedID model.ChangeFeedID,
) (processorAgent, error) {
) (retVal processorAgent, err error) {
ret := &agentImpl{
messageServer: messageServer,
messageRouter: messageRouter,
Expand Down Expand Up @@ -119,10 +120,20 @@ func newAgent(
zap.Error(err))
return nil, errors.Trace(err)
}
defer func() {
if err != nil {
if err1 := ret.deregisterPeerMessageHandlers(); err1 != nil {
log.Warn("failed to unregister processor message handlers",
zap.String("changefeed", changeFeedID),
zap.Error(err))
}
}
}()

etcdCliCtx, cancel := stdContext.WithTimeout(ctx, getOwnerFromEtcdTimeout)
ownerCaptureID, err := ctx.GlobalVars().EtcdClient.GetOwnerID(etcdCliCtx, etcd.CaptureOwnerKey)
cancel()
defer cancel()
ownerCaptureID, err := ctx.GlobalVars().EtcdClient.
GetOwnerID(etcdCliCtx, etcd.CaptureOwnerKey)
if err != nil {
if err != concurrency.ErrElectionNoLeader {
return nil, errors.Trace(err)
Expand All @@ -133,11 +144,26 @@ func newAgent(
log.Info("no owner found. We will wait for an owner to contact us.",
zap.String("changefeed", changeFeedID),
zap.Error(err))
} else {
ret.ownerCaptureID = ownerCaptureID
log.Debug("found owner",
zap.String("changefeed", changeFeedID),
zap.String("ownerID", ownerCaptureID))
return ret, nil
}

ret.ownerCaptureID = ownerCaptureID
log.Debug("found owner",
zap.String("changefeed", changeFeedID),
zap.String("ownerID", ownerCaptureID))

ret.ownerRevision, err = ctx.GlobalVars().EtcdClient.
GetOwnerRevision(etcdCliCtx, ownerCaptureID)
if err != nil {
if cerror.ErrOwnerNotFound.Equal(err) || cerror.ErrNotOwner.Equal(err) {
// These are expected errors when no owner has been elected
log.Info("no owner found when querying for the owner revision",
zap.String("changefeed", changeFeedID),
zap.Error(err))
ret.ownerCaptureID = ""
return ret, nil
}
return nil, errors.Trace(err)
}
return ret, nil
}
Expand Down Expand Up @@ -166,6 +192,18 @@ func (a *agentImpl) FinishTableOperation(
tableID model.TableID,
epoch model.ProcessorEpoch,
) (done bool, err error) {
topic := model.SyncTopic(a.changeFeed)
if !a.Barrier(ctx) {
if _, exists := a.barrierSeqs[topic]; exists {
log.L().Info("Delay sending FinishTableOperation due to pending sync",
zap.String("changefeedID", a.changeFeed),
zap.String("ownerID", a.ownerCaptureID),
zap.Int64("tableID", tableID),
zap.String("epoch", epoch))
return false, nil
}
}

message := &model.DispatchTableResponseMessage{ID: tableID, Epoch: epoch}
defer func() {
if err != nil {
Expand Down Expand Up @@ -322,8 +360,19 @@ func (a *agentImpl) Barrier(_ context.Context) (done bool) {
return true
}

func (a *agentImpl) OnOwnerChanged(ctx context.Context, newOwnerCaptureID model.CaptureID) {
func (a *agentImpl) OnOwnerChanged(
ctx context.Context,
newOwnerCaptureID model.CaptureID,
newOwnerRev int64,
) {
// The BaseAgent will notify us of an owner change if an AnnounceOwner is received.
// However, we need to filter out the event if we already learned of this owner directly
// from Etcd.
if a.ownerCaptureID == newOwnerCaptureID && a.ownerRevision == newOwnerRev {
return
}
a.ownerCaptureID = newOwnerCaptureID
a.ownerRevision = newOwnerRev
// Note that we clear the pending barriers.
a.barrierSeqs = map[p2p.Topic]p2p.Seq{}
}
Expand Down
Loading

0 comments on commit 1e29b06

Please sign in to comment.