Skip to content

Commit

Permalink
backup: check the store state by last heartbeat (#43099)
Browse files Browse the repository at this point in the history
close #42973
  • Loading branch information
YuJuncen authored Apr 19, 2023
1 parent 268901f commit f22ae5f
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
6 changes: 3 additions & 3 deletions br/pkg/backup/push.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,16 @@ func (push *pushDown) pushBackup(
store := s
storeID := s.GetId()
lctx := logutil.ContextWithField(ctx, zap.Uint64("store-id", storeID))
if s.GetState() != metapb.StoreState_Up {
logutil.CL(lctx).Warn("skip store", zap.Stringer("State", s.GetState()))
if err := utils.CheckStoreLiveness(s); err != nil {
logutil.CL(lctx).Warn("skip store", logutil.ShortError(err))
continue
}
client, err := push.mgr.GetBackupClient(lctx, storeID)
if err != nil {
// BR should be able to backup even some of stores disconnected.
// The regions managed by this store can be retried at fine-grained backup then.
logutil.CL(lctx).Warn("fail to connect store, skipping", zap.Error(err))
return nil
continue
}
wg.Add(1)
go func() {
Expand Down
28 changes: 28 additions & 0 deletions br/pkg/utils/misc.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,24 @@ import (
"time"

"github.com/pingcap/errors"
"github.com/pingcap/kvproto/pkg/metapb"
berrors "github.com/pingcap/tidb/br/pkg/errors"
"github.com/pingcap/tidb/parser/mysql"
"github.com/pingcap/tidb/parser/types"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/credentials/insecure"
)

const (
// storeDisconnectionDuration is the max duration of a store to be treated as living.
// when a store doesn't send heartbeat for 100s, it is probably offline, and most of leaders should be transformed.
// (How about network partition between TiKV and PD? Even that is rare.)
// Also note that the offline threshold in PD is 20s, see
// https://github.com/tikv/pd/blob/c40e319f50822678cda71ae62ee2fd70a9cac010/pkg/core/store.go#L523
storeDisconnectionDuration = 100 * time.Second
)

// IsTypeCompatible checks whether type target is compatible with type src
// they're compatible if
// - same null/not null and unsigned flag(maybe we can allow src not null flag, target null flag later)
Expand Down Expand Up @@ -103,3 +114,20 @@ func GRPCConn(ctx context.Context, storeAddr string, tlsConf *tls.Config, opts .
}
return connection, nil
}

// CheckStoreLiveness checks whether a store is still alive.
// Some versions of PD may not set the store state in the gRPC response.
// We need to check it manually.
func CheckStoreLiveness(s *metapb.Store) error {
if s.State != metapb.StoreState_Up {
return errors.Annotatef(berrors.ErrKVStorage, "the store state isn't up, it is %s", s.State)
}
// If the field isn't present (the default value), skip this check.
if s.GetLastHeartbeat() > 0 {
lastHeartBeat := time.Unix(0, s.GetLastHeartbeat())
if sinceLastHB := time.Since(lastHeartBeat); sinceLastHB > storeDisconnectionDuration {
return errors.Annotatef(berrors.ErrKVStorage, "the store last heartbeat is too far, at %s", sinceLastHB)
}
}
return nil
}

0 comments on commit f22ae5f

Please sign in to comment.