Skip to content

Commit

Permalink
dm: add retry for dm-worker to join dm-master
Browse files Browse the repository at this point in the history
  • Loading branch information
GMHDBJD committed Oct 29, 2024
1 parent 792da42 commit 993a184
Showing 1 changed file with 44 additions and 37 deletions.
81 changes: 44 additions & 37 deletions dm/worker/join.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ import (
"google.golang.org/grpc/credentials"
)

const retryTimes = 5

// GetJoinURLs gets the endpoints from the join address.
func GetJoinURLs(addrs string) []string {
// TODO: handle pm1=xxxx:1234,pm2=xxxx:1234,pm3=xxxx:1234
Expand Down Expand Up @@ -63,46 +65,51 @@ func (s *Server) JoinMaster(endpoints []string) error {
}

var errorStr string
for _, endpoint := range endpoints {
ctx1, cancel1 := context.WithTimeout(ctx, 3*time.Second)
//nolint:staticcheck
conn, err := grpc.DialContext(
ctx1,
utils.UnwrapScheme(endpoint),
grpc.WithBlock(),
grpcTLS,
grpc.WithBackoffMaxDelay(3*time.Second),
)
cancel1()
if err != nil {
if conn != nil {
conn.Close()
// retry to connect master
for i := 0; i < retryTimes; i++ {
for _, endpoint := range endpoints {
ctx1, cancel1 := context.WithTimeout(ctx, 3*time.Second)
//nolint:staticcheck
conn, err := grpc.DialContext(
ctx1,
utils.UnwrapScheme(endpoint),
grpc.WithBlock(),
grpcTLS,
grpc.WithBackoffMaxDelay(3*time.Second),
)
cancel1()
if err != nil {
if conn != nil {
conn.Close()
}
log.L().Error("fail to dial dm-master", zap.String("endpoint", endpoint), zap.Error(err))
errorStr = err.Error()
continue

Check warning on line 87 in dm/worker/join.go

View check run for this annotation

Codecov / codecov/patch

dm/worker/join.go#L68-L87

Added lines #L68 - L87 were not covered by tests
}
client := pb.NewMasterClient(conn)
ctx1, cancel1 = context.WithTimeout(ctx, 3*time.Second)
resp, err := client.RegisterWorker(ctx1, req)
cancel1()
conn.Close()
if err != nil {
log.L().Error("fail to register worker", zap.String("endpoint", endpoint), zap.Error(err))
errorStr = err.Error()
continue

Check warning on line 97 in dm/worker/join.go

View check run for this annotation

Codecov / codecov/patch

dm/worker/join.go#L89-L97

Added lines #L89 - L97 were not covered by tests
}
if !resp.GetResult() {
log.L().Error("fail to register worker", zap.String("endpoint", endpoint), zap.String("error", resp.Msg))
errorStr = resp.Msg
continue

Check warning on line 102 in dm/worker/join.go

View check run for this annotation

Codecov / codecov/patch

dm/worker/join.go#L99-L102

Added lines #L99 - L102 were not covered by tests
}
log.L().Error("fail to dial dm-master", zap.String("endpoint", endpoint), zap.Error(err))
errorStr = err.Error()
continue
}
client := pb.NewMasterClient(conn)
ctx1, cancel1 = context.WithTimeout(ctx, 3*time.Second)
resp, err := client.RegisterWorker(ctx1, req)
cancel1()
conn.Close()
if err != nil {
log.L().Error("fail to register worker", zap.String("endpoint", endpoint), zap.Error(err))
errorStr = err.Error()
continue
}
if !resp.GetResult() {
log.L().Error("fail to register worker", zap.String("endpoint", endpoint), zap.String("error", resp.Msg))
errorStr = resp.Msg
continue
}

// worker do calls decrypt, but the password is decrypted already,
// but in case we need it later, init it.
encrypt.InitCipher(resp.GetSecretKey())
// worker do calls decrypt, but the password is decrypted already,
// but in case we need it later, init it.
encrypt.InitCipher(resp.GetSecretKey())

Check warning on line 107 in dm/worker/join.go

View check run for this annotation

Codecov / codecov/patch

dm/worker/join.go#L107

Added line #L107 was not covered by tests

return nil
return nil

Check warning on line 109 in dm/worker/join.go

View check run for this annotation

Codecov / codecov/patch

dm/worker/join.go#L109

Added line #L109 was not covered by tests
}
log.L().Warn("retry to connect master", zap.Int("retry", i+1), zap.Int("total", retryTimes))
time.Sleep(retryConnectSleepTime)

Check warning on line 112 in dm/worker/join.go

View check run for this annotation

Codecov / codecov/patch

dm/worker/join.go#L111-L112

Added lines #L111 - L112 were not covered by tests
}
return terror.ErrWorkerFailConnectMaster.Generate(endpoints, errorStr)
}
Expand Down

0 comments on commit 993a184

Please sign in to comment.