Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pd, openapi(ticdc): retry one reset by peer when kill the pd instance, fix healthy panic #7069

Merged
merged 9 commits into from
Sep 16, 2022
6 changes: 5 additions & 1 deletion cdc/api/v1/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -872,8 +872,12 @@ func (h *OpenAPI) ServerStatus(c *gin.Context) {
// @Failure 500 {object} model.HTTPError
// @Router /api/v1/health [get]
func (h *OpenAPI) Health(c *gin.Context) {
ctx := c.Request.Context()
if !h.capture.IsOwner() {
middleware.ForwardToOwnerMiddleware(h.capture)(c)
return
}

ctx := c.Request.Context()
health, err := h.statusProvider().IsHealthy(ctx)
if err != nil {
c.IndentedJSON(http.StatusInternalServerError, model.NewHTTPError(err))
Expand Down
3 changes: 2 additions & 1 deletion cdc/api/v1/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -896,9 +896,10 @@ func TestHealth(t *testing.T) {
// capture is owner
ctrl := gomock.NewController(t)
cp := mock_capture.NewMockCapture(ctrl)

api := testCase{url: "/api/v1/health", method: "GET"}
sp := mock_owner.NewMockStatusProvider(ctrl)
ownerRouter := newRouter(cp, sp)
api := testCase{url: "/api/v1/health", method: "GET"}

cp.EXPECT().IsReady().Return(true).AnyTimes()
cp.EXPECT().Info().DoAndReturn(func() (model.CaptureInfo, error) {
Expand Down
5 changes: 5 additions & 0 deletions pkg/errorutil/ignore.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,10 @@ func IsRetryableEtcdError(err error) bool {
if strings.Contains(etcdErr.Error(), "received prior goaway: code: NO_ERROR") {
return true
}

// this may happen if the PD instance shutdown by `kill -9`, no matter the instance is the leader or not.
if strings.Contains(etcdErr.Error(), "connection reset by peer") {
overvenus marked this conversation as resolved.
Show resolved Hide resolved
return true
}
return false
}
2 changes: 2 additions & 0 deletions pkg/errorutil/ignore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ func TestIsRetryableEtcdError(t *testing.T) {
{errors.New("rpc error: code = Unavailable desc = closing transport due to: " +
"connection error: desc = \\\"error reading from server: EOF\\\", " +
"received prior goaway: code: NO_ERROR\""), true},
{errors.New("rpc error: code = Unavailable desc = error reading from server: " +
"xxx: read: connection reset by peer"), true},
}

for _, item := range cases {
Expand Down