Skip to content

Commit

Permalink
Unsafe recover (#4238)
Browse files Browse the repository at this point in the history
* Add support for online unsafe recover (ref #4242)

Signed-off-by: v01dstar <yang.zhang@pingcap.com>

* Allow recreated region to write into PD with a older region epoch

Signed-off-by: v01dstar <yang.zhang@pingcap.com>

* Add test cases to increase coverage

Signed-off-by: v01dstar <yang.zhang@pingcap.com>

* Update test input and expectations

Signed-off-by: v01dstar <yang.zhang@pingcap.com>

* Remove legacy tests

Signed-off-by: v01dstar <yang.zhang@pingcap.com>

* Add admin prefix

Signed-off-by: v01dstar <yang.zhang@pingcap.com>

* Fix a test failure

Signed-off-by: v01dstar <yang.zhang@pingcap.com>

Co-authored-by: Connor <zbk602423539@gmail.com>
  • Loading branch information
v01dstar and Connor1996 authored Nov 1, 2021
1 parent e65f0c5 commit 788f352
Show file tree
Hide file tree
Showing 14 changed files with 1,528 additions and 26 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ require (
github.com/pingcap/errcode v0.3.0
github.com/pingcap/errors v0.11.5-0.20201126102027-b0a155152ca3
github.com/pingcap/failpoint v0.0.0-20200702092429-9f69995143ce
github.com/pingcap/kvproto v0.0.0-20210819164333-bd5706b9d9f2
github.com/pingcap/kvproto v0.0.0-20211029081837-3c7bd947cf9b
github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7
github.com/pingcap/sysutil v0.0.0-20210730114356-fcd8a63f68c5
github.com/pingcap/tidb-dashboard v0.0.0-20211008050453-a25c25809529
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,8 @@ github.com/pingcap/failpoint v0.0.0-20200702092429-9f69995143ce h1:Y1kCxlCtlPTMt
github.com/pingcap/failpoint v0.0.0-20200702092429-9f69995143ce/go.mod h1:w4PEZ5y16LeofeeGwdgZB4ddv9bLyDuIX+ljstgKZyk=
github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w=
github.com/pingcap/kvproto v0.0.0-20200411081810-b85805c9476c/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI=
github.com/pingcap/kvproto v0.0.0-20210819164333-bd5706b9d9f2 h1:iCXDtZrssbW8mX8WoMQ5TapBOdoV+GF2wGR+QRdcyCs=
github.com/pingcap/kvproto v0.0.0-20210819164333-bd5706b9d9f2/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI=
github.com/pingcap/kvproto v0.0.0-20211029081837-3c7bd947cf9b h1:/aj6ITlHSJZmsm4hIMOgJAAZti+Dmq11tCyKedA6Dcs=
github.com/pingcap/kvproto v0.0.0-20211029081837-3c7bd947cf9b/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI=
github.com/pingcap/log v0.0.0-20191012051959-b742a5d432e9/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8=
github.com/pingcap/log v0.0.0-20200511115504-543df19646ad/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8=
github.com/pingcap/log v0.0.0-20210317133921-96f4fcab92a4/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8=
Expand Down
9 changes: 9 additions & 0 deletions server/api/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,15 @@ func createRouter(prefix string, svr *server.Server) *mux.Router {
apiRouter.HandleFunc("/gc/safepoint", serviceGCSafepointHandler.List).Methods("GET")
apiRouter.HandleFunc("/gc/safepoint/{service_id}", serviceGCSafepointHandler.Delete).Methods("DELETE")

// unsafe admin operation API
unsafeOperationHandler := newUnsafeOperationHandler(svr, rd)
clusterRouter.HandleFunc("/admin/unsafe/remove-failed-stores",
unsafeOperationHandler.RemoveFailedStores).Methods("POST")
clusterRouter.HandleFunc("/admin/unsafe/remove-failed-stores/show",
unsafeOperationHandler.GetFailedStoresRemovalStatus).Methods("GET")
clusterRouter.HandleFunc("/admin/unsafe/remove-failed-stores/history",
unsafeOperationHandler.GetFailedStoresRemovalHistory).Methods("GET")

// API to set or unset failpoints
failpoint.Inject("enableFailpointAPI", func() {
apiRouter.PathPrefix("/fail").HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down
79 changes: 79 additions & 0 deletions server/api/unsafe_operation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Copyright 2021 TiKV Project Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package api

import (
"net/http"

"github.com/tikv/pd/pkg/apiutil"
"github.com/tikv/pd/server"
"github.com/unrolled/render"
)

type unsafeOperationHandler struct {
svr *server.Server
rd *render.Render
}

func newUnsafeOperationHandler(svr *server.Server, rd *render.Render) *unsafeOperationHandler {
return &unsafeOperationHandler{
svr: svr,
rd: rd,
}
}

// @Tags unsafe
// @Summary Remove failed stores unsafely.
// @Produce json
// Success 200 {string} string "Request has been accepted."
// Failure 400 {string} string "The input is invalid."
// Failure 500 {string} string "PD server failed to proceed the request."
// @Router /admin/unsafe/remove-failed-stores [POST]
func (h *unsafeOperationHandler) RemoveFailedStores(w http.ResponseWriter, r *http.Request) {
rc := getCluster(r)
var stores map[uint64]string
if err := apiutil.ReadJSONRespondError(h.rd, w, r.Body, &stores); err != nil {
return
}
if len(stores) == 0 {
h.rd.JSON(w, http.StatusBadRequest, "No store specified")
return
}
if err := rc.GetUnsafeRecoveryController().RemoveFailedStores(stores); err != nil {
h.rd.JSON(w, http.StatusInternalServerError, err.Error())
return
}
h.rd.JSON(w, http.StatusOK, "Request has been accepted.")
}

// @Tags unsafe
// @Summary Show the current status of failed stores removal.
// @Produce json
// Success 200 {object} []string
// @Router /admin/unsafe/remove-failed-stores/show [GET]
func (h *unsafeOperationHandler) GetFailedStoresRemovalStatus(w http.ResponseWriter, r *http.Request) {
rc := getCluster(r)
h.rd.JSON(w, http.StatusOK, rc.GetUnsafeRecoveryController().Show())
}

// @Tags unsafe
// @Summary Show the history of failed stores removal.
// @Produce json
// Success 200 {object} []string
// @Router /admin/unsafe/remove-failed-stores/history [GET]
func (h *unsafeOperationHandler) GetFailedStoresRemovalHistory(w http.ResponseWriter, r *http.Request) {
rc := getCluster(r)
h.rd.JSON(w, http.StatusOK, rc.GetUnsafeRecoveryController().History())
}
60 changes: 60 additions & 0 deletions server/api/unsafe_operation_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Copyright 2021 TiKV Project Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package api

import (
"encoding/json"
"fmt"

. "github.com/pingcap/check"
"github.com/tikv/pd/server"
)

var _ = Suite(&testUnsafeAPISuite{})

type testUnsafeAPISuite struct {
svr *server.Server
cleanup cleanUpFunc
urlPrefix string
}

func (s *testUnsafeAPISuite) SetUpSuite(c *C) {
s.svr, s.cleanup = mustNewServer(c)
mustWaitLeader(c, []*server.Server{s.svr})

addr := s.svr.GetAddr()
s.urlPrefix = fmt.Sprintf("%s%s/api/v1/admin/unsafe", addr, apiPrefix)

mustBootstrapCluster(c, s.svr)
}

func (s *testUnsafeAPISuite) TearDownSuite(c *C) {
s.cleanup()
}

func (s *testUnsafeAPISuite) TestRemoveFailedStores(c *C) {
input := map[uint64]string{1: ""}
data, err := json.Marshal(input)
c.Assert(err, IsNil)
err = postJSON(testDialClient, s.urlPrefix+"/remove-failed-stores", data)
c.Assert(err, IsNil)
// Test show
var output []string
err = readJSON(testDialClient, s.urlPrefix+"/remove-failed-stores/show", &output)
c.Assert(err, IsNil)
// Test history
err = readJSON(testDialClient, s.urlPrefix+"/remove-failed-stores/history", &output)
c.Assert(err, IsNil)
}
8 changes: 8 additions & 0 deletions server/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ type RaftCluster struct {

// It's used to manage components.
componentManager *component.Manager

unsafeRecoveryController *unsafeRecoveryController
}

// Status saves some state information.
Expand Down Expand Up @@ -261,6 +263,7 @@ func (c *RaftCluster) Start(s Server) error {
c.coordinator = newCoordinator(c.ctx, cluster, s.GetHBStreams())
c.regionStats = statistics.NewRegionStatistics(c.opt, c.ruleManager)
c.limiter = NewStoreLimiter(s.GetPersistOptions())
c.unsafeRecoveryController = newUnsafeRecoveryController(cluster)

c.wg.Add(5)
go c.runCoordinator()
Expand Down Expand Up @@ -503,6 +506,11 @@ func (c *RaftCluster) RemoveSuspectRegion(id uint64) {
c.suspectRegions.Remove(id)
}

// GetUnsafeRecoveryController returns the unsafe recovery controller.
func (c *RaftCluster) GetUnsafeRecoveryController() *unsafeRecoveryController {
return c.unsafeRecoveryController
}

// AddSuspectKeyRange adds the key range with the its ruleID as the key
// The instance of each keyRange is like following format:
// [2][]byte: start key/end key
Expand Down
Loading

0 comments on commit 788f352

Please sign in to comment.