Skip to content

Commit

Permalink
stability: add failover test (#349)
Browse files Browse the repository at this point in the history
* stability: add failover test
  • Loading branch information
cwen0 authored and weekface committed Mar 27, 2019
1 parent 248459e commit 7dae3c2
Show file tree
Hide file tree
Showing 10 changed files with 534 additions and 48 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ images/tidb-operator/bin/
images/tidb-operator-e2e/bin/
images/tidb-operator-e2e/tidb-cluster/
images/tidb-operator-e2e/tidb-operator/
tests/images/stability-test/bin/
tests/images/e2e/bin/
tests/images/fault-trigger/bin/
tests/images/e2e/tidb-cluster/
tests/images/e2e/tidb-operator/
*.tar
Expand Down
12 changes: 12 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,18 @@ e2e-docker: e2e-build
e2e-build:
$(GO) -ldflags '$(LDFLAGS)' -o tests/images/e2e/bin/e2e tests/cmd/e2e/main.go

stability-test-build:
$(GO) -ldflags '$(LDFLAGS)' -o tests/images/stability-test/bin/stability-test tests/cmd/stability/*.go

stability-test-docker: stability-test-build
docker build -t "${DOCKER_REGISTRY}/pingcap/tidb-operator-stability-test:latest" tests/images/stability-test

stability-test-push: stability-test-docker
docker push "${DOCKER_REGISTRY}/pingcap/tidb-operator-stability-test:latest"

fault-trigger:
$(GO) -ldflags '$(LDFLAGS)' -o tests/images/fault-trigger/bin/fault-trigger tests/cmd/fault-trigger/*.go

test:
@echo "Run unit tests"
@$(GOTEST) ./pkg/... -coverprofile=coverage.txt -covermode=atomic && echo "\nUnit tests run successfully!"
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ github.com/google/btree v0.0.0-20180124185431-e89373fe6b4a h1:ZJu5NB1Bk5ms4vw0Xu
github.com/google/btree v0.0.0-20180124185431-e89373fe6b4a/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
github.com/google/gofuzz v0.0.0-20170612174753-24818f796faf h1:+RRA9JqSOZFfKrOeqr2z77+8R2RKyh8PG66dcu1V0ck=
github.com/google/gofuzz v0.0.0-20170612174753-24818f796faf/go.mod h1:HP5RmnzzSNb993RKQDq4+1A4ia9nllfqcQFTQJedwGI=
github.com/google/uuid v1.0.0 h1:b4Gk+7WdP/d3HZH8EJsZpvV7EtDOgaZLtnaNGIu1adA=
github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/googleapis/gnostic v0.2.0 h1:l6N3VoaVzTncYYW+9yOz2LJJammFZGBO13sqgEhpy9g=
github.com/googleapis/gnostic v0.2.0/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY=
Expand Down Expand Up @@ -83,6 +84,7 @@ github.com/onsi/ginkgo v1.6.0 h1:Ix8l273rp3QzYgXSR+c8d1fTG7UPgYkOSELPhiY/YGw=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/gomega v1.4.1 h1:PZSj/UFNaVp3KxrzHOcS7oyuWA7LoOY/77yCTEFu21U=
github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA=
github.com/pborman/uuid v1.2.0 h1:J7Q5mO4ysT1dv8hyrUGHb9+ooztCXu1D8MY8DZYsu3g=
github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=
github.com/pingcap/check v0.0.0-20171206051426-1c287c953996 h1:ZBdiJCMan6GSo/aPAM7gywcUKa0z58gczVrnG6TQnAQ=
github.com/pingcap/check v0.0.0-20171206051426-1c287c953996/go.mod h1:B1+S9LNcuMyLH/4HMTViQOJevkGiik3wW2AN9zb2fNQ=
Expand Down
176 changes: 176 additions & 0 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ const (
defaultConcurrency = 512
defaultBatchSize = 100
defaultRawSize = 100

period = 5 * time.Minute
)

func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, cfg *Config) OperatorActions {
Expand Down Expand Up @@ -99,6 +101,8 @@ type OperatorActions interface {
GetPodUIDMap(info *TidbClusterInfo) (map[string]types.UID, error)
GetNodeMap(info *TidbClusterInfo, component string) (map[string][]string, error)
getBackupDir(info *TidbClusterInfo) ([]string, error)
PendingFailover(info *TidbClusterInfo, faultPoint *time.Time) (bool, error)
CheckFailover(info *TidbClusterInfo, faultNode string) (bool, error)
}

type operatorActions struct {
Expand Down Expand Up @@ -1881,6 +1885,178 @@ func (oa *operatorActions) drainerHealth(info *TidbClusterInfo, hostName string)
return len(healths.PumpPos) > 0 && healths.Synced
}

func (oa *operatorActions) PendingFailover(info *TidbClusterInfo, faultPoint *time.Time) (bool, error) {
tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{})
if err != nil {
glog.Infof("pending failover,failed to get tidbcluster:[%s], error: %v", info.FullName(), err)
if strings.Contains(err.Error(), "Client.Timeout exceeded while awaiting headers") {
glog.Info("create new client")
newCli, _, err := CreateKubeClient()
if err != nil {
glog.Errorf("create new client failed, error:%v", err)
return false, nil
}
oa.cli = newCli
}
return false, nil
}
deadline := faultPoint.Add(period)
if time.Now().Before(deadline) {
if tc.Status.PD.FailureMembers != nil && len(tc.Status.PD.FailureMembers) > 0 {
err := fmt.Errorf("cluster: [%s] the pd member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339))
glog.Errorf(err.Error())
return false, err
}
if tc.Status.TiKV.FailureStores != nil && len(tc.Status.TiKV.FailureStores) > 0 {
err := fmt.Errorf("cluster: [%s] the tikv store should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339))
glog.Errorf(err.Error())
return false, err
}
if tc.Status.TiDB.FailureMembers != nil && len(tc.Status.TiDB.FailureMembers) > 0 {
err := fmt.Errorf("cluster: [%s] the tidb member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339))
glog.Errorf(err.Error())
return false, err
}

glog.Infof("cluster: [%s] operator's failover feature is pending", info.FullName())
return false, nil
}
return true, nil
}

func (oa *operatorActions) CheckFailover(info *TidbClusterInfo, node string) (bool, error) {
selector, err := label.New().Instance(info.ClusterName).Selector()
if err != nil {
glog.Errorf("cluster:[%s] create selector failed, error:%v", info.FullName(), err)
return false, nil
}
pods, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{LabelSelector: selector.String()})
if err != nil {
glog.Errorf("cluster:[%s] query pods failed, error:%v", info.FullName(), err)
return false, nil
}

affectedPods := map[string]*corev1.Pod{}
for i, pod := range pods.Items {
if pod.Spec.NodeName == node {
affectedPods[pod.Name] = &pods.Items[i]
}
}
if len(affectedPods) == 0 {
glog.Infof("the cluster:[%s] can not be affected by node:[%s]", info.FullName(), node)
return true, nil
}

tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{})
if err != nil {
glog.Errorf("query tidbcluster: [%s] failed, error: %v", info.FullName(), err)
return false, nil
}

for _, affectedPod := range affectedPods {
switch affectedPod.Labels[label.ComponentLabelKey] {
case label.PDLabelVal:
if !oa.pdFailover(affectedPod, tc) {
return false, nil
}
case label.TiKVLabelVal:
if !oa.tikvFailover(affectedPod, tc) {
return false, nil
}
case label.TiDBLabelVal:
if !oa.tidbFailover(affectedPod, tc) {
return false, nil
}
}
}

glog.Infof("cluster: [%s]'s failover feature has complete", info.FullName())
return true, nil
}

func (oa *operatorActions) pdFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool {
failure := false
for _, failureMember := range tc.Status.PD.FailureMembers {
if failureMember.PodName == pod.GetName() {
failure = true
break
}
}
if !failure {
glog.Infof("tidbCluster:[%s/%s]'s member:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name)
return false
}

for _, member := range tc.Status.PD.Members {
if member.Name == pod.GetName() {
glog.Infof("tidbCluster:[%s/%s]'s status.members still have pd member:[%s]", tc.Namespace, tc.Name, pod.Name)
return false
}
}

if tc.Status.PD.Synced && len(tc.Status.PD.Members) == int(tc.Spec.PD.Replicas) {
return true
}

glog.Infof("cluster: [%s/%s] pd:[%s] failover still not complete", tc.Namespace, tc.Name, pod.GetName())

return false
}

func (oa *operatorActions) tikvFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool {
failure := false
for _, failureStore := range tc.Status.TiKV.FailureStores {
if failureStore.PodName == pod.GetName() {
failure = true
break
}
}
if !failure {
glog.Infof("tidbCluster:[%s/%s]'s store pod:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name)
return false
}

healthCount := 0
for _, store := range tc.Status.TiKV.Stores {
if store.State == v1alpha1.TiKVStateUp {
healthCount++
}
}
if tc.Status.TiKV.Synced && healthCount == int(tc.Spec.TiKV.Replicas) {
return true
}

glog.Infof("cluster: [%s/%s] tikv:[%s] failover still not complete", tc.Namespace, tc.Name, pod.GetName())
return false
}

func (oa *operatorActions) tidbFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool {
failure := false
for _, failureMember := range tc.Status.TiDB.FailureMembers {
if failureMember.PodName == pod.GetName() {
glog.Infof("tidbCluster:[%s/%s]'s store pod:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name)
failure = true
break
}
}
if !failure {
return false
}

healthCount := 0
for _, member := range tc.Status.TiDB.Members {
if member.Health {
healthCount++
}
}

if healthCount == int(tc.Spec.TiDB.Replicas) {
return true
}
glog.Infof("cluster: [%s/%s] tidb:[%s] failover still not complete", tc.Namespace, tc.Name, pod.GetName())
return false
}

func (oa *operatorActions) GetPodUIDMap(info *TidbClusterInfo) (map[string]types.UID, error) {
result := map[string]types.UID{}

Expand Down
Loading

0 comments on commit 7dae3c2

Please sign in to comment.