From c3e931e8e1f626804b305c1a2b44ad6344a8a591 Mon Sep 17 00:00:00 2001 From: weekface Date: Wed, 24 Apr 2019 11:09:08 +0800 Subject: [PATCH 1/3] refine stability --- tests/actions.go | 112 ++++++++++++++++++--------- tests/cmd/e2e/main.go | 4 +- tests/cmd/stability/main.go | 4 +- tests/failover.go | 23 +++--- tests/fault.go | 4 +- tests/pkg/blockwriter/blockwriter.go | 4 +- tests/pkg/ops/exec.go | 6 +- tests/pkg/util/db.go | 2 +- 8 files changed, 98 insertions(+), 61 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 8fbcd63573..8dd5d9ef13 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -65,7 +65,7 @@ func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, c } const ( - DefaultPollTimeout time.Duration = 10 * time.Minute + DefaultPollTimeout time.Duration = 30 * time.Minute DefaultPollInterval time.Duration = 1 * time.Minute getBackupDirPodName = "get-backup-dir" grafanaUsername = "admin" @@ -147,7 +147,7 @@ type OperatorConfig struct { } type TidbClusterConfig struct { - BackupPVC string + BackupName string Namespace string ClusterName string OperatorTag string @@ -245,6 +245,8 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { } func (oa *operatorActions) DeployOperator(info *OperatorConfig) error { + glog.Infof("deploying tidb-operator %s", info.ReleaseName) + if info.Tag != "e2e" { if err := oa.cloneOperatorRepo(); err != nil { return err @@ -254,10 +256,7 @@ func (oa *operatorActions) DeployOperator(info *OperatorConfig) error { } } - cmd := fmt.Sprintf(`helm install %s \ - --name %s \ - --namespace %s \ - --set-string %s`, + cmd := fmt.Sprintf(`helm install %s --name %s --namespace %s --set-string %s`, oa.operatorChartPath(info.Tag), info.ReleaseName, info.Namespace, @@ -278,6 +277,8 @@ func (oa *operatorActions) DeployOperatorOrDie(info *OperatorConfig) { } func (oa *operatorActions) CleanOperator(info *OperatorConfig) error { + glog.Infof("cleaning tidb-operator %s", info.ReleaseName) + err := oa.CleanWebHookAndService(info) if err != nil { return err @@ -315,7 +316,7 @@ func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error { } func (oa *operatorActions) DeployTidbCluster(info *TidbClusterConfig) error { - glog.Infof("begin to deploy tidb cluster cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) + glog.Infof("deploying tidb cluster [%s/%s]", info.Namespace, info.ClusterName) namespace := &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ @@ -391,10 +392,10 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { return fmt.Errorf("failed to delete jobs: %v, %s", err, string(res)) } - patchPVCmd := fmt.Sprintf(`kubectl get pv -l %s=%s,%s=%s --output=name | xargs -I {} \ - kubectl patch {} -p '{"spec":{"persistentVolumeReclaimPolicy":"Delete"}}'`, - label.NamespaceLabelKey, info.Namespace, label.InstanceLabelKey, info.ClusterName) - glog.V(4).Info(patchPVCmd) + patchPVCmd := fmt.Sprintf("kubectl get pv | grep %s | grep %s | awk '{print $1}' | "+ + "xargs -I {} kubectl patch pv {} -p '{\"spec\":{\"persistentVolumeReclaimPolicy\":\"Delete\"}}'", + info.Namespace, info.ClusterName) + glog.Info(patchPVCmd) if res, err := exec.Command("/bin/sh", "-c", patchPVCmd).CombinedOutput(); err != nil { return fmt.Errorf("failed to patch pv: %v, %s", err, string(res)) } @@ -429,11 +430,11 @@ func (oa *operatorActions) CleanTidbClusterOrDie(info *TidbClusterConfig) { } func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error { - glog.Infof("begin to check tidb cluster cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) + glog.Infof("checking tidb cluster [%s/%s] status", info.Namespace, info.ClusterName) ns := info.Namespace tcName := info.ClusterName - if err := wait.PollImmediate(DefaultPollInterval, DefaultPollTimeout, func() (bool, error) { + if err := wait.Poll(DefaultPollInterval, DefaultPollTimeout, func() (bool, error) { var tc *v1alpha1.TidbCluster var err error if tc, err = oa.cli.PingcapV1alpha1().TidbClusters(ns).Get(tcName, metav1.GetOptions{}); err != nil { @@ -448,35 +449,35 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error return false, nil } - glog.Infof("check tidb cluster begin tidbMembersReadyFn") + glog.V(4).Infof("check tidb cluster begin tidbMembersReadyFn") if b, err := oa.tidbMembersReadyFn(tc); !b && err == nil { return false, nil } - glog.Infof("check tidb cluster begin reclaimPolicySyncFn") + glog.V(4).Infof("check tidb cluster begin reclaimPolicySyncFn") if b, err := oa.reclaimPolicySyncFn(tc); !b && err == nil { return false, nil } - glog.Infof("check tidb cluster begin metaSyncFn") + glog.V(4).Infof("check tidb cluster begin metaSyncFn") if b, err := oa.metaSyncFn(tc); err != nil { return false, err } else if !b && err == nil { return false, nil } - glog.Infof("check tidb cluster begin schedulerHAFn") + glog.V(4).Infof("check tidb cluster begin schedulerHAFn") if b, err := oa.schedulerHAFn(tc); !b && err == nil { return false, nil } - glog.Infof("check tidb cluster begin passwordIsSet") + glog.V(4).Infof("check tidb cluster begin passwordIsSet") if b, err := oa.passwordIsSet(info); !b && err == nil { return false, nil } if info.Monitor { - glog.Infof("check tidb monitor normal") + glog.V(4).Infof("check tidb monitor normal") if b, err := oa.monitorNormal(info); !b && err == nil { return false, nil } @@ -484,7 +485,7 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error return true, nil }); err != nil { glog.Infof("check tidb cluster status failed: %s", err.Error()) - return fmt.Errorf("failed to waiting for tidbcluster %s/%s ready in 10 minutes", ns, tcName) + return fmt.Errorf("failed to waiting for tidbcluster %s/%s ready in 30 minutes", ns, tcName) } return nil @@ -1369,14 +1370,12 @@ func (oa *operatorActions) cloneOperatorRepo() error { } func (oa *operatorActions) checkoutTag(tagName string) error { - cmd := fmt.Sprintf(`cd %s && - git stash -u && - git checkout %s && - mkdir -p %s && - cp -rf charts/tidb-operator %s && - cp -rf charts/tidb-cluster %s && - cp -rf charts/tidb-backup %s`, - oa.cfg.OperatorRepoDir, tagName, filepath.Join(oa.cfg.ChartDir, tagName), oa.operatorChartPath(tagName), oa.tidbClusterChartPath(tagName), oa.backupChartPath(tagName)) + cmd := fmt.Sprintf("cd %s && git stash -u && git checkout %s && "+ + "mkdir -p %s && cp -rf charts/tidb-operator %s && "+ + "cp -rf charts/tidb-cluster %s && cp -rf charts/tidb-backup %s", + oa.cfg.OperatorRepoDir, tagName, + filepath.Join(oa.cfg.ChartDir, tagName), oa.operatorChartPath(tagName), + oa.tidbClusterChartPath(tagName), oa.backupChartPath(tagName)) glog.Info(cmd) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil { @@ -1390,7 +1389,7 @@ func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error { glog.Infof("begin to deploy adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) sets := map[string]string{ - "name": info.BackupPVC, + "name": info.BackupName, "mode": "backup", "user": "root", "password": info.Password, @@ -1414,7 +1413,7 @@ func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error { func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error { glog.Infof("begin to clean adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - jobName := fmt.Sprintf("%s-%s", info.ClusterName, info.BackupPVC) + jobName := fmt.Sprintf("%s-%s", info.ClusterName, info.BackupName) fn := func() (bool, error) { job, err := oa.kubeCli.BatchV1().Jobs(info.Namespace).Get(jobName, metav1.GetOptions{}) if err != nil { @@ -1438,10 +1437,10 @@ func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error { } func (oa *operatorActions) Restore(from *TidbClusterConfig, to *TidbClusterConfig) error { - glog.Infof("begin to deploy restore cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) + glog.Infof("deploying restore cluster[%s/%s]", from.Namespace, from.ClusterName) sets := map[string]string{ - "name": to.BackupPVC, + "name": to.BackupName, "mode": "restore", "user": "root", "password": to.Password, @@ -1464,7 +1463,7 @@ func (oa *operatorActions) Restore(from *TidbClusterConfig, to *TidbClusterConfi func (oa *operatorActions) CheckRestore(from *TidbClusterConfig, to *TidbClusterConfig) error { glog.Infof("begin to check restore backup cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) - jobName := fmt.Sprintf("%s-restore-%s", to.ClusterName, from.BackupPVC) + jobName := fmt.Sprintf("%s-restore-%s", to.ClusterName, from.BackupName) fn := func() (bool, error) { job, err := oa.kubeCli.BatchV1().Jobs(to.Namespace).Get(jobName, metav1.GetOptions{}) if err != nil { @@ -1608,8 +1607,28 @@ func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error return nil } +func (oa *operatorActions) disableScheduledBackup(info *TidbClusterConfig) error { + glog.Infof("disabling scheduled backup") + + sets := map[string]string{ + "clusterName": info.ClusterName, + "scheduledBackup.create": "false", + } + + setString := info.TidbClusterHelmSetString(sets) + + cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", + info.ClusterName, oa.tidbClusterChartPath(info.OperatorTag), setString) + + res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to disable scheduler backup job: %v, %s", err, string(res)) + } + return nil +} + func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error { - glog.Infof("begin to check scheduler backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) + glog.Infof("checking scheduler backup for tidb cluster[%s/%s]", info.Namespace, info.ClusterName) jobName := fmt.Sprintf("%s-scheduled-backup", info.ClusterName) fn := func() (bool, error) { @@ -1637,14 +1656,26 @@ func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error { return false, nil } + succededJobCount := 0 for _, j := range backupJobs { - if j.Status.Succeeded == 0 { - glog.Errorf("cluster [%s] back up job is not completed, please wait! ", info.ClusterName) - return false, nil + if j.Status.Failed > 0 { + return false, fmt.Errorf("cluster [%s/%s] scheduled backup job failed, job: [%s] failed count is: %d", + info.Namespace, info.ClusterName, j.Name, j.Status.Failed) + } + if j.Status.Succeeded > 0 { + succededJobCount++ } } - return true, nil + if succededJobCount >= 3 { + glog.Infof("cluster [%s/%s] scheduled back up job completed count: %d", + info.Namespace, info.ClusterName, succededJobCount) + return true, nil + } + + glog.Infof("cluster [%s/%s] scheduled back up job is not completed, please wait! ", + info.Namespace, info.ClusterName) + return false, nil } err := wait.Poll(DefaultPollInterval, DefaultPollTimeout, fn) @@ -1664,6 +1695,11 @@ func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error { return fmt.Errorf("scheduler job failed!") } + err = oa.disableScheduledBackup(info) + if err != nil { + return err + } + return nil } diff --git a/tests/cmd/e2e/main.go b/tests/cmd/e2e/main.go index 5c71b1a71f..126e325d93 100644 --- a/tests/cmd/e2e/main.go +++ b/tests/cmd/e2e/main.go @@ -75,7 +75,7 @@ func main() { UserName: "root", InitSecretName: fmt.Sprintf("%s-set-secret", name1), BackupSecretName: fmt.Sprintf("%s-backup-secret", name1), - BackupPVC: "backup-pvc", + BackupName: "backup", Resources: map[string]string{ "pd.resources.limits.cpu": "1000m", "pd.resources.limits.memory": "2Gi", @@ -106,7 +106,7 @@ func main() { UserName: "root", InitSecretName: fmt.Sprintf("%s-set-secret", name2), BackupSecretName: fmt.Sprintf("%s-backup-secret", name2), - BackupPVC: "backup-pvc", + BackupName: "backup", Resources: map[string]string{ "pd.resources.limits.cpu": "1000m", "pd.resources.limits.memory": "2Gi", diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 7ebf2ad81e..9075e54561 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -80,7 +80,7 @@ func main() { UserName: "root", InitSecretName: fmt.Sprintf("%s-set-secret", clusterName1), BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName1), - BackupPVC: "backup-pvc", + BackupName: "backup", Resources: map[string]string{ "pd.resources.limits.cpu": "1000m", "pd.resources.limits.memory": "2Gi", @@ -113,7 +113,7 @@ func main() { UserName: "root", InitSecretName: fmt.Sprintf("%s-set-secret", clusterName2), BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName2), - BackupPVC: "backup-pvc", + BackupName: "backup", Resources: map[string]string{ "pd.resources.limits.cpu": "1000m", "pd.resources.limits.memory": "2Gi", diff --git a/tests/failover.go b/tests/failover.go index 20d7538ce4..4982d59887 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -4,7 +4,6 @@ import ( "fmt" "sort" "strings" - "syscall" "time" _ "github.com/go-sql-driver/mysql" @@ -90,7 +89,7 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon } // restart tikv to ensure sst files - err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", 1, syscall.SIGTERM) + err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", "tikv-server") if err != nil { glog.Errorf("kill tikv: pod=%s err=%s", store.PodName, err.Error()) return err @@ -124,22 +123,28 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon }) // make tikv crash - err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", 1, syscall.SIGTERM) - if err != nil { - glog.Errorf("kill tikv: pod=%s err=%s", store.PodName, err.Error()) - return err - } + //err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", "tikv-server") + //if err != nil { + // glog.Errorf("cluster: [%s/%s] kill tikv: pod=%s err=%s", + // info.Namespace, info.ClusterName, + // store.PodName, err.Error()) + // return err + //} tikvOps.SetPoll(DefaultPollInterval, maxStoreDownTime+tikvFailoverPeriod+failoverTimeout) return tikvOps.PollTiDBCluster(info.Namespace, info.ClusterName, func(tc *v1alpha1.TidbCluster, err error) (bool, error) { - glog.Infof("check failure stores: current=%d origin=%d", len(tc.Status.TiKV.FailureStores), origFailures) + glog.Infof("cluster: [%s/%s] check failure stores: current=%d origin=%d", + info.Namespace, info.ClusterName, + len(tc.Status.TiKV.FailureStores), origFailures) if len(tc.Status.TiKV.FailureStores) <= origFailures { return false, nil } ups := countUpStores(tc) - glog.Infof("check up stores: current=%d origin=%d", ups, origUps) + glog.Infof("cluster: [%s/%s] check up stores: current=%d origin=%d", + info.Namespace, info.ClusterName, + ups, origUps) if ups < origUps { return false, nil } diff --git a/tests/fault.go b/tests/fault.go index 319f57fc6d..df6089fbbc 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -165,8 +165,6 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error return err } - glog.Infof("%+v", vms) - for _, vm := range vms { if vm.IP == node && vm.Status == "running" { return nil @@ -325,7 +323,7 @@ func (fa *faultTriggerActions) serviceAction(node string, serverName string, act return err } - glog.Infof("%s %s %s successfully", action, serverName, node) + glog.V(4).Infof("%s %s %s successfully", action, serverName, node) return nil } diff --git a/tests/pkg/blockwriter/blockwriter.go b/tests/pkg/blockwriter/blockwriter.go index 49d62afdf4..dcf090a8c3 100644 --- a/tests/pkg/blockwriter/blockwriter.go +++ b/tests/pkg/blockwriter/blockwriter.go @@ -127,7 +127,7 @@ func (c *BlockWriterCase) generateQuery(ctx context.Context, queryChan chan []st if len(queryChan) < queryChanSize { queryChan <- querys } else { - glog.Infof("[%s] [%s] [action: generate Query] query channel is full, sleep 10 seconds", c, c.ClusterName) + glog.V(4).Infof("[%s] [%s] [action: generate Query] query channel is full, sleep 10 seconds", c, c.ClusterName) util.Sleep(ctx, 10*time.Second) } } @@ -164,7 +164,7 @@ func (bw *blockWriter) run(ctx context.Context, db *sql.DB, queryChan chan []str return default: if err := bw.batchExecute(db, query); err != nil { - glog.Error(err) + glog.V(4).Info(err) time.Sleep(5 * time.Second) continue } diff --git a/tests/pkg/ops/exec.go b/tests/pkg/ops/exec.go index a904d9697a..e97767a941 100644 --- a/tests/pkg/ops/exec.go +++ b/tests/pkg/ops/exec.go @@ -16,9 +16,7 @@ import ( "bytes" "io" "net/url" - "strconv" "strings" - "syscall" "github.com/golang/glog" "github.com/pingcap/errors" @@ -81,9 +79,9 @@ func (cli *ClientOps) ExecWithOptions(options ExecOptions) (string, string, erro return strings.TrimSpace(stdout.String()), strings.TrimSpace(stderr.String()), err } -func (cli *ClientOps) KillProcess(ns string, pod string, container string, pid int, sig syscall.Signal) error { +func (cli *ClientOps) KillProcess(ns string, pod string, container string, pname string) error { _, _, err := cli.ExecWithOptions(ExecOptions{ - Command: []string{"kill", "-" + strconv.Itoa(int(sig)), strconv.Itoa(pid)}, + Command: []string{"pkill", pname}, Namespace: ns, PodName: pod, ContainerName: container, diff --git a/tests/pkg/util/db.go b/tests/pkg/util/db.go index c931f76245..a31ab6c201 100644 --- a/tests/pkg/util/db.go +++ b/tests/pkg/util/db.go @@ -14,6 +14,6 @@ func OpenDB(dsn string, maxIdleConns int) (*sql.DB, error) { } db.SetMaxIdleConns(maxIdleConns) - glog.Info("DB opens successfully") + glog.V(4).Info("DB opens successfully") return db, nil } From e0fd1ce31bf6df7e837a62c4079f77d2857212f8 Mon Sep 17 00:00:00 2001 From: weekface Date: Wed, 24 Apr 2019 14:27:26 +0800 Subject: [PATCH 2/3] fix typo --- tests/actions.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 8dd5d9ef13..829ddc1063 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -1695,12 +1695,7 @@ func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error { return fmt.Errorf("scheduler job failed!") } - err = oa.disableScheduledBackup(info) - if err != nil { - return err - } - - return nil + return oa.disableScheduledBackup(info) } func getParentUIDFromJob(j batchv1.Job) (types.UID, bool) { From 0f959809a9c24b18b6e311edb0b1e52aaf8932b9 Mon Sep 17 00:00:00 2001 From: weekface Date: Wed, 24 Apr 2019 15:36:01 +0800 Subject: [PATCH 3/3] address comment --- tests/actions.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index 829ddc1063..c682a7df82 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -65,7 +65,7 @@ func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, c } const ( - DefaultPollTimeout time.Duration = 30 * time.Minute + DefaultPollTimeout time.Duration = 10 * time.Minute DefaultPollInterval time.Duration = 1 * time.Minute getBackupDirPodName = "get-backup-dir" grafanaUsername = "admin"