Skip to content

Commit

Permalink
refine stability (#422)
Browse files Browse the repository at this point in the history
* refine logger
* fix scheduled backup case bug
* disable scheduled backup when finished
* fix TruncateSSTFile bug
  • Loading branch information
weekface authored Apr 24, 2019
1 parent d8e43e0 commit d954936
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 61 deletions.
107 changes: 69 additions & 38 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ type OperatorConfig struct {
}

type TidbClusterConfig struct {
BackupPVC string
BackupName string
Namespace string
ClusterName string
OperatorTag string
Expand Down Expand Up @@ -247,6 +247,8 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
}

func (oa *operatorActions) DeployOperator(info *OperatorConfig) error {
glog.Infof("deploying tidb-operator %s", info.ReleaseName)

if info.Tag != "e2e" {
if err := oa.cloneOperatorRepo(); err != nil {
return err
Expand All @@ -256,10 +258,7 @@ func (oa *operatorActions) DeployOperator(info *OperatorConfig) error {
}
}

cmd := fmt.Sprintf(`helm install %s \
--name %s \
--namespace %s \
--set-string %s`,
cmd := fmt.Sprintf(`helm install %s --name %s --namespace %s --set-string %s`,
oa.operatorChartPath(info.Tag),
info.ReleaseName,
info.Namespace,
Expand All @@ -280,6 +279,8 @@ func (oa *operatorActions) DeployOperatorOrDie(info *OperatorConfig) {
}

func (oa *operatorActions) CleanOperator(info *OperatorConfig) error {
glog.Infof("cleaning tidb-operator %s", info.ReleaseName)

err := oa.CleanWebHookAndService(info)
if err != nil {
return err
Expand Down Expand Up @@ -317,7 +318,7 @@ func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error {
}

func (oa *operatorActions) DeployTidbCluster(info *TidbClusterConfig) error {
glog.Infof("begin to deploy tidb cluster cluster[%s] namespace[%s]", info.ClusterName, info.Namespace)
glog.Infof("deploying tidb cluster [%s/%s]", info.Namespace, info.ClusterName)

namespace := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -393,10 +394,10 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error {
return fmt.Errorf("failed to delete jobs: %v, %s", err, string(res))
}

patchPVCmd := fmt.Sprintf(`kubectl get pv -l %s=%s,%s=%s --output=name | xargs -I {} \
kubectl patch {} -p '{"spec":{"persistentVolumeReclaimPolicy":"Delete"}}'`,
label.NamespaceLabelKey, info.Namespace, label.InstanceLabelKey, info.ClusterName)
glog.V(4).Info(patchPVCmd)
patchPVCmd := fmt.Sprintf("kubectl get pv | grep %s | grep %s | awk '{print $1}' | "+
"xargs -I {} kubectl patch pv {} -p '{\"spec\":{\"persistentVolumeReclaimPolicy\":\"Delete\"}}'",
info.Namespace, info.ClusterName)
glog.Info(patchPVCmd)
if res, err := exec.Command("/bin/sh", "-c", patchPVCmd).CombinedOutput(); err != nil {
return fmt.Errorf("failed to patch pv: %v, %s", err, string(res))
}
Expand Down Expand Up @@ -431,11 +432,11 @@ func (oa *operatorActions) CleanTidbClusterOrDie(info *TidbClusterConfig) {
}

func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error {
glog.Infof("begin to check tidb cluster cluster[%s] namespace[%s]", info.ClusterName, info.Namespace)
glog.Infof("checking tidb cluster [%s/%s] status", info.Namespace, info.ClusterName)

ns := info.Namespace
tcName := info.ClusterName
if err := wait.PollImmediate(oa.pollInterval, DefaultPollTimeout, func() (bool, error) {
if err := wait.Poll(oa.pollInterval, DefaultPollTimeout, func() (bool, error) {
var tc *v1alpha1.TidbCluster
var err error
if tc, err = oa.cli.PingcapV1alpha1().TidbClusters(ns).Get(tcName, metav1.GetOptions{}); err != nil {
Expand All @@ -450,43 +451,43 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error
return false, nil
}

glog.Infof("check tidb cluster begin tidbMembersReadyFn")
glog.V(4).Infof("check tidb cluster begin tidbMembersReadyFn")
if b, err := oa.tidbMembersReadyFn(tc); !b && err == nil {
return false, nil
}

glog.Infof("check tidb cluster begin reclaimPolicySyncFn")
glog.V(4).Infof("check tidb cluster begin reclaimPolicySyncFn")
if b, err := oa.reclaimPolicySyncFn(tc); !b && err == nil {
return false, nil
}

glog.Infof("check tidb cluster begin metaSyncFn")
glog.V(4).Infof("check tidb cluster begin metaSyncFn")
if b, err := oa.metaSyncFn(tc); err != nil {
return false, err
} else if !b && err == nil {
return false, nil
}

glog.Infof("check tidb cluster begin schedulerHAFn")
glog.V(4).Infof("check tidb cluster begin schedulerHAFn")
if b, err := oa.schedulerHAFn(tc); !b && err == nil {
return false, nil
}

glog.Infof("check tidb cluster begin passwordIsSet")
glog.V(4).Infof("check tidb cluster begin passwordIsSet")
if b, err := oa.passwordIsSet(info); !b && err == nil {
return false, nil
}

if info.Monitor {
glog.Infof("check tidb monitor normal")
glog.V(4).Infof("check tidb monitor normal")
if b, err := oa.monitorNormal(info); !b && err == nil {
return false, nil
}
}
return true, nil
}); err != nil {
glog.Infof("check tidb cluster status failed: %s", err.Error())
return fmt.Errorf("failed to waiting for tidbcluster %s/%s ready in 10 minutes", ns, tcName)
return fmt.Errorf("failed to waiting for tidbcluster %s/%s ready in 30 minutes", ns, tcName)
}

return nil
Expand Down Expand Up @@ -1371,14 +1372,12 @@ func (oa *operatorActions) cloneOperatorRepo() error {
}

func (oa *operatorActions) checkoutTag(tagName string) error {
cmd := fmt.Sprintf(`cd %s &&
git stash -u &&
git checkout %s &&
mkdir -p %s &&
cp -rf charts/tidb-operator %s &&
cp -rf charts/tidb-cluster %s &&
cp -rf charts/tidb-backup %s`,
oa.cfg.OperatorRepoDir, tagName, filepath.Join(oa.cfg.ChartDir, tagName), oa.operatorChartPath(tagName), oa.tidbClusterChartPath(tagName), oa.backupChartPath(tagName))
cmd := fmt.Sprintf("cd %s && git stash -u && git checkout %s && "+
"mkdir -p %s && cp -rf charts/tidb-operator %s && "+
"cp -rf charts/tidb-cluster %s && cp -rf charts/tidb-backup %s",
oa.cfg.OperatorRepoDir, tagName,
filepath.Join(oa.cfg.ChartDir, tagName), oa.operatorChartPath(tagName),
oa.tidbClusterChartPath(tagName), oa.backupChartPath(tagName))
glog.Info(cmd)
res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput()
if err != nil {
Expand All @@ -1392,7 +1391,7 @@ func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error {
glog.Infof("begin to deploy adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace)

sets := map[string]string{
"name": info.BackupPVC,
"name": info.BackupName,
"mode": "backup",
"user": "root",
"password": info.Password,
Expand All @@ -1416,7 +1415,7 @@ func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error {
func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error {
glog.Infof("begin to clean adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace)

jobName := fmt.Sprintf("%s-%s", info.ClusterName, info.BackupPVC)
jobName := fmt.Sprintf("%s-%s", info.ClusterName, info.BackupName)
fn := func() (bool, error) {
job, err := oa.kubeCli.BatchV1().Jobs(info.Namespace).Get(jobName, metav1.GetOptions{})
if err != nil {
Expand All @@ -1440,10 +1439,10 @@ func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error {
}

func (oa *operatorActions) Restore(from *TidbClusterConfig, to *TidbClusterConfig) error {
glog.Infof("begin to deploy restore cluster[%s] namespace[%s]", from.ClusterName, from.Namespace)
glog.Infof("deploying restore cluster[%s/%s]", from.Namespace, from.ClusterName)

sets := map[string]string{
"name": to.BackupPVC,
"name": to.BackupName,
"mode": "restore",
"user": "root",
"password": to.Password,
Expand All @@ -1466,7 +1465,7 @@ func (oa *operatorActions) Restore(from *TidbClusterConfig, to *TidbClusterConfi

func (oa *operatorActions) CheckRestore(from *TidbClusterConfig, to *TidbClusterConfig) error {
glog.Infof("begin to check restore backup cluster[%s] namespace[%s]", from.ClusterName, from.Namespace)
jobName := fmt.Sprintf("%s-restore-%s", to.ClusterName, from.BackupPVC)
jobName := fmt.Sprintf("%s-restore-%s", to.ClusterName, from.BackupName)
fn := func() (bool, error) {
job, err := oa.kubeCli.BatchV1().Jobs(to.Namespace).Get(jobName, metav1.GetOptions{})
if err != nil {
Expand Down Expand Up @@ -1610,8 +1609,28 @@ func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error
return nil
}

func (oa *operatorActions) disableScheduledBackup(info *TidbClusterConfig) error {
glog.Infof("disabling scheduled backup")

sets := map[string]string{
"clusterName": info.ClusterName,
"scheduledBackup.create": "false",
}

setString := info.TidbClusterHelmSetString(sets)

cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s",
info.ClusterName, oa.tidbClusterChartPath(info.OperatorTag), setString)

res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput()
if err != nil {
return fmt.Errorf("failed to disable scheduler backup job: %v, %s", err, string(res))
}
return nil
}

func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error {
glog.Infof("begin to check scheduler backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace)
glog.Infof("checking scheduler backup for tidb cluster[%s/%s]", info.Namespace, info.ClusterName)

jobName := fmt.Sprintf("%s-scheduled-backup", info.ClusterName)
fn := func() (bool, error) {
Expand Down Expand Up @@ -1639,14 +1658,26 @@ func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error {
return false, nil
}

succededJobCount := 0
for _, j := range backupJobs {
if j.Status.Succeeded == 0 {
glog.Errorf("cluster [%s] back up job is not completed, please wait! ", info.ClusterName)
return false, nil
if j.Status.Failed > 0 {
return false, fmt.Errorf("cluster [%s/%s] scheduled backup job failed, job: [%s] failed count is: %d",
info.Namespace, info.ClusterName, j.Name, j.Status.Failed)
}
if j.Status.Succeeded > 0 {
succededJobCount++
}
}

return true, nil
if succededJobCount >= 3 {
glog.Infof("cluster [%s/%s] scheduled back up job completed count: %d",
info.Namespace, info.ClusterName, succededJobCount)
return true, nil
}

glog.Infof("cluster [%s/%s] scheduled back up job is not completed, please wait! ",
info.Namespace, info.ClusterName)
return false, nil
}

err := wait.Poll(oa.pollInterval, DefaultPollTimeout, fn)
Expand All @@ -1666,7 +1697,7 @@ func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error {
return fmt.Errorf("scheduler job failed!")
}

return nil
return oa.disableScheduledBackup(info)
}

func getParentUIDFromJob(j batchv1.Job) (types.UID, bool) {
Expand Down
4 changes: 2 additions & 2 deletions tests/cmd/e2e/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func main() {
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", name1),
BackupSecretName: fmt.Sprintf("%s-backup-secret", name1),
BackupPVC: "backup-pvc",
BackupName: "backup",
Resources: map[string]string{
"pd.resources.limits.cpu": "1000m",
"pd.resources.limits.memory": "2Gi",
Expand Down Expand Up @@ -107,7 +107,7 @@ func main() {
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", name2),
BackupSecretName: fmt.Sprintf("%s-backup-secret", name2),
BackupPVC: "backup-pvc",
BackupName: "backup",
Resources: map[string]string{
"pd.resources.limits.cpu": "1000m",
"pd.resources.limits.memory": "2Gi",
Expand Down
4 changes: 2 additions & 2 deletions tests/cmd/stability/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ func main() {
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", clusterName1),
BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName1),
BackupPVC: "backup-pvc",
BackupName: "backup",
Resources: map[string]string{
"pd.resources.limits.cpu": "1000m",
"pd.resources.limits.memory": "2Gi",
Expand Down Expand Up @@ -113,7 +113,7 @@ func main() {
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", clusterName2),
BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName2),
BackupPVC: "backup-pvc",
BackupName: "backup",
Resources: map[string]string{
"pd.resources.limits.cpu": "1000m",
"pd.resources.limits.memory": "2Gi",
Expand Down
23 changes: 14 additions & 9 deletions tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"fmt"
"sort"
"strings"
"syscall"
"time"

_ "github.com/go-sql-driver/mysql"
Expand Down Expand Up @@ -90,7 +89,7 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon
}

// restart tikv to ensure sst files
err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", 1, syscall.SIGTERM)
err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", "tikv-server")
if err != nil {
glog.Errorf("kill tikv: pod=%s err=%s", store.PodName, err.Error())
return err
Expand Down Expand Up @@ -124,22 +123,28 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon
})

// make tikv crash
err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", 1, syscall.SIGTERM)
if err != nil {
glog.Errorf("kill tikv: pod=%s err=%s", store.PodName, err.Error())
return err
}
//err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", "tikv-server")
//if err != nil {
// glog.Errorf("cluster: [%s/%s] kill tikv: pod=%s err=%s",
// info.Namespace, info.ClusterName,
// store.PodName, err.Error())
// return err
//}

tikvOps.SetPoll(DefaultPollInterval, maxStoreDownTime+tikvFailoverPeriod+failoverTimeout)

return tikvOps.PollTiDBCluster(info.Namespace, info.ClusterName,
func(tc *v1alpha1.TidbCluster, err error) (bool, error) {
glog.Infof("check failure stores: current=%d origin=%d", len(tc.Status.TiKV.FailureStores), origFailures)
glog.Infof("cluster: [%s/%s] check failure stores: current=%d origin=%d",
info.Namespace, info.ClusterName,
len(tc.Status.TiKV.FailureStores), origFailures)
if len(tc.Status.TiKV.FailureStores) <= origFailures {
return false, nil
}
ups := countUpStores(tc)
glog.Infof("check up stores: current=%d origin=%d", ups, origUps)
glog.Infof("cluster: [%s/%s] check up stores: current=%d origin=%d",
info.Namespace, info.ClusterName,
ups, origUps)
if ups < origUps {
return false, nil
}
Expand Down
4 changes: 1 addition & 3 deletions tests/fault.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,6 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error
return err
}

glog.Infof("%+v", vms)

for _, vm := range vms {
if vm.IP == node && vm.Status == "running" {
return nil
Expand Down Expand Up @@ -325,7 +323,7 @@ func (fa *faultTriggerActions) serviceAction(node string, serverName string, act
return err
}

glog.Infof("%s %s %s successfully", action, serverName, node)
glog.V(4).Infof("%s %s %s successfully", action, serverName, node)

return nil
}
Expand Down
4 changes: 2 additions & 2 deletions tests/pkg/blockwriter/blockwriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func (c *BlockWriterCase) generateQuery(ctx context.Context, queryChan chan []st
if len(queryChan) < queryChanSize {
queryChan <- querys
} else {
glog.Infof("[%s] [%s] [action: generate Query] query channel is full, sleep 10 seconds", c, c.ClusterName)
glog.V(4).Infof("[%s] [%s] [action: generate Query] query channel is full, sleep 10 seconds", c, c.ClusterName)
util.Sleep(ctx, 10*time.Second)
}
}
Expand Down Expand Up @@ -164,7 +164,7 @@ func (bw *blockWriter) run(ctx context.Context, db *sql.DB, queryChan chan []str
return
default:
if err := bw.batchExecute(db, query); err != nil {
glog.Error(err)
glog.V(4).Info(err)
time.Sleep(5 * time.Second)
continue
}
Expand Down
Loading

0 comments on commit d954936

Please sign in to comment.