Skip to content

Commit

Permalink
stability cases added: pd replicas 1 and stop tidb-operator (#496)
Browse files Browse the repository at this point in the history
  • Loading branch information
weekface authored May 27, 2019
1 parent 375304b commit a875bf5
Show file tree
Hide file tree
Showing 11 changed files with 104 additions and 39 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ require (
github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e // indirect
github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273 // indirect
github.com/renstrom/dedent v1.1.0 // indirect
github.com/robfig/cron v1.1.0 // indirect
github.com/russross/blackfriday v1.5.2+incompatible // indirect
github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
github.com/sirupsen/logrus v1.0.6
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e h1:n/3MEhJQjQxrO
github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273 h1:agujYaXJSxSo18YNX3jzl+4G6Bstwt+kqv47GS12uL0=
github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/robfig/cron v1.1.0 h1:jk4/Hud3TTdcrJgUOBgsqrZBarcxl6ADIjSC2iniwLY=
github.com/robfig/cron v1.1.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k=
github.com/russross/blackfriday v1.5.2+incompatible h1:/YIL6L1Deczl4O/cQ7ZVdrdKwuB6y7EWpw9LkD8xofE=
github.com/russross/blackfriday v1.5.2+incompatible/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
github.com/russross/blackfriday v2.0.0+incompatible h1:cBXrhZNUf9C+La9/YpS+UHpUT8YD6Td9ZMSU9APFcsk=
Expand Down
8 changes: 6 additions & 2 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ type OperatorActions interface {
CheckK8sAvailableOrDie(excludeNodes map[string]string, excludePods map[string]*corev1.Pod)
CheckOperatorAvailable(operatorConfig *OperatorConfig) error
CheckTidbClustersAvailable(infos []*TidbClusterConfig) error
CheckOperatorDownOrDie(infos []*TidbClusterConfig)
CheckTidbClustersAvailableOrDie(infos []*TidbClusterConfig)
CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
Expand Down Expand Up @@ -333,7 +334,7 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
"controllerManager.autoFailover": "true",
"scheduler.kubeSchedulerImageName": oi.SchedulerImage,
"controllerManager.logLevel": oi.LogLevel,
"scheduler.logLevel": "2",
"scheduler.logLevel": "4",
"controllerManager.replicas": "2",
"scheduler.replicas": "2",
"imagePullPolicy": string(oi.ImagePullPolicy),
Expand Down Expand Up @@ -2343,7 +2344,10 @@ func (oa *operatorActions) EmitEvent(info *TidbClusterConfig, message string) {
return
}

ce := oa.clusterEvents[info.String()]
ce, ok := oa.clusterEvents[info.String()]
if !ok {
return
}
ce.events = append(ce.events, ev)

// sleep a while to avoid overlapping time
Expand Down
3 changes: 3 additions & 0 deletions tests/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ import (
func (oa *operatorActions) BackupRestore(from, to *TidbClusterConfig) error {
oa.StopInsertDataTo(from)

// wait for insert stop fully
time.Sleep(1 * time.Minute)

err := oa.DeployAdHocBackup(from)
if err != nil {
glog.Errorf("cluster:[%s] deploy happen error: %v", from.ClusterName, err)
Expand Down
22 changes: 18 additions & 4 deletions tests/cmd/e2e/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,10 @@ func main() {
if err != nil {
glog.Fatal(err)
}
// create database and table and insert a column for test backup and restore
initSQL := `"create database record;use record;create table test(t char(32))"`

name1 := "e2e-cluster1"
name2 := "e2e-cluster2"
name3 := "e2e-pd-replicas-1"
clusterInfos := []*tests.TidbClusterConfig{
{
Namespace: name1,
Expand All @@ -77,7 +76,6 @@ func main() {
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", initTidbVersion),
StorageClassName: "local-storage",
Password: "admin",
InitSQL: initSQL,
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", name1),
BackupSecretName: fmt.Sprintf("%s-backup-secret", name1),
Expand Down Expand Up @@ -120,7 +118,6 @@ func main() {
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", initTidbVersion),
StorageClassName: "local-storage",
Password: "admin",
InitSQL: initSQL,
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", name2),
BackupSecretName: fmt.Sprintf("%s-backup-secret", name2),
Expand Down Expand Up @@ -154,6 +151,23 @@ func main() {
TiDBTokenLimit: 1000,
PDLogLevel: "info",
},
{
Namespace: name2,
ClusterName: name3,
OperatorTag: conf.OperatorTag,
PDImage: fmt.Sprintf("pingcap/pd:%s", initTidbVersion),
TiKVImage: fmt.Sprintf("pingcap/tikv:%s", initTidbVersion),
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", initTidbVersion),
StorageClassName: "local-storage",
Password: "admin",
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", name2),
BackupSecretName: fmt.Sprintf("%s-backup-secret", name2),
Resources: map[string]string{
"pd.replicas": "1",
"discovery.image": conf.OperatorImage,
},
},
}

defer func() {
Expand Down
69 changes: 51 additions & 18 deletions tests/cmd/stability/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,18 @@ import (
"strconv"
"time"

"github.com/pingcap/tidb-operator/tests/slack"

"github.com/golang/glog"
"github.com/jinzhu/copier"
"github.com/pingcap/tidb-operator/tests"
"github.com/pingcap/tidb-operator/tests/pkg/client"
"github.com/pingcap/tidb-operator/tests/slack"
"github.com/robfig/cron"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/util/logs"
)

var successCount int

func main() {
logs.InitLogs()
defer logs.FlushLogs()
Expand All @@ -40,13 +42,9 @@ func main() {

conf := tests.ParseConfigOrDie()
cli, kubeCli := client.NewCliOrDie()
fta := tests.NewFaultTriggerAction(cli, kubeCli, conf)
fta.CheckAndRecoverEnvOrDie()

tidbVersion := conf.GetTiDBVersionOrDie()
upgardeTiDBVersions := conf.GetUpgradeTidbVersionsOrDie()

// operator config
operatorCfg := &tests.OperatorConfig{
Namespace: "pingcap",
ReleaseName: "operator",
Expand All @@ -60,11 +58,6 @@ func main() {
ImagePullPolicy: v1.PullAlways,
}

// TODO remove this
// create database and table and insert a column for test backup and restore
initSQL := `"create database record;use record;create table test(t char(32))"`

// two clusters in different namespaces
clusterName1 := "stability-cluster1"
clusterName2 := "stability-cluster2"
cluster1 := &tests.TidbClusterConfig{
Expand All @@ -76,7 +69,6 @@ func main() {
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", tidbVersion),
StorageClassName: "local-storage",
Password: "admin",
InitSQL: initSQL,
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", clusterName1),
BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName1),
Expand Down Expand Up @@ -118,7 +110,6 @@ func main() {
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", tidbVersion),
StorageClassName: "local-storage",
Password: "admin",
InitSQL: initSQL,
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", clusterName2),
BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName2),
Expand Down Expand Up @@ -156,18 +147,48 @@ func main() {
copier.Copy(clusterRestoreTo, clusterBackupFrom)
clusterRestoreTo.ClusterName = "cluster-restore"

onePDCluster := &tests.TidbClusterConfig{}
copier.Copy(onePDCluster, cluster1)
onePDCluster.ClusterName = "pd-replicas-1"
onePDCluster.Namespace = "pd-replicas-1"
onePDCluster.Resources["pd.replicas"] = "1"

allClusters := []*tests.TidbClusterConfig{cluster1, cluster2, clusterRestoreTo}

fta := tests.NewFaultTriggerAction(cli, kubeCli, conf)
oa := tests.NewOperatorActions(cli, kubeCli, tests.DefaultPollInterval, conf, allClusters)

fta.CheckAndRecoverEnvOrDie()
oa.CheckK8sAvailableOrDie(nil, nil)
go wait.Forever(oa.EventWorker, 10*time.Second)
// start a http server in goruntine
go oa.StartValidatingAdmissionWebhookServerOrDie(operatorCfg)

defer func() {
oa.DumpAllLogs(operatorCfg, allClusters)
}()
c := cron.New()
c.AddFunc("0 0 10 * * *", func() {
slack.NotifyAndCompletedf("Succeed %d times in the past 24 hours.", successCount)
successCount = 0
})
go c.Start()

fn := func() {
run(oa, fta, conf, operatorCfg, allClusters, cluster1, cluster2,
onePDCluster, upgardeTiDBVersions, clusterRestoreTo, clusterBackupFrom)
}
wait.Forever(fn, 5*time.Minute)
}

func run(oa tests.OperatorActions,
fta tests.FaultTriggerActions,
conf *tests.Config,
operatorCfg *tests.OperatorConfig,
allClusters []*tests.TidbClusterConfig,
cluster1 *tests.TidbClusterConfig,
cluster2 *tests.TidbClusterConfig,
onePDCluster *tests.TidbClusterConfig,
upgardeTiDBVersions []string,
clusterRestoreTo *tests.TidbClusterConfig,
clusterBackupFrom *tests.TidbClusterConfig,
) {
// clean and deploy operator
oa.CleanOperatorOrDie(operatorCfg)
oa.DeployOperatorOrDie(operatorCfg)
Expand All @@ -176,15 +197,22 @@ func main() {
for _, cluster := range allClusters {
oa.CleanTidbClusterOrDie(cluster)
}
oa.CleanTidbClusterOrDie(onePDCluster)

// deploy and check cluster1, cluster2
oa.DeployTidbClusterOrDie(cluster1)
oa.DeployTidbClusterOrDie(cluster2)
oa.DeployTidbClusterOrDie(onePDCluster)
oa.CheckTidbClusterStatusOrDie(cluster1)
oa.CheckTidbClusterStatusOrDie(cluster2)
oa.CheckTidbClusterStatusOrDie(onePDCluster)

oa.CleanTidbClusterOrDie(onePDCluster)

go oa.BeginInsertDataToOrDie(cluster1)
go oa.BeginInsertDataToOrDie(cluster2)
defer oa.StopInsertDataTo(cluster1)
defer oa.StopInsertDataTo(cluster2)

// scale out cluster1 and cluster2
cluster1.ScaleTiDB(3).ScaleTiKV(5).ScalePD(5)
Expand Down Expand Up @@ -252,6 +280,10 @@ func main() {
// backup and restore
oa.BackupRestoreOrDie(clusterBackupFrom, clusterRestoreTo)

oa.CleanOperatorOrDie(operatorCfg)
oa.CheckOperatorDownOrDie(allClusters)
oa.DeployOperatorOrDie(operatorCfg)

// stop a node and failover automatically
physicalNode, node, faultTime := fta.StopNodeOrDie()
oa.EmitEvent(nil, fmt.Sprintf("StopNode: %s on %s", node, physicalNode))
Expand Down Expand Up @@ -283,5 +315,6 @@ func main() {
glog.Errorf("failed to clean temp dirs, this error can be ignored.")
}

slack.NotifyAndCompleted("\nFinished.")
successCount++
glog.Infof("################## Stability test finished at: %v\n\n\n\n", time.Now().Format(time.RFC3339))
}
13 changes: 13 additions & 0 deletions tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,19 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo
})
}

func (oa *operatorActions) CheckOperatorDownOrDie(clusters []*TidbClusterConfig) {
glog.Infof("checking k8s/tidbCluster status when operator down")

KeepOrDie(3*time.Second, 10*time.Minute, func() error {
err := oa.CheckK8sAvailable(nil, nil)
if err != nil {
return err
}

return oa.CheckTidbClustersAvailable(clusters)
})
}

func (oa *operatorActions) CheckK8sAvailableOrDie(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) {
if err := oa.CheckK8sAvailable(excludeNodes, excludePods); err != nil {
slack.NotifyAndPanic(err)
Expand Down
17 changes: 5 additions & 12 deletions tests/manifests/stability/stability.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,11 @@ spec:
image: ""
imagePullPolicy: Always
command:
- /bin/sh
- -c
- |
set -e
for i in `seq 1 100`
do
echo "######################## $i begin ########################"
/usr/local/bin/stability-test \
--config=/etc/tidb-operator-stability/config.yaml \
--slack-webhook-url=""
echo "######################## $i end ##########################"
done
- /usr/local/bin/stability-test
- --config=/etc/tidb-operator-stability/config.yaml
- --operator-image=pingcap/tidb-operator:v1.0.0-beta.2
- --operator-tag=v1.0.0-beta.2
- --slack-webhook-url=""
volumeMounts:
- mountPath: /logDir
name: logdir
Expand Down
1 change: 1 addition & 0 deletions tests/pkg/blockwriter/blockwriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ func (bw *blockWriter) batchExecute(db *sql.DB, query string) error {
}

func (bw *blockWriter) run(ctx context.Context, db *sql.DB, queryChan chan []string) {
defer glog.Infof("run stopped")
for {
select {
case <-ctx.Done():
Expand Down
2 changes: 1 addition & 1 deletion tests/pkg/webhook/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
return &reviewResponse
}

if leader.Name == name && tc.Status.TiDB.StatefulSet.Replicas > 1 {
if leader.Name == name && tc.Status.PD.StatefulSet.Replicas > 1 {
time.Sleep(10 * time.Second)
err := fmt.Errorf("pd is leader, can't be deleted namespace %s name %s", namespace, name)
glog.Error(err)
Expand Down
5 changes: 3 additions & 2 deletions tests/slack/slack.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func (attachment *Attachment) AddField(field Field) *Attachment {

func Send(webhookURL string, proxy string, payload Payload) error {
if webhookURL == "" {
return fmt.Errorf("the webhookURL have not set")
return nil
}
body, err := json.Marshal(payload)
if err != nil {
Expand Down Expand Up @@ -154,7 +154,8 @@ func NotifyAndPanic(err error) {
panic(err)
}

func NotifyAndCompleted(msg string) {
func NotifyAndCompletedf(format string, args ...interface{}) {
msg := fmt.Sprintf(format, args...)
sendErr := SendGoodMsg(msg)
if sendErr != nil {
glog.Warningf("failed to notify slack[%s] the massage: %s,error: %v", WebhookURL, msg, sendErr)
Expand Down

0 comments on commit a875bf5

Please sign in to comment.