Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stability cases added: pd replicas 1 and stop tidb-operator #496

Merged
merged 19 commits into from
May 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ require (
github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e // indirect
github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273 // indirect
github.com/renstrom/dedent v1.1.0 // indirect
github.com/robfig/cron v1.1.0 // indirect
github.com/russross/blackfriday v1.5.2+incompatible // indirect
github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
github.com/sirupsen/logrus v1.0.6
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e h1:n/3MEhJQjQxrO
github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273 h1:agujYaXJSxSo18YNX3jzl+4G6Bstwt+kqv47GS12uL0=
github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/robfig/cron v1.1.0 h1:jk4/Hud3TTdcrJgUOBgsqrZBarcxl6ADIjSC2iniwLY=
github.com/robfig/cron v1.1.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k=
github.com/russross/blackfriday v1.5.2+incompatible h1:/YIL6L1Deczl4O/cQ7ZVdrdKwuB6y7EWpw9LkD8xofE=
github.com/russross/blackfriday v1.5.2+incompatible/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
github.com/russross/blackfriday v2.0.0+incompatible h1:cBXrhZNUf9C+La9/YpS+UHpUT8YD6Td9ZMSU9APFcsk=
Expand Down
8 changes: 6 additions & 2 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ type OperatorActions interface {
CheckK8sAvailableOrDie(excludeNodes map[string]string, excludePods map[string]*corev1.Pod)
CheckOperatorAvailable(operatorConfig *OperatorConfig) error
CheckTidbClustersAvailable(infos []*TidbClusterConfig) error
CheckOperatorDownOrDie(infos []*TidbClusterConfig)
CheckTidbClustersAvailableOrDie(infos []*TidbClusterConfig)
CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
Expand Down Expand Up @@ -333,7 +334,7 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
"controllerManager.autoFailover": "true",
"scheduler.kubeSchedulerImageName": oi.SchedulerImage,
"controllerManager.logLevel": oi.LogLevel,
"scheduler.logLevel": "2",
"scheduler.logLevel": "4",
"controllerManager.replicas": "2",
"scheduler.replicas": "2",
"imagePullPolicy": string(oi.ImagePullPolicy),
Expand Down Expand Up @@ -2343,7 +2344,10 @@ func (oa *operatorActions) EmitEvent(info *TidbClusterConfig, message string) {
return
}

ce := oa.clusterEvents[info.String()]
ce, ok := oa.clusterEvents[info.String()]
if !ok {
return
}
ce.events = append(ce.events, ev)

// sleep a while to avoid overlapping time
Expand Down
3 changes: 3 additions & 0 deletions tests/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ import (
func (oa *operatorActions) BackupRestore(from, to *TidbClusterConfig) error {
oa.StopInsertDataTo(from)

// wait for insert stop fully
time.Sleep(1 * time.Minute)

err := oa.DeployAdHocBackup(from)
if err != nil {
glog.Errorf("cluster:[%s] deploy happen error: %v", from.ClusterName, err)
Expand Down
22 changes: 18 additions & 4 deletions tests/cmd/e2e/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,10 @@ func main() {
if err != nil {
glog.Fatal(err)
}
// create database and table and insert a column for test backup and restore
initSQL := `"create database record;use record;create table test(t char(32))"`

name1 := "e2e-cluster1"
name2 := "e2e-cluster2"
name3 := "e2e-pd-replicas-1"
clusterInfos := []*tests.TidbClusterConfig{
{
Namespace: name1,
Expand All @@ -77,7 +76,6 @@ func main() {
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", initTidbVersion),
StorageClassName: "local-storage",
Password: "admin",
InitSQL: initSQL,
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", name1),
BackupSecretName: fmt.Sprintf("%s-backup-secret", name1),
Expand Down Expand Up @@ -120,7 +118,6 @@ func main() {
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", initTidbVersion),
StorageClassName: "local-storage",
Password: "admin",
InitSQL: initSQL,
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", name2),
BackupSecretName: fmt.Sprintf("%s-backup-secret", name2),
Expand Down Expand Up @@ -154,6 +151,23 @@ func main() {
TiDBTokenLimit: 1000,
PDLogLevel: "info",
},
{
shuijing198799 marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the one pd test in e2e test and stability test is different ? this test may only test deploy TiDB cluster, so I think the e2e test did a lot of extra work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in order to cover this issue: #350

Namespace: name2,
ClusterName: name3,
OperatorTag: conf.OperatorTag,
PDImage: fmt.Sprintf("pingcap/pd:%s", initTidbVersion),
TiKVImage: fmt.Sprintf("pingcap/tikv:%s", initTidbVersion),
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", initTidbVersion),
StorageClassName: "local-storage",
Password: "admin",
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", name2),
BackupSecretName: fmt.Sprintf("%s-backup-secret", name2),
Resources: map[string]string{
"pd.replicas": "1",
"discovery.image": conf.OperatorImage,
},
},
}

defer func() {
Expand Down
69 changes: 51 additions & 18 deletions tests/cmd/stability/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,18 @@ import (
"strconv"
"time"

"github.com/pingcap/tidb-operator/tests/slack"

"github.com/golang/glog"
"github.com/jinzhu/copier"
"github.com/pingcap/tidb-operator/tests"
"github.com/pingcap/tidb-operator/tests/pkg/client"
"github.com/pingcap/tidb-operator/tests/slack"
"github.com/robfig/cron"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/util/logs"
)

var successCount int

func main() {
logs.InitLogs()
defer logs.FlushLogs()
Expand All @@ -40,13 +42,9 @@ func main() {

conf := tests.ParseConfigOrDie()
cli, kubeCli := client.NewCliOrDie()
fta := tests.NewFaultTriggerAction(cli, kubeCli, conf)
fta.CheckAndRecoverEnvOrDie()

tidbVersion := conf.GetTiDBVersionOrDie()
upgardeTiDBVersions := conf.GetUpgradeTidbVersionsOrDie()

// operator config
operatorCfg := &tests.OperatorConfig{
Namespace: "pingcap",
ReleaseName: "operator",
Expand All @@ -60,11 +58,6 @@ func main() {
ImagePullPolicy: v1.PullAlways,
}

// TODO remove this
// create database and table and insert a column for test backup and restore
initSQL := `"create database record;use record;create table test(t char(32))"`

// two clusters in different namespaces
clusterName1 := "stability-cluster1"
clusterName2 := "stability-cluster2"
cluster1 := &tests.TidbClusterConfig{
Expand All @@ -76,7 +69,6 @@ func main() {
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", tidbVersion),
StorageClassName: "local-storage",
Password: "admin",
InitSQL: initSQL,
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", clusterName1),
BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName1),
Expand Down Expand Up @@ -118,7 +110,6 @@ func main() {
TiDBImage: fmt.Sprintf("pingcap/tidb:%s", tidbVersion),
StorageClassName: "local-storage",
Password: "admin",
InitSQL: initSQL,
UserName: "root",
InitSecretName: fmt.Sprintf("%s-set-secret", clusterName2),
BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName2),
Expand Down Expand Up @@ -156,18 +147,48 @@ func main() {
copier.Copy(clusterRestoreTo, clusterBackupFrom)
clusterRestoreTo.ClusterName = "cluster-restore"

onePDCluster := &tests.TidbClusterConfig{}
copier.Copy(onePDCluster, cluster1)
onePDCluster.ClusterName = "pd-replicas-1"
onePDCluster.Namespace = "pd-replicas-1"
onePDCluster.Resources["pd.replicas"] = "1"

allClusters := []*tests.TidbClusterConfig{cluster1, cluster2, clusterRestoreTo}

fta := tests.NewFaultTriggerAction(cli, kubeCli, conf)
oa := tests.NewOperatorActions(cli, kubeCli, tests.DefaultPollInterval, conf, allClusters)

fta.CheckAndRecoverEnvOrDie()
oa.CheckK8sAvailableOrDie(nil, nil)
go wait.Forever(oa.EventWorker, 10*time.Second)
// start a http server in goruntine
go oa.StartValidatingAdmissionWebhookServerOrDie(operatorCfg)

defer func() {
oa.DumpAllLogs(operatorCfg, allClusters)
}()
c := cron.New()
c.AddFunc("0 0 10 * * *", func() {
slack.NotifyAndCompletedf("Succeed %d times in the past 24 hours.", successCount)
successCount = 0
})
go c.Start()

fn := func() {
run(oa, fta, conf, operatorCfg, allClusters, cluster1, cluster2,
onePDCluster, upgardeTiDBVersions, clusterRestoreTo, clusterBackupFrom)
}
wait.Forever(fn, 5*time.Minute)
}

func run(oa tests.OperatorActions,
fta tests.FaultTriggerActions,
conf *tests.Config,
operatorCfg *tests.OperatorConfig,
allClusters []*tests.TidbClusterConfig,
cluster1 *tests.TidbClusterConfig,
cluster2 *tests.TidbClusterConfig,
onePDCluster *tests.TidbClusterConfig,
upgardeTiDBVersions []string,
clusterRestoreTo *tests.TidbClusterConfig,
clusterBackupFrom *tests.TidbClusterConfig,
) {
// clean and deploy operator
oa.CleanOperatorOrDie(operatorCfg)
oa.DeployOperatorOrDie(operatorCfg)
Expand All @@ -176,15 +197,22 @@ func main() {
for _, cluster := range allClusters {
oa.CleanTidbClusterOrDie(cluster)
}
oa.CleanTidbClusterOrDie(onePDCluster)

// deploy and check cluster1, cluster2
oa.DeployTidbClusterOrDie(cluster1)
oa.DeployTidbClusterOrDie(cluster2)
oa.DeployTidbClusterOrDie(onePDCluster)
oa.CheckTidbClusterStatusOrDie(cluster1)
oa.CheckTidbClusterStatusOrDie(cluster2)
oa.CheckTidbClusterStatusOrDie(onePDCluster)

oa.CleanTidbClusterOrDie(onePDCluster)

go oa.BeginInsertDataToOrDie(cluster1)
go oa.BeginInsertDataToOrDie(cluster2)
defer oa.StopInsertDataTo(cluster1)
defer oa.StopInsertDataTo(cluster2)

// scale out cluster1 and cluster2
cluster1.ScaleTiDB(3).ScaleTiKV(5).ScalePD(5)
Expand Down Expand Up @@ -252,6 +280,10 @@ func main() {
// backup and restore
oa.BackupRestoreOrDie(clusterBackupFrom, clusterRestoreTo)

oa.CleanOperatorOrDie(operatorCfg)
oa.CheckOperatorDownOrDie(allClusters)
oa.DeployOperatorOrDie(operatorCfg)

// stop a node and failover automatically
physicalNode, node, faultTime := fta.StopNodeOrDie()
oa.EmitEvent(nil, fmt.Sprintf("StopNode: %s on %s", node, physicalNode))
Expand Down Expand Up @@ -283,5 +315,6 @@ func main() {
glog.Errorf("failed to clean temp dirs, this error can be ignored.")
}

slack.NotifyAndCompleted("\nFinished.")
successCount++
glog.Infof("################## Stability test finished at: %v\n\n\n\n", time.Now().Format(time.RFC3339))
}
13 changes: 13 additions & 0 deletions tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,19 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo
})
}

func (oa *operatorActions) CheckOperatorDownOrDie(clusters []*TidbClusterConfig) {
glog.Infof("checking k8s/tidbCluster status when operator down")

KeepOrDie(3*time.Second, 10*time.Minute, func() error {
err := oa.CheckK8sAvailable(nil, nil)
if err != nil {
return err
}

return oa.CheckTidbClustersAvailable(clusters)
})
}

func (oa *operatorActions) CheckK8sAvailableOrDie(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) {
if err := oa.CheckK8sAvailable(excludeNodes, excludePods); err != nil {
slack.NotifyAndPanic(err)
Expand Down
17 changes: 5 additions & 12 deletions tests/manifests/stability/stability.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,11 @@ spec:
image: ""
imagePullPolicy: Always
command:
- /bin/sh
- -c
- |
set -e
for i in `seq 1 100`
do
echo "######################## $i begin ########################"
/usr/local/bin/stability-test \
--config=/etc/tidb-operator-stability/config.yaml \
--slack-webhook-url=""
echo "######################## $i end ##########################"
done
- /usr/local/bin/stability-test
- --config=/etc/tidb-operator-stability/config.yaml
- --operator-image=pingcap/tidb-operator:v1.0.0-beta.2
- --operator-tag=v1.0.0-beta.2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can define image-repo and image-tag to avoid passing the same version twice.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the operator-tag is used by git clone to check out the source code.

For example, the image maybe pingcap/tidb-operator:latest and the operator-tag is master.

- --slack-webhook-url=""
volumeMounts:
- mountPath: /logDir
name: logdir
Expand Down
1 change: 1 addition & 0 deletions tests/pkg/blockwriter/blockwriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ func (bw *blockWriter) batchExecute(db *sql.DB, query string) error {
}

func (bw *blockWriter) run(ctx context.Context, db *sql.DB, queryChan chan []string) {
defer glog.Infof("run stopped")
for {
select {
case <-ctx.Done():
Expand Down
2 changes: 1 addition & 1 deletion tests/pkg/webhook/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
return &reviewResponse
}

if leader.Name == name && tc.Status.TiDB.StatefulSet.Replicas > 1 {
if leader.Name == name && tc.Status.PD.StatefulSet.Replicas > 1 {
time.Sleep(10 * time.Second)
err := fmt.Errorf("pd is leader, can't be deleted namespace %s name %s", namespace, name)
glog.Error(err)
Expand Down
5 changes: 3 additions & 2 deletions tests/slack/slack.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func (attachment *Attachment) AddField(field Field) *Attachment {

func Send(webhookURL string, proxy string, payload Payload) error {
if webhookURL == "" {
return fmt.Errorf("the webhookURL have not set")
return nil
}
body, err := json.Marshal(payload)
if err != nil {
Expand Down Expand Up @@ -154,7 +154,8 @@ func NotifyAndPanic(err error) {
panic(err)
}

func NotifyAndCompleted(msg string) {
func NotifyAndCompletedf(format string, args ...interface{}) {
msg := fmt.Sprintf(format, args...)
sendErr := SendGoodMsg(msg)
if sendErr != nil {
glog.Warningf("failed to notify slack[%s] the massage: %s,error: %v", WebhookURL, msg, sendErr)
Expand Down