Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: ensure pods unaffected when upgrading #955

Merged
merged 14 commits into from
Oct 16, 2019
103 changes: 84 additions & 19 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"os/exec"
"path/filepath"
"reflect"
"sort"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -457,6 +458,15 @@ func (oa *operatorActions) CleanOperatorOrDie(info *OperatorConfig) {

func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error {
glog.Infof("upgrading tidb-operator %s", info.ReleaseName)

listOptions := metav1.ListOptions{
LabelSelector: labels.SelectorFromSet(
label.New().Labels()).String(),
}
pods1, err := oa.kubeCli.CoreV1().Pods(metav1.NamespaceAll).List(listOptions)
if err != nil {
return err
}
if err := oa.checkoutTag(info.Tag); err != nil {
return err
}
Expand All @@ -469,7 +479,42 @@ func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error {
if err != nil {
return fmt.Errorf("failed to upgrade operator to: %s, %v, %s", info.Image, err, string(res))
}
return nil

time.Sleep(5 * time.Minute)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check ready status? that will be more efficient

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main purpose here is to prevent the pod from being rebuilt after updating the operator. After updating the operator, it may become ready soon, but it may take a while to rebuild all the pods. This time is not easy to grasp, so I just use sleep here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will change it to wait.Poll

pods2, err := oa.kubeCli.CoreV1().Pods(metav1.NamespaceAll).List(listOptions)
if err != nil {
return err
}

return ensurePodsUnchanged(pods1, pods2)
}

func ensurePodsUnchanged(pods1, pods2 *corev1.PodList) error {
pods1UIDs := getUIDs(pods1)
pods2UIDs := getUIDs(pods2)
if reflect.DeepEqual(pods1UIDs, pods2UIDs) {
glog.V(4).Infof("%#v", pods1)
glog.V(4).Infof("%#v", pods2)
glog.V(4).Infof("%v, %v", pods1UIDs, pods2UIDs)
glog.V(4).Infof("pods unchanged after operator upgraded")
return nil
}

glog.Infof("%#v", pods1)
glog.Infof("%#v", pods2)
glog.Infof("%v, %v", pods1UIDs, pods2UIDs)
return fmt.Errorf("some pods changed after operator upgraded")
}

func getUIDs(pods *corev1.PodList) []string {
arr := make([]string, 0, len(pods.Items))

for _, pod := range pods.Items {
arr = append(arr, string(pod.UID))
}

sort.Strings(arr)
return arr
}

func (oa *operatorActions) UpgradeOperatorOrDie(info *OperatorConfig) {
Expand Down Expand Up @@ -570,6 +615,8 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error {
var beforePVNames []string
for _, pv := range pvList.Items {
beforePVNames = append(beforePVNames, pv.GetName())
glog.V(4).Infof("%s, %s, %v", pv.Name, pv.Spec.PersistentVolumeReclaimPolicy, pv.Labels)
glog.V(4).Info(pv.Spec.ClaimRef)
}
glog.V(4).Info(beforePVNames)

Expand Down Expand Up @@ -601,24 +648,34 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error {
afterPVCNames = append(afterPVCNames, pvc.GetName())
}
glog.V(4).Info(afterPVCNames)

pvList, err = oa.kubeCli.CoreV1().PersistentVolumes().List(metav1.ListOptions{LabelSelector: selector.String()})
if err != nil {
return err
}
var afterPVNames []string
for _, pv := range pvList.Items {
afterPVNames = append(afterPVNames, pv.GetName())
}
glog.V(4).Info(afterPVNames)

if !reflect.DeepEqual(beforePVCNames, afterPVCNames) {
return fmt.Errorf("pvc changed when we delete cluster: %s/%s, before: %v, after: %v",
ns, tcName, beforePVCNames, afterPVCNames)
}
if !reflect.DeepEqual(beforePVNames, afterPVNames) {
return fmt.Errorf("pv changed when we delete cluster: %s/%s, before: %v, after: %v",
ns, tcName, beforePVNames, afterPVNames)

waitPVFn := func() (done bool, err error) {
pvList, err = oa.kubeCli.CoreV1().PersistentVolumes().List(metav1.ListOptions{LabelSelector: selector.String()})
if err != nil {
return false, nil
}
var afterPVNames []string
for _, pv := range pvList.Items {
afterPVNames = append(afterPVNames, pv.GetName())
}
glog.V(4).Info(afterPVNames)

if !reflect.DeepEqual(beforePVNames, afterPVNames) {
glog.Errorf("pv changed when we delete cluster: %s/%s, before: %v, after: %v",
ns, tcName, beforePVNames, afterPVNames)
return false, nil
}

return true, nil
}

err = wait.Poll(oa.pollInterval, DefaultPollTimeout, waitPVFn)
if err != nil {
return err
}

err = oa.kubeCli.CoreV1().Pods(info.Namespace).Delete(getBackupDirPodName, &metav1.DeleteOptions{})
Expand Down Expand Up @@ -660,9 +717,11 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error {
return fmt.Errorf("failed to delete configmaps: %v, %s", err, string(res))
}

patchPVCmd := fmt.Sprintf("kubectl get pv | grep %s | grep %s | awk '{print $1}' | "+
patchPVCmd := fmt.Sprintf("kubectl get pv -l %s=%s,%s=%s,%s=%s | awk '{print $1}' | "+
"xargs -I {} kubectl patch pv {} -p '{\"spec\":{\"persistentVolumeReclaimPolicy\":\"Delete\"}}'",
info.Namespace, info.ClusterName)
label.ManagedByLabelKey, "tidb-operator",
label.NamespaceLabelKey, info.Namespace,
label.InstanceLabelKey, info.ClusterName)
glog.V(4).Info(patchPVCmd)
if res, err := exec.Command("/bin/sh", "-c", patchPVCmd).CombinedOutput(); err != nil {
return fmt.Errorf("failed to patch pv: %v, %s", err, string(res))
Expand Down Expand Up @@ -1026,6 +1085,8 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo
replicas := tc.TiKVRealReplicas()
for i := replicas - 1; i >= 0; i-- {
if err := wait.PollImmediate(1*time.Second, 10*time.Minute, func() (done bool, err error) {
podName := fmt.Sprintf("%s-tikv-%d", tcName, i)
scheduler := fmt.Sprintf("evict-leader-scheduler-%s", findStoreFn(tc, podName))
schedulers, err := pdClient.GetEvictLeaderSchedulers()
if err != nil {
glog.Errorf("failed to get evict leader schedulers, %v", err)
Expand All @@ -1034,13 +1095,17 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo
glog.V(4).Infof("index:%d,schedulers:%v,error:%v", i, schedulers, err)
if len(schedulers) > 1 {
glog.Errorf("there are too many evict leader schedulers: %v", schedulers)
for _, s := range schedulers {
if s == scheduler {
glog.Infof("found scheudler: %s", scheduler)
return true, nil
}
}
return false, nil
}
if len(schedulers) == 0 {
return false, nil
}
podName := fmt.Sprintf("%s-tikv-%d", tcName, i)
scheduler := fmt.Sprintf("evict-leader-scheduler-%s", findStoreFn(tc, podName))
if schedulers[0] == scheduler {
glog.Infof("index: %d,the schedulers: %s = %s", i, schedulers[0], scheduler)
return true, nil
Expand Down
2 changes: 1 addition & 1 deletion tests/cluster_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,6 @@ func (tc *TidbClusterConfig) BuildSubValues(path string) (string, error) {
if err != nil {
return "", err
}
glog.Infof("subValues:\n %s", subValues)
glog.V(4).Infof("subValues:\n %s", subValues)
return subVaulesPath, nil
}
7 changes: 3 additions & 4 deletions tests/cmd/stability/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,9 @@ func run() {
time.Sleep(30 * time.Second)
oa.CheckTidbClustersAvailableOrDie([]*tests.TidbClusterConfig{cluster})
// rollback conf
cluster.PDPreStartScript = strconv.Quote("")
cluster.TiKVPreStartScript = strconv.Quote("")
cluster.TiDBPreStartScript = strconv.Quote("")
cluster.PDPreStartScript = strconv.Quote("# noop")
cluster.TiKVPreStartScript = strconv.Quote("# noop")
cluster.TiDBPreStartScript = strconv.Quote("# noop")
oa.UpgradeTidbClusterOrDie(cluster)
// wait upgrade complete
oa.CheckUpgradeOrDie(ctx, cluster)
Expand Down Expand Up @@ -318,7 +318,6 @@ func run() {
ocfg.Image = cfg.UpgradeOperatorImage
ocfg.Tag = cfg.UpgradeOperatorTag
oa.UpgradeOperatorOrDie(ocfg)
time.Sleep(5 * time.Minute)
postUpgrade := []*tests.TidbClusterConfig{
cluster3,
cluster1,
Expand Down