From 2df0ecd1d68d48d06f1f91c6320b83f97f2f1d4e Mon Sep 17 00:00:00 2001
From: Neil Shen
Date: Wed, 3 Aug 2022 10:50:05 +0800
Subject: [PATCH] ticdc: support graceful upgrade TiCDC pods (#4647)
---
docs/api-references/docs.md | 16 +++++++++
manifests/crd.yaml | 2 ++
.../crd/v1/pingcap.com_tidbclusters.yaml | 2 ++
.../crd/v1beta1/pingcap.com_tidbclusters.yaml | 2 ++
manifests/crd_v1beta1.yaml | 2 ++
.../pingcap/v1alpha1/openapi_generated.go | 8 ++++-
pkg/apis/pingcap/v1alpha1/tidbcluster.go | 22 +++++++++---
pkg/apis/pingcap/v1alpha1/tidbcluster_test.go | 14 ++++++++
pkg/apis/pingcap/v1alpha1/types.go | 6 ++++
.../pingcap/v1alpha1/zz_generated.deepcopy.go | 5 +++
pkg/manager/member/ticdc_scaler.go | 5 +--
pkg/manager/member/ticdc_scaler_test.go | 1 +
pkg/manager/member/ticdc_upgrader.go | 15 +++++---
pkg/manager/member/ticdc_upgrader_test.go | 36 ++++++++++++++-----
14 files changed, 114 insertions(+), 22 deletions(-)
diff --git a/docs/api-references/docs.md b/docs/api-references/docs.md
index c852d793a17..26c35b06f92 100644
--- a/docs/api-references/docs.md
+++ b/docs/api-references/docs.md
@@ -14521,6 +14521,22 @@ string
Defaults to Kubernetes default storage class.
+
+
+gracefulShutdownTimeout
+
+
+Kubernetes meta/v1.Duration
+
+
+ |
+
+(Optional)
+ GracefulShutdownTimeout is the timeout of gracefully shutdown a TiCDC pod.
+Encoded in the format of Go Duration.
+Defaults to 10m
+ |
+
TiCDCStatus
diff --git a/manifests/crd.yaml b/manifests/crd.yaml
index 5182d1d3fb2..5ed02618968 100644
--- a/manifests/crd.yaml
+++ b/manifests/crd.yaml
@@ -21443,6 +21443,8 @@ spec:
type: object
type: object
type: array
+ gracefulShutdownTimeout:
+ type: string
hostNetwork:
type: boolean
image:
diff --git a/manifests/crd/v1/pingcap.com_tidbclusters.yaml b/manifests/crd/v1/pingcap.com_tidbclusters.yaml
index 1c76f0daee3..eb842958286 100644
--- a/manifests/crd/v1/pingcap.com_tidbclusters.yaml
+++ b/manifests/crd/v1/pingcap.com_tidbclusters.yaml
@@ -9134,6 +9134,8 @@ spec:
type: object
type: object
type: array
+ gracefulShutdownTimeout:
+ type: string
hostNetwork:
type: boolean
image:
diff --git a/manifests/crd/v1beta1/pingcap.com_tidbclusters.yaml b/manifests/crd/v1beta1/pingcap.com_tidbclusters.yaml
index da00f2d11f2..1c95047e6fa 100644
--- a/manifests/crd/v1beta1/pingcap.com_tidbclusters.yaml
+++ b/manifests/crd/v1beta1/pingcap.com_tidbclusters.yaml
@@ -9122,6 +9122,8 @@ spec:
type: object
type: object
type: array
+ gracefulShutdownTimeout:
+ type: string
hostNetwork:
type: boolean
image:
diff --git a/manifests/crd_v1beta1.yaml b/manifests/crd_v1beta1.yaml
index 4cc98d753bb..e4dee0205b6 100644
--- a/manifests/crd_v1beta1.yaml
+++ b/manifests/crd_v1beta1.yaml
@@ -21429,6 +21429,8 @@ spec:
type: object
type: object
type: array
+ gracefulShutdownTimeout:
+ type: string
hostNetwork:
type: boolean
image:
diff --git a/pkg/apis/pingcap/v1alpha1/openapi_generated.go b/pkg/apis/pingcap/v1alpha1/openapi_generated.go
index f142d7c9451..5ca09019974 100644
--- a/pkg/apis/pingcap/v1alpha1/openapi_generated.go
+++ b/pkg/apis/pingcap/v1alpha1/openapi_generated.go
@@ -7641,12 +7641,18 @@ func schema_pkg_apis_pingcap_v1alpha1_TiCDCSpec(ref common.ReferenceCallback) co
Format: "",
},
},
+ "gracefulShutdownTimeout": {
+ SchemaProps: spec.SchemaProps{
+ Description: "GracefulShutdownTimeout is the timeout of gracefully shutdown a TiCDC pod. Encoded in the format of Go Duration. Defaults to 10m",
+ Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Duration"),
+ },
+ },
},
Required: []string{"replicas"},
},
},
Dependencies: []string{
- "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CDCConfigWraper", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StorageVolume", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.SuspendAction", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.TopologySpreadConstraint", "k8s.io/api/core/v1.Affinity", "k8s.io/api/core/v1.Container", "k8s.io/api/core/v1.EnvFromSource", "k8s.io/api/core/v1.EnvVar", "k8s.io/api/core/v1.LocalObjectReference", "k8s.io/api/core/v1.PodDNSConfig", "k8s.io/api/core/v1.PodSecurityContext", "k8s.io/api/core/v1.Toleration", "k8s.io/api/core/v1.Volume", "k8s.io/api/core/v1.VolumeMount", "k8s.io/apimachinery/pkg/api/resource.Quantity"},
+ "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CDCConfigWraper", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StorageVolume", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.SuspendAction", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.TopologySpreadConstraint", "k8s.io/api/core/v1.Affinity", "k8s.io/api/core/v1.Container", "k8s.io/api/core/v1.EnvFromSource", "k8s.io/api/core/v1.EnvVar", "k8s.io/api/core/v1.LocalObjectReference", "k8s.io/api/core/v1.PodDNSConfig", "k8s.io/api/core/v1.PodSecurityContext", "k8s.io/api/core/v1.Toleration", "k8s.io/api/core/v1.Volume", "k8s.io/api/core/v1.VolumeMount", "k8s.io/apimachinery/pkg/api/resource.Quantity", "k8s.io/apimachinery/pkg/apis/meta/v1.Duration"},
}
}
diff --git a/pkg/apis/pingcap/v1alpha1/tidbcluster.go b/pkg/apis/pingcap/v1alpha1/tidbcluster.go
index 1f819963971..66e1871a134 100644
--- a/pkg/apis/pingcap/v1alpha1/tidbcluster.go
+++ b/pkg/apis/pingcap/v1alpha1/tidbcluster.go
@@ -38,6 +38,9 @@ const (
defaultEnablePVReclaim = false
// defaultEvictLeaderTimeout is the timeout limit of evict leader
defaultEvictLeaderTimeout = 1500 * time.Minute
+ // defaultTiCDCGracefulShutdownTimeout is the timeout limit of graceful
+ // shutdown a TiCDC pod.
+ defaultTiCDCGracefulShutdownTimeout = 10 * time.Minute
)
var (
@@ -194,6 +197,14 @@ func (tc *TidbCluster) TiFlashVersion() string {
return "latest"
}
+func (tc *TidbCluster) TiFlashContainerPrivilege() *bool {
+ if tc.Spec.TiFlash == nil || tc.Spec.TiFlash.Privileged == nil {
+ pri := false
+ return &pri
+ }
+ return tc.Spec.TiFlash.Privileged
+}
+
// TiCDCImage return the image used by TiCDC.
//
// If TiCDC isn't specified, return empty string.
@@ -219,12 +230,13 @@ func (tc *TidbCluster) TiCDCImage() string {
return image
}
-func (tc *TidbCluster) TiFlashContainerPrivilege() *bool {
- if tc.Spec.TiFlash == nil || tc.Spec.TiFlash.Privileged == nil {
- pri := false
- return &pri
+// TiCDCGracefulShutdownTimeout returns the timeout of gracefully shutdown
+// a TiCDC pod.
+func (tc *TidbCluster) TiCDCGracefulShutdownTimeout() time.Duration {
+ if tc.Spec.TiCDC != nil && tc.Spec.TiCDC.GracefulShutdownTimeout != nil {
+ return tc.Spec.TiCDC.GracefulShutdownTimeout.Duration
}
- return tc.Spec.TiFlash.Privileged
+ return defaultTiCDCGracefulShutdownTimeout
}
// TiDBImage return the image used by TiDB.
diff --git a/pkg/apis/pingcap/v1alpha1/tidbcluster_test.go b/pkg/apis/pingcap/v1alpha1/tidbcluster_test.go
index ccd8cb79eb6..2aceecb9715 100644
--- a/pkg/apis/pingcap/v1alpha1/tidbcluster_test.go
+++ b/pkg/apis/pingcap/v1alpha1/tidbcluster_test.go
@@ -15,6 +15,7 @@ package v1alpha1
import (
"testing"
+ "time"
. "github.com/onsi/gomega"
apps "k8s.io/api/apps/v1"
@@ -672,6 +673,19 @@ func TestPDVersion(t *testing.T) {
}
}
+func TestTiCDCGracefulShutdownTimeout(t *testing.T) {
+ g := NewGomegaWithT(t)
+
+ tc := newTidbCluster()
+ g.Expect(tc.TiCDCGracefulShutdownTimeout()).To(Equal(defaultTiCDCGracefulShutdownTimeout))
+
+ tc.Spec.TiCDC = &TiCDCSpec{GracefulShutdownTimeout: nil}
+ g.Expect(tc.TiCDCGracefulShutdownTimeout()).To(Equal(defaultTiCDCGracefulShutdownTimeout))
+
+ tc.Spec.TiCDC = &TiCDCSpec{GracefulShutdownTimeout: &metav1.Duration{Duration: time.Minute}}
+ g.Expect(tc.TiCDCGracefulShutdownTimeout()).To(Equal(time.Minute))
+}
+
func TestComponentFunc(t *testing.T) {
t.Run("ComponentIsNormal", func(t *testing.T) {
g := NewGomegaWithT(t)
diff --git a/pkg/apis/pingcap/v1alpha1/types.go b/pkg/apis/pingcap/v1alpha1/types.go
index 5c0371efc8c..e7362282400 100644
--- a/pkg/apis/pingcap/v1alpha1/types.go
+++ b/pkg/apis/pingcap/v1alpha1/types.go
@@ -680,6 +680,12 @@ type TiCDCSpec struct {
// Defaults to Kubernetes default storage class.
// +optional
StorageClassName *string `json:"storageClassName,omitempty"`
+
+ // GracefulShutdownTimeout is the timeout of gracefully shutdown a TiCDC pod.
+ // Encoded in the format of Go Duration.
+ // Defaults to 10m
+ // +optional
+ GracefulShutdownTimeout *metav1.Duration `json:"gracefulShutdownTimeout,omitempty"`
}
// TiCDCConfig is the configuration of tidbcdc
diff --git a/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go
index b873f783519..700365fd7fa 100644
--- a/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go
@@ -5282,6 +5282,11 @@ func (in *TiCDCSpec) DeepCopyInto(out *TiCDCSpec) {
*out = new(string)
**out = **in
}
+ if in.GracefulShutdownTimeout != nil {
+ in, out := &in.GracefulShutdownTimeout, &out.GracefulShutdownTimeout
+ *out = new(metav1.Duration)
+ **out = **in
+ }
return
}
diff --git a/pkg/manager/member/ticdc_scaler.go b/pkg/manager/member/ticdc_scaler.go
index ce654757f81..9450ff8a6a8 100644
--- a/pkg/manager/member/ticdc_scaler.go
+++ b/pkg/manager/member/ticdc_scaler.go
@@ -162,9 +162,6 @@ func gracefulShutdownTiCDC(
return nil
}
-// TODO: support configurable graceful shutdown timeout.
-var ticdcGracefulShutdownTimeout time.Duration = 10 * time.Second
-
func checkTiCDCGracefulShutdownTimeout(
tc *v1alpha1.TidbCluster,
podCtl controller.PodControlInterface,
@@ -186,7 +183,7 @@ func checkTiCDCGracefulShutdownTimeout(
return true, nil
}
- gracefulShutdownTimeout := ticdcGracefulShutdownTimeout
+ gracefulShutdownTimeout := tc.TiCDCGracefulShutdownTimeout()
if time.Now().After(beginTime.Add(gracefulShutdownTimeout)) {
klog.Infof("ticdc.%s: graceful shutdown timeout (threshold: %v) for Pod %s in cluster %s/%s",
action, gracefulShutdownTimeout, podName, ns, tc.GetName())
diff --git a/pkg/manager/member/ticdc_scaler_test.go b/pkg/manager/member/ticdc_scaler_test.go
index 7841a5c9b35..3cc32ac9f95 100644
--- a/pkg/manager/member/ticdc_scaler_test.go
+++ b/pkg/manager/member/ticdc_scaler_test.go
@@ -350,6 +350,7 @@ func TestTiCDCGracefulShutdown(t *testing.T) {
tc := newTidbClusterForPD()
tc.Spec.TiCDC = &v1alpha1.TiCDCSpec{}
+ ticdcGracefulShutdownTimeout := tc.TiCDCGracefulShutdownTimeout()
newPod := func() *corev1.Pod {
return &corev1.Pod{
TypeMeta: metav1.TypeMeta{Kind: "Pod", APIVersion: "v1"},
diff --git a/pkg/manager/member/ticdc_upgrader.go b/pkg/manager/member/ticdc_upgrader.go
index 8b6a53e802c..646b8a25561 100644
--- a/pkg/manager/member/ticdc_upgrader.go
+++ b/pkg/manager/member/ticdc_upgrader.go
@@ -87,9 +87,9 @@ func (u *ticdcUpgrader) Upgrade(tc *v1alpha1.TidbCluster, oldSet *apps.StatefulS
mngerutils.SetUpgradePartition(newSet, *oldSet.Spec.UpdateStrategy.RollingUpdate.Partition)
podOrdinals := helper.GetPodOrdinals(*oldSet.Spec.Replicas, oldSet).List()
- for _i := len(podOrdinals) - 1; _i >= 0; _i-- {
- i := podOrdinals[_i]
- podName := ticdcPodName(tcName, i)
+ for i := len(podOrdinals) - 1; i >= 0; i-- {
+ ordinal := podOrdinals[i]
+ podName := ticdcPodName(tcName, ordinal)
pod, err := u.deps.PodLister.Pods(ns).Get(podName)
if err != nil {
return fmt.Errorf("ticdcUpgrader.Upgrade: failed to get pod %s for cluster %s/%s, error: %s", podName, ns, tcName, err)
@@ -108,7 +108,14 @@ func (u *ticdcUpgrader) Upgrade(tc *v1alpha1.TidbCluster, oldSet *apps.StatefulS
}
continue
}
- mngerutils.SetUpgradePartition(newSet, i)
+
+ err = gracefulShutdownTiCDC(tc, u.deps.CDCControl, u.deps.PodControl, pod, ordinal, "Upgrade")
+ if err != nil {
+ return err
+ }
+ klog.Infof("ticdcUpgrade.Upgrade: %s has graceful shutdown in cluster %s/%s", podName, tc.GetNamespace(), tc.GetName())
+
+ mngerutils.SetUpgradePartition(newSet, ordinal)
return nil
}
diff --git a/pkg/manager/member/ticdc_upgrader_test.go b/pkg/manager/member/ticdc_upgrader_test.go
index e449b590424..2227adfa630 100644
--- a/pkg/manager/member/ticdc_upgrader_test.go
+++ b/pkg/manager/member/ticdc_upgrader_test.go
@@ -34,14 +34,15 @@ func TestTiCDCUpgrader_Upgrade(t *testing.T) {
g := NewGomegaWithT(t)
type testcase struct {
- name string
- changeFn func(*v1alpha1.TidbCluster)
- invalidPod bool
- changePods func(pods []*corev1.Pod)
- missPod bool
- errorExpect bool
- changeOldSet func(set *apps.StatefulSet)
- expectFn func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet)
+ name string
+ changeFn func(*v1alpha1.TidbCluster)
+ invalidPod bool
+ changePods func(pods []*corev1.Pod)
+ missPod bool
+ errorExpect bool
+ changeOldSet func(set *apps.StatefulSet)
+ changeUpgrader func(u *ticdcUpgrader)
+ expectFn func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet)
}
testFn := func(test *testcase, t *testing.T) {
@@ -61,6 +62,9 @@ func TestTiCDCUpgrader_Upgrade(t *testing.T) {
if test.changePods != nil {
test.changePods(pods)
}
+ if test.changeUpgrader != nil {
+ test.changeUpgrader(upgrader.(*ticdcUpgrader))
+ }
for _, pod := range pods {
podInformer.Informer().GetIndexer().Add(pod)
}
@@ -89,6 +93,22 @@ func TestTiCDCUpgrader_Upgrade(t *testing.T) {
g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(pointer.Int32Ptr(0)))
},
},
+ {
+ name: "graceful upgrade retry",
+ errorExpect: true,
+ changeUpgrader: func(u *ticdcUpgrader) {
+ u.deps.CDCControl = &cdcCtlMock{
+ // resignOwner returns false to let graceful shutdown retry.
+ resignOwner: func(tc *v1alpha1.TidbCluster, ordinal int32) (ok bool, err error) {
+ return false, nil
+ },
+ }
+ },
+ expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet) {
+ g.Expect(tc.Status.TiCDC.Phase).To(Equal(v1alpha1.UpgradePhase))
+ g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(pointer.Int32Ptr(1)))
+ },
+ },
{
name: "normal with pod notReady",
changePods: func(pods []*corev1.Pod) {