From 8917fa2827e94f6ede2b8234eeea9050d0856f19 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 22 May 2019 18:35:52 +0800 Subject: [PATCH] support the affinity feature of k8s which define the rule of assigning pods to nodes (#475) --- .../templates/config/_pd-config.tpl | 2 +- .../tidb-cluster/templates/tidb-cluster.yaml | 15 +- charts/tidb-cluster/values.yaml | 103 +++++++-- docs/operation-guide.md | 63 +++++- pkg/apis/pingcap.com/v1alpha1/types.go | 47 ++--- .../v1alpha1/zz_generated.deepcopy.go | 15 ++ pkg/manager/member/pd_member_manager.go | 8 +- pkg/manager/member/tidb_member_manager.go | 8 +- pkg/manager/member/tikv_member_manager.go | 8 +- pkg/util/util.go | 120 ----------- pkg/util/utils_test.go | 196 ------------------ 11 files changed, 195 insertions(+), 390 deletions(-) diff --git a/charts/tidb-cluster/templates/config/_pd-config.tpl b/charts/tidb-cluster/templates/config/_pd-config.tpl index daea702054..ea5d044f5a 100644 --- a/charts/tidb-cluster/templates/config/_pd-config.tpl +++ b/charts/tidb-cluster/templates/config/_pd-config.tpl @@ -82,7 +82,7 @@ max-replicas = {{ .Values.pd.maxReplicas }} # The placement priorities is implied by the order of label keys. # For example, ["zone", "rack"] means that we should place replicas to # different zones first, then to different racks if we don't have enough zones. -location-labels = ["zone", "rack", "host"] +location-labels = ["region", "zone", "rack", "host"] [label-property] # Do not assign region leaders to stores that have these tags. diff --git a/charts/tidb-cluster/templates/tidb-cluster.yaml b/charts/tidb-cluster/templates/tidb-cluster.yaml index b27c85773d..a2d94ea355 100644 --- a/charts/tidb-cluster/templates/tidb-cluster.yaml +++ b/charts/tidb-cluster/templates/tidb-cluster.yaml @@ -33,11 +33,10 @@ spec: {{- if .Values.pd.resources }} {{ toYaml .Values.pd.resources | indent 4 }} {{- end }} - {{- if .Values.pd.nodeSelector }} + affinity: +{{ toYaml .Values.pd.affinity | indent 6 }} nodeSelector: {{ toYaml .Values.pd.nodeSelector | indent 6 }} - {{- end }} - nodeSelectorRequired: {{ .Values.nodeSelectorRequired | default true }} {{- if .Values.pd.tolerations }} tolerations: {{ toYaml .Values.pd.tolerations | indent 4 }} @@ -56,11 +55,10 @@ spec: {{- if .Values.tikv.resources }} {{ toYaml .Values.tikv.resources | indent 4 }} {{- end }} - {{- if .Values.tikv.nodeSelector }} + affinity: +{{ toYaml .Values.tikv.affinity | indent 6 }} nodeSelector: {{ toYaml .Values.tikv.nodeSelector | indent 6 }} - {{- end }} - nodeSelectorRequired: {{ .Values.nodeSelectorRequired | default true }} {{- if .Values.tikv.tolerations }} tolerations: {{ toYaml .Values.tikv.tolerations | indent 4 }} @@ -76,11 +74,10 @@ spec: {{- if .Values.tidb.resources }} {{ toYaml .Values.tidb.resources | indent 4 }} {{- end }} - {{- if .Values.tidb.nodeSelector }} + affinity: +{{ toYaml .Values.tidb.affinity | indent 6 }} nodeSelector: {{ toYaml .Values.tidb.nodeSelector | indent 6 }} - {{- end }} - nodeSelectorRequired: {{ .Values.nodeSelectorRequired | default true }} {{- if .Values.tidb.tolerations }} tolerations: {{ toYaml .Values.tidb.tolerations | indent 4 }} diff --git a/charts/tidb-cluster/values.yaml b/charts/tidb-cluster/values.yaml index 645072caac..18fd9c0d2a 100644 --- a/charts/tidb-cluster/values.yaml +++ b/charts/tidb-cluster/values.yaml @@ -73,16 +73,72 @@ pd: # cpu: 4000m # memory: 4Gi storage: 1Gi - # nodeSelector is used for scheduling pod, - # if nodeSelectorRequired is true, all the following labels must be matched + + ## affinity defines pd scheduling rules,it's default settings is empty. + ## please read the affinity document before set your scheduling rule: + ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + ## The following is typical example of affinity settings: + ## The PodAntiAffinity setting of the example keeps PD pods does not co-locate on a topology node as far as possible to improve the disaster tolerance of PD on Kubernetes. + ## The NodeAffinity setting of the example ensure that the PD pods can only be scheduled to nodes with label:[type="pd"], + # affinity: + # podAntiAffinity: + # preferredDuringSchedulingIgnoredDuringExecution: + # # this term work when the nodes have the label named region + # - weight: 10 + # podAffinityTerm: + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: + # app.kubernetes.io/component: "pd" + # topologyKey: "region" + # namespaces: + # - + # # this term work when the nodes have the label named zone + # - weight: 20 + # podAffinityTerm: + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: + # app.kubernetes.io/component: "pd" + # topologyKey: "zone" + # namespaces: + # - + # # this term work when the nodes have the label named rack + # - weight: 40 + # podAffinityTerm: + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: + # app.kubernetes.io/component: "pd" + # topologyKey: "rack" + # namespaces: + # - + # # this term work when the nodes have the label named kubernetes.io/hostname + # - weight: 80 + # podAffinityTerm: + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: + # app.kubernetes.io/component: "pd" + # topologyKey: "kubernetes.io/hostname" + # namespaces: + # - + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: "kind" + # operator: In + # values: + # - "pd" + + ## nodeSelector ensure pods only assigning to nodes which have each of the indicated key-value pairs as labels + ## ref:https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector nodeSelector: {} - # kind: pd - # # zone is comma separated availability zone list - # zone: cn-bj1-01,cn-bj1-02 - # # region is comma separated region list - # region: cn-bj1 - # Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. - # refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration + + ## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. + ## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration tolerations: [] # - key: node-role # operator: Equal @@ -117,10 +173,18 @@ tikv: # cpu: 12000m # memory: 24Gi storage: 10Gi + + ## affinity defines tikv scheduling rules,affinity default settings is empty. + ## please read the affinity document before set your scheduling rule: + ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + + ## nodeSelector ensure pods only assigning to nodes which have each of the indicated key-value pairs as labels + ## ref:https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector nodeSelector: {} - # kind: tikv - # zone: cn-bj1-01,cn-bj1-02 - # region: cn-bj1 + + ## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. + ## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration tolerations: [] # - key: node-role # operator: Equal @@ -196,10 +260,19 @@ tidb: requests: {} # cpu: 12000m # memory: 12Gi + + + ## affinity defines tikv scheduling rules,affinity default settings is empty. + ## please read the affinity document before set your scheduling rule: + ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + + ## nodeSelector ensure pods only assigning to nodes which have each of the indicated key-value pairs as labels + ## ref:https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector nodeSelector: {} - # kind: tidb - # zone: cn-bj1-01,cn-bj1-02 - # region: cn-bj1 + + ## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. + ## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration tolerations: [] # - key: node-role # operator: Equal diff --git a/docs/operation-guide.md b/docs/operation-guide.md index e35ea2f42d..6a46c81211 100644 --- a/docs/operation-guide.md +++ b/docs/operation-guide.md @@ -11,21 +11,70 @@ $ namespace="tidb" > **Note:** The rest of the document will use `values.yaml` to reference `charts/tidb-cluster/values.yaml` +## Configuration + +TiDB Operator uses `values.yaml` as TiDB cluster configuration file. It provides the default basic configuration which you can use directly for quick deployment, but if you have specific configuration requirements or for production deployment, you need to manually modify the variables in the `values.yaml` file. + +* Resource setting + + * CPU & Memory + + The default deployment doesn't set CPU and memory requests or limits for any of the pods, these settings can make TiDB cluster run on a small Kubernetes cluster like DinD or the default GKE cluster for testing. But for production deployment, you would likely to adjust the cpu, memory and storage resources according to the [recommendations](https://pingcap.com/docs/dev/how-to/deploy/hardware-recommendations/#software-and-hardware-recommendations). + + The resource limits should be equal or bigger than the resource requests, it is suggested to set limit and request equal to get [`Guaranteed` QoS]( https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/#create-a-pod-that-gets-assigned-a-qos-class-of-guaranteed). + + * Storage + + The variables `pd.storageClassName` and `tikv.storageClassName` in `values.yaml` are used to set `StorageClass` of PD and TiKV,their default setting are `local-storage` with minimal size. + + If you don't want to use the default `StorageClass` or your Kubernetes cluster does not support `local-storage` class, please execute the following command to find an available `StorageClass` and select the ones you want to provide to TiDB cluster. + + ```shell + $ kubectl get sc + ``` + +* Disaster Tolerance setting + + TiDB is a distributed database. Its disaster tolerance means that when any physical node failed, not only to ensure TiDB server is available, but also ensure the data is complete and available. + + How to guarantee Disaster Tolerance of TiDB cluster on Kubernetes? + + We mainly solve the problem from the scheduling of services and data. + + * Disaster Tolerance of TiDB instance + + TiDB Operator provides an extended scheduler to guarantee PD/TiKV/TiDB instance disaster tolerance on host level. TiDB Cluster has set the extended scheduler as default scheduler, you will find the setting in the variable `schedulerName` of `values.yaml`. + + In the other hand use `PodAntiAffinity` term of `affinity` to ensure disaster tolerance on the other topology levels (e.g. rack, zone, region). + refer to the doc: [pod affnity & anti affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#inter-pod-affinity-and-anti-affinity-beta-feature), moreover `values.yaml` also provides a typical disaster tolerance setting example in the comments of `pd.affinity`. + + * Disaster Tolerance of data + + Disaster tolerance of data is guaranteed by TiDB Cluster itself. The only work Operator needs to do is that collects topology info from specific labels of Kubernetes nodes where TiKV Pod runs on and then PD will schedule data replicas auto according to the topology info. + Because current TiDB Operator can only recognize some specific labels, so you can only set nodes topology info with the following particular labels + + * `region`: region where node is located + * `zone`: zone where node is located + * `rack`: rack where node is located + * `kubernetes.io/hostname`: hostname of the node + + you need label topology info to nodes of Kubernetes cluster use the following command + ```shell + # Not all tags are required + $ kubectl label node region= zone= rack= kubernetes.io/hostname= + ``` + +For other settings, the variables in `values.yaml` are self-explanatory with comments. You can modify them according to your need before installing the charts. + ## Deploy TiDB cluster -After TiDB Operator and Helm are deployed correctly, TiDB cluster can be deployed using following command: +After TiDB Operator and Helm are deployed correctly and configuration completed, TiDB cluster can be deployed using following command: ```shell $ helm install charts/tidb-cluster --name=${releaseName} --namespace=${namespace} $ kubectl get po -n ${namespace} -l app.kubernetes.io/instance=${releaseName} ``` -The default deployment doesn't set CPU and memory requests or limits for any of the pods, and the storage used is `local-storage` with minimal size. These settings can make TiDB cluster run on a small Kubernetes cluster like DinD or the default GKE cluster for testing. But for production deployment, you would likely to adjust the cpu, memory and storage resources according to the [recommendations](https://github.com/pingcap/docs/blob/master/op-guide/recommendation.md). - -The resource limits should be equal or bigger than the resource requests, it is suggested to set limit and request equal to get [`Guaranteed` QoS]( https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/#create-a-pod-that-gets-assigned-a-qos-class-of-guaranteed). - -For other settings, the variables in `values.yaml` are self-explanatory with comments. You can modify them according to your need before installing the charts. - ## Access TiDB cluster By default TiDB service is exposed using [`NodePort`](https://kubernetes.io/docs/concepts/services-networking/service/#nodeport). You can modify it to `ClusterIP` which will disable access from outside of the cluster. Or modify it to [`LoadBalancer`](https://kubernetes.io/docs/concepts/services-networking/service/#loadbalancer) if the underlining Kubernetes supports this kind of service. diff --git a/pkg/apis/pingcap.com/v1alpha1/types.go b/pkg/apis/pingcap.com/v1alpha1/types.go index 26f5f9f86d..419d1cd973 100644 --- a/pkg/apis/pingcap.com/v1alpha1/types.go +++ b/pkg/apis/pingcap.com/v1alpha1/types.go @@ -15,10 +15,9 @@ package v1alpha1 import ( apps "k8s.io/api/apps/v1beta1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - - corev1 "k8s.io/api/core/v1" ) const ( @@ -107,27 +106,27 @@ type TidbClusterStatus struct { // PDSpec contains details of PD member type PDSpec struct { ContainerSpec - Replicas int32 `json:"replicas"` - NodeSelector map[string]string `json:"nodeSelector,omitempty"` - NodeSelectorRequired bool `json:"nodeSelectorRequired,omitempty"` - StorageClassName string `json:"storageClassName,omitempty"` - Tolerations []corev1.Toleration `json:"tolerations,omitempty"` - Annotations map[string]string `json:"annotations,omitempty"` + Replicas int32 `json:"replicas"` + Affinity *corev1.Affinity `json:"affinity,omitempty"` + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + StorageClassName string `json:"storageClassName,omitempty"` + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + Annotations map[string]string `json:"annotations,omitempty"` } // TiDBSpec contains details of PD member type TiDBSpec struct { ContainerSpec - Replicas int32 `json:"replicas"` - NodeSelector map[string]string `json:"nodeSelector,omitempty"` - NodeSelectorRequired bool `json:"nodeSelectorRequired,omitempty"` - StorageClassName string `json:"storageClassName,omitempty"` - Tolerations []corev1.Toleration `json:"tolerations,omitempty"` - Annotations map[string]string `json:"annotations,omitempty"` - BinlogEnabled bool `json:"binlogEnabled,omitempty"` - MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"` - SeparateSlowLog bool `json:"separateSlowLog,omitempty"` - SlowLogTailer TiDBSlowLogTailerSpec `json:"slowLogTailer,omitempty"` + Replicas int32 `json:"replicas"` + Affinity *corev1.Affinity `json:"affinity,omitempty"` + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + StorageClassName string `json:"storageClassName,omitempty"` + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + Annotations map[string]string `json:"annotations,omitempty"` + BinlogEnabled bool `json:"binlogEnabled,omitempty"` + MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"` + SeparateSlowLog bool `json:"separateSlowLog,omitempty"` + SlowLogTailer TiDBSlowLogTailerSpec `json:"slowLogTailer,omitempty"` } // TiDBSlowLogTailerSpec represents an optional log tailer sidecar with TiDB @@ -138,12 +137,12 @@ type TiDBSlowLogTailerSpec struct { // TiKVSpec contains details of PD member type TiKVSpec struct { ContainerSpec - Replicas int32 `json:"replicas"` - NodeSelector map[string]string `json:"nodeSelector,omitempty"` - NodeSelectorRequired bool `json:"nodeSelectorRequired,omitempty"` - StorageClassName string `json:"storageClassName,omitempty"` - Tolerations []corev1.Toleration `json:"tolerations,omitempty"` - Annotations map[string]string `json:"annotations,omitempty"` + Replicas int32 `json:"replicas"` + Affinity *corev1.Affinity `json:"affinity,omitempty"` + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + StorageClassName string `json:"storageClassName,omitempty"` + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + Annotations map[string]string `json:"annotations,omitempty"` } // TiKVPromGatewaySpec runs as a sidecar with TiKVSpec diff --git a/pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go index 9304c82205..1bf3b6d605 100644 --- a/pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go @@ -89,6 +89,11 @@ func (in *PDMember) DeepCopy() *PDMember { func (in *PDSpec) DeepCopyInto(out *PDSpec) { *out = *in in.ContainerSpec.DeepCopyInto(&out.ContainerSpec) + if in.Affinity != nil { + in, out := &in.Affinity, &out.Affinity + *out = new(v1.Affinity) + (*in).DeepCopyInto(*out) + } if in.NodeSelector != nil { in, out := &in.NodeSelector, &out.NodeSelector *out = make(map[string]string, len(*in)) @@ -245,6 +250,11 @@ func (in *TiDBSlowLogTailerSpec) DeepCopy() *TiDBSlowLogTailerSpec { func (in *TiDBSpec) DeepCopyInto(out *TiDBSpec) { *out = *in in.ContainerSpec.DeepCopyInto(&out.ContainerSpec) + if in.Affinity != nil { + in, out := &in.Affinity, &out.Affinity + *out = new(v1.Affinity) + (*in).DeepCopyInto(*out) + } if in.NodeSelector != nil { in, out := &in.NodeSelector, &out.NodeSelector *out = make(map[string]string, len(*in)) @@ -352,6 +362,11 @@ func (in *TiKVPromGatewaySpec) DeepCopy() *TiKVPromGatewaySpec { func (in *TiKVSpec) DeepCopyInto(out *TiKVSpec) { *out = *in in.ContainerSpec.DeepCopyInto(&out.ContainerSpec) + if in.Affinity != nil { + in, out := &in.Affinity, &out.Affinity + *out = new(v1.Affinity) + (*in).DeepCopyInto(*out) + } if in.NodeSelector != nil { in, out := &in.NodeSelector, &out.NodeSelector *out = make(map[string]string, len(*in)) diff --git a/pkg/manager/member/pd_member_manager.go b/pkg/manager/member/pd_member_manager.go index b2a7edb6e8..d975ebec8d 100644 --- a/pkg/manager/member/pd_member_manager.go +++ b/pkg/manager/member/pd_member_manager.go @@ -477,12 +477,8 @@ func (pmm *pdMemberManager) getNewPDSetForTidbCluster(tc *v1alpha1.TidbCluster) }, Spec: corev1.PodSpec{ SchedulerName: tc.Spec.SchedulerName, - Affinity: util.AffinityForNodeSelector( - ns, - tc.Spec.PD.NodeSelectorRequired, - label.New().Instance(instanceName).PD(), - tc.Spec.PD.NodeSelector, - ), + Affinity: tc.Spec.PD.Affinity, + NodeSelector: tc.Spec.PD.NodeSelector, Containers: []corev1.Container{ { Name: v1alpha1.PDMemberType.String(), diff --git a/pkg/manager/member/tidb_member_manager.go b/pkg/manager/member/tidb_member_manager.go index 412b79d50f..aaead2d5a4 100644 --- a/pkg/manager/member/tidb_member_manager.go +++ b/pkg/manager/member/tidb_member_manager.go @@ -348,12 +348,8 @@ func (tmm *tidbMemberManager) getNewTiDBSetForTidbCluster(tc *v1alpha1.TidbClust }, Spec: corev1.PodSpec{ SchedulerName: tc.Spec.SchedulerName, - Affinity: util.AffinityForNodeSelector( - ns, - tc.Spec.TiDB.NodeSelectorRequired, - label.New().Instance(instanceName).TiDB(), - tc.Spec.TiDB.NodeSelector, - ), + Affinity: tc.Spec.TiDB.Affinity, + NodeSelector: tc.Spec.TiDB.NodeSelector, Containers: containers, RestartPolicy: corev1.RestartPolicyAlways, Tolerations: tc.Spec.TiDB.Tolerations, diff --git a/pkg/manager/member/tikv_member_manager.go b/pkg/manager/member/tikv_member_manager.go index 624dff9a75..3546b9bf82 100644 --- a/pkg/manager/member/tikv_member_manager.go +++ b/pkg/manager/member/tikv_member_manager.go @@ -336,12 +336,8 @@ func (tkmm *tikvMemberManager) getNewSetForTidbCluster(tc *v1alpha1.TidbCluster) }, Spec: corev1.PodSpec{ SchedulerName: tc.Spec.SchedulerName, - Affinity: util.AffinityForNodeSelector( - ns, - tc.Spec.TiKV.NodeSelectorRequired, - tikvLabel, - tc.Spec.TiKV.NodeSelector, - ), + Affinity: tc.Spec.TiKV.Affinity, + NodeSelector: tc.Spec.TiKV.NodeSelector, Containers: []corev1.Container{ { Name: v1alpha1.TiKVMemberType.String(), diff --git a/pkg/util/util.go b/pkg/util/util.go index acb5a03c0f..ba196f362c 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -15,7 +15,6 @@ package util import ( "fmt" - "sort" "strconv" "strings" @@ -23,127 +22,8 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/kubernetes/pkg/kubelet/apis" ) -var ( - // weight is in range 1-100 - topologySchedulingWeight = map[string]int32{ - "region": 10, - "zone": 20, - "rack": 40, - apis.LabelHostname: 80, - } -) - -// AntiAffinityForPod creates a PodAntiAffinity with antiLabels -func AntiAffinityForPod(namespace string, antiLabels map[string]string) *corev1.PodAntiAffinity { - keys := []string{} - for key := range topologySchedulingWeight { - keys = append(keys, key) - } - sort.Strings(keys) // we must use sorted selector, otherwise affinity may vary causing new statefulset generated and pod recreated - terms := []corev1.WeightedPodAffinityTerm{} - for _, key := range keys { - term := corev1.WeightedPodAffinityTerm{ - Weight: topologySchedulingWeight[key], - PodAffinityTerm: corev1.PodAffinityTerm{ - LabelSelector: &metav1.LabelSelector{MatchLabels: antiLabels}, - TopologyKey: key, - Namespaces: []string{namespace}, - }, - } - terms = append(terms, term) - } - return &corev1.PodAntiAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: terms, - } -} - -// AffinityForNodeSelector creates an Affinity for NodeSelector -// Externally we use NodeSelector for simplicity, -// while internally we convert it to affinity which can express complex scheduling rules -func AffinityForNodeSelector(namespace string, required bool, antiLabels, selector map[string]string) *corev1.Affinity { - if selector == nil { - return nil - } - affinity := &corev1.Affinity{} - if antiLabels != nil { - affinity.PodAntiAffinity = AntiAffinityForPod(namespace, antiLabels) - } - - keys := []string{} - for key := range selector { - keys = append(keys, key) - } - sort.Strings(keys) // we must use sorted selector, otherwise affinity may vary causing new statefulset generated and pod recreated - - requiredTerms := []corev1.NodeSelectorTerm{} - if required { // all nodeSelectors are required - var exps []corev1.NodeSelectorRequirement - for _, key := range keys { - requirement := corev1.NodeSelectorRequirement{ - Key: key, - Operator: corev1.NodeSelectorOpIn, - Values: strings.Split(selector[key], ","), - } - // NodeSelectorRequirement in the same MatchExpressions are ANDed otherwise ORed - exps = append(exps, requirement) - } - requiredTerms = append(requiredTerms, corev1.NodeSelectorTerm{MatchExpressions: exps}) - affinity.NodeAffinity = &corev1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ - NodeSelectorTerms: requiredTerms, - }, - } - return affinity - } - - preferredTerms := []corev1.PreferredSchedulingTerm{} - exps := []corev1.NodeSelectorRequirement{} - for _, key := range keys { - if selector[key] == "" { - continue - } - values := strings.Split(selector[key], ",") - // region,zone,rack,host are preferred labels, others are must match labels - if weight, ok := topologySchedulingWeight[key]; ok { - t := corev1.PreferredSchedulingTerm{ - Weight: weight, - Preference: corev1.NodeSelectorTerm{ - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: key, - Operator: corev1.NodeSelectorOpIn, - Values: values, - }, - }, - }, - } - preferredTerms = append(preferredTerms, t) - } else { - requirement := corev1.NodeSelectorRequirement{ - Key: key, - Operator: corev1.NodeSelectorOpIn, - Values: values, - } - // NodeSelectorRequirement in the same MatchExpressions are ANDed otherwise ORed - exps = append(exps, requirement) - } - } - requiredTerms = append(requiredTerms, corev1.NodeSelectorTerm{MatchExpressions: exps}) - - affinity.NodeAffinity = &corev1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ - NodeSelectorTerms: requiredTerms, - }, - PreferredDuringSchedulingIgnoredDuringExecution: preferredTerms, - } - - return affinity -} - // ResourceRequirement creates ResourceRequirements for MemberSpec // Optionally pass in a default value func ResourceRequirement(spec v1alpha1.ContainerSpec, defaultRequests ...corev1.ResourceRequirements) corev1.ResourceRequirements { diff --git a/pkg/util/utils_test.go b/pkg/util/utils_test.go index 2163883401..77101bd5d6 100644 --- a/pkg/util/utils_test.go +++ b/pkg/util/utils_test.go @@ -20,204 +20,8 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/kubernetes/pkg/kubelet/apis" ) -func TestAffinityForNodeSelector(t *testing.T) { - g := NewGomegaWithT(t) - type testcase struct { - name string - required bool - antiLabels map[string]string - selector map[string]string - expectFn func(*GomegaWithT, *corev1.Affinity) - } - - antiLabels := map[string]string{"region": "region1", "zone": "zone1", "rack": "rack1", apis.LabelHostname: "host1"} - testFn := func(test *testcase, t *testing.T) { - t.Log(test.name) - test.expectFn(g, AffinityForNodeSelector(metav1.NamespaceDefault, test.required, test.antiLabels, test.selector)) - } - - tests := []testcase{ - { - name: "selector is nil", - required: false, - antiLabels: nil, - selector: nil, - expectFn: func(g *GomegaWithT, affinity *corev1.Affinity) { - g.Expect(affinity).To(BeNil()) - }, - }, - { - name: "required, antiLabels is nil", - required: true, - antiLabels: nil, - selector: map[string]string{"a": "a1,a2,a3", "b": "b1"}, - expectFn: func(g *GomegaWithT, affinity *corev1.Affinity) { - affi := &corev1.Affinity{ - NodeAffinity: &corev1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ - NodeSelectorTerms: []corev1.NodeSelectorTerm{ - { - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: "a", - Operator: corev1.NodeSelectorOpIn, - Values: []string{"a1", "a2", "a3"}, - }, - { - Key: "b", - Operator: corev1.NodeSelectorOpIn, - Values: []string{"b1"}, - }, - }, - }, - }, - }, - }, - } - g.Expect(affinity).To(Equal(affi)) - }, - }, - { - name: "required, antiLabels is not nil", - required: true, - antiLabels: antiLabels, - selector: map[string]string{"a": "a1,a2,a3", "b": "b1"}, - expectFn: func(g *GomegaWithT, affinity *corev1.Affinity) { - affi := &corev1.Affinity{ - NodeAffinity: &corev1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ - NodeSelectorTerms: []corev1.NodeSelectorTerm{ - { - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: "a", - Operator: corev1.NodeSelectorOpIn, - Values: []string{"a1", "a2", "a3"}, - }, - { - Key: "b", - Operator: corev1.NodeSelectorOpIn, - Values: []string{"b1"}, - }, - }, - }, - }, - }, - }, - PodAntiAffinity: &corev1.PodAntiAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{ - { - Weight: 80, - PodAffinityTerm: corev1.PodAffinityTerm{ - LabelSelector: &metav1.LabelSelector{MatchLabels: antiLabels}, - TopologyKey: apis.LabelHostname, - Namespaces: []string{metav1.NamespaceDefault}, - }, - }, - { - Weight: 40, - PodAffinityTerm: corev1.PodAffinityTerm{ - LabelSelector: &metav1.LabelSelector{MatchLabels: antiLabels}, - TopologyKey: "rack", - Namespaces: []string{metav1.NamespaceDefault}, - }, - }, - { - Weight: 10, - PodAffinityTerm: corev1.PodAffinityTerm{ - LabelSelector: &metav1.LabelSelector{MatchLabels: antiLabels}, - TopologyKey: "region", - Namespaces: []string{metav1.NamespaceDefault}, - }, - }, - { - Weight: 20, - PodAffinityTerm: corev1.PodAffinityTerm{ - LabelSelector: &metav1.LabelSelector{MatchLabels: antiLabels}, - TopologyKey: "zone", - Namespaces: []string{metav1.NamespaceDefault}, - }, - }, - }, - }, - } - g.Expect(affinity).To(Equal(affi)) - }, - }, - { - name: "not required", - required: false, - antiLabels: nil, - selector: map[string]string{ - "region": "region1", - "zone": "zone1,zone2", - "rack": "", - "a": "a1,a2,a3", - "b": "b1", - }, - expectFn: func(g *GomegaWithT, affinity *corev1.Affinity) { - affi := &corev1.Affinity{ - NodeAffinity: &corev1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ - NodeSelectorTerms: []corev1.NodeSelectorTerm{ - { - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: "a", - Operator: corev1.NodeSelectorOpIn, - Values: []string{"a1", "a2", "a3"}, - }, - { - Key: "b", - Operator: corev1.NodeSelectorOpIn, - Values: []string{"b1"}, - }, - }, - }, - }, - }, - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.PreferredSchedulingTerm{ - { - Weight: 10, - Preference: corev1.NodeSelectorTerm{ - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: "region", - Operator: corev1.NodeSelectorOpIn, - Values: []string{"region1"}, - }, - }, - }, - }, - { - Weight: 20, - Preference: corev1.NodeSelectorTerm{ - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: "zone", - Operator: corev1.NodeSelectorOpIn, - Values: []string{"zone1", "zone2"}, - }, - }, - }, - }, - }, - }, - } - g.Expect(affinity).To(Equal(affi)) - }, - }, - } - - for i := range tests { - testFn(&tests[i], t) - } -} - func TestResourceRequirement(t *testing.T) { g := NewGomegaWithT(t) type testcase struct {