diff --git a/docs/api-references/docs.md b/docs/api-references/docs.md index e029104f29..4986ec7b57 100644 --- a/docs/api-references/docs.md +++ b/docs/api-references/docs.md @@ -2756,21 +2756,13 @@ If not set, the default ScaleOutIntervalSeconds will be set to 300

metrics
- -[]Kubernetes autoscaling/v2beta2.MetricSpec + +[]CustomMetric (Optional) -

metrics contains the specifications for which to use to calculate the -desired replica count (the maximum replica count across all metrics will -be used). The desired replica count is calculated multiplying the -ratio between the target value and the current value by the current -number of pods. Ergo, metrics used must decrease as the pod count is -increased, and vice-versa. See the individual metric source types for -more information about how each type of metric must respond. -If not set, the default metric will be set to 80% average CPU utilization.

@@ -3613,6 +3605,77 @@ CrdKind +

CustomMetric

+

+(Appears on: +BasicAutoScalerSpec) +

+

+

+ + + + + + + + + + + + + + + + + + + + + +
FieldDescription
+MetricSpec
+ + +Kubernetes autoscaling/v2beta2.MetricSpec + + +
+

+(Members of MetricSpec are embedded into this type.) +

+(Optional) +

metrics contains the specifications for which to use to calculate the +desired replica count (the maximum replica count across all metrics will +be used). The desired replica count is calculated multiplying the +ratio between the target value and the current value by the current +number of pods. Ergo, metrics used must decrease as the pod count is +increased, and vice-versa. See the individual metric source types for +more information about how each type of metric must respond. +If not set, the auto-scaling won’t happen.

+
+LeastStoragePressurePeriodSeconds
+ +int64 + +
+(Optional) +

LeastStoragePressurePeriodSeconds is only for the storage auto-scaling case when the resource name in the metricSpec +is Storage. When the Storage metrics meet the pressure, Operator would wait +LeastStoragePressurePeriodSeconds duration then able to scale out. +If not set, the default value is 300

+
+leastRemainAvailableStoragePercent
+ +int64 + +
+(Optional) +

LeastRemainAvailableStoragePercent indicates the least remaining available storage percent compare to +the capacity storage. If the available storage is lower than the capacity storage * LeastRemainAvailableStoragePercent, +the storage status will become storage pressure and ready to be scaled out. +LeastRemainAvailableStoragePercent should between 5 and 90. If note set, the default value would be 10

+

DashboardConfig

(Appears on: @@ -4931,6 +4994,7 @@ string +(Optional)

CurrentValue indicates the value calculated in the last auto-scaling reconciliation

@@ -4942,9 +5006,26 @@ string +(Optional)

TargetValue indicates the threshold value for this metrics in auto-scaling

+ + +StorageMetricsStatus
+ + +StorageMetricsStatus + + + + +

+(Members of StorageMetricsStatus are embedded into this type.) +

+(Optional) + +

MonitorComponentAccessor

@@ -8813,6 +8894,86 @@ More info: StorageMetricsStatus +

+(Appears on: +MetricsStatus) +

+

+

StorageMetricsStatus describe the storage metrics status in the last auto-scaling reconciliation

+

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldDescription
+storagePressure
+ +bool + +
+(Optional) +

StoragePressure indicates whether storage under pressure

+
+storagePressureStartTime
+ + +Kubernetes meta/v1.Time + + +
+(Optional) +

StoragePressureStartTime indicates the timestamp of the StoragePressure fist become true from false or nil

+
+availableStorage
+ +string + +
+(Optional) +
+capacityStorage
+ +string + +
+(Optional) +
+baselineAvailableStorage
+ +string + +
+

BaselineAvailableStorage indicates the baseline for available storage size. +This is calculated by the capacity storage size * storage auto-scaling baseline percent value +If the AvailableStorage is less than the BaselineAvailableStorage, the database is under StoragePressure +optional

+

StorageProvider

(Appears on: diff --git a/manifests/crd.yaml b/manifests/crd.yaml index 7fa7abf382..5b9303197e 100644 --- a/manifests/crd.yaml +++ b/manifests/crd.yaml @@ -13485,16 +13485,25 @@ spec: metrics: items: properties: + availableStorage: + type: string + baselineAvailableStorage: + type: string + capacityStorage: + type: string currentValue: type: string name: type: string + storagePressure: + type: boolean + storagePressureStartTime: + format: date-time + type: string thresholdValue: type: string required: - name - - currentValue - - thresholdValue type: object type: array recommendedReplicas: @@ -13514,16 +13523,25 @@ spec: metrics: items: properties: + availableStorage: + type: string + baselineAvailableStorage: + type: string + capacityStorage: + type: string currentValue: type: string name: type: string + storagePressure: + type: boolean + storagePressureStartTime: + format: date-time + type: string thresholdValue: type: string required: - name - - currentValue - - thresholdValue type: object type: array recommendedReplicas: diff --git a/pkg/apis/pingcap/v1alpha1/openapi_generated.go b/pkg/apis/pingcap/v1alpha1/openapi_generated.go index a536c63ed3..990e490ec1 100644 --- a/pkg/apis/pingcap/v1alpha1/openapi_generated.go +++ b/pkg/apis/pingcap/v1alpha1/openapi_generated.go @@ -91,6 +91,7 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.Status": schema_pkg_apis_pingcap_v1alpha1_Status(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StmtSummary": schema_pkg_apis_pingcap_v1alpha1_StmtSummary(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StorageClaim": schema_pkg_apis_pingcap_v1alpha1_StorageClaim(ref), + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StorageMetricsStatus": schema_pkg_apis_pingcap_v1alpha1_StorageMetricsStatus(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StorageProvider": schema_pkg_apis_pingcap_v1alpha1_StorageProvider(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.TiCDCConfig": schema_pkg_apis_pingcap_v1alpha1_TiCDCConfig(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.TiCDCSpec": schema_pkg_apis_pingcap_v1alpha1_TiCDCSpec(ref), @@ -899,12 +900,11 @@ func schema_pkg_apis_pingcap_v1alpha1_BasicAutoScalerSpec(ref common.ReferenceCa }, "metrics": { SchemaProps: spec.SchemaProps{ - Description: "metrics contains the specifications for which to use to calculate the desired replica count (the maximum replica count across all metrics will be used). The desired replica count is calculated multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa. See the individual metric source types for more information about how each type of metric must respond. If not set, the default metric will be set to 80% average CPU utilization.", - Type: []string{"array"}, + Type: []string{"array"}, Items: &spec.SchemaOrArray{ Schema: &spec.Schema{ SchemaProps: spec.SchemaProps{ - Ref: ref("k8s.io/api/autoscaling/v2beta2.MetricSpec"), + Ref: ref("github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric"), }, }, }, @@ -928,7 +928,7 @@ func schema_pkg_apis_pingcap_v1alpha1_BasicAutoScalerSpec(ref common.ReferenceCa }, }, Dependencies: []string{ - "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint", "k8s.io/api/autoscaling/v2beta2.MetricSpec"}, + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint"}, } } @@ -2015,10 +2015,44 @@ func schema_pkg_apis_pingcap_v1alpha1_MetricsStatus(ref common.ReferenceCallback Format: "", }, }, + "storagePressure": { + SchemaProps: spec.SchemaProps{ + Description: "StoragePressure indicates whether storage under pressure", + Type: []string{"boolean"}, + Format: "", + }, + }, + "storagePressureStartTime": { + SchemaProps: spec.SchemaProps{ + Description: "StoragePressureStartTime indicates the timestamp of the StoragePressure fist become true from false or nil", + Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Time"), + }, + }, + "availableStorage": { + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + "capacityStorage": { + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + "baselineAvailableStorage": { + SchemaProps: spec.SchemaProps{ + Description: "BaselineAvailableStorage indicates the baseline for available storage size. This is calculated by the capacity storage size * storage auto-scaling baseline percent value If the AvailableStorage is less than the BaselineAvailableStorage, the database is under StoragePressure optional", + Type: []string{"string"}, + Format: "", + }, + }, }, - Required: []string{"name", "currentValue", "thresholdValue"}, + Required: []string{"name"}, }, }, + Dependencies: []string{ + "k8s.io/apimachinery/pkg/apis/meta/v1.Time"}, } } @@ -4317,6 +4351,53 @@ func schema_pkg_apis_pingcap_v1alpha1_StorageClaim(ref common.ReferenceCallback) } } +func schema_pkg_apis_pingcap_v1alpha1_StorageMetricsStatus(ref common.ReferenceCallback) common.OpenAPIDefinition { + return common.OpenAPIDefinition{ + Schema: spec.Schema{ + SchemaProps: spec.SchemaProps{ + Description: "StorageMetricsStatus describe the storage metrics status in the last auto-scaling reconciliation", + Type: []string{"object"}, + Properties: map[string]spec.Schema{ + "storagePressure": { + SchemaProps: spec.SchemaProps{ + Description: "StoragePressure indicates whether storage under pressure", + Type: []string{"boolean"}, + Format: "", + }, + }, + "storagePressureStartTime": { + SchemaProps: spec.SchemaProps{ + Description: "StoragePressureStartTime indicates the timestamp of the StoragePressure fist become true from false or nil", + Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Time"), + }, + }, + "availableStorage": { + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + "capacityStorage": { + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + "baselineAvailableStorage": { + SchemaProps: spec.SchemaProps{ + Description: "BaselineAvailableStorage indicates the baseline for available storage size. This is calculated by the capacity storage size * storage auto-scaling baseline percent value If the AvailableStorage is less than the BaselineAvailableStorage, the database is under StoragePressure optional", + Type: []string{"string"}, + Format: "", + }, + }, + }, + }, + }, + Dependencies: []string{ + "k8s.io/apimachinery/pkg/apis/meta/v1.Time"}, + } +} + func schema_pkg_apis_pingcap_v1alpha1_StorageProvider(ref common.ReferenceCallback) common.OpenAPIDefinition { return common.OpenAPIDefinition{ Schema: spec.Schema{ @@ -8762,12 +8843,11 @@ func schema_pkg_apis_pingcap_v1alpha1_TidbAutoScalerSpec(ref common.ReferenceCal }, "metrics": { SchemaProps: spec.SchemaProps{ - Description: "metrics contains the specifications for which to use to calculate the desired replica count (the maximum replica count across all metrics will be used). The desired replica count is calculated multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa. See the individual metric source types for more information about how each type of metric must respond. If not set, the default metric will be set to 80% average CPU utilization.", - Type: []string{"array"}, + Type: []string{"array"}, Items: &spec.SchemaOrArray{ Schema: &spec.Schema{ SchemaProps: spec.SchemaProps{ - Ref: ref("k8s.io/api/autoscaling/v2beta2.MetricSpec"), + Ref: ref("github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric"), }, }, }, @@ -8791,7 +8871,7 @@ func schema_pkg_apis_pingcap_v1alpha1_TidbAutoScalerSpec(ref common.ReferenceCal }, }, Dependencies: []string{ - "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint", "k8s.io/api/autoscaling/v2beta2.MetricSpec"}, + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint"}, } } @@ -9900,12 +9980,11 @@ func schema_pkg_apis_pingcap_v1alpha1_TikvAutoScalerSpec(ref common.ReferenceCal }, "metrics": { SchemaProps: spec.SchemaProps{ - Description: "metrics contains the specifications for which to use to calculate the desired replica count (the maximum replica count across all metrics will be used). The desired replica count is calculated multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa. See the individual metric source types for more information about how each type of metric must respond. If not set, the default metric will be set to 80% average CPU utilization.", - Type: []string{"array"}, + Type: []string{"array"}, Items: &spec.SchemaOrArray{ Schema: &spec.Schema{ SchemaProps: spec.SchemaProps{ - Ref: ref("k8s.io/api/autoscaling/v2beta2.MetricSpec"), + Ref: ref("github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric"), }, }, }, @@ -9929,7 +10008,7 @@ func schema_pkg_apis_pingcap_v1alpha1_TikvAutoScalerSpec(ref common.ReferenceCal }, }, Dependencies: []string{ - "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint", "k8s.io/api/autoscaling/v2beta2.MetricSpec"}, + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint"}, } } diff --git a/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go b/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go index 825967de4a..f8a7d0dc89 100644 --- a/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go +++ b/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go @@ -115,6 +115,19 @@ type BasicAutoScalerSpec struct { // +optional ScaleOutIntervalSeconds *int32 `json:"scaleOutIntervalSeconds,omitempty"` + // +optional + Metrics []CustomMetric `json:"metrics,omitempty"` + + // MetricsTimeDuration describe the Time duration to be queried in the Prometheus + // +optional + MetricsTimeDuration *string `json:"metricsTimeDuration,omitempty"` + // ExternalEndpoint makes the auto-scaler controller able to query the external service + // to fetch the recommended replicas for TiKV/TiDB + // +optional + ExternalEndpoint *ExternalEndpoint `json:"externalEndpoint,omitempty"` +} + +type CustomMetric struct { // metrics contains the specifications for which to use to calculate the // desired replica count (the maximum replica count across all metrics will // be used). The desired replica count is calculated multiplying the @@ -122,17 +135,21 @@ type BasicAutoScalerSpec struct { // number of pods. Ergo, metrics used must decrease as the pod count is // increased, and vice-versa. See the individual metric source types for // more information about how each type of metric must respond. - // If not set, the default metric will be set to 80% average CPU utilization. + // If not set, the auto-scaling won't happen. // +optional - Metrics []v2beta2.MetricSpec `json:"metrics,omitempty"` - - // MetricsTimeDuration describe the Time duration to be queried in the Prometheus + v2beta2.MetricSpec `json:",inline"` + // LeastStoragePressurePeriodSeconds is only for the storage auto-scaling case when the resource name in the metricSpec + // is `Storage`. When the Storage metrics meet the pressure, Operator would wait + // LeastStoragePressurePeriodSeconds duration then able to scale out. + // If not set, the default value is `300` // +optional - MetricsTimeDuration *string `json:"metricsTimeDuration,omitempty"` - // ExternalEndpoint makes the auto-scaler controller able to query the external service - // to fetch the recommended replicas for TiKV/TiDB + LeastStoragePressurePeriodSeconds *int64 `json:"leastStoragePressurePeriodSeconds,omitempty"` + // LeastRemainAvailableStoragePercent indicates the least remaining available storage percent compare to + // the capacity storage. If the available storage is lower than the capacity storage * LeastRemainAvailableStoragePercent, + // the storage status will become storage pressure and ready to be scaled out. + // LeastRemainAvailableStoragePercent should between 5 and 90. If note set, the default value would be 10 // +optional - ExternalEndpoint *ExternalEndpoint `json:"externalEndpoint,omitempty"` + LeastRemainAvailableStoragePercent *int64 `json:"leastRemainAvailableStoragePercent,omitempty"` } // +k8s:openapi-gen=true @@ -196,9 +213,33 @@ type MetricsStatus struct { // Name indicates the metrics name Name string `json:"name"` // CurrentValue indicates the value calculated in the last auto-scaling reconciliation - CurrentValue string `json:"currentValue"` + // +optional + CurrentValue *string `json:"currentValue,omitempty"` // TargetValue indicates the threshold value for this metrics in auto-scaling - ThresholdValue string `json:"thresholdValue"` + // +optional + ThresholdValue *string `json:"thresholdValue,omitempty"` + // +optional + StorageMetricsStatus `json:",inline"` +} + +// +k8s:openapi-gen=true +// StorageMetricsStatus describe the storage metrics status in the last auto-scaling reconciliation +type StorageMetricsStatus struct { + // StoragePressure indicates whether storage under pressure + // +optional + StoragePressure *bool `json:"storagePressure,omitempty"` + // StoragePressureStartTime indicates the timestamp of the StoragePressure fist become true from false or nil + // +optional + StoragePressureStartTime *metav1.Time `json:"storagePressureStartTime,omitempty"` + // +optional + AvailableStorage *string `json:"availableStorage,omitempty"` + // +optional + CapacityStorage *string `json:"capacityStorage,omitempty"` + // BaselineAvailableStorage indicates the baseline for available storage size. + // This is calculated by the capacity storage size * storage auto-scaling baseline percent value + // If the AvailableStorage is less than the BaselineAvailableStorage, the database is under StoragePressure + // optional + BaselineAvailableStorage *string `json:"baselineAvailableStorage,omitempty"` } // +k8s:openapi-gen=true diff --git a/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go index 01554c87b9..555c36b4bb 100644 --- a/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go @@ -21,7 +21,6 @@ import ( time "time" appsv1 "k8s.io/api/apps/v1" - v2beta2 "k8s.io/api/autoscaling/v2beta2" v1 "k8s.io/api/core/v1" extensionsv1beta1 "k8s.io/api/extensions/v1beta1" v1beta1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1" @@ -370,7 +369,7 @@ func (in *BasicAutoScalerSpec) DeepCopyInto(out *BasicAutoScalerSpec) { } if in.Metrics != nil { in, out := &in.Metrics, &out.Metrics - *out = make([]v2beta2.MetricSpec, len(*in)) + *out = make([]CustomMetric, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -404,7 +403,9 @@ func (in *BasicAutoScalerStatus) DeepCopyInto(out *BasicAutoScalerStatus) { if in.MetricsStatusList != nil { in, out := &in.MetricsStatusList, &out.MetricsStatusList *out = make([]MetricsStatus, len(*in)) - copy(*out, *in) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } } if in.LastAutoScalingTimestamp != nil { in, out := &in.LastAutoScalingTimestamp, &out.LastAutoScalingTimestamp @@ -791,6 +792,33 @@ func (in *CrdKinds) DeepCopy() *CrdKinds { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CustomMetric) DeepCopyInto(out *CustomMetric) { + *out = *in + in.MetricSpec.DeepCopyInto(&out.MetricSpec) + if in.LeastStoragePressurePeriodSeconds != nil { + in, out := &in.LeastStoragePressurePeriodSeconds, &out.LeastStoragePressurePeriodSeconds + *out = new(int64) + **out = **in + } + if in.LeastRemainAvailableStoragePercent != nil { + in, out := &in.LeastRemainAvailableStoragePercent, &out.LeastRemainAvailableStoragePercent + *out = new(int64) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CustomMetric. +func (in *CustomMetric) DeepCopy() *CustomMetric { + if in == nil { + return nil + } + out := new(CustomMetric) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DashboardConfig) DeepCopyInto(out *DashboardConfig) { *out = *in @@ -1726,6 +1754,17 @@ func (in *MasterKeyKMSConfig) DeepCopy() *MasterKeyKMSConfig { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MetricsStatus) DeepCopyInto(out *MetricsStatus) { *out = *in + if in.CurrentValue != nil { + in, out := &in.CurrentValue, &out.CurrentValue + *out = new(string) + **out = **in + } + if in.ThresholdValue != nil { + in, out := &in.ThresholdValue, &out.ThresholdValue + *out = new(string) + **out = **in + } + in.StorageMetricsStatus.DeepCopyInto(&out.StorageMetricsStatus) return } @@ -3639,6 +3678,46 @@ func (in *StorageClaim) DeepCopy() *StorageClaim { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StorageMetricsStatus) DeepCopyInto(out *StorageMetricsStatus) { + *out = *in + if in.StoragePressure != nil { + in, out := &in.StoragePressure, &out.StoragePressure + *out = new(bool) + **out = **in + } + if in.StoragePressureStartTime != nil { + in, out := &in.StoragePressureStartTime, &out.StoragePressureStartTime + *out = (*in).DeepCopy() + } + if in.AvailableStorage != nil { + in, out := &in.AvailableStorage, &out.AvailableStorage + *out = new(string) + **out = **in + } + if in.CapacityStorage != nil { + in, out := &in.CapacityStorage, &out.CapacityStorage + *out = new(string) + **out = **in + } + if in.BaselineAvailableStorage != nil { + in, out := &in.BaselineAvailableStorage, &out.BaselineAvailableStorage + *out = new(string) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StorageMetricsStatus. +func (in *StorageMetricsStatus) DeepCopy() *StorageMetricsStatus { + if in == nil { + return nil + } + out := new(StorageMetricsStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *StorageProvider) DeepCopyInto(out *StorageProvider) { *out = *in diff --git a/pkg/autoscaler/autoscaler/autoscaler_manager.go b/pkg/autoscaler/autoscaler/autoscaler_manager.go index 1ce1d19fe9..951242087b 100644 --- a/pkg/autoscaler/autoscaler/autoscaler_manager.go +++ b/pkg/autoscaler/autoscaler/autoscaler_manager.go @@ -16,7 +16,6 @@ package autoscaler import ( "encoding/json" "fmt" - "strconv" "strings" "time" @@ -147,47 +146,22 @@ func (am *autoScalerManager) updateAutoScaling(oldTc *v1alpha1.TidbCluster, if tac.Annotations == nil { tac.Annotations = map[string]string{} } - f := func(key string) (*time.Time, error) { - v, ok := tac.Annotations[key] - if ok { - ts, err := strconv.ParseInt(v, 10, 64) - if err != nil { - klog.Errorf("failed to convert label[%s] key to int64, err:%v", key, err) - return nil, err - } - t := time.Unix(ts, 0) - return &t, nil - } - return nil, nil - } - - tac.Annotations[label.AnnLastSyncingTimestamp] = fmt.Sprintf("%d", time.Now().Unix()) - + now := time.Now() + tac.Annotations[label.AnnLastSyncingTimestamp] = fmt.Sprintf("%d", now.Unix()) if tac.Spec.TiKV != nil { if oldTc.Status.TiKV.StatefulSet != nil { tac.Status.TiKV.CurrentReplicas = oldTc.Status.TiKV.StatefulSet.CurrentReplicas } - lastTimestamp, err := f(label.AnnTiKVLastAutoScalingTimestamp) - if err != nil { - return err - } - if lastTimestamp != nil { - tac.Status.TiKV.LastAutoScalingTimestamp = &metav1.Time{Time: *lastTimestamp} - } + tac.Status.TiKV.LastAutoScalingTimestamp = &metav1.Time{Time: now} } else { tac.Status.TiKV = nil } + if tac.Spec.TiDB != nil { if oldTc.Status.TiDB.StatefulSet != nil { tac.Status.TiDB.CurrentReplicas = oldTc.Status.TiDB.StatefulSet.CurrentReplicas } - lastTimestamp, err := f(label.AnnTiDBLastAutoScalingTimestamp) - if err != nil { - return err - } - if lastTimestamp != nil { - tac.Status.TiDB.LastAutoScalingTimestamp = &metav1.Time{Time: *lastTimestamp} - } + tac.Status.TiDB.LastAutoScalingTimestamp = &metav1.Time{Time: now} } else { tac.Status.TiDB = nil } diff --git a/pkg/autoscaler/autoscaler/calculate/calculate.go b/pkg/autoscaler/autoscaler/calculate/calculate.go index 6509712e2b..9194c492c1 100644 --- a/pkg/autoscaler/autoscaler/calculate/calculate.go +++ b/pkg/autoscaler/autoscaler/calculate/calculate.go @@ -24,12 +24,12 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" promClient "github.com/prometheus/client_golang/api" - autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog" ) const ( + TikvSumStorageMetricsPattern = `sum(tikv_store_size_bytes{cluster="%s", type="%s"}) by (cluster)` TikvSumCpuMetricsPattern = `sum(increase(tikv_thread_cpu_seconds_total{cluster="%s"}[%s])) by (instance)` TidbSumCpuMetricsPattern = `sum(increase(process_cpu_seconds_total{cluster="%s",job="tidb"}[%s])) by (instance)` InvalidTacMetricConfigureMsg = "tac[%s/%s] metric configuration invalid" @@ -44,7 +44,6 @@ type SingleQuery struct { Timestamp int64 Quary string Instances []string - Metric autoscalingv2beta2.MetricSpec } func queryMetricsFromPrometheus(tac *v1alpha1.TidbClusterAutoScaler, client promClient.Client, sq *SingleQuery, resp *Response) error { @@ -92,6 +91,7 @@ func sumForEachInstance(instances []string, resp *Response) (float64, error) { if len(resp.Data.Result) < 1 { return 0, fmt.Errorf("metrics Response return zero info") } + for _, r := range resp.Data.Result { if s.Has(r.Metric.Instance) { v, err := strconv.ParseFloat(r.Value[1].(string), 64) diff --git a/pkg/autoscaler/autoscaler/calculate/cpu.go b/pkg/autoscaler/autoscaler/calculate/cpu.go index 2d803fe205..36e763f925 100644 --- a/pkg/autoscaler/autoscaler/calculate/cpu.go +++ b/pkg/autoscaler/autoscaler/calculate/cpu.go @@ -20,7 +20,9 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" promClient "github.com/prometheus/client_golang/api" appsv1 "k8s.io/api/apps/v1" + autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" corev1 "k8s.io/api/core/v1" + "k8s.io/utils/pointer" ) const ( @@ -29,10 +31,8 @@ const ( //TODO: create issue to explain how auto-scaling algorithm based on cpu metrics work func CalculateRecomendedReplicasByCpuCosts(tac *v1alpha1.TidbClusterAutoScaler, sq *SingleQuery, sts *appsv1.StatefulSet, - client promClient.Client, memberType v1alpha1.MemberType, duration time.Duration) (int32, error) { - metric := sq.Metric + client promClient.Client, memberType v1alpha1.MemberType, duration time.Duration, metric autoscalingv2beta2.MetricSpec) (int32, error) { instances := sq.Instances - if metric.Resource == nil || metric.Resource.Target.AverageUtilization == nil { return -1, fmt.Errorf(InvalidTacMetricConfigureMsg, tac.Namespace, tac.Name) } @@ -66,9 +66,9 @@ func CalculateRecomendedReplicasByCpuCosts(tac *v1alpha1.TidbClusterAutoScaler, return -1, err } metrics := v1alpha1.MetricsStatus{ - Name: string(MetricTypeCPU), - CurrentValue: fmt.Sprintf("%v", cpuSecsTotal), - ThresholdValue: fmt.Sprintf("%v", expectedCpuSecsTotal), + Name: string(corev1.ResourceCPU), + CurrentValue: pointer.StringPtr(fmt.Sprintf("%v", cpuSecsTotal)), + ThresholdValue: pointer.StringPtr(fmt.Sprintf("%v", expectedCpuSecsTotal)), } if memberType == v1alpha1.TiKVMemberType { addMetricsStatusIntoMetricsStatusList(metrics, &tac.Status.TiKV.BasicAutoScalerStatus) diff --git a/pkg/autoscaler/autoscaler/calculate/storage.go b/pkg/autoscaler/autoscaler/calculate/storage.go new file mode 100644 index 0000000000..effea4667a --- /dev/null +++ b/pkg/autoscaler/autoscaler/calculate/storage.go @@ -0,0 +1,140 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package calculate + +import ( + "fmt" + "strconv" + "time" + + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" + promClient "github.com/prometheus/client_golang/api" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" +) + +func CalculateWhetherStoragePressure(tac *v1alpha1.TidbClusterAutoScaler, capacitySq, availableSq *SingleQuery, + client promClient.Client, metric v1alpha1.CustomMetric) (bool, error) { + if metric.Resource == nil || + metric.Resource.Name != corev1.ResourceStorage || + metric.LeastRemainAvailableStoragePercent == nil { + return false, fmt.Errorf("tac[%s/%s] didn't set storage metric correctly", tac.Namespace, tac.Name) + } + + // query total available storage size + resp := &Response{} + err := queryMetricsFromPrometheus(tac, client, availableSq, resp) + if err != nil { + return false, err + } + var availableSize uint64 + for _, r := range resp.Data.Result { + if r.Metric.Cluster == tac.Spec.Cluster.Name { + availableSize, err = strconv.ParseUint(r.Value[1].(string), 10, 64) + if err != nil { + return false, err + } + } + } + + // query total capacity storage size + resp = &Response{} + err = queryMetricsFromPrometheus(tac, client, capacitySq, resp) + if err != nil { + return false, err + } + var capacitySize uint64 + for _, r := range resp.Data.Result { + if r.Metric.Cluster == tac.Spec.Cluster.Name { + capacitySize, err = strconv.ParseUint(r.Value[1].(string), 10, 64) + if err != nil { + return false, err + } + } + } + v := *metric.LeastRemainAvailableStoragePercent + baselineAvailableSize := (capacitySize / 100) * uint64(v) + storagePressure := false + if availableSize < baselineAvailableSize { + storagePressure = true + } + + var newStatus, oldStatus *v1alpha1.MetricsStatus + for _, m := range tac.Status.TiKV.MetricsStatusList { + if m.Name == string(corev1.ResourceStorage) { + oldStatus = &m + break + } + } + storageMetrics := v1alpha1.StorageMetricsStatus{ + StoragePressure: pointer.BoolPtr(storagePressure), + AvailableStorage: pointer.StringPtr(byteCountDecimal(availableSize)), + CapacityStorage: pointer.StringPtr(byteCountDecimal(capacitySize)), + BaselineAvailableStorage: pointer.StringPtr(byteCountDecimal(baselineAvailableSize)), + } + if oldStatus != nil { + oldStatus.StoragePressure = storageMetrics.StoragePressure + oldStatus.AvailableStorage = storageMetrics.AvailableStorage + oldStatus.CapacityStorage = storageMetrics.CapacityStorage + oldStatus.BaselineAvailableStorage = storageMetrics.BaselineAvailableStorage + newStatus = oldStatus + } else { + newStatus = &v1alpha1.MetricsStatus{ + Name: string(corev1.ResourceStorage), + StorageMetricsStatus: storageMetrics, + } + } + + if storagePressure { + if !isStoragePressureStartTimeRecordAlready(tac.Status) { + newStatus.StorageMetricsStatus.StoragePressureStartTime = &metav1.Time{Time: time.Now()} + } + } else { + newStatus.StoragePressureStartTime = nil + } + addMetricsStatusIntoMetricsStatusList(*newStatus, &tac.Status.TiKV.BasicAutoScalerStatus) + return storagePressure, nil +} + +// TODO: add unit test +func isStoragePressureStartTimeRecordAlready(tacStatus v1alpha1.TidbClusterAutoSclaerStatus) bool { + if tacStatus.TiKV == nil { + return false + } + if len(tacStatus.TiKV.MetricsStatusList) < 1 { + return false + } + for _, metricsStatus := range tacStatus.TiKV.MetricsStatusList { + if metricsStatus.Name == "storage" { + if metricsStatus.StoragePressureStartTime != nil { + return true + } + } + } + return false +} + +func byteCountDecimal(b uint64) string { + const unit = 1000 + if b < unit { + return fmt.Sprintf("%d B", b) + } + div, exp := int64(unit), 0 + for n := b / unit; n >= unit; n /= unit { + div *= unit + exp++ + } + return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "kMGTPE"[exp]) +} diff --git a/pkg/autoscaler/autoscaler/calculate/util.go b/pkg/autoscaler/autoscaler/calculate/util.go index 1523ba74e3..40d586219e 100644 --- a/pkg/autoscaler/autoscaler/calculate/util.go +++ b/pkg/autoscaler/autoscaler/calculate/util.go @@ -22,18 +22,10 @@ import ( corev1 "k8s.io/api/core/v1" ) -// MetricType describe the current Supported Metric Type to calculate the recommended Replicas -type MetricType string - -const ( - MetricTypeCPU MetricType = "cpu" - //metricTypeQPS MetricType = "qps" -) - // currently, we only choose one metrics to be computed. // If there exists several metrics, we tend to choose ResourceMetricSourceType metric -func FilterMetrics(metrics []autoscalingv2beta2.MetricSpec, name corev1.ResourceName) []autoscalingv2beta2.MetricSpec { - var list []autoscalingv2beta2.MetricSpec +func FilterMetrics(metrics []v1alpha1.CustomMetric, name corev1.ResourceName) []v1alpha1.CustomMetric { + var list []v1alpha1.CustomMetric for _, m := range metrics { if m.Type == autoscalingv2beta2.ResourceMetricSourceType && m.Resource != nil && m.Resource.Name == name { list = append(list, m) @@ -43,14 +35,6 @@ func FilterMetrics(metrics []autoscalingv2beta2.MetricSpec, name corev1.Resource return list } -// genMetricType return the supported MetricType in Operator by kubernetes auto-scaling MetricType -func GenMetricType(tac *v1alpha1.TidbClusterAutoScaler, metric autoscalingv2beta2.MetricSpec) (MetricType, error) { - if metric.Type == autoscalingv2beta2.ResourceMetricSourceType && metric.Resource != nil && metric.Resource.Name == corev1.ResourceCPU { - return MetricTypeCPU, nil - } - return "", fmt.Errorf(InvalidTacMetricConfigureMsg, tac.Namespace, tac.Name) -} - // filterContainer is to filter the specific container from the given statefulset(tidb/tikv) func filterContainer(tac *v1alpha1.TidbClusterAutoScaler, sts *appsv1.StatefulSet, containerName string) (*corev1.Container, error) { for _, c := range sts.Spec.Template.Spec.Containers { diff --git a/pkg/autoscaler/autoscaler/tidb_autoscaler.go b/pkg/autoscaler/autoscaler/tidb_autoscaler.go index 6a40d4ae18..83c79e0bcb 100644 --- a/pkg/autoscaler/autoscaler/tidb_autoscaler.go +++ b/pkg/autoscaler/autoscaler/tidb_autoscaler.go @@ -108,10 +108,9 @@ func calculateTidbMetrics(tac *v1alpha1.TidbClusterAutoScaler, sts *appsv1.State Endpoint: ep, Timestamp: time.Now().Unix(), Instances: instances, - Metric: metrics[0], Quary: fmt.Sprintf(calculate.TidbSumCpuMetricsPattern, tac.Spec.Cluster.Name, *tac.Spec.TiDB.MetricsTimeDuration), } - return calculate.CalculateRecomendedReplicasByCpuCosts(tac, sq, sts, client, v1alpha1.TiDBMemberType, duration) + return calculate.CalculateRecomendedReplicasByCpuCosts(tac, sq, sts, client, v1alpha1.TiDBMemberType, duration, metrics[0].MetricSpec) } return -1, fmt.Errorf(calculate.InvalidTacMetricConfigureMsg, tac.Namespace, tac.Name) } diff --git a/pkg/autoscaler/autoscaler/tikv_autoscaler.go b/pkg/autoscaler/autoscaler/tikv_autoscaler.go index 9c48094eea..95989499e5 100644 --- a/pkg/autoscaler/autoscaler/tikv_autoscaler.go +++ b/pkg/autoscaler/autoscaler/tikv_autoscaler.go @@ -20,11 +20,13 @@ import ( "github.com/pingcap/advanced-statefulset/client/apis/apps/v1/helper" "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/autoscaler/autoscaler/calculate" + "github.com/pingcap/tidb-operator/pkg/controller" "github.com/pingcap/tidb-operator/pkg/label" operatorUtils "github.com/pingcap/tidb-operator/pkg/util" promClient "github.com/prometheus/client_golang/api" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/klog" ) func (am *autoScalerManager) syncTiKV(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler) error { @@ -69,25 +71,83 @@ func calculateTiKVMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.Tidb if err != nil { return err } + if len(tac.Spec.TiKV.Metrics) < 1 { + klog.V(4).Infof("tac[%s/%s] have no setting, skip auto-scaling", tac.Namespace, tac.Name) + return nil + } + + metrics := calculate.FilterMetrics(tac.Spec.TiKV.Metrics, corev1.ResourceStorage) + if len(metrics) > 0 { + now := time.Now().Unix() + capacitySq := &calculate.SingleQuery{ + Endpoint: ep, + Timestamp: now, + Instances: instances, + Quary: fmt.Sprintf(calculate.TikvSumStorageMetricsPattern, tac.Spec.Cluster.Name, "capacity"), + } + availableSq := &calculate.SingleQuery{ + Endpoint: ep, + Timestamp: now, + Instances: instances, + Quary: fmt.Sprintf(calculate.TikvSumStorageMetricsPattern, tac.Spec.Cluster.Name, "available"), + } + return calculateTiKVStorageMetrics(tac, tc, capacitySq, availableSq, client, metrics[0]) + } // check CPU - metrics := calculate.FilterMetrics(tac.Spec.TiKV.Metrics, corev1.ResourceCPU) + metrics = calculate.FilterMetrics(tac.Spec.TiKV.Metrics, corev1.ResourceCPU) if len(metrics) > 0 { sq := &calculate.SingleQuery{ Endpoint: ep, Timestamp: time.Now().Unix(), Instances: instances, - Metric: metrics[0], Quary: fmt.Sprintf(calculate.TikvSumCpuMetricsPattern, tac.Spec.Cluster.Name, *tac.Spec.TiKV.MetricsTimeDuration), } - return calculateTiKVCPUMetrics(tac, tc, sts, sq, client, duration) + return calculateTiKVCPUMetrics(tac, tc, sts, sq, client, duration, metrics[0]) } + + // none metrics selected, end auto-scaling return nil } -func calculateTiKVCPUMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.TidbCluster, sts *appsv1.StatefulSet, sq *calculate.SingleQuery, client promClient.Client, duration time.Duration) error { +func calculateTiKVStorageMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.TidbCluster, + capSq, avaSq *calculate.SingleQuery, client promClient.Client, metric v1alpha1.CustomMetric) error { + if tc.Spec.TiKV.Replicas >= tac.Spec.TiKV.MaxReplicas { + klog.V(4).Infof("tac[%s/%s]'s tikv won't scale out by storage pressure due to maxReplicas", tac.Namespace, tac.Name) + return nil + } + intervalSeconds := tac.Spec.TiKV.ScaleOutIntervalSeconds + ableToScale, err := checkTiKVAutoScalingInterval(tac, *intervalSeconds) + if err != nil { + return err + } + if !ableToScale { + klog.Infof("tac[%s/%s]'s tikv won't scale out by storage pressure due to scale-out cool-down interval", tac.Namespace, tac.Name) + return nil + } + storagePressure, err := calculate.CalculateWhetherStoragePressure(tac, capSq, avaSq, client, metric) + if err != nil { + return err + } + if !storagePressure { + return nil + } + ableToScale, err = checkWhetherAbleToScaleDueToStorage(tac, metric, time.Now(), controller.ResyncDuration) + if err != nil { + return err + } + if !ableToScale { + return nil + } + currentReplicas := tc.Spec.TiKV.Replicas + targetReplicas := currentReplicas + 1 + return updateTacIfTiKVScale(tc, tac, targetReplicas) +} + +func calculateTiKVCPUMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.TidbCluster, sts *appsv1.StatefulSet, + sq *calculate.SingleQuery, client promClient.Client, duration time.Duration, metric v1alpha1.CustomMetric) error { - targetReplicas, err := calculate.CalculateRecomendedReplicasByCpuCosts(tac, sq, sts, client, v1alpha1.TiKVMemberType, duration) + targetReplicas, err := calculate.CalculateRecomendedReplicasByCpuCosts(tac, sq, sts, client, v1alpha1.TiKVMemberType, duration, metric.MetricSpec) if err != nil { return err } @@ -96,14 +156,25 @@ func calculateTiKVCPUMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.T return nil } currentReplicas := int32(len(sq.Instances)) - err = syncTiKVAfterCalculated(tc, tac, currentReplicas, targetReplicas) + intervalSeconds := tac.Spec.TiKV.ScaleInIntervalSeconds + ableToScale, err := checkTiKVAutoScalingInterval(tac, *intervalSeconds) + if err != nil { + return err + } + if !ableToScale { + return nil + } + err = updateTacIfTiKVScale(tc, tac, targetReplicas) if err != nil { return err } return addAnnotationMarkIfScaleOutDueToCPUMetrics(tc, currentReplicas, targetReplicas, sts) } -func checkTiKVAutoScaling(tac *v1alpha1.TidbClusterAutoScaler, intervalSeconds int32) (bool, error) { +// checkTiKVAutoScalingInterval check the each 2 auto-scaling interval depends on the scaling-in and scaling-out +// Note that for the storage scaling, we will check scale-out interval before we start to scraping metrics, +// and for the cpu scaling, we will check scale-in/scale-out interval after we finish calculating metrics. +func checkTiKVAutoScalingInterval(tac *v1alpha1.TidbClusterAutoScaler, intervalSeconds int32) (bool, error) { if tac.Annotations == nil { tac.Annotations = map[string]string{} } @@ -117,28 +188,35 @@ func checkTiKVAutoScaling(tac *v1alpha1.TidbClusterAutoScaler, intervalSeconds i return true, nil } -// syncTiKVAfterCalculated would check the Consecutive count to avoid jitter, and it would also check the interval -// duration between each auto-scaling. If either of them is not meet, the auto-scaling would be rejected. -// If the auto-scaling is permitted, the timestamp would be recorded and the Consecutive count would be zeroed. -// The currentReplicas of TiKV calculated in auto-scaling is the count of the StateUp TiKV instance, so we need to -// add the number of other state tikv instance replicas when we update the TidbCluster.Spec.TiKV.Replicas -func syncTiKVAfterCalculated(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler, currentReplicas, recommendedReplicas int32) error { - intervalSeconds := tac.Spec.TiKV.ScaleInIntervalSeconds - if recommendedReplicas > currentReplicas { - intervalSeconds = tac.Spec.TiKV.ScaleOutIntervalSeconds +// checkWhetherAbleToScaleDueToStorage will check whether the storage pressure status have been existed for as least +// LeastStoragePressurePeriodSeconds duration. If not, the operator would wait next round to check again. +func checkWhetherAbleToScaleDueToStorage(tac *v1alpha1.TidbClusterAutoScaler, metric v1alpha1.CustomMetric, now time.Time, resyncDuration time.Duration) (bool, error) { + if metric.LeastStoragePressurePeriodSeconds == nil { + return false, fmt.Errorf("tac[%s/%s]'s leastStoragePressurePeriodSeconds must be setted before scale out in storage", tac.Namespace, tac.Name) } - ableToScale, err := checkTiKVAutoScaling(tac, *intervalSeconds) - if err != nil { - return err + if tac.Status.TiKV.LastAutoScalingTimestamp == nil { + return false, fmt.Errorf("tac[%s/%s]'s tikv status LastAutoScalingTimestamp haven't been set", tac.Namespace, tac.Name) } - if !ableToScale { - return nil + if now.Sub(tac.Status.TiKV.LastAutoScalingTimestamp.Time) > 3*resyncDuration { + klog.Infof("tac[%s/%s]'s tikv status LastAutoScalingTimestamp timeout", tac.Namespace, tac.Name) + return false, nil + } + for _, m := range tac.Status.TiKV.MetricsStatusList { + if m.Name == string(corev1.ResourceStorage) { + if m.StoragePressure == nil || m.StoragePressureStartTime == nil { + return false, nil + } + x := now.Sub(m.StoragePressureStartTime.Time).Seconds() + if x >= float64(*metric.LeastStoragePressurePeriodSeconds) { + return true, nil + } + } } - return updateTcTiKVIfScale(tc, tac, recommendedReplicas) + return false, nil } -// we record the auto-scaling out slot for tikv, in order to add special hot labels when they are created -func updateTcTiKVIfScale(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler, recommendedReplicas int32) error { +// updateTacIfTiKVScale update the tac status and syncing annotations if tikv scale-in/out +func updateTacIfTiKVScale(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler, recommendedReplicas int32) error { tac.Annotations[label.AnnTiKVLastAutoScalingTimestamp] = fmt.Sprintf("%d", time.Now().Unix()) tc.Spec.TiKV.Replicas = recommendedReplicas tac.Status.TiKV.RecommendedReplicas = recommendedReplicas diff --git a/pkg/autoscaler/autoscaler/tikv_autoscaler_test.go b/pkg/autoscaler/autoscaler/tikv_autoscaler_test.go new file mode 100644 index 0000000000..f904256202 --- /dev/null +++ b/pkg/autoscaler/autoscaler/tikv_autoscaler_test.go @@ -0,0 +1,149 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package autoscaler + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" + autoscaling "k8s.io/api/autoscaling/v2beta2" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" +) + +func TestCheckWhetherAbleToScaleDueToStorage(t *testing.T) { + g := NewGomegaWithT(t) + now := time.Now() + tac1 := newStorageMetricTac() + tac1.Status = v1alpha1.TidbClusterAutoSclaerStatus{ + TiKV: &v1alpha1.TikvAutoScalerStatus{ + BasicAutoScalerStatus: v1alpha1.BasicAutoScalerStatus{ + MetricsStatusList: []v1alpha1.MetricsStatus{ + { + Name: string(corev1.ResourceStorage), + StorageMetricsStatus: v1alpha1.StorageMetricsStatus{ + StoragePressure: pointer.BoolPtr(true), + StoragePressureStartTime: &metav1.Time{ + Time: now.Add(-2 * time.Minute), + }, + }, + }, + }, + LastAutoScalingTimestamp: &metav1.Time{ + Time: now.Add(-10 * time.Second), + }, + }, + }, + } + tac2 := newStorageMetricTac() + tac2.Status = v1alpha1.TidbClusterAutoSclaerStatus{ + TiKV: &v1alpha1.TikvAutoScalerStatus{ + BasicAutoScalerStatus: v1alpha1.BasicAutoScalerStatus{ + MetricsStatusList: []v1alpha1.MetricsStatus{ + { + Name: string(corev1.ResourceStorage), + StorageMetricsStatus: v1alpha1.StorageMetricsStatus{ + StoragePressure: pointer.BoolPtr(true), + StoragePressureStartTime: &metav1.Time{ + Time: now.Add(-20 * time.Second), + }, + }, + }, + }, + LastAutoScalingTimestamp: &metav1.Time{ + Time: now.Add(-10 * time.Second), + }, + }, + }, + } + tac3 := newStorageMetricTac() + tac3.Status = v1alpha1.TidbClusterAutoSclaerStatus{ + TiKV: &v1alpha1.TikvAutoScalerStatus{ + BasicAutoScalerStatus: v1alpha1.BasicAutoScalerStatus{ + MetricsStatusList: []v1alpha1.MetricsStatus{ + { + Name: string(corev1.ResourceStorage), + StorageMetricsStatus: v1alpha1.StorageMetricsStatus{ + StoragePressure: pointer.BoolPtr(true), + StoragePressureStartTime: &metav1.Time{ + Time: now.Add(-2 * time.Minute), + }, + }, + }, + }, + LastAutoScalingTimestamp: &metav1.Time{ + Time: now.Add(-10 * time.Minute), + }, + }, + }, + } + testcases := []struct { + name string + tac *v1alpha1.TidbClusterAutoScaler + metric v1alpha1.CustomMetric + expectedResult bool + }{ + { + name: "experienced disk pressure for enough time", + tac: tac1, + metric: v1alpha1.CustomMetric{ + LeastStoragePressurePeriodSeconds: pointer.Int64Ptr(60), + }, + expectedResult: true, + }, + { + name: "haven't experienced disk pressure for enough time", + tac: tac2, + metric: v1alpha1.CustomMetric{ + LeastStoragePressurePeriodSeconds: pointer.Int64Ptr(60), + }, + expectedResult: false, + }, + { + name: "last syncing time is stale", + tac: tac3, + metric: v1alpha1.CustomMetric{ + LeastStoragePressurePeriodSeconds: pointer.Int64Ptr(60), + }, + expectedResult: false, + }, + } + + for _, testcase := range testcases { + t.Run(testcase.name, func(t *testing.T) { + result, err := checkWhetherAbleToScaleDueToStorage(testcase.tac, testcase.metric, now, 30*time.Second) + g.Expect(err).Should(BeNil()) + g.Expect(result).Should(Equal(testcase.expectedResult)) + }) + } +} + +func newStorageMetricTac() *v1alpha1.TidbClusterAutoScaler { + tac := newTidbClusterAutoScaler() + tac.Spec.TiKV.Metrics = []v1alpha1.CustomMetric{ + { + LeastStoragePressurePeriodSeconds: pointer.Int64Ptr(60), + MetricSpec: autoscaling.MetricSpec{ + Type: autoscaling.ResourceMetricSourceType, + Resource: &autoscaling.ResourceMetricSource{ + Name: corev1.ResourceStorage, + }, + }, + }, + } + return tac +} diff --git a/pkg/autoscaler/autoscaler/util.go b/pkg/autoscaler/autoscaler/util.go index 1f753b4adf..b33013c3a6 100644 --- a/pkg/autoscaler/autoscaler/util.go +++ b/pkg/autoscaler/autoscaler/util.go @@ -22,21 +22,10 @@ import ( "github.com/pingcap/tidb-operator/pkg/label" operatorUtils "github.com/pingcap/tidb-operator/pkg/util" appsv1 "k8s.io/api/apps/v1" - autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" corev1 "k8s.io/api/core/v1" "k8s.io/utils/pointer" ) -var defaultMetricSpec = autoscalingv2beta2.MetricSpec{ - Type: autoscalingv2beta2.ResourceMetricSourceType, - Resource: &autoscalingv2beta2.ResourceMetricSource{ - Name: corev1.ResourceCPU, - Target: autoscalingv2beta2.MetricTarget{ - AverageUtilization: pointer.Int32Ptr(80), - }, - }, -} - // checkStsAutoScalingPrerequisites would check the sts status to ensure wouldn't happen during // upgrading, scaling func checkStsAutoScalingPrerequisites(set *appsv1.StatefulSet) bool { @@ -132,13 +121,21 @@ func defaultTAC(tac *v1alpha1.TidbClusterAutoScaler) { } // If ExternalEndpoint is not provided, we would set default metrics if tac.Spec.TiKV.ExternalEndpoint == nil { - if len(tac.Spec.TiKV.Metrics) == 0 { - tac.Spec.TiKV.Metrics = append(tac.Spec.TiKV.Metrics, defaultMetricSpec) - } if tac.Spec.TiKV.MetricsTimeDuration == nil { tac.Spec.TiKV.MetricsTimeDuration = pointer.StringPtr("3m") } } + for id, m := range tac.Spec.TiKV.Metrics { + if m.Resource != nil && m.Resource.Name == corev1.ResourceStorage { + if m.LeastStoragePressurePeriodSeconds == nil { + m.LeastStoragePressurePeriodSeconds = pointer.Int64Ptr(300) + } + if m.LeastRemainAvailableStoragePercent == nil { + m.LeastRemainAvailableStoragePercent = pointer.Int64Ptr(10) + } + tac.Spec.TiKV.Metrics[id] = m + } + } } if tac.Spec.TiDB != nil { @@ -152,9 +149,6 @@ func defaultTAC(tac *v1alpha1.TidbClusterAutoScaler) { tac.Spec.TiDB.ScaleInIntervalSeconds = pointer.Int32Ptr(500) } if tac.Spec.TiDB.ExternalEndpoint == nil { - if len(tac.Spec.TiDB.Metrics) == 0 { - tac.Spec.TiDB.Metrics = append(tac.Spec.TiDB.Metrics, defaultMetricSpec) - } if tac.Spec.TiDB.MetricsTimeDuration == nil { tac.Spec.TiDB.MetricsTimeDuration = pointer.StringPtr("3m") } @@ -177,3 +171,17 @@ func genMetricsEndpoint(tac *v1alpha1.TidbClusterAutoScaler) (string, error) { } return fmt.Sprintf("http://%s-prometheus.%s.svc:9090", tac.Spec.Monitor.Name, tac.Spec.Monitor.Namespace), nil } + +func emptyStorageMetricsStatus(tac *v1alpha1.TidbClusterAutoScaler) { + for id, m := range tac.Status.TiKV.MetricsStatusList { + if m.Name == string(corev1.ResourceStorage) { + m.StoragePressure = nil + m.StoragePressureStartTime = nil + m.CapacityStorage = nil + m.AvailableStorage = nil + m.BaselineAvailableStorage = nil + tac.Status.TiKV.MetricsStatusList[id] = m + return + } + } +} diff --git a/pkg/autoscaler/autoscaler/util_test.go b/pkg/autoscaler/autoscaler/util_test.go index a0a9b8b3a6..980721d3d3 100644 --- a/pkg/autoscaler/autoscaler/util_test.go +++ b/pkg/autoscaler/autoscaler/util_test.go @@ -22,7 +22,6 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/label" appsv1 "k8s.io/api/apps/v1" - autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" "k8s.io/utils/pointer" ) @@ -259,13 +258,12 @@ func TestDefaultTac(t *testing.T) { tac := newTidbClusterAutoScaler() tac.Spec.TiDB = nil tac.Spec.TiKV.MinReplicas = nil - tac.Spec.TiKV.Metrics = []autoscalingv2beta2.MetricSpec{} + tac.Spec.TiKV.Metrics = []v1alpha1.CustomMetric{} tac.Spec.TiKV.MetricsTimeDuration = nil tac.Spec.TiKV.ScaleOutIntervalSeconds = nil tac.Spec.TiKV.ScaleInIntervalSeconds = nil defaultTAC(tac) g.Expect(*tac.Spec.TiKV.MinReplicas).Should(Equal(int32(1))) - g.Expect(len(tac.Spec.TiKV.Metrics)).Should(Equal(1)) g.Expect(*tac.Spec.TiKV.MetricsTimeDuration).Should(Equal("3m")) g.Expect(*tac.Spec.TiKV.ScaleOutIntervalSeconds).Should(Equal(int32(300))) g.Expect(*tac.Spec.TiKV.ScaleInIntervalSeconds).Should(Equal(int32(500))) @@ -273,13 +271,12 @@ func TestDefaultTac(t *testing.T) { tac = newTidbClusterAutoScaler() tac.Spec.TiKV = nil tac.Spec.TiDB.MinReplicas = nil - tac.Spec.TiDB.Metrics = []autoscalingv2beta2.MetricSpec{} + tac.Spec.TiDB.Metrics = []v1alpha1.CustomMetric{} tac.Spec.TiDB.MetricsTimeDuration = nil tac.Spec.TiDB.ScaleOutIntervalSeconds = nil tac.Spec.TiDB.ScaleInIntervalSeconds = nil defaultTAC(tac) g.Expect(*tac.Spec.TiDB.MinReplicas).Should(Equal(int32(1))) - g.Expect(len(tac.Spec.TiDB.Metrics)).Should(Equal(1)) g.Expect(*tac.Spec.TiDB.MetricsTimeDuration).Should(Equal("3m")) g.Expect(*tac.Spec.TiDB.ScaleOutIntervalSeconds).Should(Equal(int32(300))) g.Expect(*tac.Spec.TiDB.ScaleInIntervalSeconds).Should(Equal(int32(500))) diff --git a/pkg/label/label.go b/pkg/label/label.go index 5b3253304a..57ea057ce3 100644 --- a/pkg/label/label.go +++ b/pkg/label/label.go @@ -109,10 +109,6 @@ const ( AnnTiDBLastAutoScalingTimestamp = "tidb.tidb.pingcap.com/last-autoscaling-timestamp" // AnnTiKVLastAutoScalingTimestamp is annotation key of tidbclusterto which ordinal is created by tikv auto-scaling AnnTiKVLastAutoScalingTimestamp = "tikv.tidb.pingcap.com/last-autoscaling-timestamp" - - // AnnTiKVReadyToScaleTimestamp records timestamp when tikv ready to scale - AnnTiKVReadyToScaleTimestamp = "tikv.tidb.pingcap.com/ready-to-scale-timestamp" - // AnnLastSyncingTimestamp records last sync timestamp AnnLastSyncingTimestamp = "tidb.pingcap.com/last-syncing-timestamp" diff --git a/tests/e2e/tidbcluster/serial.go b/tests/e2e/tidbcluster/serial.go index 038ddddf2b..e5acb9c8e8 100644 --- a/tests/e2e/tidbcluster/serial.go +++ b/tests/e2e/tidbcluster/serial.go @@ -483,12 +483,14 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { err = mock.SetPrometheusResponse(monitor.Name, monitor.Namespace, mp, fw) framework.ExpectNoError(err, "set tikv mock metrics error") - var defaultMetricSpec = autoscalingv2beta2.MetricSpec{ - Type: autoscalingv2beta2.ResourceMetricSourceType, - Resource: &autoscalingv2beta2.ResourceMetricSource{ - Name: corev1.ResourceCPU, - Target: autoscalingv2beta2.MetricTarget{ - AverageUtilization: pointer.Int32Ptr(80), + var defaultMetricSpec = v1alpha1.CustomMetric{ + MetricSpec: autoscalingv2beta2.MetricSpec{ + Type: autoscalingv2beta2.ResourceMetricSourceType, + Resource: &autoscalingv2beta2.ResourceMetricSource{ + Name: corev1.ResourceCPU, + Target: autoscalingv2beta2.MetricTarget{ + AverageUtilization: pointer.Int32Ptr(80), + }, }, }, } @@ -502,7 +504,7 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { ScaleInIntervalSeconds: pointer.Int32Ptr(100), }, } - tac.Spec.TiKV.Metrics = []autoscalingv2beta2.MetricSpec{} + tac.Spec.TiKV.Metrics = []v1alpha1.CustomMetric{} tac.Spec.TiKV.Metrics = append(tac.Spec.TiKV.Metrics, defaultMetricSpec) _, err = cli.PingcapV1alpha1().TidbClusterAutoScalers(ns).Create(tac) @@ -711,7 +713,7 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { ScaleInIntervalSeconds: pointer.Int32Ptr(100), }, } - tac.Spec.TiDB.Metrics = []autoscalingv2beta2.MetricSpec{} + tac.Spec.TiDB.Metrics = []v1alpha1.CustomMetric{} tac.Spec.TiDB.Metrics = append(tac.Spec.TiDB.Metrics, defaultMetricSpec) _, err = cli.PingcapV1alpha1().TidbClusterAutoScalers(ns).Update(tac) framework.ExpectNoError(err, "Update TidbMonitorClusterAutoScaler error")