diff --git a/cmd/crane-agent/app/agent.go b/cmd/crane-agent/app/agent.go index 378a99bae..8fd1da2b7 100644 --- a/cmd/crane-agent/app/agent.go +++ b/cmd/crane-agent/app/agent.go @@ -99,7 +99,7 @@ func Run(ctx context.Context, opts *options.Options) error { actionInformer := craneInformerFactory.Ensurance().V1alpha1().AvoidanceActions() tspInformer := craneInformerFactory.Prediction().V1alpha1().TimeSeriesPredictions() - newAgent, err := agent.NewAgent(ctx, hostname, opts.RuntimeEndpoint, kubeClient, craneClient, + newAgent, err := agent.NewAgent(ctx, hostname, opts.RuntimeEndpoint, opts.CgroupDriver, kubeClient, craneClient, podInformer, nodeInformer, nepInformer, actionInformer, tspInformer, opts.NodeResourceReserved, opts.Ifaces, healthCheck, opts.CollectInterval) if err != nil { diff --git a/cmd/crane-agent/app/options/option.go b/cmd/crane-agent/app/options/option.go index 5818fdd06..205a40d97 100644 --- a/cmd/crane-agent/app/options/option.go +++ b/cmd/crane-agent/app/options/option.go @@ -13,6 +13,8 @@ type Options struct { HostnameOverride string // RuntimeEndpoint is the endpoint of runtime RuntimeEndpoint string + // driver that the kubelet uses to manipulate cgroups on the host (cgroupfs or systemd) + CgroupDriver string // Is debug/pprof endpoint enabled EnableProfiling bool // BindAddr is the address the endpoint binds to. @@ -45,6 +47,7 @@ func (o *Options) Validate() error { func (o *Options) AddFlags(flags *pflag.FlagSet) { flags.StringVar(&o.HostnameOverride, "hostname-override", "", "Which is the name of k8s node be used to filtered.") flags.StringVar(&o.RuntimeEndpoint, "runtime-endpoint", "", "The runtime endpoint docker: unix:///var/run/dockershim.sock, containerd: unix:///run/containerd/containerd.sock, cri-o: unix:///run/crio/crio.sock, k3s: unix:///run/k3s/containerd/containerd.sock.") + flags.StringVar(&o.CgroupDriver, "cgroup-driver", "cgroupfs", "Driver that the kubelet uses to manipulate cgroups on the host. Possible values: 'cgroupfs', 'systemd'. Default to 'cgroupfs'") flags.Bool("enable-profiling", false, "Is debug/pprof endpoint enabled, default: false") flags.StringVar(&o.BindAddr, "bind-address", "0.0.0.0:8081", "The address the agent binds to for metrics, health-check and pprof, default: 0.0.0.0:8081.") flags.DurationVar(&o.CollectInterval, "collect-interval", 10*time.Second, "Period for the state collector to collect metrics, default: 10s") diff --git a/cmd/craned/app/manager.go b/cmd/craned/app/manager.go index ee37d1fb8..44a2c33df 100644 --- a/cmd/craned/app/manager.go +++ b/cmd/craned/app/manager.go @@ -106,20 +106,20 @@ func Run(ctx context.Context, opts *options.Options) error { return err } // initialize data sources and predictor - realtimeDataSources, histroyDataSources, _ := initializationDataSource(mgr, opts) - predictorMgr := initializationPredictorManager(opts, realtimeDataSources, histroyDataSources) - - initializationScheme() - initializationWebhooks(mgr, opts) - initializationControllers(ctx, mgr, opts, predictorMgr, histroyDataSources[providers.PrometheusDataSource]) - // initialization custom collector metrics - initializationMetricCollector(mgr) + realtimeDataSources, histroyDataSources, _ := initDataSources(mgr, opts) + predictorMgr := initPredictorManager(opts, realtimeDataSources, histroyDataSources) + + initScheme() + initWebhooks(mgr, opts) + initControllers(ctx, mgr, opts, predictorMgr, histroyDataSources[providers.PrometheusDataSource]) + // initialize custom collector metrics + initMetricCollector(mgr) runAll(ctx, mgr, predictorMgr, opts) return nil } -func initializationScheme() { +func initScheme() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) if utilfeature.DefaultFeatureGate.Enabled(features.CraneAutoscaling) { utilruntime.Must(autoscalingapi.AddToScheme(scheme)) @@ -135,12 +135,12 @@ func initializationScheme() { } } -func initializationMetricCollector(mgr ctrl.Manager) { +func initMetricCollector(mgr ctrl.Manager) { // register as prometheus metric collector metrics.CustomCollectorRegister(metrics.NewTspMetricCollector(mgr.GetClient())) } -func initializationWebhooks(mgr ctrl.Manager, opts *options.Options) { +func initWebhooks(mgr ctrl.Manager, opts *options.Options) { if !opts.WebhookConfig.Enabled { return } @@ -159,7 +159,7 @@ func initializationWebhooks(mgr ctrl.Manager, opts *options.Options) { } } -func initializationDataSource(mgr ctrl.Manager, opts *options.Options) (map[providers.DataSourceType]providers.RealTime, map[providers.DataSourceType]providers.History, map[providers.DataSourceType]providers.Interface) { +func initDataSources(mgr ctrl.Manager, opts *options.Options) (map[providers.DataSourceType]providers.RealTime, map[providers.DataSourceType]providers.History, map[providers.DataSourceType]providers.Interface) { realtimeDataSources := make(map[providers.DataSourceType]providers.RealTime) historyDataSources := make(map[providers.DataSourceType]providers.History) hybridDataSources := make(map[providers.DataSourceType]providers.Interface) @@ -193,12 +193,12 @@ func initializationDataSource(mgr ctrl.Manager, opts *options.Options) (map[prov return realtimeDataSources, historyDataSources, hybridDataSources } -func initializationPredictorManager(opts *options.Options, realtimeDataSources map[providers.DataSourceType]providers.RealTime, historyDataSources map[providers.DataSourceType]providers.History) predictor.Manager { +func initPredictorManager(opts *options.Options, realtimeDataSources map[providers.DataSourceType]providers.RealTime, historyDataSources map[providers.DataSourceType]providers.History) predictor.Manager { return predictor.NewManager(realtimeDataSources, historyDataSources, predictor.DefaultPredictorsConfig(opts.AlgorithmModelConfig)) } -// initializationControllers setup controllers with manager -func initializationControllers(ctx context.Context, mgr ctrl.Manager, opts *options.Options, predictorMgr predictor.Manager, historyDataSource providers.History) { +// initControllers setup controllers with manager +func initControllers(ctx context.Context, mgr ctrl.Manager, opts *options.Options, predictorMgr predictor.Manager, historyDataSource providers.History) { discoveryClientSet, err := discovery.NewDiscoveryClientForConfig(mgr.GetConfig()) if err != nil { klog.Exit(err, "Unable to create discover client") diff --git a/deploy/craned/deployment.yaml b/deploy/craned/deployment.yaml index 036141cd2..8a43734f5 100644 --- a/deploy/craned/deployment.yaml +++ b/deploy/craned/deployment.yaml @@ -101,18 +101,18 @@ data: - targets: [] properties: resource.cpu-request-percentile: "0.98" - ehpa.deployment-min-replicas: "1" - ehpa.statefulset-min-replicas: "1" - ehpa.workload-min-replicas: "1" - ehpa.pod-min-ready-seconds: "30" - ehpa.pod-available-ratio: "0.5" - ehpa.default-min-replicas: "2" - ehpa.max-replicas-factor: "3" - ehpa.min-cpu-usage-threshold: "10" - ehpa.fluctuation-threshold: "1.5" - ehpa.min-cpu-target-utilization: "30" - ehpa.max-cpu-target-utilization: "75" - ehpa.reference-hpa: "true" + replicas.workload-min-replicas: "3" + replicas.pod-min-ready-seconds: "30" + replicas.pod-available-ratio: "0.5" + replicas.default-min-replicas: "3" + replicas.max-replicas-factor: "3" + replicas.min-cpu-usage-threshold: "1" + replicas.fluctuation-threshold: "1.5" + replicas.min-cpu-target-utilization: "30" + replicas.max-cpu-target-utilization: "75" + replicas.cpu-target-utilization: "50" + replicas.cpu-percentile: "95" + replicas.reference-hpa: "true" --- apiVersion: v1 diff --git a/deploy/manifests/analysis.crane.io_analytics.yaml b/deploy/manifests/analysis.crane.io_analytics.yaml index 8d3f9feb7..79a122453 100644 --- a/deploy/manifests/analysis.crane.io_analytics.yaml +++ b/deploy/manifests/analysis.crane.io_analytics.yaml @@ -57,6 +57,11 @@ spec: format: int64 type: integer type: object + config: + additionalProperties: + type: string + description: Override Recommendation configs + type: object resourceSelectors: description: ResourceSelector indicates how to select resources(e.g. a set of Deployments) for an Analytics. @@ -210,8 +215,8 @@ spec: format: date-time type: string recommendations: - description: Recommendations is a list of pointers to recommendations - that are updated by this analytics. + description: Recommendations is a list of RecommendationMission that + run parallel. items: properties: apiVersion: @@ -233,6 +238,14 @@ spec: kind: description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' type: string + lastStartTime: + description: LastStartTime is last time we start a recommendation + mission. + format: date-time + type: string + message: + description: Message presents the running message for this mission + type: string name: description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' type: string diff --git a/deploy/manifests/autoscaling.crane.io_effectivehorizontalpodautoscalers.yaml b/deploy/manifests/autoscaling.crane.io_effectivehorizontalpodautoscalers.yaml index 9b425d026..6cd43ecc5 100644 --- a/deploy/manifests/autoscaling.crane.io_effectivehorizontalpodautoscalers.yaml +++ b/deploy/manifests/autoscaling.crane.io_effectivehorizontalpodautoscalers.yaml @@ -785,6 +785,8 @@ spec: type: string sampleInterval: type: string + targetUtilization: + type: string type: object type: object predictionWindowSeconds: diff --git a/deploy/manifests/ensurance.crane.io_avoidanceactions.yaml b/deploy/manifests/ensurance.crane.io_avoidanceactions.yaml index 83d9abcf3..c1f3d0fa1 100644 --- a/deploy/manifests/ensurance.crane.io_avoidanceactions.yaml +++ b/deploy/manifests/ensurance.crane.io_avoidanceactions.yaml @@ -13,6 +13,8 @@ spec: kind: AvoidanceAction listKind: AvoidanceActionList plural: avoidanceactions + shortNames: + - avoid singular: avoidanceaction scope: Cluster versions: diff --git a/deploy/manifests/ensurance.crane.io_nodeqosensurancepolicies.yaml b/deploy/manifests/ensurance.crane.io_nodeqosensurancepolicies.yaml index 113cdda79..e546abb12 100644 --- a/deploy/manifests/ensurance.crane.io_nodeqosensurancepolicies.yaml +++ b/deploy/manifests/ensurance.crane.io_nodeqosensurancepolicies.yaml @@ -13,6 +13,8 @@ spec: kind: NodeQOSEnsurancePolicy listKind: NodeQOSEnsurancePolicyList plural: nodeqosensurancepolicies + shortNames: + - nep singular: nodeqosensurancepolicy scope: Cluster versions: @@ -95,7 +97,8 @@ spec: type: integer type: object timeoutSeconds: - description: TimeoutSeconds is the timeout for request + description: TimeoutSeconds is the timeout for request. Defaults + to 0, no timeout forever. format: int32 type: integer type: object diff --git a/deploy/manifests/ensurance.crane.io_podqosensurancepolicies.yaml b/deploy/manifests/ensurance.crane.io_podqosensurancepolicies.yaml index 443cb2659..8c32945af 100644 --- a/deploy/manifests/ensurance.crane.io_podqosensurancepolicies.yaml +++ b/deploy/manifests/ensurance.crane.io_podqosensurancepolicies.yaml @@ -13,6 +13,8 @@ spec: kind: PodQOSEnsurancePolicy listKind: PodQOSEnsurancePolicyList plural: podqosensurancepolicies + shortNames: + - qep singular: podqosensurancepolicy scope: Namespaced versions: @@ -182,12 +184,9 @@ spec: required: - port type: object - initialDelaySeconds: - description: Init delay time for handler Defaults to 5 - format: int32 - type: integer timeoutSeconds: - description: Timeout for request. Defaults to 0, instead not timeout + description: TimeoutSeconds is the timeout for request. Defaults + to 0, no timeout forever format: int32 type: integer type: object diff --git a/deploy/manifests/prediction.crane.io_clusternodepredictions.yaml b/deploy/manifests/prediction.crane.io_clusternodepredictions.yaml index 7b3ea91ce..c70c3d87d 100644 --- a/deploy/manifests/prediction.crane.io_clusternodepredictions.yaml +++ b/deploy/manifests/prediction.crane.io_clusternodepredictions.yaml @@ -140,6 +140,8 @@ spec: type: string sampleInterval: type: string + targetUtilization: + type: string type: object type: object expressionQuery: diff --git a/deploy/manifests/prediction.crane.io_timeseriespredictions.yaml b/deploy/manifests/prediction.crane.io_timeseriespredictions.yaml index 20c70d29b..f28621ed3 100644 --- a/deploy/manifests/prediction.crane.io_timeseriespredictions.yaml +++ b/deploy/manifests/prediction.crane.io_timeseriespredictions.yaml @@ -145,6 +145,8 @@ spec: type: string sampleInterval: type: string + targetUtilization: + type: string type: object type: object expressionQuery: diff --git a/docs/assets/util.css b/docs/assets/util.css index b9531d171..baa19fd26 100644 --- a/docs/assets/util.css +++ b/docs/assets/util.css @@ -5,3 +5,7 @@ .arithmatex { font-size: 0.85rem; } + +foreignObject > div { + font-size: 0.85rem; +} \ No newline at end of file diff --git a/docs/images/crane-keda-ali-compare-cron.png b/docs/images/crane-keda-ali-compare-cron.png new file mode 100644 index 000000000..e00a06b8f Binary files /dev/null and b/docs/images/crane-keda-ali-compare-cron.png differ diff --git a/docs/roadmaps/roadmap-1h-2022.md b/docs/roadmaps/roadmap-1h-2022.md index aba63646a..7cc225d96 100644 --- a/docs/roadmaps/roadmap-1h-2022.md +++ b/docs/roadmaps/roadmap-1h-2022.md @@ -18,15 +18,17 @@ Please let us know if you have urgent needs which are not presented in the plan. - Node QoS Ensurance for Mem - Prediction with CPU, Memory, and Business Metrics - Scalability to support 1K TSP and 1K EPA -### 0.4.0 [April] -- EVPA support -- Dynamic Scheduler +### 0.4.0 [released] - UI to support EPA. ### 0.5.0 [May] -- HPC open source -- Node & Pod QoS Ensurance for DiskIO and Network -- Prediction with DiskIO, Network +- Resource and Replicas Recommendation +- Load-aware Scheduler ### 0.6.0 [June] - Scalability to support 3k TSP and 3k EPA -- Application Portrait -- SLO based Application QoS for CPU and Mem +- Algorithm and QoS Documentation +- EHPA grafana dashboard +### 0.7.0 [July] +- Support apiservice router for multiple metric adapters +- Prediction with Business Metrics +### 0.8.0 [August] +- Algorithm estimate notebook diff --git a/docs/tutorials/dynamic-scheduler-plugin.zh.md b/docs/tutorials/dynamic-scheduler-plugin.zh.md new file mode 100644 index 000000000..3f12cfee3 --- /dev/null +++ b/docs/tutorials/dynamic-scheduler-plugin.zh.md @@ -0,0 +1,39 @@ +# Dynamic Scheduler:负载感知调度器插件 + +## 介绍 +kubernetes 的原生调度器只能通过资源请求来调度 pod,这很容易造成一系列负载不均的问题: + +- 对于某些节点,实际负载与资源请求相差不大,这会导致很大概率出现稳定性问题。 +- 对于其他节点来说,实际负载远小于资源请求,这将导致资源的巨大浪费。 + +为了解决这些问题,动态调度器根据实际的节点利用率构建了一个简单但高效的模型,并过滤掉那些负载高的节点来平衡集群。 + +## 设计细节 + +### 架构 +![](./../images/dynamic-scheduler-plugin.png) + + +如上图,动态调度器依赖于`Prometheus`和`Node-exporter`收集和汇总指标数据,它由两个组件组成: + +!!! note "Note" + `Node-annotator` 目前是 `Crane-scheduler-controller`的一个模块. + +- `Node-annotator`定期从 Prometheus 拉取数据,并以注释的形式在节点上用时间戳标记它们。 +- `Dynamic plugin`直接从节点的注释中读取负载数据,过滤并基于简单的算法对候选节点进行评分。 + +### 调度策略 +动态调度器提供了一个默认值[调度策略](../deploy/manifests/policy.yaml)并支持用户自定义策略。默认策略依赖于以下指标: + +- `cpu_usage_avg_5m` +- `cpu_usage_max_avg_1h` +- `cpu_usage_max_avg_1d` +- `mem_usage_avg_5m` +- `mem_usage_max_avg_1h` +- `mem_usage_max_avg_1d` + +在调度的`Filter`阶段,如果该节点的实际使用率大于上述任一指标的阈值,则该节点将被过滤。而在`Score`阶段,最终得分是这些指标值的加权和。 + +### Hot Value + +在生产集群中,可能会频繁出现调度热点,因为创建 Pod 后节点的负载不能立即增加。因此,我们定义了一个额外的指标,名为`Hot Value`,表示节点最近几次的调度频率。并且节点的最终优先级是最终得分减去`Hot Value`。 diff --git a/docs/tutorials/using-effective-hpa-to-scaling-with-effectiveness.md b/docs/tutorials/using-effective-hpa-to-scaling-with-effectiveness.md index eb40ed648..b0c5770e9 100644 --- a/docs/tutorials/using-effective-hpa-to-scaling-with-effectiveness.md +++ b/docs/tutorials/using-effective-hpa-to-scaling-with-effectiveness.md @@ -136,7 +136,7 @@ Let's take one use case that using EffectiveHorizontalPodAutoscaler in productio We did a profiling on the load history of one application in production and replayed it in staging environment. With the same application, we leverage both EffectiveHorizontalPodAutoscaler and HorizontalPodAutoscaler to manage the scale and compare the result. From the red line in below chart, we can see its actual total cpu usage is high at ~8am, ~12pm, ~8pm and low in midnight. The green line shows the prediction cpu usage trend. -![craen-ehpa-metrics-chart](../images/crane-ehpa-metrics-chart.png) +![crane-ehpa-metrics-chart](../images/crane-ehpa-metrics-chart.png) Below is the comparison result between EffectiveHorizontalPodAutoscaler and HorizontalPodAutoscaler. The red line is the replica number generated by HorizontalPodAutoscaler and the green line is the result from EffectiveHorizontalPodAutoscaler. ![crane-ehpa-metrics-replicas-chart](../images/crane-ehpa-replicas-chart.png) @@ -195,6 +195,229 @@ status: ``` +### Cron-based autoscaling +EffectiveHorizontalPodAutoscaler supports cron based autoscaling. + +Besides based on monitoring metrics, sometimes there are differences between holiday and weekdays in workload traffic, and a simple prediction algorithm may not work relatively well. Then you can make up for the lack of prediction by setting the weekend cron to have a larger number of replicas. + +For some non-web traffic applications, for example, some applications do not need to work on weekends, and then want to reduce the workload replicas to 1, you can also configure cron to reduce the cost for your service. + +Following are cron main fields in the ehpa spec: + + - CronSpec: You can set multiple cron autoscaling configurations, cron cycle can set the start time and end time of the cycle, and the number of replicas of the workload can be continuously guaranteed to the set target value within the time range. + - Name: cron identifier + - TargetReplicas: the target number of replicas of the workload in this cron time range. + - Start: The start time of the cron, in the standard linux crontab format + - End: the end time of the cron, in the standard linux crontab format + + +Current cron autoscaling capabilities from some manufacturers and communities have some shortcomings. + +1. The cron capability is provided separately, has no global view of autoscaling, poor compatibility with HPA, and conflicts with other scale trigger. +2. The semantics and behavior of cron do not match very well, and are even very difficult to understand when used, which can easily mislead users and lead to autoscaling failures. + +The following figure shows the comparison between the current EHPA cron autoscaling implementation and other cron capabilities. + +![crane-keda-ali-compare-cron.png](../images/crane-keda-ali-compare-cron.png) + + +To address the above issues, the cron autoscaling implemented by EHPA is designed on the basis of compatibility with HPA, and cron, as an indicator of HPA, acts on the workload object together with other indicators. In addition, the setting of cron is also very simple. When cron is configured separately, the default scaling of the workload will not be performed when it is not in the active time range. + + +#### Cron working without other metrics +You can just configure cron itself to work, assume you have no other metrics configured. +```yaml +apiVersion: autoscaling.crane.io/v1alpha1 +kind: EffectiveHorizontalPodAutoscaler +metadata: + name: php-apache-local +spec: + # ScaleTargetRef is the reference to the workload that should be scaled. + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: php-apache + minReplicas: 1 # MinReplicas is the lower limit replicas to the scale target which the autoscaler can scale down to. + maxReplicas: 100 # MaxReplicas is the upper limit replicas to the scale target which the autoscaler can scale up to. + scaleStrategy: Auto # ScaleStrategy indicate the strategy to scaling target, value can be "Auto" and "Manual". + # Better to setting cron to fill the one complete time period such as one day, one week + # Below is one day cron scheduling, it + #(targetReplicas) + #80 -------- --------- ---------- + # | | | | | | + #10 ------------ ----- -------- ---------- + #(time) 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + # Local timezone means you use the server's(or maybe is a container's) timezone which the craned running in. for example, if your craned started as utc timezone, then it is utc. if it started as Asia/Shanghai, then it is Asia/Shanghai. + crons: + - name: "cron1" + timezone: "Local" + description: "scale down" + start: "0 0 ? * *" + end: "0 6 ? * *" + targetReplicas: 10 + - name: "cron2" + timezone: "Local" + description: "scale up" + start: "0 6 ? * *" + end: "0 9 ? * *" + targetReplicas: 80 + - name: "cron3" + timezone: "Local" + description: "scale down" + start: "00 9 ? * *" + end: "00 11 ? * *" + targetReplicas: 10 + - name: "cron4" + timezone: "Local" + description: "scale up" + start: "00 11 ? * *" + end: "00 14 ? * *" + targetReplicas: 80 + - name: "cron5" + timezone: "Local" + description: "scale down" + start: "00 14 ? * *" + end: "00 17 ? * *" + targetReplicas: 10 + - name: "cron6" + timezone: "Local" + description: "scale up" + start: "00 17 ? * *" + end: "00 20 ? * *" + targetReplicas: 80 + - name: "cron7" + timezone: "Local" + description: "scale down" + start: "00 20 ? * *" + end: "00 00 ? * *" + targetReplicas: 10 +``` + +CronSpec has following fields. + +* **name** defines the name of the cron, cron name must be unique in the same ehpa +* **description** defines the details description of the cron. it can be empty. +* **timezone** defines the timezone of the cron which the crane to schedule in. If unspecified, default use `UTC` timezone. you can set it to `Local` which means you use timezone of the container of crane service running in. Also, `America/Los_Angeles` is ok. +* **start** defines the cron start time schedule, which is crontab format. see https://en.wikipedia.org/wiki/Cron +* **end** defines the cron end time schedule, which is crontab format. see https://en.wikipedia.org/wiki/Cron +* **targetReplicas** defines the target replicas the workload to scale when the cron is active, which means current time is between start and end. + +Above means each day, the workload needs to keep the replicas hourly. +``` + #80 -------- --------- ---------- + # | | | | | | + #1 ------------ ----- -------- ---------- + #(time) 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 +``` + +Remember **not to set start time is after end**. For example, when you set following: +``` + crons: + - name: "cron2" + timezone: "Local" + description: "scale up" + start: "0 9 ? * *" + end: "0 6 ? * *" + targetReplicas: 80 +``` +Above is not valid because the start will be always later than end. The hpa controller will always get the workload's desired replica to scale, which means keep the original replicas. + + +#### Horizontal scaling process +There are six steps of cron-driven and scaling process: + +1. EffectiveHPAController creates HorizontalPodAutoscaler which is injected to external cron metrics in spec. +2. HPAController reads cron external metrics from KubeApiServer +3. KubeApiServer forwards requests to MetricAdapter and MetricServer +4. The MetricAdapter finds the cron scaler for target hpa, and detect if the cron scaler is active, which means the current time is between the cron start and end schedule time. It will return the `TargetReplicas` specified in the `CronSpec`. +5. HPAController calculates all metric results and propose a new scale replicas for target by selecting the largest one. +6. HPAController scales target with Scale Api + + +When use ehpa, users can configure only cron metric, let the ehpa to be used as cron hpa. + +Multiple crons of one ehpa will be transformed to one external metric. HPA will fetch this external cron metric and calculates target replicas when reconcile. HPA will select the largest proposal replicas to scale the workload from multiple metrics. + + + +#### Cron working with other metrics together + +EffectiveHorizontalPodAutoscaler is compatible with HorizontalPodAutoscaler(Which is kubernetes built in). So if you configured metrics for HPA such as cpu or memory, then the HPA will scale by the real time metric it observed. + +With EHPA, users can configure CronMetric、PredictionMetric、OriginalMetric at the same time. + +**We highly recomend you configure metrics of all dimensions. They are represtenting the cron replicas, prior predicted replicas, posterior observed replicas.** + +This is a powerful feature. Because HPA always pick the largest replicas calculated by all dimensional metrics to scale. Which will gurantee your workload's QoS, when you configure three types of autoscaling at the same time, the replicas caculated by real metric observed is largest, then it will use the max one. Although the replicas caculated by prediction metric is smaller for some unexpected reason. So you don't be worried about the QoS. + + +#### Mechanism +When metrics adapter deal with the external cron metric requests, metrics adapter will do following steps. + +``` mermaid +graph LR + A[Start] --> B{Active Cron?}; + B -->|Yes| C(largest targetReplicas) --> F; + B -->|No| D{Work together with other metrics?}; + D -->|Yes| G(minimum replicas) --> F; + D -->|No| H(current replicas) --> F; + F[Result workload replicas]; +``` + +1. No active cron now, there are two cases: + + - no other hpa metrics work with cron together, then return current workload replicas to keep the original desired replicas + - other hpa metrics work with cron together, then return min value to remove the cron impact for other metrics. when cron is working with other metrics together, it should not return workload's original desired replicas, because there maybe other metrics want to trigger the workload to scale in. hpa controller select max replicas computed by all metrics(this is hpa default policy in hard code), cron will impact the hpa. so we should remove the cron effect when cron is not active, it should return min value. + + +2. Has active ones. we use the largest targetReplicas specified in cron spec. Basically, there should not be more then one active cron at the same time period, it is not a best practice. + +HPA will get the cron external metric value, then it will compute the replicas by itself. + +#### Use Case + +When you need to keep the workload replicas to minimum at midnight, you configured cron. And you need the HPA to get the real metric observed by metrics server to do scale based on real time observed metric. At last you configure a prediction-driven metric to do scale up early and scale down lately by predicting way. + +```yaml +apiVersion: autoscaling.crane.io/v1alpha1 +kind: EffectiveHorizontalPodAutoscaler +metadata: + name: php-apache-multi-dimensions +spec: + # ScaleTargetRef is the reference to the workload that should be scaled. + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: php-apache + minReplicas: 1 # MinReplicas is the lower limit replicas to the scale target which the autoscaler can scale down to. + maxReplicas: 100 # MaxReplicas is the upper limit replicas to the scale target which the autoscaler can scale up to. + scaleStrategy: Auto # ScaleStrategy indicate the strategy to scaling target, value can be "Auto" and "Manual". + # Metrics contains the specifications for which to use to calculate the desired replica count. + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 50 + # Prediction defines configurations for predict resources. + # If unspecified, defaults don't enable prediction. + prediction: + predictionWindowSeconds: 3600 # PredictionWindowSeconds is the time window to predict metrics in the future. + predictionAlgorithm: + algorithmType: dsp + dsp: + sampleInterval: "60s" + historyLength: "3d" + crons: + - name: "cron1" + description: "scale up" + start: "0 0 ? * 6" + end: "00 23 ? * 0" + targetReplicas: 100 +``` + + ## FAQ ### error: unable to get metric crane_pod_cpu_usage @@ -214,5 +437,6 @@ When checking the status for EffectiveHorizontalPodAutoscaler, you may see this reason: Not all workload's cpu metric are predictable, if predict your workload failed, it will show above errors. solution: + - Just waiting. the Prediction algorithm need more time, you can see `DSP` section to know more about this algorithm. - EffectiveHorizontalPodAutoscaler have a protection mechanism when prediction failed, it will use the actual cpu utilization to do autoscaling. diff --git a/docs/tutorials/using-effective-hpa-to-scaling-with-effectiveness.zh.md b/docs/tutorials/using-effective-hpa-to-scaling-with-effectiveness.zh.md index 0347177ab..6fbcf39ea 100644 --- a/docs/tutorials/using-effective-hpa-to-scaling-with-effectiveness.zh.md +++ b/docs/tutorials/using-effective-hpa-to-scaling-with-effectiveness.zh.md @@ -196,6 +196,248 @@ status: ``` +### Cron-based autoscaling +EffectiveHorizontalPodAutoscaler 支持基于 cron 的自动缩放。 + +除了基于监控指标,有时节假日和工作日的工作负载流量存在差异,简单的预测算法可能效果不佳。然后可以通过设置周末 cron 来支持更大数量的副本来弥补预测的不足。 + +对于一些非 web 流量的应用,比如一些应用不需要在周末使用,可以把工作负载的副本数减少到 1,也可以配置 cron 来降低你的服务成本。 + +以下是 `EHPA Spec` 中的 cron 主要字段: + +- `CronSpec`:可以设置多个 cron 自动伸缩配置,cron cycle 可以设置循环的开始时间和结束时间,并且工作负载的副本数可以在时间范围内持续保持为设定的目标值。 +- `Name`:cron 标识符 +- `TargetReplicas`:此 cron 时间范围内工作负载的目标副本数。 +- `Start`:cron 的开始时间,标准 linux crontab 格式 +- `End`:cron 的结束时间,标准 linux crontab 格式 + + +一些云厂商和社区当前的 cron 自动缩放功能存在一些缺点。 + +1. cron 能力单独提供,没有在全局视图中进行自动缩放,与 HPA 兼容性差,与其他缩放触发器冲突。 +2. cron 的语义和行为不是很匹配,使用时甚至很难理解,很容易误导用户,导致自动伸缩失败。 + +下图显示了当前 EHPA cron 自动伸缩实现与其他 cron 能力的对比。 + +![crane-keda-ali-compare-cron.png](../images/crane-keda-ali-compare-cron.png) + + +针对以上问题,EHPA 实现的 cron autoscaling 是在与 HPA 兼容的基础上设计的,cron 作为 HPA 的一个指标,与其他指标一起作用于工作负载。 + +另外,cron 的设置也很简单。单独配置 cron 时,不在活动时间范围内时,不会对工作负载执行缩放。 + + +#### Cron working without other metrics +假设你没有配置其他指标,你只需配置 cron 本身即可工作。 +```yaml +apiVersion: autoscaling.crane.io/v1alpha1 +kind: EffectiveHorizontalPodAutoscaler +metadata: + name: php-apache-local +spec: + # ScaleTargetRef 关联到需扩缩容的工作负载 + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: php-apache + minReplicas: 1 # MinReplicas : autoscaler 缩放的最低副本数 + maxReplicas: 100 # MaxReplicas : autoscaler 缩放的最大副本数 + scaleStrategy: Auto # ScaleStrategy : 缩放工作负载时候,所采用的策略。可选值为 "Auto" "Manual" + # 最好将Cron Scheduling设置为一个完整的时间周期,例如: 一天,一周 + # 下面是一天的Cron Scheduling + #(targetReplicas) + #80 -------- --------- ---------- + # | | | | | | + #10 ------------ ----- -------- ---------- + #(time) 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + #本地时区(timezone: "Local")意味着您使用运行Craned所在的服务器(或者可能是容器)的时区。例如,当Craned 是以UTC时区开始,那么它就是UTC。如果一开始是Asia/Shanghai,那么它就是Asia/Shanghai。 + crons: + - name: "cron1" + timezone: "Local" + description: "scale down" + start: "0 0 ? * *" + end: "0 6 ? * *" + targetReplicas: 10 + - name: "cron2" + timezone: "Local" + description: "scale up" + start: "0 6 ? * *" + end: "0 9 ? * *" + targetReplicas: 80 + - name: "cron3" + timezone: "Local" + description: "scale down" + start: "00 9 ? * *" + end: "00 11 ? * *" + targetReplicas: 10 + - name: "cron4" + timezone: "Local" + description: "scale up" + start: "00 11 ? * *" + end: "00 14 ? * *" + targetReplicas: 80 + - name: "cron5" + timezone: "Local" + description: "scale down" + start: "00 14 ? * *" + end: "00 17 ? * *" + targetReplicas: 10 + - name: "cron6" + timezone: "Local" + description: "scale up" + start: "00 17 ? * *" + end: "00 20 ? * *" + targetReplicas: 80 + - name: "cron7" + timezone: "Local" + description: "scale down" + start: "00 20 ? * *" + end: "00 00 ? * *" + targetReplicas: 10 +``` + +CronSpec 具有以下字段: + +* **name** 定义了 cron 的名字,cron 名字在同一个 Ehpa 中必须是唯一的 +* **description** 定义 cron 的详细描述。它可以是空的。 +* **timezone** 定义Crane所要调度的 cron 时区。如果未指定,则默认使用`UTC`时区。你可以将它设置为`Local`,这将使用正在运行的Crane容器所在的时区。其实,你定义`America/Los_Angeles`也是可以的。 +* **start** 定义 cron 开始调度的时间,是 crontab 格式。参考 [wiki-Cron](https://en.wikipedia.org/wiki/Cron) +* **end** 定义 cron 结束调度的时间,是 crontab 格式。参考 [wiki-Cron](https://en.wikipedia.org/wiki/Cron) +* **targetReplicas** 定义目标副本在 cron 处于活动状态时要扩展的工作负载,这意味着目标副本数介于开始时间和结束时间之间生效。 + +以上YAML定义,意味着一天当中,工作负载在每小时所需要保持的副本数。工作负载将会每天按照该规则执行。 + +``` + #80 -------- --------- ---------- + # | | | | | | + #1 ------------ ----- -------- ---------- + #(time) 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 +``` + +记住**不要设置开始时间在结束时间之后**。 + +例如,当你设置以下内容时: +``` + crons: + - name: "cron2" + timezone: "Local" + description: "scale up" + start: "0 9 ? * *" + end: "0 6 ? * *" + targetReplicas: 80 +``` +以上无效,因为开始总是晚于结束。 + +HPA 控制器始终根据工作负载所描述的副本数进行扩展,这意味着保留原有副本数不变。 + + +#### Horizontal scaling process + +cron 驱动和扩展过程有六个步骤: + +1. `EffectiveHPAController` 创建 `HorizontalPodAutoscaler`,它被注入到`spec`中的`external cron metrics`中。 +2. `HPAController` 从 `KubeApiServer` 读取 `external cron metrics` +3. `KubeApiServer` 将请求转发给 `MetricAdapter` 和 `MetricServer` +4. `MetricAdapter` 找到目标 hpa 的 `cron scaler`,并检测 `cron scaler` 是否处于活动状态。这意味着当前时间介于 cron 开始和结束计划时间之间。它将返回`TargetReplicas`中定义的`CronSpec`。 +5. `HPAController` 计算所有 metrics 结果,并通过选择最大的一个为目标副本数。并由此创建一个新的`scale replicas`。 +6. `HPAController` 使用 `Scale Api` 缩放目标 + + +使用 EHPA 时,用户可以只配置 cron metric,让 EHPA 用作 cron hpa。 + +一个 EHPA 的多个 crons 将转换为一个`external metrics`。 + +HPA 将获取`external metrics`并在协调时计算目标副本。当存在多个指标的工作负载时,HPA 将选择最大的副本数来扩展。 + + + +#### Cron working with other metrics together + +`EffectiveHorizontalPodAutoscaler` 兼容 `HorizontalPodAutoscaler`(内置在 kubernetes)。因此,如果你为 HPA 配置了指标,例如 cpu 或内存,那么 HPA 将根据它观察到的实时指标对副本数进行扩展。 + +通过 EHPA,用户可以同时配置 `CronMetric`、`PredictionMetric`、`OriginalMetric`。 + +**我们强烈建议你配置所有维度的指标。它们分别代表 cron 副本、先前预测的副本、后观察的副本。** + +这是一个强大的功能。因为 HPA 总是选择由所有维度`metrics`计算的最大副本进行扩展。 + +这将保证你工作负载的 QoS,当你同时配置三种类型的自动缩放时,根据实际观察到的指标计算的副本最大,然后它将使用最大的一个。 + +尽管由于某些意想不到的原因,导致由`PredictionMetric`计算的副本更小。因此,你不必担心 QoS。 + + +#### Mechanism +当`metrics adapter`处理`external cron metrics`请求时,`metrics adapter`将执行以下步骤。 + +``` mermaid +graph LR + A[Start] --> B{Active Cron?}; + B -->|Yes| C(largest targetReplicas) --> F; + B -->|No| D{Work together with other metrics?}; + D -->|Yes| G(minimum replicas) --> F; + D -->|No| H(current replicas) --> F; + F[Result workload replicas]; +``` + + +1. 没有活跃的cron,有两种情况: + + - 没有其他 hpa 指标与 cron 一起使用,然后返回当前工作负载副本以保留原始所需的副本 + - 当其他 hpa 指标与 cron 一起使用,将会返回最小值以消除cron对其他指标的影响。当 cron 与其他指标一起工作时,它不应该返回工作负载的原始副本数,因为可能有其他指标想要缩小工作负载的副本数。`HPA Controller`选择由所有指标计算的最大副本(这是硬代码中的 hpa 默认策略),cron 会影响 hpa。所以我们应该在 cron 不活动时移除 cron 效果,它应该返回最小值。 + + +2. 有活跃的cron。我们使用`cron spec`中指定的最大目标副本。基本上,在同一时间段内不应有超过一个活跃的 cron,这不是最佳实践。 + +HPA 将获取`cron external metrics`,然后它会自行计算副本数。 + +#### Use Case + +当你需要在午夜将工作负载副本数保持在最低限度,根据该需求配置了 cron。 + +你需要 HPA 来获取指标服务器观察到的真实指标,以根据实时观察到的指标进行扩展。 + +最后,你配置一个`prediction-driven metric`,通过预测方式提前扩大规模并在末期缩小规模。 + +```yaml +apiVersion: autoscaling.crane.io/v1alpha1 +kind: EffectiveHorizontalPodAutoscaler +metadata: + name: php-apache-multi-dimensions +spec: + # ScaleTargetRef 关联到需扩缩容的工作负载 + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: php-apache + minReplicas: 1 # MinReplicas : 缩放的最小副本数 + maxReplicas: 100 # MaxReplicas : 缩放的最大副本数 + scaleStrategy: Auto # ScaleStrategy : 缩放工作负载时候,所采用的策略。可选值为 "Auto" "Manual" + # Metrics 包含了用于计算所需副本数的指标。 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 50 + # Prediction 的配置定义了需要预测的资源 + # 若不配置,则默认不启动 prediction + prediction: + predictionWindowSeconds: 3600 # PredictionWindowSeconds 是预测未来指标的时间窗口。 + predictionAlgorithm: + algorithmType: dsp + dsp: + sampleInterval: "60s" + historyLength: "3d" + crons: + - name: "cron1" + description: "scale up" + start: "0 0 ? * 6" + end: "00 23 ? * 0" + targetReplicas: 100 +``` + + ## 常见问题 ### 错误: unable to get metric crane_pod_cpu_usage @@ -217,4 +459,4 @@ status: 解决方案: - 等一段时间再看。预测算法 `DSP` 需要一定时间的数据才能进行预测。希望了解算法细节的可以查看算法的文档。 -- EffectiveHorizontalPodAutoscaler 提供一种保护机制,当预测失效时依然能通过实际的 CPU 使用率工作。 \ No newline at end of file +- EffectiveHorizontalPodAutoscaler 提供一种保护机制,当预测失效时依然能通过实际的 CPU 使用率工作。 diff --git a/docs/tutorials/using-qos-ensurance.zh.md b/docs/tutorials/using-qos-ensurance.zh.md new file mode 100644 index 000000000..f5f927b08 --- /dev/null +++ b/docs/tutorials/using-qos-ensurance.zh.md @@ -0,0 +1,182 @@ +# Qos Ensurance +Qos Ensurance 保证了运行在 Kubernetes 上的 Pod 的稳定性。 +当较高优先级的 Pod 受到资源竞争的影响时,Disable Schedule、Throttle以及Evict 将应用于低优先级的 Pod。 + + +## Qos Ensurance 架构 +Qos ensurance 的架构如下图所示。它包含三个模块。 + +1. `State Collector`:定期收集指标 +2. `Anomaly Analyzer`:使用收集指标,以分析节点是否发生异常 +3. `Action Executor`:执行回避动作,包括 Disable Scheduling、Throttle 和 Eviction。 + +![crane-qos-enurance](../images/crane-qos-ensurance.png) + +主要流程: + +1. `State Collector` 从 kube-apiserver 同步策略。 +2. 如果策略发生更改,`State Collector`会更新指标收集规则。 +3. `State Collector`定期收集指标。 +4. `State Collector`将指标传输到`Anomaly Analyzer`。 +5. `Anomaly Analyzer`对所有规则进行范围分析,以分析达到的回避阈值或恢复阈值。 +6. `Anomaly Analyzer`合并分析结果并通知`Action Executor`执行回避动作。 +7. `Action Executor`根据分析结果执行动作。 + +## Disable Scheduling + +定义 `AvoidanceAction` 和 `NodeQOSEnsurancePolicy`。 + +当节点 CPU 使用率触发回避阈值时,将该节点设置为禁用调度。 + + +示例 YAML 如下所示: + +```yaml title="AvoidanceAction" +apiVersion: ensurance.crane.io/v1alpha1 +kind: AvoidanceAction +metadata: + labels: + app: system + name: disablescheduling +spec: + description: disable schedule new pods to the node + coolDownSeconds: 300 #(1) +``` + +1. 节点从禁止调度状态到正常状态的最小等待时间 + +```yaml title="NodeQOSEnsurancePolicy" +apiVersion: ensurance.crane.io/v1alpha1 +kind: NodeQOSEnsurancePolicy +metadata: + name: "waterline1" + labels: + app: "system" +spec: + nodeQualityProbe: + timeoutSeconds: 10 + nodeLocalGet: + localCacheTTLSeconds: 60 + objectiveEnsurances: + - name: "cpu-usage" + avoidanceThreshold: 2 #(1) + restoreThreshold: 2 #(2) + actionName: "disablescheduling" #(3) + strategy: "None" #(4) + metricRule: + name: "cpu_total_usage" #(5) + value: 4000 #(6) +``` + +1. 当达到阈值并持续多次,那么我们认为规则被触发 +2. 当阈值未达到并继续多次, 那么我们认为规则已恢复 +3. 关联到 AvoidanceAction 名称 +4. 动作的策略,你可以将其设置为“预览”以不实际执行 +5. 指标名称 +6. 指标的阈值 + +请观看视频以了解更多`Disable Scheduling`的细节。 + + + +## Throttle + +定义 `AvoidanceAction` 和 `NodeQOSEnsurancePolicy`。 + +当节点 CPU 使用率触发回避阈值时,将执行节点的`Throttle Action`。 + +示例 YAML 如下所示: + +```yaml title="AvoidanceAction" +apiVersion: ensurance.crane.io/v1alpha1 +kind: AvoidanceAction +metadata: + name: throttle + labels: + app: system +spec: + coolDownSeconds: 300 + throttle: + cpuThrottle: + minCPURatio: 10 #(1) + stepCPURatio: 10 #(2) + description: "throttle low priority pods" +``` + +1. CPU 配额的最小比例,如果 pod 被限制低于这个比例,就会被设置为这个。 +2. 该配置设置给`Throttle Action`。它将在每个触发的回避动作中减少这个 CPU 配额占比。它会在每个恢复动作中增加这个 CPU 配额占比。 + +```yaml title="NodeQOSEnsurancePolicy" +apiVersion: ensurance.crane.io/v1alpha1 +kind: NodeQOSEnsurancePolicy +metadata: + name: "waterline2" + labels: + app: "system" +spec: + nodeQualityProbe: + timeoutSeconds: 10 + nodeLocalGet: + localCacheTTLSeconds: 60 + objectiveEnsurances: + - name: "cpu-usage" + avoidanceThreshold: 2 + restoredThreshold: 2 + actionName: "throttle" + strategy: "None" + metricRule: + name: "cpu_total_usage" + value: 6000 +``` + +## Eviction + +下面的 YAML 是另一种情况,当节点 CPU 使用率触发阈值时,节点上的低优先级 pod 将被驱逐。 + +```yaml title="AvoidanceAction" +apiVersion: ensurance.crane.io/v1alpha1 +kind: AvoidanceAction +metadata: + name: eviction + labels: + app: system +spec: + coolDownSeconds: 300 + eviction: + terminationGracePeriodSeconds: 30 #(1) + description: "evict low priority pods" +``` + +1. pod 需要优雅终止的持续时间(以秒为单位)。 + +```yaml title="NodeQOSEnsurancePolicy" +apiVersion: ensurance.crane.io/v1alpha1 +kind: NodeQOSEnsurancePolicy +metadata: + name: "waterline3" + labels: + app: "system" +spec: + nodeQualityProbe: + timeoutSeconds: 10 + nodeLocalGet: + localCacheTTLSeconds: 60 + objectiveEnsurances: + - name: "cpu-usage" + avoidanceThreshold: 2 + restoreThreshold: 2 + actionName: "evict" + strategy: "Preview" #(1) + metricRule: + name: "cpu_total_usage" + value: 6000 +``` + +1. 回避动作策略。当设置为`Preview`时,将不会被实际执行 + +## Supported Metrics + +Name | Description +---------|------------- +cpu_total_usage | node cpu usage +cpu_total_utilization | node cpu utilization diff --git a/docs/tutorials/using-time-series-prediction.zh.md b/docs/tutorials/using-time-series-prediction.zh.md new file mode 100644 index 000000000..f7e35488b --- /dev/null +++ b/docs/tutorials/using-time-series-prediction.zh.md @@ -0,0 +1,110 @@ +# TimeSeriesPrediction + +## Overview +Knowing the future makes things easier for us. + +--- + +许多业务在时间序列上天然存在周期性的,尤其是对于那些直接或间接为“人”服务的业务。这种周期性是由人们日常活动的规律性决定的。例如,人们习惯于中午和晚上点外卖;早晚总有交通高峰;即使是搜索等模式不那么明显的服务,夜间的请求量也远低于白天时间。对于这类业务相关的应用来说,从过去几天的历史数据中推断出次日的指标,或者从上周一的数据中推断出下周一的访问量是很自然的想法。通过预测未来 24 小时内的指标或流量模式,我们可以更好地管理我们的应用程序实例,稳定我们的系统,同时降低成本。 + +`TimeSeriesPrediction` 被用于预测 Kubernetes 对象指标。它基于 `PredictionCore` 进行预测。 + + +## Features +`TimeSeriesPrediction` 的示例 yaml 如下所示: + +```yaml title="TimeSeriesPrediction" +apiVersion: prediction.crane.io/v1alpha1 +kind: TimeSeriesPrediction +metadata: + name: node-resource-percentile + namespace: default +spec: + targetRef: + kind: Node + name: 192.168.56.166 + predictionWindowSeconds: 600 + predictionMetrics: + - resourceIdentifier: node-cpu + type: ResourceQuery + resourceQuery: cpu + algorithm: + algorithmType: "percentile" + percentile: + sampleInterval: "1m" + minSampleWeight: "1.0" + histogram: + maxValue: "10000.0" + epsilon: "1e-10" + halfLife: "12h" + bucketSize: "10" + firstBucketSize: "40" + bucketSizeGrowthRatio: "1.5" + - resourceIdentifier: node-mem + type: ResourceQuery + resourceQuery: memory + algorithm: + algorithmType: "percentile" + percentile: + sampleInterval: "1m" + minSampleWeight: "1.0" + histogram: + maxValue: "1000000.0" + epsilon: "1e-10" + halfLife: "12h" + bucketSize: "10" + firstBucketSize: "40" + bucketSizeGrowthRatio: "1.5" +``` + +* `spec.targetRef` 定义了对 Kubernetes 对象的引用,包括 Node 或其他工作负载,例如 Deployment。 +* `spec.predictionMetrics` 定义了关于 `spec.targetRef` 的指标。 +* `spec.predictionWindowSeconds` 是预测时间序列持续时间。`TimeSeriesPredictionController` 将轮换 `spec.Status` 中的预测数据,以供消费者使用预测的时间序列数据。 + +## Prediction Metrics +```yaml title="TimeSeriesPrediction" +apiVersion: prediction.crane.io/v1alpha1 +kind: TimeSeriesPrediction +metadata: + name: node-resource-percentile + namespace: default +spec: + predictionMetrics: + - resourceIdentifier: node-cpu + type: ResourceQuery + resourceQuery: cpu + algorithm: + algorithmType: "percentile" + percentile: + sampleInterval: "1m" + minSampleWeight: "1.0" + histogram: + maxValue: "10000.0" + epsilon: "1e-10" + halfLife: "12h" + bucketSize: "10" + firstBucketSize: "40" + bucketSizeGrowthRatio: "1.5" +``` + +### Metric Type + +现在我们只支持 `prometheus` 作为数据源。我们定义`MetricType`与数据源进行结合。但是现在可能有些数据源不支持 `MetricType`。 + +指标查询有以下三种类型: + + - `ResourceQuery`是 kubernetes 内置的资源指标,例如 cpu 或 memory。Crane目前只支持 CPU 和内存。 + - `RawQuery`是通过 DSL 的查询,比如 prometheus 查询语句。现在已支持 Prometheus 。 + - `ExpressionQuery`是一个表达式查询。 + + +### Algorithm +`Algorithm`定义算法类型和参数来预测指标。现在有两种算法: + + - `dsp`是一种预测时间序列的算法,它基于 FFT(快速傅里叶变换),擅长预测一些具有季节性和周期的时间序列。 + - `percentile`是一种估计时间序列,并找到代表过去时间序列的推荐值的算法,它基于指数衰减权重直方图统计。它是用来估计一个时间序列的,它不擅长预测一个时间序列,虽然`percentile`可以输出一个时间序列的预测数据,但是都是一样的值。**所以如果你想预测一个时间序列,dsp 是一个更好的选择。** + + +#### dsp params + +#### percentile params diff --git a/examples/config_set.yaml b/examples/config_set.yaml index debf4260e..11f96febf 100644 --- a/examples/config_set.yaml +++ b/examples/config_set.yaml @@ -4,15 +4,15 @@ configs: - targets: [] properties: resource.cpu-request-percentile: "0.98" - ehpa.deployment-min-replicas: "1" - ehpa.statefulset-min-replicas: "1" - ehpa.workload-min-replicas: "1" - ehpa.pod-min-ready-seconds: "30" - ehpa.pod-available-ratio: "0.5" - ehpa.default-min-replicas: "2" - ehpa.max-replicas-factor: "3" - ehpa.min-cpu-usage-threshold: "10" - ehpa.fluctuation-threshold: "3" - ehpa.min-cpu-target-utilization: "30" - ehpa.max-cpu-target-utilization: "75" - ehpa.reference-hpa: "true" \ No newline at end of file + replicas.workload-min-replicas: "3" + replicas.pod-min-ready-seconds: "30" + replicas.pod-available-ratio: "0.5" + replicas.default-min-replicas: "3" + replicas.max-replicas-factor: "3" + replicas.min-cpu-usage-threshold: "1" + replicas.fluctuation-threshold: "1.5" + replicas.min-cpu-target-utilization: "30" + replicas.max-cpu-target-utilization: "75" + replicas.cpu-target-utilization: "50" + replicas.cpu-percentile: "95" + replicas.reference-hpa: "true" \ No newline at end of file diff --git a/examples/ensurance/disablescheduling-when-ext-cpu-total-distribute.yaml b/examples/ensurance/disablescheduling-when-ext-cpu-total-distribute.yaml new file mode 100644 index 000000000..02308db96 --- /dev/null +++ b/examples/ensurance/disablescheduling-when-ext-cpu-total-distribute.yaml @@ -0,0 +1,20 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: NodeQOSEnsurancePolicy +metadata: + name: "disablescheduling-when-ext-cpu-total-distribute" + labels: + app: "system" +spec: + nodeQualityProbe: + timeoutSeconds: 10 + nodeLocalGet: + localCacheTTLSeconds: 60 + objectiveEnsurances: + - name: "ext_cpu_total_distribute" + avoidanceThreshold: 2 + restoreThreshold: 2 + actionName: "disablescheduling" + strategy: "None" + metricRule: + name: "ext_cpu_total_distribute" + value: 110 \ No newline at end of file diff --git a/go.mod b/go.mod index 9a5b1a7aa..f18108b85 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.17 require ( github.com/go-echarts/go-echarts/v2 v2.2.4 - github.com/gocrane/api v0.4.1-0.20220507041258-d376db2b4ad4 + github.com/gocrane/api v0.4.1-0.20220520134105-09d430d903ac github.com/google/cadvisor v0.39.2 github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 github.com/prometheus/client_golang v1.11.0 @@ -46,10 +46,12 @@ require ( ) require ( + github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab // indirect github.com/Microsoft/go-winio v0.5.1 // indirect github.com/NYTimes/gziphandler v1.1.1 // indirect github.com/PuerkitoBio/purell v1.1.1 // indirect github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect + github.com/aws/aws-sdk-go v1.38.49 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bits-and-blooms/bitset v1.2.0 // indirect github.com/blang/semver v3.5.1+incompatible // indirect @@ -57,6 +59,7 @@ require ( github.com/cilium/ebpf v0.6.2 // indirect github.com/containerd/console v1.0.2 // indirect github.com/containerd/containerd v1.4.4 // indirect + github.com/containerd/ttrpc v1.0.2 // indirect github.com/coreos/go-semver v0.3.0 // indirect github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/cyphar/filepath-securejoin v0.2.2 // indirect @@ -92,6 +95,7 @@ require ( github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect github.com/imdario/mergo v0.3.12 // indirect github.com/inconshreveable/mousetrap v1.0.0 // indirect + github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/karrick/godirwalk v1.16.1 // indirect github.com/leodido/go-urn v1.2.0 // indirect @@ -99,6 +103,7 @@ require ( github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect github.com/mindprince/gonvml v0.0.0-20190828220739-9ebdce4bb989 // indirect github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible // indirect + github.com/moby/spdystream v0.2.0 // indirect github.com/moby/sys/mountinfo v0.4.1 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect @@ -141,6 +146,7 @@ require ( go.uber.org/zap v1.19.0 // indirect golang.org/x/text v0.3.7 // indirect golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect + golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/protobuf v1.27.1 // indirect @@ -150,6 +156,11 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect k8s.io/apiextensions-apiserver v0.22.2 // indirect + k8s.io/cloud-provider v0.22.3 // indirect + k8s.io/component-helpers v0.22.3 // indirect + k8s.io/kube-scheduler v0.0.0 // indirect + k8s.io/kubelet v0.0.0 // indirect + k8s.io/mount-utils v0.22.3 // indirect k8s.io/utils v0.0.0-20210819203725-bdf08cb9a70a // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.22 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.1.2 // indirect diff --git a/go.sum b/go.sum index e4ee71bad..d7eaf7871 100644 --- a/go.sum +++ b/go.sum @@ -61,6 +61,7 @@ github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/GoogleCloudPlatform/k8s-cloud-provider v0.0.0-20200415212048-7901bc822317/go.mod h1:DF8FZRxMHMGv/vP2lQP6h+dYzzjpuRn24VeRiYn3qjQ= +github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab h1:UKkYhof1njT1/xq4SEg5z+VpTgjmNeHwPGRQl7takDI= github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab/go.mod h1:3VYc5hodBMJ5+l/7J4xAyMeuM2PNuepvHlGs8yilUCA= github.com/MakeNowJust/heredoc v0.0.0-20170808103936-bb23615498cd/go.mod h1:64YHyfSL2R96J44Nlwm39UHepQbyR5q10x7iYa1ks2E= github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw= @@ -92,6 +93,7 @@ github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/auth0/go-jwt-middleware v1.0.1/go.mod h1:YSeUX3z6+TF2H+7padiEqNJ73Zy9vXW72U//IgN0BIM= github.com/aws/aws-sdk-go v1.35.24/go.mod h1:tlPOdRjfxPBpNIwqDj61rmsnA85v9jc0Ps9+muhnW+k= +github.com/aws/aws-sdk-go v1.38.49 h1:E31vxjCe6a5I+mJLmUGaZobiWmg9KdWaud9IfceYeYQ= github.com/aws/aws-sdk-go v1.38.49/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= github.com/benbjohnson/clock v1.0.3/go.mod h1:bGMdMPoPVvcYyt1gHDf4J2KE153Yf9BuiUKYMaxlTDM= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= @@ -161,6 +163,7 @@ github.com/containerd/ttrpc v0.0.0-20190828154514-0e0f228740de/go.mod h1:PvCDdDG github.com/containerd/ttrpc v1.0.2 h1:2/O3oTZN36q2xRolk0a2WWGgh7/Vf/liElg5hFYLX9U= github.com/containerd/ttrpc v1.0.2/go.mod h1:UAxOpgT9ziI0gJrmKvgcZivgxOp8iFPSk8httJEt98Y= github.com/containerd/typeurl v0.0.0-20180627222232-a93fcdb778cd/go.mod h1:Cm3kwCdlkCfMSHURc+r6fwoGH6/F1hH3S4sg0rLFWPc= +github.com/containerd/typeurl v1.0.1 h1:PvuK4E3D5S5q6IqsPDCy928FhP0LUIGcmZ/Yhgp5Djw= github.com/containerd/typeurl v1.0.1/go.mod h1:TB1hUtrpaiO88KEK56ijojHS1+NeF0izUACaJW2mdXg= github.com/containernetworking/cni v0.8.1/go.mod h1:LGwApLUm2FpoOfxTDEeq8T9ipbpZ61X79hmU3w8FmsY= github.com/coredns/caddy v1.1.0/go.mod h1:A6ntJQlAWuQfFlsd9hvigKbo2WS0VUs2l1e2F+BawD4= @@ -206,6 +209,7 @@ github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDD github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153 h1:yUdfgN0XgIJw7foRItutHYUIhlcKzcSf5vDpdhQAKTc= github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= github.com/emicklei/go-restful v2.9.5+incompatible/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= @@ -306,12 +310,10 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= github.com/gobwas/ws v1.1.0-rc.5 h1:QOAag7FoBaBYYHRqzqkhhd8fq5RTubvI4v3Ft/gDVVQ= github.com/gobwas/ws v1.1.0-rc.5/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0= -github.com/gocrane/api v0.3.0 h1:ziH+zYQy/shiqQ6yskMs67e+bQ9WmPp8eCVhLW85NFQ= -github.com/gocrane/api v0.3.0/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= -github.com/gocrane/api v0.4.0 h1:1IWP3gbkp3T4kX68w4+PfqUr4Cb/gaJrihLYg6aKOLY= -github.com/gocrane/api v0.4.0/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= github.com/gocrane/api v0.4.1-0.20220507041258-d376db2b4ad4 h1:vGDg3G6y661KAlhjf/8/r8JCjaIi6aV8szCP+MZRU3Y= github.com/gocrane/api v0.4.1-0.20220507041258-d376db2b4ad4/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= +github.com/gocrane/api v0.4.1-0.20220520134105-09d430d903ac h1:lBKVVOA4del0Plj80PCE+nglxaJxaXanCv5N6a3laVY= +github.com/gocrane/api v0.4.1-0.20220520134105-09d430d903ac/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= @@ -460,7 +462,9 @@ github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NH github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/ishidawataru/sctp v0.0.0-20190723014705-7c296d48a2b5/go.mod h1:DM4VvS+hD/kDi1U1QsX2fnZowwBhqD0Dk3bRPKF/Oc8= github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/jonboulle/clockwork v0.2.2 h1:UOGuzwb1PwsrDAObMuhUnj0p5ULPj8V/xJ7Kx9qUBdQ= @@ -543,6 +547,7 @@ github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 h1:dd7vnTDfjtwCETZDrRe+GPYNLA1jBtbZeyfyE8eZCyk= github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12/go.mod h1:i/KKcxEWEO8Yyl11DYafRPKOPVYTrhxiTRigjtEEXZU= github.com/moby/ipvs v1.0.1/go.mod h1:2pngiyseZbIKXNv7hsKj3O9UEz30c53MT9005gt2hxQ= +github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= github.com/moby/sys/mountinfo v0.4.1 h1:1O+1cHA1aujwEwwVMa2Xm2l+gIpUHyd3+D+d7LZh1kM= github.com/moby/sys/mountinfo v0.4.1/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= @@ -1261,11 +1266,13 @@ k8s.io/autoscaler/vertical-pod-autoscaler v0.10.0/go.mod h1:sOm4JDB+0ZrrqFuscsbW k8s.io/cli-runtime v0.22.3/go.mod h1:um6JvCxV9Hrhq0zCUxcqYoY7/wF64g6IYgOViI8sg6Q= k8s.io/client-go v0.22.3 h1:6onkOSc+YNdwq5zXE0wFXicq64rrym+mXwHu/CPVGO4= k8s.io/client-go v0.22.3/go.mod h1:ElDjYf8gvZsKDYexmsmnMQ0DYO8W9RwBjfQ1PI53yow= +k8s.io/cloud-provider v0.22.3 h1:ZsWdB0WmyjKlE901EM14BuSvnN+QPGrCGjcfDc+b5NI= k8s.io/cloud-provider v0.22.3/go.mod h1:GsKMR5EnNH4zcfkEvOxBPEZVuRvadVRkZvGqYxxBvO4= k8s.io/cluster-bootstrap v0.22.3/go.mod h1:FVBAeGJ/T6QbNgGb7DX98FCjExJnNLsRXtGRMjEQ26I= k8s.io/code-generator v0.22.3/go.mod h1:eV77Y09IopzeXOJzndrDyCI88UBok2h6WxAlBwpxa+o= k8s.io/component-base v0.22.3 h1:/+hryAW03u3FpJQww+GSMsArJNUbGjH66lrgxaRynLU= k8s.io/component-base v0.22.3/go.mod h1:kuybv1miLCMoOk3ebrqF93GbQHQx6W2287FC0YEQY6s= +k8s.io/component-helpers v0.22.3 h1:08tn+T8HnjRTwDP2ErIBhHGvPcYJf5zWaWW83golHWc= k8s.io/component-helpers v0.22.3/go.mod h1:7OVySVH5elhHKuJKUOxZEfpT1Bm3ChmBQZHmuFfbGHk= k8s.io/controller-manager v0.22.3/go.mod h1:4cvQGMvYf6IpTY08/NigEiI5UrN/cbtOe5e5WepYmcQ= k8s.io/cri-api v0.22.3 h1:6C6Af3BooYbmZzZydibKgyJvZK1MRJQ/sSsvjunos2o= @@ -1285,14 +1292,17 @@ k8s.io/kube-openapi v0.0.0-20210421082810-95288971da7e/go.mod h1:vHXdDvt9+2spS2R k8s.io/kube-openapi v0.0.0-20210817084001-7fbd8d59e5b8 h1:Xxl9TLJ30BJ1pGWfGZnqbpww2rwOt3RAzbSz+omQGtg= k8s.io/kube-openapi v0.0.0-20210817084001-7fbd8d59e5b8/go.mod h1:foAE7XkrXQ1Qo2eWsW/iWksptrVdbl6t+vscSdmmGjk= k8s.io/kube-proxy v0.22.3/go.mod h1:9ta1U8GKKo6by981sN/L6MhFJzPWxMdfh7plVPH1I2s= +k8s.io/kube-scheduler v0.22.3 h1:VDd3zuXPUNTT1WSt/s+1Pk4lnfBNcVdfoijHv0MnV/4= k8s.io/kube-scheduler v0.22.3/go.mod h1:jVLHSttd8cSejBLOeiWE+g8etA6XdOBGiR8tI577OhU= k8s.io/kubectl v0.22.3/go.mod h1:gcpQHPOx+Jke9Og6Li7YxR/ZuaOtFUeJw7xHH617tHs= +k8s.io/kubelet v0.22.3 h1:C21Kg66Zzvc21uJITEPg4stGMcSZsR1JB+7+6Uwm8zs= k8s.io/kubelet v0.22.3/go.mod h1:9nUZNGUigU2uAIm7kgf8BsvYDI9KjIE5nt9+yI1+p7w= k8s.io/kubernetes v1.22.3 h1:/eFfR5S2Vxn0t9kcLVAZXQFloKMkklWQIf5e0hFbzlA= k8s.io/kubernetes v1.22.3/go.mod h1:Snea7fgIObGgHmLbUJ3OgjGEr5bjj16iEdp5oHS6eS8= k8s.io/legacy-cloud-providers v0.22.3/go.mod h1:eEOOaRtP2PuCVkjZvuTPa6ZgyPpzJkCVqpE3YtuArLQ= k8s.io/metrics v0.22.3 h1:G4EGLIcm9CSlpLRXKjIJiZqM/l45xasz2BOiK4qJCNo= k8s.io/metrics v0.22.3/go.mod h1:HbLFLRKtXzoC/6tHLQAlO9AeOBXZp2eB6SsgkbujoNI= +k8s.io/mount-utils v0.22.3 h1:3BIJ5lgA4qIhQ2/YhpCs9VMDP+/CiBGKZ8N+zeHFCWE= k8s.io/mount-utils v0.22.3/go.mod h1:dHl6c2P60T5LHUnZxVslyly9EDCMzvhtISO5aY+Z4sk= k8s.io/pod-security-admission v0.22.3/go.mod h1:xtkf/UhVWICokQLSDvD+8plfGkTQW4VTJvnixVWCeWk= k8s.io/sample-apiserver v0.22.3/go.mod h1:HuEOdD/pT5R7gKNr2REb62uabZaJuFZyY3wUd86nFCA= diff --git a/mkdocs.yml b/mkdocs.yml index 106166207..7512365b7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -58,7 +58,11 @@ markdown_extensions: linenums_style: pymdownx-inline - pymdownx.inlinehilite - pymdownx.snippets - - pymdownx.superfences + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format - pymdownx.tabbed: alternate_style: true - pymdownx.arithmatex: diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index e8b3778d7..237faba04 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -53,7 +53,7 @@ type Agent struct { } func NewAgent(ctx context.Context, - nodeName, runtimeEndpoint string, + nodeName, runtimeEndpoint, cgroupDriver string, kubeClient *kubernetes.Clientset, craneClient *craneclientset.Clientset, podInformer coreinformers.PodInformer, @@ -77,7 +77,7 @@ func NewAgent(ctx context.Context, } utilruntime.Must(ensuranceapi.AddToScheme(scheme.Scheme)) - cadvisorManager := cadvisor.NewCadvisorManager() + cadvisorManager := cadvisor.NewCadvisorManager(cgroupDriver) exclusiveCPUSet := cm.DefaultExclusiveCPUSet if craneCpuSetManager := utilfeature.DefaultFeatureGate.Enabled(features.CraneCpuSetManager); craneCpuSetManager { cpuManager := cm.NewAdvancedCpuManager(podInformer, runtimeEndpoint, cadvisorManager) diff --git a/pkg/controller/analytics/analytics_controller.go b/pkg/controller/analytics/analytics_controller.go index a1e6dbc7c..716324f9a 100644 --- a/pkg/controller/analytics/analytics_controller.go +++ b/pkg/controller/analytics/analytics_controller.go @@ -3,6 +3,7 @@ package analytics import ( "context" "fmt" + "reflect" "strings" "sync" "time" @@ -25,7 +26,9 @@ import ( "k8s.io/client-go/tools/record" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/yaml" analysisv1alph1 "github.com/gocrane/api/analysis/v1alpha1" @@ -77,10 +80,22 @@ func (c *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return ctrl.Result{}, nil } - shouldAnalytics := c.ShouldAnalytics(analytics) - if !shouldAnalytics { - klog.V(4).Infof("Nothing happens for Analytics %s", req.NamespacedName) - return ctrl.Result{}, nil + lastUpdateTime := analytics.Status.LastUpdateTime + if analytics.Spec.CompletionStrategy.CompletionStrategyType == analysisv1alph1.CompletionStrategyOnce { + if lastUpdateTime != nil { + // This is a one-off analytics task which has been completed. + return ctrl.Result{}, nil + } + } else { + if lastUpdateTime != nil { + planingTime := lastUpdateTime.Add(time.Duration(*analytics.Spec.CompletionStrategy.PeriodSeconds) * time.Second) + now := time.Now() + if now.Before(planingTime) { + return ctrl.Result{ + RequeueAfter: planingTime.Sub(now), + }, nil + } + } } finished := c.DoAnalytics(ctx, analytics) @@ -89,36 +104,17 @@ func (c *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu if analytics.Spec.CompletionStrategy.PeriodSeconds != nil { d := time.Second * time.Duration(*analytics.Spec.CompletionStrategy.PeriodSeconds) klog.V(4).InfoS("Will re-sync", "after", d) + // Arrange for next round. return ctrl.Result{ RequeueAfter: d, }, nil } } + klog.V(6).Infof("Analytics not finished, continue to do it.") return ctrl.Result{RequeueAfter: time.Second * 1}, nil } -// ShouldAnalytics decide if we need do analytics according to status -func (c *Controller) ShouldAnalytics(analytics *analysisv1alph1.Analytics) bool { - lastUpdateTime := analytics.Status.LastUpdateTime - - if analytics.Spec.CompletionStrategy.CompletionStrategyType == analysisv1alph1.CompletionStrategyOnce { - if lastUpdateTime != nil { - // already finish analytics - return false - } - } else { - if lastUpdateTime != nil { - planingTime := lastUpdateTime.Add(time.Duration(*analytics.Spec.CompletionStrategy.PeriodSeconds) * time.Second) - if time.Now().Before(planingTime) { - return false - } - } - } - - return true -} - func (c *Controller) DoAnalytics(ctx context.Context, analytics *analysisv1alph1.Analytics) bool { newStatus := analytics.Status.DeepCopy() @@ -149,7 +145,7 @@ func (c *Controller) DoAnalytics(ctx context.Context, analytics *analysisv1alph1 } if currMissions == nil { - // create recommendation Missions for this round + // create recommendation missions for this round for _, id := range identities { currMissions = append(currMissions, analysisv1alph1.RecommendationMission{ TargetRef: corev1.ObjectReference{Kind: id.Kind, APIVersion: id.APIVersion, Namespace: id.Namespace, Name: id.Name}, @@ -157,14 +153,15 @@ func (c *Controller) DoAnalytics(ctx context.Context, analytics *analysisv1alph1 } } + if currMissions == nil { + klog.Infof("No missions, return.") + return true + } + var currRecommendations []*analysisv1alph1.Recommendation labelSet := labels.Set{} labelSet[known.AnalyticsUidLabel] = string(analytics.UID) - if analytics.Namespace == known.CraneSystemNamespace { - currRecommendations, err = c.recommLister.List(labels.SelectorFromSet(labelSet)) - } else { - currRecommendations, err = c.recommLister.Recommendations(analytics.Namespace).List(labels.SelectorFromSet(labelSet)) - } + currRecommendations, err = c.recommLister.Recommendations(analytics.Namespace).List(labels.SelectorFromSet(labelSet)) if err != nil { c.Recorder.Event(analytics, corev1.EventTypeNormal, "FailedSelectResource", err.Error()) msg := fmt.Sprintf("Failed to get recomendations, Analytics %s error %v", klog.KObj(analytics), err) @@ -182,14 +179,14 @@ func (c *Controller) DoAnalytics(ctx context.Context, analytics *analysisv1alph1 } maxConcurrency := 10 - executeIndex := -1 + executionIndex := -1 var concurrency int for index, mission := range currMissions { if mission.LastStartTime != nil { continue } - if executeIndex == -1 { - executeIndex = index + if executionIndex == -1 { + executionIndex = index } if concurrency < maxConcurrency { concurrency++ @@ -198,45 +195,44 @@ func (c *Controller) DoAnalytics(ctx context.Context, analytics *analysisv1alph1 wg := sync.WaitGroup{} wg.Add(concurrency) - for index := range currMissions { - if index < executeIndex || index >= concurrency+executeIndex { - continue - } - - var existRecommendation *analysisv1alph1.Recommendation + for index := executionIndex; index < len(currMissions) && index < concurrency+executionIndex; index++ { + var existingRecommendation *analysisv1alph1.Recommendation for _, r := range currRecommendations { - if currMissions[index].UID == r.UID { - existRecommendation = r + if reflect.DeepEqual(currMissions[index].TargetRef, r.Spec.TargetRef) { + existingRecommendation = r + break } } - go c.ExecuteMission(ctx, &wg, analytics, identities, &currMissions[index], existRecommendation, timeNow) + go c.ExecuteMission(ctx, &wg, analytics, identities, &currMissions[index], existingRecommendation, timeNow) } wg.Wait() finished := false - if executeIndex+concurrency == len(currMissions) { + if executionIndex+concurrency == len(currMissions) { finished = true } if finished { newStatus.LastUpdateTime = &timeNow - // clean orphan recommendation + // clean orphan recommendations for _, recommendation := range currRecommendations { exist := false for _, mission := range currMissions { if recommendation.UID == mission.UID { exist = true + break } } if !exist { - klog.Infof("Deleting recommendation %s.", klog.KObj(recommendation)) err = c.Client.Delete(ctx, recommendation) if err != nil { - klog.Errorf("Delete recommendation %s failed: %v", klog.KObj(recommendation), err) + klog.ErrorS(err, "Failed to delete recommendation.", "recommendation", klog.KObj(recommendation)) + } else { + klog.Infof("Deleted orphan recommendation %v.", klog.KObj(recommendation)) } } } @@ -301,7 +297,7 @@ func (c *Controller) SetupWithManager(mgr ctrl.Manager) error { c.K8SVersion = version.MustParseGeneric(serverVersion.GitVersion) return ctrl.NewControllerManagedBy(mgr). - For(&analysisv1alph1.Analytics{}). + For(&analysisv1alph1.Analytics{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). Complete(c) } @@ -393,7 +389,7 @@ func (c *Controller) GetIdentities(ctx context.Context, analytics *analysisv1alp return identities, nil } -func (c *Controller) ExecuteMission(ctx context.Context, wg *sync.WaitGroup, analytics *analysisv1alph1.Analytics, identities map[string]ObjectIdentity, mission *analysisv1alph1.RecommendationMission, existRecommendation *analysisv1alph1.Recommendation, timeNow metav1.Time) { +func (c *Controller) ExecuteMission(ctx context.Context, wg *sync.WaitGroup, analytics *analysisv1alph1.Analytics, identities map[string]ObjectIdentity, mission *analysisv1alph1.RecommendationMission, existingRecommendation *analysisv1alph1.Recommendation, timeNow metav1.Time) { defer func() { mission.LastStartTime = &timeNow klog.Infof("Mission message: %s", mission.Message) @@ -406,7 +402,7 @@ func (c *Controller) ExecuteMission(ctx context.Context, wg *sync.WaitGroup, ana mission.Message = fmt.Sprintf("Failed to get identity, key %s. ", k) return } else { - recommendation := existRecommendation + recommendation := existingRecommendation if recommendation == nil { recommendation = c.CreateRecommendationObject(ctx, analytics, mission.TargetRef, id) } @@ -424,31 +420,22 @@ func (c *Controller) ExecuteMission(ctx context.Context, wg *sync.WaitGroup, ana } var value string - if proposed.ResourceRequest != nil { - valueBytes, err := yaml.Marshal(proposed.ResourceRequest) - if err != nil { - mission.Message = err.Error() - return - } - value = string(valueBytes) - } else if proposed.EffectiveHPA != nil { - valueBytes, err := yaml.Marshal(proposed.EffectiveHPA) - if err != nil { - mission.Message = err.Error() - return - } - value = string(valueBytes) + valueBytes, err := yaml.Marshal(proposed) + if err != nil { + mission.Message = err.Error() + return } + value = string(valueBytes) recommendation.Status.RecommendedValue = value - if existRecommendation != nil { + if existingRecommendation != nil { klog.Infof("Update recommendation %s", klog.KObj(recommendation)) if err := c.Update(ctx, recommendation); err != nil { mission.Message = fmt.Sprintf("Failed to create recommendation %s: %v", klog.KObj(recommendation), err) return } - klog.Infof("Successful to update Recommendation %s", klog.KObj(recommendation)) + klog.Infof("Successfully to update Recommendation %s", klog.KObj(recommendation)) } else { klog.Infof("Create recommendation %s", klog.KObj(recommendation)) if err := c.Create(ctx, recommendation); err != nil { @@ -456,7 +443,7 @@ func (c *Controller) ExecuteMission(ctx context.Context, wg *sync.WaitGroup, ana return } - klog.Infof("Successful to create Recommendation %s", klog.KObj(recommendation)) + klog.Infof("Successfully to create Recommendation %s", klog.KObj(recommendation)) } mission.Message = "Success" diff --git a/pkg/controller/ehpa/effective_hpa_controller.go b/pkg/controller/ehpa/effective_hpa_controller.go index efc3a5d3c..27272d015 100644 --- a/pkg/controller/ehpa/effective_hpa_controller.go +++ b/pkg/controller/ehpa/effective_hpa_controller.go @@ -88,7 +88,7 @@ func (c *EffectiveHPAController) Reconcile(ctx context.Context, req ctrl.Request setPredictionCondition(newStatus, prediction.Status.Conditions) } - hpa, err := c.ReconcileHPA(ctx, ehpa, substitute) + hpa, err := c.ReconcileHPA(ctx, ehpa, substitute, newStatus) if err != nil { setCondition(newStatus, autoscalingapi.Ready, metav1.ConditionFalse, "FailedReconcileHPA", err.Error()) c.UpdateStatus(ctx, ehpa, newStatus) @@ -206,8 +206,8 @@ func setCondition(status *autoscalingapi.EffectiveHorizontalPodAutoscalerStatus, func RecordMetrics(ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler) { if ehpa.Status.ExpectReplicas != nil { labels := map[string]string{ - "resourceName": klog.KObj(ehpa).String(), - "strategy": string(ehpa.Spec.ScaleStrategy), + "namespace": ehpa.Namespace, + "name": ehpa.Name, } metrics.EHPAReplicas.With(labels).Set(float64(*ehpa.Status.ExpectReplicas)) } diff --git a/pkg/controller/ehpa/hpa.go b/pkg/controller/ehpa/hpa.go index d9247eb47..19e36dbc0 100644 --- a/pkg/controller/ehpa/hpa.go +++ b/pkg/controller/ehpa/hpa.go @@ -22,7 +22,7 @@ import ( "github.com/gocrane/crane/pkg/utils" ) -func (c *EffectiveHPAController) ReconcileHPA(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler, substitute *autoscalingapi.Substitute) (*autoscalingv2.HorizontalPodAutoscaler, error) { +func (c *EffectiveHPAController) ReconcileHPA(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler, substitute *autoscalingapi.Substitute, status *autoscalingapi.EffectiveHorizontalPodAutoscalerStatus) (*autoscalingv2.HorizontalPodAutoscaler, error) { hpaList := &autoscalingv2.HorizontalPodAutoscalerList{} opts := []client.ListOption{ client.MatchingLabels(map[string]string{known.EffectiveHorizontalPodAutoscalerUidLabel: string(ehpa.UID)}), @@ -30,17 +30,17 @@ func (c *EffectiveHPAController) ReconcileHPA(ctx context.Context, ehpa *autosca err := c.Client.List(ctx, hpaList, opts...) if err != nil { if errors.IsNotFound(err) { - return c.CreateHPA(ctx, ehpa, substitute) + return c.CreateHPA(ctx, ehpa, substitute, status) } else { c.Recorder.Event(ehpa, v1.EventTypeNormal, "FailedGetHPA", err.Error()) klog.Error("Failed to get HPA, ehpa %s error %v", klog.KObj(ehpa), err) return nil, err } } else if len(hpaList.Items) == 0 { - return c.CreateHPA(ctx, ehpa, substitute) + return c.CreateHPA(ctx, ehpa, substitute, status) } - return c.UpdateHPAIfNeed(ctx, ehpa, &hpaList.Items[0], substitute) + return c.UpdateHPAIfNeed(ctx, ehpa, &hpaList.Items[0], substitute, status) } func (c *EffectiveHPAController) GetHPA(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler) (*autoscalingv2.HorizontalPodAutoscaler, error) { @@ -58,8 +58,8 @@ func (c *EffectiveHPAController) GetHPA(ctx context.Context, ehpa *autoscalingap return &hpaList.Items[0], nil } -func (c *EffectiveHPAController) CreateHPA(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler, substitute *autoscalingapi.Substitute) (*autoscalingv2.HorizontalPodAutoscaler, error) { - hpa, err := c.NewHPAObject(ctx, ehpa, substitute) +func (c *EffectiveHPAController) CreateHPA(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler, substitute *autoscalingapi.Substitute, status *autoscalingapi.EffectiveHorizontalPodAutoscalerStatus) (*autoscalingv2.HorizontalPodAutoscaler, error) { + hpa, err := c.NewHPAObject(ctx, ehpa, substitute, status) if err != nil { c.Recorder.Event(ehpa, v1.EventTypeNormal, "FailedCreateHPAObject", err.Error()) klog.Errorf("Failed to create object, HorizontalPodAutoscaler %s error %v", klog.KObj(hpa), err) @@ -79,8 +79,8 @@ func (c *EffectiveHPAController) CreateHPA(ctx context.Context, ehpa *autoscalin return hpa, nil } -func (c *EffectiveHPAController) NewHPAObject(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler, substitute *autoscalingapi.Substitute) (*autoscalingv2.HorizontalPodAutoscaler, error) { - metrics, err := c.GetHPAMetrics(ctx, ehpa) +func (c *EffectiveHPAController) NewHPAObject(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler, substitute *autoscalingapi.Substitute, status *autoscalingapi.EffectiveHorizontalPodAutoscalerStatus) (*autoscalingv2.HorizontalPodAutoscaler, error) { + metrics, err := c.GetHPAMetrics(ctx, ehpa, status) if err != nil { return nil, err } @@ -135,9 +135,9 @@ func (c *EffectiveHPAController) NewHPAObject(ctx context.Context, ehpa *autosca return hpa, nil } -func (c *EffectiveHPAController) UpdateHPAIfNeed(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler, hpaExist *autoscalingv2.HorizontalPodAutoscaler, substitute *autoscalingapi.Substitute) (*autoscalingv2.HorizontalPodAutoscaler, error) { +func (c *EffectiveHPAController) UpdateHPAIfNeed(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler, hpaExist *autoscalingv2.HorizontalPodAutoscaler, substitute *autoscalingapi.Substitute, status *autoscalingapi.EffectiveHorizontalPodAutoscalerStatus) (*autoscalingv2.HorizontalPodAutoscaler, error) { var needUpdate bool - hpa, err := c.NewHPAObject(ctx, ehpa, substitute) + hpa, err := c.NewHPAObject(ctx, ehpa, substitute, status) if err != nil { c.Recorder.Event(ehpa, v1.EventTypeNormal, "FailedCreateHPAObject", err.Error()) klog.Errorf("Failed to create object, HorizontalPodAutoscaler %s error %v", klog.KObj(hpa), err) @@ -173,14 +173,14 @@ func (c *EffectiveHPAController) UpdateHPAIfNeed(ctx context.Context, ehpa *auto } // GetHPAMetrics loop metricSpec in EffectiveHorizontalPodAutoscaler and generate metricSpec for HPA -func (c *EffectiveHPAController) GetHPAMetrics(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler) ([]autoscalingv2.MetricSpec, error) { +func (c *EffectiveHPAController) GetHPAMetrics(ctx context.Context, ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler, status *autoscalingapi.EffectiveHorizontalPodAutoscalerStatus) ([]autoscalingv2.MetricSpec, error) { var metrics []autoscalingv2.MetricSpec for _, metric := range ehpa.Spec.Metrics { copyMetric := metric.DeepCopy() metrics = append(metrics, *copyMetric) } - if utils.IsEHPAPredictionEnabled(ehpa) { + if utils.IsEHPAPredictionEnabled(ehpa) && isPredictionReady(status) { var customMetricsForPrediction []autoscalingv2.MetricSpec for _, metric := range metrics { @@ -223,12 +223,17 @@ func (c *EffectiveHPAController) GetHPAMetrics(ctx context.Context, ehpa *autosc return nil, fmt.Errorf("no pods returns from scale object. ") } - requests, err := utils.CalculatePodRequests(pods, metric.Resource.Name) + availablePods := utils.GetAvailablePods(pods) + if len(availablePods) == 0 { + return nil, fmt.Errorf("failed to get available pods. ") + } + + requests, err := utils.CalculatePodRequests(availablePods, metric.Resource.Name) if err != nil { return nil, err } - averageValue := int64((float64(requests) * float64(*metric.Resource.Target.AverageUtilization) / 100) / float64(len(pods))) + averageValue := int64((float64(requests) * float64(*metric.Resource.Target.AverageUtilization) / 100) / float64(len(availablePods))) customMetric.Target.AverageValue = resource.NewMilliQuantity(averageValue, resource.DecimalSI) } else { customMetric.Target.AverageValue = metric.Resource.Target.AverageValue diff --git a/pkg/controller/ehpa/hpa_event_handler.go b/pkg/controller/ehpa/hpa_event_handler.go index 51c9ec58a..19bd67f9e 100644 --- a/pkg/controller/ehpa/hpa_event_handler.go +++ b/pkg/controller/ehpa/hpa_event_handler.go @@ -1,7 +1,6 @@ package ehpa import ( - "fmt" "strings" autoscalingv2 "k8s.io/api/autoscaling/v2beta2" @@ -43,15 +42,10 @@ func (h *hpaEventHandler) Update(evt event.UpdateEvent, q workqueue.RateLimiting scaleType = "ehpa" } - direction := "Down" - if newHpa.Status.DesiredReplicas > oldHpa.Status.DesiredReplicas { - direction = "Up" - } - labels := map[string]string{ - "resourceName": fmt.Sprintf("%s/%s", newHpa.Namespace, newHpa.Name), - "type": scaleType, - "direction": direction, + "namespace": newHpa.Namespace, + "name": newHpa.Name, + "type": scaleType, } metrics.HPAScaleCount.With(labels).Inc() diff --git a/pkg/controller/ehpa/hpa_observer_controller.go b/pkg/controller/ehpa/hpa_observer_controller.go index 08918b8a3..90e44a0a2 100644 --- a/pkg/controller/ehpa/hpa_observer_controller.go +++ b/pkg/controller/ehpa/hpa_observer_controller.go @@ -34,7 +34,8 @@ func (c *HPAObserverController) Reconcile(ctx context.Context, req ctrl.Request) } labels := map[string]string{ - "resourceName": klog.KObj(hpa).String(), + "namespace": hpa.Namespace, + "name": hpa.Name, } metrics.HPAReplicas.With(labels).Set(float64(hpa.Status.DesiredReplicas)) diff --git a/pkg/controller/ehpa/predict.go b/pkg/controller/ehpa/predict.go index cb4d3fc69..f8ee0cac2 100644 --- a/pkg/controller/ehpa/predict.go +++ b/pkg/controller/ehpa/predict.go @@ -163,3 +163,13 @@ func setPredictionCondition(status *autoscalingapi.EffectiveHorizontalPodAutosca } } } + +func isPredictionReady(status *autoscalingapi.EffectiveHorizontalPodAutoscalerStatus) bool { + for _, cond := range status.Conditions { + if cond.Type == string(autoscalingapi.PredictionReady) && cond.Status == metav1.ConditionTrue { + return true + } + } + + return false +} diff --git a/pkg/controller/ehpa/substitute_controller.go b/pkg/controller/ehpa/substitute_controller.go index 7352a479d..66c905061 100644 --- a/pkg/controller/ehpa/substitute_controller.go +++ b/pkg/controller/ehpa/substitute_controller.go @@ -56,6 +56,19 @@ func (c *SubstituteController) Reconcile(ctx context.Context, req ctrl.Request) Replicas: substitute.Spec.Replicas, } + if substitute.Spec.Replicas != scale.Status.Replicas { + substitute.Spec.Replicas = scale.Status.Replicas + + err := c.Update(ctx, substitute) + if err != nil { + c.Recorder.Event(substitute, v1.EventTypeNormal, "FailedUpdateSubstitute", err.Error()) + klog.Errorf("Failed to update Substitute %s, error %v", klog.KObj(substitute), err) + return ctrl.Result{}, err + } + + klog.Infof("Update Substitute successful, Substitute %s", klog.KObj(substitute)) + } + if !equality.Semantic.DeepEqual(&substitute.Status, &newStatus) { substitute.Status = newStatus err := c.Status().Update(ctx, substitute) diff --git a/pkg/controller/recommendation/updater.go b/pkg/controller/recommendation/updater.go index 93b74fce2..32c6614e3 100644 --- a/pkg/controller/recommendation/updater.go +++ b/pkg/controller/recommendation/updater.go @@ -23,22 +23,12 @@ import ( ) func (c *Controller) UpdateRecommendation(ctx context.Context, recommendation *analysisapi.Recommendation) (bool, error) { - var proposedEHPA recommendtypes.EffectiveHorizontalPodAutoscalerRecommendation - var proposedResource recommendtypes.ProposedRecommendation + var proposedRecommendation recommendtypes.ProposedRecommendation needUpdate := false - if recommendation.Spec.Type == analysisapi.AnalysisTypeResource { - err := yaml.Unmarshal([]byte(recommendation.Status.RecommendedValue), &proposedResource) - if err != nil { - return false, err - } - } - - if recommendation.Spec.Type == analysisapi.AnalysisTypeHPA { - err := yaml.Unmarshal([]byte(recommendation.Status.RecommendedValue), &proposedEHPA) - if err != nil { - return false, err - } + err := yaml.Unmarshal([]byte(recommendation.Status.RecommendedValue), &proposedRecommendation) + if err != nil { + return false, err } if recommendation.Spec.AdoptionType == analysisapi.AdoptionTypeStatus { @@ -48,7 +38,7 @@ func (c *Controller) UpdateRecommendation(ctx context.Context, recommendation *a unstructed := &unstructured.Unstructured{} unstructed.SetAPIVersion(recommendation.Spec.TargetRef.APIVersion) unstructed.SetKind(recommendation.Spec.TargetRef.Kind) - err := c.Client.Get(ctx, client.ObjectKey{Name: recommendation.Spec.TargetRef.Name, Namespace: recommendation.Spec.TargetRef.Namespace}, unstructed) + err = c.Client.Get(ctx, client.ObjectKey{Name: recommendation.Spec.TargetRef.Name, Namespace: recommendation.Spec.TargetRef.Namespace}, unstructed) if err != nil { return false, fmt.Errorf("get target object failed: %v. ", err) } @@ -61,14 +51,40 @@ func (c *Controller) UpdateRecommendation(ctx context.Context, recommendation *a switch recommendation.Spec.Type { case analysisapi.AnalysisTypeResource: - if annotation[known.ResourceRecommendationValueAnnotation] != recommendation.Status.RecommendedValue { - annotation[known.ResourceRecommendationValueAnnotation] = recommendation.Status.RecommendedValue - needUpdate = true + if proposedRecommendation.ResourceRequest != nil { + resourceValue, err := yaml.Marshal(proposedRecommendation.ResourceRequest) + if err != nil { + return false, fmt.Errorf("marshal ResourceRequest failed: %v. ", err) + } + + if annotation[known.ResourceRecommendationValueAnnotation] != string(resourceValue) { + annotation[known.ResourceRecommendationValueAnnotation] = string(resourceValue) + needUpdate = true + } } - case analysisapi.AnalysisTypeHPA: - if annotation[known.HPARecommendationValueAnnotation] != recommendation.Status.RecommendedValue { - annotation[known.HPARecommendationValueAnnotation] = recommendation.Status.RecommendedValue - needUpdate = true + case analysisapi.AnalysisTypeReplicas: + if proposedRecommendation.ReplicasRecommendation != nil { + replicasValue, err := yaml.Marshal(proposedRecommendation.ReplicasRecommendation) + if err != nil { + return false, fmt.Errorf("marshal ReplicasRecommendation failed: %v. ", err) + } + + if annotation[known.ReplicasRecommendationValueAnnotation] != string(replicasValue) { + annotation[known.ReplicasRecommendationValueAnnotation] = string(replicasValue) + needUpdate = true + } + } + + if proposedRecommendation.EffectiveHPA != nil { + ehpaValue, err := yaml.Marshal(proposedRecommendation.EffectiveHPA) + if err != nil { + return false, fmt.Errorf("marshal EffectiveHPA failed: %v. ", err) + } + + if annotation[known.HPARecommendationValueAnnotation] != string(ehpaValue) { + annotation[known.HPARecommendationValueAnnotation] = string(ehpaValue) + needUpdate = true + } } } @@ -83,7 +99,7 @@ func (c *Controller) UpdateRecommendation(ctx context.Context, recommendation *a // Only support Auto Type for EHPA recommendation if recommendation.Spec.AdoptionType == analysisapi.AdoptionTypeAuto { - if recommendation.Spec.Type == analysisapi.AnalysisTypeHPA { + if recommendation.Spec.Type == analysisapi.AnalysisTypeReplicas && proposedRecommendation.EffectiveHPA != nil { ehpa, err := utils.GetEHPAFromScaleTarget(ctx, c.Client, recommendation.Spec.TargetRef.Namespace, recommendation.Spec.TargetRef) if err != nil { return false, fmt.Errorf("get EHPA from target failed: %v. ", err) @@ -95,11 +111,11 @@ func (c *Controller) UpdateRecommendation(ctx context.Context, recommendation *a Name: recommendation.Spec.TargetRef.Name, }, Spec: autoscalingapi.EffectiveHorizontalPodAutoscalerSpec{ - MinReplicas: proposedEHPA.MinReplicas, - MaxReplicas: *proposedEHPA.MaxReplicas, - Metrics: proposedEHPA.Metrics, + MinReplicas: proposedRecommendation.EffectiveHPA.MinReplicas, + MaxReplicas: *proposedRecommendation.EffectiveHPA.MaxReplicas, + Metrics: proposedRecommendation.EffectiveHPA.Metrics, ScaleStrategy: autoscalingapi.ScaleStrategyPreview, - Prediction: proposedEHPA.Prediction, + Prediction: proposedRecommendation.EffectiveHPA.Prediction, ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{ Kind: recommendation.Spec.TargetRef.Kind, APIVersion: recommendation.Spec.TargetRef.APIVersion, @@ -119,9 +135,9 @@ func (c *Controller) UpdateRecommendation(ctx context.Context, recommendation *a // if user change it, we don't want to override it. // The reason for Prediction is the same. ehpaUpdate := ehpa.DeepCopy() - ehpaUpdate.Spec.MinReplicas = proposedEHPA.MinReplicas - ehpaUpdate.Spec.MaxReplicas = *proposedEHPA.MaxReplicas - ehpaUpdate.Spec.Metrics = proposedEHPA.Metrics + ehpaUpdate.Spec.MinReplicas = proposedRecommendation.EffectiveHPA.MinReplicas + ehpaUpdate.Spec.MaxReplicas = *proposedRecommendation.EffectiveHPA.MaxReplicas + ehpaUpdate.Spec.Metrics = proposedRecommendation.EffectiveHPA.Metrics if !equality.Semantic.DeepEqual(&ehpaUpdate.Spec, &ehpa.Spec) { if err = c.Client.Update(ctx, ehpaUpdate); err != nil { diff --git a/pkg/controller/timeseriesprediction/time_series_prediction_controller.go b/pkg/controller/timeseriesprediction/time_series_prediction_controller.go index 119987cfd..fe82607de 100644 --- a/pkg/controller/timeseriesprediction/time_series_prediction_controller.go +++ b/pkg/controller/timeseriesprediction/time_series_prediction_controller.go @@ -14,7 +14,9 @@ import ( "k8s.io/client-go/tools/record" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/predicate" predictionapi "github.com/gocrane/api/prediction/v1alpha1" predictormgr "github.com/gocrane/crane/pkg/predictor" @@ -90,7 +92,7 @@ func (tc *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Res // SetupWithManager creates a controller and register to controller manager. func (tc *Controller) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). - For(&predictionapi.TimeSeriesPrediction{}). + For(&predictionapi.TimeSeriesPrediction{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). Complete(tc) } diff --git a/pkg/ensurance/collector/cadvisor/cadvisor_linux.go b/pkg/ensurance/collector/cadvisor/cadvisor_linux.go index 4caf2c042..2c3fe7805 100644 --- a/pkg/ensurance/collector/cadvisor/cadvisor_linux.go +++ b/pkg/ensurance/collector/cadvisor/cadvisor_linux.go @@ -46,9 +46,14 @@ type CadvisorCollector struct { } type CadvisorManager struct { + cgroupDriver string cmanager.Manager } +func (m *CadvisorManager) GetCgroupDriver() string { + return m.cgroupDriver +} + var _ Manager = new(CadvisorManager) func NewCadvisorCollector(podLister corelisters.PodLister, manager Manager) *CadvisorCollector { @@ -59,7 +64,7 @@ func NewCadvisorCollector(podLister corelisters.PodLister, manager Manager) *Cad return &c } -func NewCadvisorManager() Manager { +func NewCadvisorManager(cgroupDriver string) Manager { var includedMetrics = cadvisorcontainer.MetricSet{ cadvisorcontainer.CpuUsageMetrics: struct{}{}, cadvisorcontainer.ProcessSchedulerMetrics: struct{}{}, @@ -71,7 +76,7 @@ func NewCadvisorManager() Manager { sysfs := csysfs.NewRealSysFs() maxHousekeepingConfig := cmanager.HouskeepingConfig{Interval: &maxHousekeepingInterval, AllowDynamic: &allowDynamic} - m, err := cmanager.New(memCache, sysfs, maxHousekeepingConfig, includedMetrics, http.DefaultClient, []string{utils.CgroupKubePods}, "") + m, err := cmanager.New(memCache, sysfs, maxHousekeepingConfig, includedMetrics, http.DefaultClient, []string{"/" + utils.CgroupKubePods}, "") if err != nil { klog.Errorf("Failed to create cadvisor manager start: %v", err) return nil @@ -83,7 +88,8 @@ func NewCadvisorManager() Manager { } return &CadvisorManager{ - m, + cgroupDriver: cgroupDriver, + Manager: m, } } @@ -109,7 +115,7 @@ func (c *CadvisorCollector) Collect() (map[string][]common.TimeSeries, error) { var stateMap = make(map[string][]common.TimeSeries) for _, pod := range allPods { var now = time.Now() - containers, err := c.Manager.GetContainerInfoV2(types.GetCgroupPath(pod), cadvisorapiv2.RequestOptions{ + containers, err := c.Manager.GetContainerInfoV2(types.GetCgroupPath(pod, c.Manager.GetCgroupDriver()), cadvisorapiv2.RequestOptions{ IdType: cadvisorapiv2.TypeName, Count: 1, Recursive: true, diff --git a/pkg/ensurance/collector/cadvisor/cadvisor_unsupported.go b/pkg/ensurance/collector/cadvisor/cadvisor_unsupported.go index 9b20d808e..5582bd34e 100644 --- a/pkg/ensurance/collector/cadvisor/cadvisor_unsupported.go +++ b/pkg/ensurance/collector/cadvisor/cadvisor_unsupported.go @@ -22,10 +22,12 @@ type CadvisorCollectorUnsupport struct { type CadvisorManagerUnsupport struct{} -func NewCadvisorManager() Manager { +func NewCadvisorManager(_ string) Manager { return &CadvisorManagerUnsupport{} } +var _ Manager = new(CadvisorManagerUnsupport) + func NewCadvisorCollector(_ corelisters.PodLister, manager Manager) *CadvisorCollectorUnsupport { return &CadvisorCollectorUnsupport{} } @@ -54,6 +56,10 @@ func (m *CadvisorManagerUnsupport) GetMachineInfo() (*info.MachineInfo, error) { return nil, errUnsupported } +func (m *CadvisorManagerUnsupport) GetCgroupDriver() string { + return "" +} + func CheckMetricNameExist(name string) bool { return false } diff --git a/pkg/ensurance/collector/cadvisor/types.go b/pkg/ensurance/collector/cadvisor/types.go index 471f0b397..b98fbb809 100644 --- a/pkg/ensurance/collector/cadvisor/types.go +++ b/pkg/ensurance/collector/cadvisor/types.go @@ -9,4 +9,5 @@ type Manager interface { GetContainerInfoV2(containerName string, options cadvisorapiv2.RequestOptions) (map[string]cadvisorapiv2.ContainerInfo, error) GetContainerInfo(containerName string, query *info.ContainerInfoRequest) (*info.ContainerInfo, error) GetMachineInfo() (*info.MachineInfo, error) + GetCgroupDriver() string } diff --git a/pkg/ensurance/collector/collector.go b/pkg/ensurance/collector/collector.go index 3024caac7..b7967e65c 100644 --- a/pkg/ensurance/collector/collector.go +++ b/pkg/ensurance/collector/collector.go @@ -14,6 +14,7 @@ import ( "github.com/gocrane/crane/pkg/common" "github.com/gocrane/crane/pkg/ensurance/collector/cadvisor" "github.com/gocrane/crane/pkg/ensurance/collector/nodelocal" + "github.com/gocrane/crane/pkg/ensurance/collector/noderesource" "github.com/gocrane/crane/pkg/ensurance/collector/types" "github.com/gocrane/crane/pkg/features" "github.com/gocrane/crane/pkg/known" @@ -179,6 +180,15 @@ func (s *StateCollector) UpdateCollectors() { if _, exists := s.collectors.Load(types.CadvisorCollectorType); !exists { s.collectors.Store(types.CadvisorCollectorType, cadvisor.NewCadvisorCollector(s.podLister, s.GetCadvisorManager())) } + + if nodeResourceGate := utilfeature.DefaultFeatureGate.Enabled(features.CraneNodeResource); nodeResourceGate { + if _, exists := s.collectors.Load(types.NodeResourceCollectorType); !exists { + c := noderesource.NewNodeResourceCollector(s.nodeName, s.nodeLister, s.podLister) + if c != nil { + s.collectors.Store(types.NodeResourceCollectorType, c) + } + } + } break } diff --git a/pkg/ensurance/collector/noderesource/noderesource.go b/pkg/ensurance/collector/noderesource/noderesource.go new file mode 100644 index 000000000..1f508dce8 --- /dev/null +++ b/pkg/ensurance/collector/noderesource/noderesource.go @@ -0,0 +1,63 @@ +package noderesource + +import ( + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/labels" + v1 "k8s.io/client-go/listers/core/v1" + "k8s.io/klog/v2" + + "github.com/gocrane/crane/pkg/common" + "github.com/gocrane/crane/pkg/ensurance/collector/types" + "github.com/gocrane/crane/pkg/utils" +) + +type NodeResource struct { + nodeName string + nodeLister v1.NodeLister + podLister v1.PodLister +} + +func NewNodeResourceCollector(nodeName string, nodeLister v1.NodeLister, podLister v1.PodLister) *NodeResource { + klog.V(4).Infof("NodeResourceCollector create") + return &NodeResource{ + nodeName: nodeName, + nodeLister: nodeLister, + podLister: podLister, + } +} + +func (n *NodeResource) GetType() types.CollectType { + return types.NodeResourceCollectorType +} + +func (n *NodeResource) Collect() (map[string][]common.TimeSeries, error) { + klog.V(6).Infof("NodeResourceCollector Collect") + node, err := n.nodeLister.Get(n.nodeName) + if err != nil { + return nil, err + } + pods, err := n.podLister.List(labels.Everything()) + if err != nil { + return nil, err + } + + allExtCpu := node.Status.Allocatable.Name(corev1.ResourceName(fmt.Sprintf(utils.ExtResourcePrefixFormat, corev1.ResourceCPU.String())), resource.DecimalSI).MilliValue() + var distributeExtCpu int64 = 0 + for _, pod := range pods { + for _, container := range pod.Spec.Containers { + if quantity, ok := container.Resources.Requests[corev1.ResourceName(fmt.Sprintf(utils.ExtResourcePrefixFormat, corev1.ResourceCPU.String()))]; ok { + distributeExtCpu += quantity.MilliValue() + } + } + } + klog.V(4).Infof("allExtCpu: %d, distributeExtCpu: %d", allExtCpu, distributeExtCpu) + return map[string][]common.TimeSeries{string(types.MetricNameExtCpuTotalDistribute): {{Samples: []common.Sample{{Value: (float64(distributeExtCpu) / float64(allExtCpu)) * 100, Timestamp: time.Now().Unix()}}}}}, nil +} + +func (n *NodeResource) Stop() error { + return nil +} diff --git a/pkg/ensurance/collector/types/types.go b/pkg/ensurance/collector/types/types.go index d26c6fd89..540a5507c 100644 --- a/pkg/ensurance/collector/types/types.go +++ b/pkg/ensurance/collector/types/types.go @@ -4,6 +4,7 @@ import ( "strings" v1 "k8s.io/api/core/v1" + "k8s.io/kubernetes/pkg/kubelet/cm" "github.com/gocrane/crane/pkg/utils" ) @@ -15,6 +16,7 @@ const ( CadvisorCollectorType CollectType = "cadvisor" EbpfCollectorType CollectType = "ebpf" MetricsServerCollectorType CollectType = "metrics-server" + NodeResourceCollectorType CollectType = "node-resource" ) type MetricName string @@ -51,20 +53,29 @@ const ( MetricNameContainerSchedRunQueueTime MetricName = "container_sched_run_queue_time" MetricNameExtResContainerCpuTotalUsage MetricName = "ext_res_container_cpu_total_usage" + MetricNameExtCpuTotalDistribute MetricName = "ext_cpu_total_distribute" ) -func GetCgroupPath(p *v1.Pod) string { - var pathArrays = []string{utils.CgroupKubePods} - +func GetCgroupPath(p *v1.Pod, cgroupDriver string) string { + cgroupName := GetCgroupName(p) + switch cgroupDriver { + case "stytemd": + return cgroupName.ToSystemd() + case "cgroupfs": + return cgroupName.ToCgroupfs() + default: + return "" + } +} +func GetCgroupName(p *v1.Pod) cm.CgroupName { switch p.Status.QOSClass { case v1.PodQOSGuaranteed: - pathArrays = append(pathArrays, utils.CgroupPodPrefix+string(p.UID)) + return cm.NewCgroupName(cm.RootCgroupName, utils.CgroupKubePods, cm.GetPodCgroupNameSuffix(p.UID)) case v1.PodQOSBurstable: - pathArrays = append(pathArrays, strings.ToLower(string(v1.PodQOSBurstable)), utils.CgroupPodPrefix+string(p.UID)) + return cm.NewCgroupName(cm.RootCgroupName, utils.CgroupKubePods, strings.ToLower(string(v1.PodQOSBurstable)), cm.GetPodCgroupNameSuffix(p.UID)) case v1.PodQOSBestEffort: - pathArrays = append(pathArrays, strings.ToLower(string(v1.PodQOSBestEffort)), utils.CgroupPodPrefix+string(p.UID)) + return cm.NewCgroupName(cm.RootCgroupName, utils.CgroupKubePods, strings.ToLower(string(v1.PodQOSBestEffort)), cm.GetPodCgroupNameSuffix(p.UID)) default: - return "" + return cm.RootCgroupName } - return strings.Join(pathArrays, "/") } diff --git a/pkg/known/annotation.go b/pkg/known/annotation.go index 46d64da5c..e80e9329d 100644 --- a/pkg/known/annotation.go +++ b/pkg/known/annotation.go @@ -2,6 +2,7 @@ package known const ( HPARecommendationValueAnnotation = "analysis.crane.io/hpa-recommendation" + ReplicasRecommendationValueAnnotation = "analysis.crane.io/replicas-recommendation" ResourceRecommendationValueAnnotation = "analysis.crane.io/resource-recommendation" ) diff --git a/pkg/metricprovider/custom_metric_provider.go b/pkg/metricprovider/custom_metric_provider.go index c8d1e9959..31cdb3a5b 100644 --- a/pkg/metricprovider/custom_metric_provider.go +++ b/pkg/metricprovider/custom_metric_provider.go @@ -14,7 +14,6 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" "k8s.io/metrics/pkg/apis/custom_metrics" @@ -24,6 +23,7 @@ import ( predictionapi "github.com/gocrane/api/prediction/v1alpha1" "github.com/gocrane/crane/pkg/known" + "github.com/gocrane/crane/pkg/utils" ) type metricValue struct { @@ -87,9 +87,9 @@ func (p *CustomMetricProvider) GetMetricBySelector(ctx context.Context, namespac return nil, err } - readyPods := GetReadyPods(pods) - if len(readyPods) == 0 { - return nil, fmt.Errorf("failed to get ready pods. ") + availablePods := utils.GetAvailablePods(pods) + if len(availablePods) == 0 { + return nil, fmt.Errorf("failed to get available pods. ") } isPredicting := false @@ -122,6 +122,7 @@ func (p *CustomMetricProvider) GetMetricBySelector(ctx context.Context, namespac timestampStart := time.Now() timestampEnd := timestampStart.Add(time.Duration(prediction.Spec.PredictionWindowSeconds) * time.Second) largestMetricValue := &metricValue{} + hasValidSample := false for _, v := range timeSeries.Samples { // exclude values that not in time range if v.Timestamp < timestampStart.Unix() || v.Timestamp > timestampEnd.Unix() { @@ -133,21 +134,26 @@ func (p *CustomMetricProvider) GetMetricBySelector(ctx context.Context, namespac return nil, fmt.Errorf("failed to parse value to float: %v ", err) } if valueFloat > largestMetricValue.value { + hasValidSample = true largestMetricValue.value = valueFloat largestMetricValue.timestamp = v.Timestamp } } - averageValue := int64(math.Round(largestMetricValue.value * 1000 / float64(len(readyPods)))) + if !hasValidSample { + return nil, fmt.Errorf("TimeSeries is outdated, metric name %s", info.Metric) + } + + averageValue := int64(math.Round(largestMetricValue.value * 1000 / float64(len(availablePods)))) klog.Infof("Provide custom metric %s average value %f.", info.Metric, float64(averageValue)/1000) - for name := range readyPods { + for _, pod := range availablePods { metric := custom_metrics.MetricValue{ DescribedObject: custom_metrics.ObjectReference{ APIVersion: "v1", Kind: "Pod", - Name: name, + Name: pod.Name, Namespace: namespace, }, Metric: custom_metrics.MetricIdentifier{ @@ -250,16 +256,3 @@ func (p *CustomMetricProvider) GetPods(ctx context.Context, namespace string, se return podList.Items, nil } - -// GetReadyPods return a set with ready pod names -func GetReadyPods(pods []v1.Pod) sets.String { - readyPods := sets.String{} - - for _, pod := range pods { - if pod.DeletionTimestamp != nil || pod.Status.Phase != v1.PodRunning { - continue - } - readyPods.Insert(pod.Name) - } - return readyPods -} diff --git a/pkg/metrics/analysis.go b/pkg/metrics/analysis.go new file mode 100644 index 000000000..79356b882 --- /dev/null +++ b/pkg/metrics/analysis.go @@ -0,0 +1,22 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + ResourceRecommendation = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "crane", + Subsystem: "analysis", + Name: "resource_recommendation", + Help: "The containers' CPU/Memory recommended value", + }, + []string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"}, + ) +) + +func init() { + metrics.Registry.MustRegister(ResourceRecommendation) +} diff --git a/pkg/metrics/autoscaling.go b/pkg/metrics/autoscaling.go index dd9362d1b..7c9a7c6bf 100644 --- a/pkg/metrics/autoscaling.go +++ b/pkg/metrics/autoscaling.go @@ -13,7 +13,7 @@ var ( Name: "hpa_replicas", Help: "Replicas for HPA", }, - []string{"resourceName"}, + []string{"namespace", "name"}, ) EHPAReplicas = prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -22,7 +22,7 @@ var ( Name: "effective_hpa_replicas", Help: "Replicas for Effective HPA", }, - []string{"resourceName", "strategy"}, + []string{"namespace", "name"}, ) HPAScaleCount = prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -31,7 +31,7 @@ var ( Name: "hpa_scale_count", Help: "Scale count for HPA", }, - []string{"resourceName", "type", "direction"}, + []string{"namespace", "name", "type"}, ) OOMCount = prometheus.NewCounterVec( prometheus.CounterOpts{ diff --git a/pkg/metrics/ensuarance.go b/pkg/metrics/ensuarance.go index 0fb4a925f..2d0b662e2 100644 --- a/pkg/metrics/ensuarance.go +++ b/pkg/metrics/ensuarance.go @@ -26,6 +26,10 @@ const ( ExecutorErrorTotal = "executor_error_total" ExecutorEvictTotal = "executor_evict_total" PodResourceErrorTotal = "pod_resource_error_total" + + NodeCpuCannotBeReclaimedSeconds = "node_cpu_cannot_be_reclaimed_seconds" + NodeResourceRecommended = "node_resource_recommended" + NodeResourceRecommendedFrom = "node_resource_recommended_from" ) type StepLabel string @@ -42,15 +46,18 @@ const ( // Step for pod resource manager StepGetPeriod StepLabel = "getPeriod" StepUpdateQuota StepLabel = "updateQuota" + + StepGetExtResourceRecommended StepLabel = "getExtResourceRecommended" ) type SubComponent string const ( - SubComponentSchedule SubComponent = "schedule" - SubComponentThrottle SubComponent = "throttle" - SubComponentEvict SubComponent = "evict" - SubComponentPodResource SubComponent = "pod-resource-manager" + SubComponentSchedule SubComponent = "schedule" + SubComponentThrottle SubComponent = "throttle" + SubComponentEvict SubComponent = "evict" + SubComponentPodResource SubComponent = "pod-resource-manager" + SubComponentNodeResource SubComponent = "node-resource-manager" ) type AnalyzeType string @@ -180,6 +187,39 @@ var ( StabilityLevel: k8smetrics.ALPHA, }, []string{"subcomponent", "step"}, ) + + // LastActivity records the last activity time of each steps + nodeCpuCannotBeReclaimedSeconds = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: CraneNamespace, + Subsystem: CraneAgentSubsystem, + Name: NodeCpuCannotBeReclaimedSeconds, + Help: "The cpu seconds that cannot be reclaimed.", + StabilityLevel: k8smetrics.ALPHA, + }, []string{}, + ) + + //NodeResourceRecommended + nodeResourceRecommended = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: CraneNamespace, + Subsystem: CraneAgentSubsystem, + Name: NodeResourceRecommended, + Help: "The value of recommendation.", + StabilityLevel: k8smetrics.ALPHA, + }, []string{"subcomponent", "step", "resourceName"}, + ) + + //NodeResourceRecommended + nodeResourceRecommendedFrom = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: CraneNamespace, + Subsystem: CraneAgentSubsystem, + Name: NodeResourceRecommendedFrom, + Help: "Where the recommended values come from. (tsp: 1, local: 0)", + StabilityLevel: k8smetrics.ALPHA, + }, []string{"subcomponent", "step", "resourceName"}, + ) ) var registerCraneAgentMetricsOnce sync.Once @@ -195,6 +235,9 @@ func RegisterCraneAgent() { legacyregistry.MustRegister(executorStatusCounts) legacyregistry.MustRegister(executorErrorCounts) legacyregistry.MustRegister(executorEvictCounts) + legacyregistry.MustRegister(nodeCpuCannotBeReclaimedSeconds) + legacyregistry.MustRegister(nodeResourceRecommended) + legacyregistry.MustRegister(nodeResourceRecommendedFrom) }) } @@ -258,3 +301,23 @@ func PodResourceUpdateErrorCounterInc(subComponent SubComponent, stepName StepLa func ExecutorEvictCountsInc() { executorEvictCounts.Inc() } + +func UpdateNodeCpuCannotBeReclaimedSeconds(value float64) { + nodeCpuCannotBeReclaimedSeconds.With(prometheus.Labels{}).Set(value) +} + +func UpdateNodeResourceRecommendedValue(subComponent SubComponent, stepName StepLabel, resourceName string, from string, value float64) { + nodeResourceRecommended.With(prometheus.Labels{"subcomponent": string(subComponent), "step": string(stepName), "resourceName": resourceName}).Set(value) + switch from { + case "tsp": + UpdateNodeResourceRecommendedFromValue(subComponent, stepName, resourceName, 1) + case "local": + UpdateNodeResourceRecommendedFromValue(subComponent, stepName, resourceName, 0) + default: + UpdateNodeResourceRecommendedFromValue(subComponent, stepName, resourceName, -1) + } +} + +func UpdateNodeResourceRecommendedFromValue(subComponent SubComponent, stepName StepLabel, resourceName string, value float64) { + nodeResourceRecommendedFrom.With(prometheus.Labels{"subcomponent": string(subComponent), "step": string(stepName), "resourceName": resourceName}).Set(value) +} diff --git a/pkg/prediction/percentile/config.go b/pkg/prediction/percentile/config.go index 591fc333a..c09943ff5 100644 --- a/pkg/prediction/percentile/config.go +++ b/pkg/prediction/percentile/config.go @@ -14,8 +14,9 @@ import ( ) var defaultMinSampleWeight float64 = 1e-5 -var defaultMarginFraction float64 = .25 +var defaultMarginFraction float64 = 0.0 var defaultPercentile float64 = .99 +var defaultTargetUtilization float64 = 1.0 var defaultHistogramOptions, _ = vpa.NewLinearHistogramOptions(100.0, 0.1, 1e-10) var defaultInternalConfig = internalConfig{ @@ -26,6 +27,7 @@ var defaultInternalConfig = internalConfig{ marginFraction: defaultMarginFraction, percentile: defaultPercentile, histogramOptions: defaultHistogramOptions, + targetUtilization: defaultTargetUtilization, historyLength: time.Hour * 24 * 7, } @@ -38,12 +40,13 @@ type internalConfig struct { minSampleWeight float64 marginFraction float64 percentile float64 + targetUtilization float64 initMode config.ModelInitMode } func (c *internalConfig) String() string { - return fmt.Sprintf("{aggregated: %v, historyLength: %v, sampleInterval: %v, histogramDecayHalfLife: %v, minSampleWeight: %v, marginFraction: %v, percentile: %v}", - c.aggregated, c.historyLength, c.sampleInterval, c.histogramDecayHalfLife, c.minSampleWeight, c.marginFraction, c.percentile) + return fmt.Sprintf("{aggregated: %v, historyLength: %v, sampleInterval: %v, histogramDecayHalfLife: %v, minSampleWeight: %v, marginFraction: %v, percentile: %v, targetUtilization: %v}", + c.aggregated, c.historyLength, c.sampleInterval, c.histogramDecayHalfLife, c.minSampleWeight, c.marginFraction, c.percentile, c.targetUtilization) } // todo: later better to refine the algorithm params to a map not a struct to get more extendability, @@ -131,6 +134,11 @@ func makeInternalConfig(p *v1alpha1.Percentile, initMode *config.ModelInitMode) return nil, err } + targetUtilization, err := utils.ParseFloat(p.TargetUtilization, defaultTargetUtilization) + if err != nil { + return nil, err + } + // default use history mode := config.ModelInitModeHistory if initMode != nil { @@ -146,6 +154,7 @@ func makeInternalConfig(p *v1alpha1.Percentile, initMode *config.ModelInitMode) minSampleWeight: minSampleWeight, marginFraction: marginFraction, percentile: percentile, + targetUtilization: targetUtilization, } klog.InfoS("Made an internal config.", "internalConfig", c) diff --git a/pkg/prediction/percentile/estimator.go b/pkg/prediction/percentile/estimator.go index fb89926c9..a871a45d0 100644 --- a/pkg/prediction/percentile/estimator.go +++ b/pkg/prediction/percentile/estimator.go @@ -17,6 +17,11 @@ type marginEstimator struct { baseEstimator Estimator } +type targetUtilizationEstimator struct { + targetUtilization float64 + baseEstimator Estimator +} + func NewPercentileEstimator(percentile float64) Estimator { return &percentileEstimator{percentile} } @@ -25,6 +30,10 @@ func WithMargin(marginFraction float64, baseEstimator Estimator) Estimator { return &marginEstimator{marginFraction, baseEstimator} } +func WithTargetUtilization(targetUtilization float64, baseEstimator Estimator) Estimator { + return &targetUtilizationEstimator{targetUtilization, baseEstimator} +} + func (e *percentileEstimator) GetEstimation(h vpa.Histogram) float64 { return h.Percentile(e.percentile) } @@ -32,3 +41,7 @@ func (e *percentileEstimator) GetEstimation(h vpa.Histogram) float64 { func (e *marginEstimator) GetEstimation(h vpa.Histogram) float64 { return e.baseEstimator.GetEstimation(h) * (1 + e.marginFraction) } + +func (e *targetUtilizationEstimator) GetEstimation(h vpa.Histogram) float64 { + return e.baseEstimator.GetEstimation(h) / e.targetUtilization +} diff --git a/pkg/prediction/percentile/prediction.go b/pkg/prediction/percentile/prediction.go index 91b3cb225..12bef1d16 100644 --- a/pkg/prediction/percentile/prediction.go +++ b/pkg/prediction/percentile/prediction.go @@ -16,6 +16,7 @@ import ( ) var _ prediction.Interface = &percentilePrediction{} +var keyAll = "__all__" type percentilePrediction struct { prediction.GenericPrediction @@ -25,7 +26,7 @@ type percentilePrediction struct { stopChMap sync.Map } -func (p *percentilePrediction) QueryPredictionStatus(ctx context.Context, metricNamer metricnaming.MetricNamer) (prediction.Status, error) { +func (p *percentilePrediction) QueryPredictionStatus(_ context.Context, metricNamer metricnaming.MetricNamer) (prediction.Status, error) { _, status := p.a.GetSignals(metricNamer.BuildUniqueKey()) return status, nil } @@ -57,6 +58,46 @@ func generateSamplesFromWindow(value float64, start time.Time, end time.Time, st return result } +func (p *percentilePrediction) getPredictedValuesFromSignals(queryExpr string, signals map[string]*aggregateSignal) []*common.TimeSeries { + var predictedTimeSeriesList []*common.TimeSeries + + cfg := p.a.GetConfig(queryExpr) + estimator := NewPercentileEstimator(cfg.percentile) + estimator = WithMargin(cfg.marginFraction, estimator) + estimator = WithTargetUtilization(cfg.targetUtilization, estimator) + now := time.Now().Unix() + + if cfg.aggregated { + signal := signals[keyAll] + if signal != nil { + sample := common.Sample{ + Value: estimator.GetEstimation(signal.histogram), + Timestamp: now, + } + predictedTimeSeriesList = append(predictedTimeSeriesList, &common.TimeSeries{ + Labels: nil, + Samples: []common.Sample{sample}, + }) + } + } else { + for key, signal := range signals { + if key == keyAll { + continue + } + sample := common.Sample{ + Value: estimator.GetEstimation(signal.histogram), + Timestamp: now, + } + predictedTimeSeriesList = append(predictedTimeSeriesList, &common.TimeSeries{ + Labels: signal.labels, + Samples: []common.Sample{sample}, + }) + } + } + + return predictedTimeSeriesList +} + func (p *percentilePrediction) getPredictedValues(ctx context.Context, namer metricnaming.MetricNamer) []*common.TimeSeries { var predictedTimeSeriesList []*common.TimeSeries @@ -71,42 +112,7 @@ func (p *percentilePrediction) getPredictedValues(ctx context.Context, namer met return predictedTimeSeriesList } if signals != nil && status == prediction.StatusReady { - cfg := p.a.GetConfig(queryExpr) - estimator := NewPercentileEstimator(cfg.percentile) - estimator = WithMargin(cfg.marginFraction, estimator) - now := time.Now().Unix() - - if cfg.aggregated { - key := "__all__" - signal := signals[key] - if signal == nil { - return nil - } - sample := common.Sample{ - Value: estimator.GetEstimation(signal.histogram), - Timestamp: now, - } - predictedTimeSeriesList = append(predictedTimeSeriesList, &common.TimeSeries{ - Labels: nil, - Samples: []common.Sample{sample}, - }) - return predictedTimeSeriesList - } else { - for key, signal := range signals { - if key == "__all__" { - continue - } - sample := common.Sample{ - Value: estimator.GetEstimation(signal.histogram), - Timestamp: now, - } - predictedTimeSeriesList = append(predictedTimeSeriesList, &common.TimeSeries{ - Labels: signal.labels, - Samples: []common.Sample{sample}, - }) - } - return predictedTimeSeriesList - } + return p.getPredictedValuesFromSignals(queryExpr, signals) } select { case <-ctx.Done(): @@ -127,52 +133,15 @@ func (p *percentilePrediction) QueryRealtimePredictedValues(ctx context.Context, return p.getPredictedValues(ctx, namer), nil } -// QueryRealtimePredictedValuesOnce once task, it is only called once then caller will delete the query after call, but this query maybe used by other callers, -// so when there has already registered this namer query, we get the estimated value from the model directly. -// when there is no this namer query in state, we fetch history data to recover the histogram model then get the estimated value by a stateless function as data processing way. -func (p *percentilePrediction) QueryRealtimePredictedValuesOnce(ctx context.Context, namer metricnaming.MetricNamer, config config.Config) ([]*common.TimeSeries, error) { - var predictedTimeSeriesList []*common.TimeSeries - +// QueryRealtimePredictedValuesOnce is a one-off task, and the query will be deleted by the caller after the call. However, this query maybe has already been used by other callers, +// if so, we get the estimated value from the model directly. +// When the query is not found, we fetch the history data to build the histogram, and then get the estimated value by a stateless function as data processing way. +func (p *percentilePrediction) QueryRealtimePredictedValuesOnce(_ context.Context, namer metricnaming.MetricNamer, config config.Config) ([]*common.TimeSeries, error) { queryExpr := namer.BuildUniqueKey() signals, status := p.a.GetSignals(queryExpr) if signals != nil && status == prediction.StatusReady { - cfg := p.a.GetConfig(queryExpr) - estimator := NewPercentileEstimator(cfg.percentile) - estimator = WithMargin(cfg.marginFraction, estimator) - now := time.Now().Unix() - - if cfg.aggregated { - key := "__all__" - signal := signals[key] - if signal == nil { - return nil, fmt.Errorf("no signal key %v found", key) - } - sample := common.Sample{ - Value: estimator.GetEstimation(signal.histogram), - Timestamp: now, - } - predictedTimeSeriesList = append(predictedTimeSeriesList, &common.TimeSeries{ - Labels: nil, - Samples: []common.Sample{sample}, - }) - return predictedTimeSeriesList, nil - } else { - for key, signal := range signals { - if key == "__all__" { - continue - } - sample := common.Sample{ - Value: estimator.GetEstimation(signal.histogram), - Timestamp: now, - } - predictedTimeSeriesList = append(predictedTimeSeriesList, &common.TimeSeries{ - Labels: signal.labels, - Samples: []common.Sample{sample}, - }) - } - return predictedTimeSeriesList, nil - } + return p.getPredictedValuesFromSignals(queryExpr, signals), nil } else { // namer metric query is firstly registered by this caller // we first fetch history data to construct the histogram model, then get estimation. @@ -183,7 +152,6 @@ func (p *percentilePrediction) QueryRealtimePredictedValuesOnce(ctx context.Cont // process is a stateless function to get estimation of a metric series by constructing a histogram then get estimation data. func (p *percentilePrediction) process(namer metricnaming.MetricNamer, config config.Config) ([]*common.TimeSeries, error) { - var predictedTimeSeriesList []*common.TimeSeries var historyTimeSeriesList []*common.TimeSeries var err error queryExpr := namer.BuildUniqueKey() @@ -200,7 +168,6 @@ func (p *percentilePrediction) process(namer metricnaming.MetricNamer, config co } signals := map[string]*aggregateSignal{} - keyAll := "__all__" if cfg.aggregated { signal := newAggregateSignal(cfg) for _, ts := range historyTimeSeriesList { @@ -226,40 +193,7 @@ func (p *percentilePrediction) process(namer metricnaming.MetricNamer, config co } } - estimator := NewPercentileEstimator(cfg.percentile) - estimator = WithMargin(cfg.marginFraction, estimator) - now := time.Now().Unix() - - if cfg.aggregated { - signal := signals[keyAll] - if signal == nil { - return nil, fmt.Errorf("no signal key %v found", keyAll) - } - sample := common.Sample{ - Value: estimator.GetEstimation(signal.histogram), - Timestamp: now, - } - predictedTimeSeriesList = append(predictedTimeSeriesList, &common.TimeSeries{ - Labels: nil, - Samples: []common.Sample{sample}, - }) - return predictedTimeSeriesList, nil - } else { - for key, signal := range signals { - if key == "__all__" { - continue - } - sample := common.Sample{ - Value: estimator.GetEstimation(signal.histogram), - Timestamp: now, - } - predictedTimeSeriesList = append(predictedTimeSeriesList, &common.TimeSeries{ - Labels: signal.labels, - Samples: []common.Sample{sample}, - }) - } - return predictedTimeSeriesList, nil - } + return p.getPredictedValuesFromSignals(queryExpr, signals), nil } func NewPrediction(realtimeProvider providers.RealTime, historyProvider providers.History) prediction.Interface { @@ -296,9 +230,6 @@ func (p *percentilePrediction) Run(stopCh <-chan struct{}) { // We just init the signal and setting the status // we start the real time model updating directly. but there is a window time for each metricNamer in the algorithm config to ready status c := p.a.GetConfig(QueryExpr) - if c == nil { - c = &defaultInternalConfig - } var initError error switch c.initMode { @@ -397,7 +328,7 @@ func (p *percentilePrediction) initByRealTimeProvider(namer metricnaming.MetricN if cfg.aggregated { signal := newAggregateSignal(cfg) - p.a.SetSignalWithStatus(queryExpr, "__all__", signal, prediction.StatusInitializing) + p.a.SetSignalWithStatus(queryExpr, keyAll, signal, prediction.StatusInitializing) } else { signals := map[string]*aggregateSignal{} p.a.SetSignalsWithStatus(queryExpr, signals, prediction.StatusInitializing) @@ -406,8 +337,8 @@ func (p *percentilePrediction) initByRealTimeProvider(namer metricnaming.MetricN // todo: // nolint:unused -func (p *percentilePrediction) initByCheckPoint(namer metricnaming.MetricNamer) error { - return fmt.Errorf("Do not support checkpoint now") +func (p *percentilePrediction) initByCheckPoint(_ metricnaming.MetricNamer) error { + return fmt.Errorf("checkpoint not supported") } func (p *percentilePrediction) initFromHistory(namer metricnaming.MetricNamer) error { @@ -430,7 +361,7 @@ func (p *percentilePrediction) initFromHistory(namer metricnaming.MetricNamer) e signal.addSample(t, s.Value) } } - p.a.SetSignal(queryExpr, "__all__", signal) + p.a.SetSignal(queryExpr, keyAll, signal) } else { signals := map[string]*aggregateSignal{} for _, ts := range historyTimeSeriesList { @@ -463,8 +394,7 @@ func (p *percentilePrediction) addSamples(namer metricnaming.MetricNamer) { c := p.a.GetConfig(queryExpr) if c.aggregated { - key := "__all__" - signal := p.a.GetSignal(queryExpr, key) + signal := p.a.GetSignal(queryExpr, keyAll) if signal == nil { return } @@ -475,7 +405,7 @@ func (p *percentilePrediction) addSamples(namer metricnaming.MetricNamer) { // it is not a time dimension, but we use N samples of different container instances of the workload to represent the N intervals samples for _, ts := range latestTimeSeriesList { if len(ts.Samples) < 1 { - klog.V(4).InfoS("Sample not found.", "key", key) + klog.V(4).InfoS("Sample not found.", "key", keyAll) continue } sample := ts.Samples[len(ts.Samples)-1] @@ -488,7 +418,7 @@ func (p *percentilePrediction) addSamples(namer metricnaming.MetricNamer) { // LazyTraining: directly accumulating data from real time metric provider until the data is enough // Checkpoint: directly recover the model from a checkpoint, and then updating the model until accumulated data is enough if signal.GetAggregationWindowLength() >= c.historyLength { - p.a.SetSignalStatus(queryExpr, key, prediction.StatusReady) + p.a.SetSignalStatus(queryExpr, keyAll, prediction.StatusReady) } klog.V(6).InfoS("Sample added.", "sampleValue", sample.Value, "sampleTime", sampleTime, "queryExpr", queryExpr, "history", c.historyLength, "aggregationWindowLength", signal.GetAggregationWindowLength()) diff --git a/pkg/recommend/advisor/advisor.go b/pkg/recommend/advisor/advisor.go index 993f6a9ce..c444b5318 100644 --- a/pkg/recommend/advisor/advisor.go +++ b/pkg/recommend/advisor/advisor.go @@ -22,9 +22,9 @@ func NewAdvisors(ctx *types.Context) (advisors []Advisor) { Context: ctx, }, } - case analysisapi.AnalysisTypeHPA: + case analysisapi.AnalysisTypeReplicas: advisors = []Advisor{ - &EHPAAdvisor{ + &ReplicasAdvisor{ Context: ctx, }, } diff --git a/pkg/recommend/advisor/ehpa.go b/pkg/recommend/advisor/replicas.go similarity index 72% rename from pkg/recommend/advisor/ehpa.go rename to pkg/recommend/advisor/replicas.go index 5f209b374..09115a2bb 100644 --- a/pkg/recommend/advisor/ehpa.go +++ b/pkg/recommend/advisor/replicas.go @@ -25,19 +25,19 @@ import ( "github.com/gocrane/crane/pkg/utils" ) -var _ Advisor = &EHPAAdvisor{} +var _ Advisor = &ReplicasAdvisor{} -type EHPAAdvisor struct { +type ReplicasAdvisor struct { *types.Context } -func (a *EHPAAdvisor) Advise(proposed *types.ProposedRecommendation) error { +func (a *ReplicasAdvisor) Advise(proposed *types.ProposedRecommendation) error { p := a.PredictorMgr.GetPredictor(predictionapi.AlgorithmTypeDSP) if p == nil { return fmt.Errorf("predictor %v not found", predictionapi.AlgorithmTypeDSP) } - predictableEnabled, err := strconv.ParseBool(a.Context.ConfigProperties["ehpa.predictable"]) + predictableEnabled, err := strconv.ParseBool(a.Context.ConfigProperties["replicas.predictable"]) if err != nil { predictableEnabled = false } @@ -57,14 +57,14 @@ func (a *EHPAAdvisor) Advise(proposed *types.ProposedRecommendation) error { if err := metricNamer.Validate(); err != nil { return err } - klog.V(4).Infof("EHPAAdvisor CpuQuery %s Recommendation %s", metricNamer.BuildUniqueKey(), klog.KObj(a.Recommendation)) + klog.V(4).Infof("ReplicasAdvisor CpuQuery %s Recommendation %s", metricNamer.BuildUniqueKey(), klog.KObj(a.Recommendation)) timeNow := time.Now() tsList, err := a.DataSource.QueryTimeSeries(metricNamer, timeNow.Add(-time.Hour*24*7), timeNow, time.Minute) if err != nil { - return fmt.Errorf("EHPAAdvisor query historic metrics failed: %v ", err) + return fmt.Errorf("ReplicasAdvisor query historic metrics failed: %v ", err) } if len(tsList) != 1 { - return fmt.Errorf("EHPAAdvisor query historic metrics data is unexpected, List length is %d ", len(tsList)) + return fmt.Errorf("ReplicasAdvisor query historic metrics data is unexpected, List length is %d ", len(tsList)) } predictable := true @@ -75,17 +75,17 @@ func (a *EHPAAdvisor) Advise(proposed *types.ProposedRecommendation) error { timeNow, timeNow.Add(time.Hour*24*7)) if err != nil { - klog.Warningf("EHPAAdvisor query predicted time series failed: %v ", err) + klog.Warningf("ReplicasAdvisor query predicted time series failed: %v ", err) predictable = false } if len(tsListPrediction) != 1 { - klog.Warningf("EHPAAdvisor prediction metrics data is unexpected, List length is %d ", len(tsListPrediction)) + klog.Warningf("ReplicasAdvisor prediction metrics data is unexpected, List length is %d ", len(tsListPrediction)) predictable = false } if predictableEnabled && !predictable { - return fmt.Errorf("EHPAAdvisor cannot predict target: %v ", err) + return fmt.Errorf("ReplicasAdvisor cannot predict target: %v ", err) } var cpuMax float64 @@ -107,34 +107,39 @@ func (a *EHPAAdvisor) Advise(proposed *types.ProposedRecommendation) error { } } + cpuPercentile, err := strconv.ParseFloat(a.Context.ConfigProperties["replicas.cpu-percentile"], 64) + if err != nil { + return fmt.Errorf("ReplicasAdvisor parse replicas.cpu-percentile failed: %v", err) + } + err = a.checkMinCpuUsageThreshold(cpuMax) if err != nil { - return fmt.Errorf("EHPAAdvisor checkMinCpuUsageThreshold failed: %v", err) + return fmt.Errorf("ReplicasAdvisor checkMinCpuUsageThreshold failed: %v", err) } medianMin, medianMax, err := a.minMaxMedians(tsList) if err != nil { - return fmt.Errorf("EHPAAdvisor minMaxMedians failed: %v", err) + return fmt.Errorf("ReplicasAdvisor minMaxMedians failed: %v", err) } err = a.checkFluctuation(medianMin, medianMax) if err != nil { - return fmt.Errorf("EHPAAdvisor checkFluctuation failed: %v", err) + return fmt.Errorf("ReplicasAdvisor checkFluctuation failed: %v", err) } targetUtilization, requestTotal, err := a.proposeTargetUtilization() if err != nil { - return fmt.Errorf("EHPAAdvisor proposeTargetUtilization failed: %v", err) + return fmt.Errorf("ReplicasAdvisor proposeTargetUtilization failed: %v", err) } minReplicas, err := a.proposeMinReplicas(medianMin, requestTotal) if err != nil { - return fmt.Errorf("EHPAAdvisor proposeMinReplicas failed: %v", err) + return fmt.Errorf("ReplicasAdvisor proposeMinReplicas failed: %v", err) } - maxReplicas, err := a.proposeMaxReplicas(cpuUsages, targetUtilization, minReplicas) + maxReplicas, err := a.proposeMaxReplicas(cpuUsages, cpuPercentile, targetUtilization, minReplicas) if err != nil { - return fmt.Errorf("EHPAAdvisor proposeMaxReplicas failed: %v", err) + return fmt.Errorf("ReplicasAdvisor proposeMaxReplicas failed: %v", err) } defaultPredictionWindow := int32(3600) @@ -165,7 +170,7 @@ func (a *EHPAAdvisor) Advise(proposed *types.ProposedRecommendation) error { } } - referenceHpa, err := strconv.ParseBool(a.Context.ConfigProperties["ehpa.reference-hpa"]) + referenceHpa, err := strconv.ParseBool(a.Context.ConfigProperties["replicas.reference-hpa"]) if err != nil { referenceHpa = false } @@ -182,30 +187,35 @@ func (a *EHPAAdvisor) Advise(proposed *types.ProposedRecommendation) error { } } + replicasRecommendation := &types.ReplicasRecommendation{ + Replicas: &minReplicas, + } + proposed.EffectiveHPA = proposedEHPA + proposed.ReplicasRecommendation = replicasRecommendation return nil } -func (a *EHPAAdvisor) Name() string { - return "EHPAAdvisor" +func (a *ReplicasAdvisor) Name() string { + return "ReplicasAdvisor" } -// checkMinCpuUsageThreshold check if the max cpu for target is reach to ehpa.min-cpu-usage-threshold -func (a *EHPAAdvisor) checkMinCpuUsageThreshold(cpuMax float64) error { - minCpuUsageThreshold, err := strconv.ParseFloat(a.Context.ConfigProperties["ehpa.min-cpu-usage-threshold"], 64) +// checkMinCpuUsageThreshold check if the max cpu for target is reach to replicas.min-cpu-usage-threshold +func (a *ReplicasAdvisor) checkMinCpuUsageThreshold(cpuMax float64) error { + minCpuUsageThreshold, err := strconv.ParseFloat(a.Context.ConfigProperties["replicas.min-cpu-usage-threshold"], 64) if err != nil { return err } - klog.V(4).Infof("EHPAAdvisor checkMinCpuUsageThreshold, cpuMax %f threshold %f", cpuMax, minCpuUsageThreshold) + klog.V(4).Infof("ReplicasAdvisor checkMinCpuUsageThreshold, cpuMax %f threshold %f", cpuMax, minCpuUsageThreshold) if cpuMax < minCpuUsageThreshold { - return fmt.Errorf("target cpuusage %f is under ehpa.min-cpu-usage-threshold %f. ", cpuMax, minCpuUsageThreshold) + return fmt.Errorf("target cpuusage %f is under replicas.min-cpu-usage-threshold %f. ", cpuMax, minCpuUsageThreshold) } return nil } -func (a *EHPAAdvisor) minMaxMedians(predictionTs []*common.TimeSeries) (float64, float64, error) { +func (a *ReplicasAdvisor) minMaxMedians(predictionTs []*common.TimeSeries) (float64, float64, error) { // aggregate with time's hour cpuUsagePredictionMap := make(map[int][]float64) for _, sample := range predictionTs[0].Samples { @@ -241,14 +251,14 @@ func (a *EHPAAdvisor) minMaxMedians(predictionTs []*common.TimeSeries) (float64, } } - klog.V(4).Infof("EHPAAdvisor minMaxMedians medianMax %f, medianMin %f, medianUsages %v", medianMax, medianMin, medianUsages) + klog.V(4).Infof("ReplicasAdvisor minMaxMedians medianMax %f, medianMin %f, medianUsages %v", medianMax, medianMin, medianUsages) return medianMin, medianMax, nil } -// checkFluctuation check if the time series fluctuation is reach to ehpa.fluctuation-threshold -func (a *EHPAAdvisor) checkFluctuation(medianMin, medianMax float64) error { - fluctuationThreshold, err := strconv.ParseFloat(a.Context.ConfigProperties["ehpa.fluctuation-threshold"], 64) +// checkFluctuation check if the time series fluctuation is reach to replicas.fluctuation-threshold +func (a *ReplicasAdvisor) checkFluctuation(medianMin, medianMax float64) error { + fluctuationThreshold, err := strconv.ParseFloat(a.Context.ConfigProperties["replicas.fluctuation-threshold"], 64) if err != nil { return err } @@ -259,7 +269,7 @@ func (a *EHPAAdvisor) checkFluctuation(medianMin, medianMax float64) error { fluctuation := medianMax / medianMin if fluctuation < fluctuationThreshold { - return fmt.Errorf("target cpu fluctuation %f is under ehpa.fluctuation-threshold %f. ", fluctuation, fluctuationThreshold) + return fmt.Errorf("target cpu fluctuation %f is under replicas.fluctuation-threshold %f. ", fluctuation, fluctuationThreshold) } return nil @@ -268,13 +278,13 @@ func (a *EHPAAdvisor) checkFluctuation(medianMin, medianMax float64) error { // proposeTargetUtilization use the 99 percentile cpu usage to propose target utilization, // since we think if pod have reach the top usage before, maybe this is a suitable target to running. // Considering too high or too low utilization are both invalid, we will be capping target utilization finally. -func (a *EHPAAdvisor) proposeTargetUtilization() (int32, int64, error) { - minCpuTargetUtilization, err := strconv.ParseInt(a.Context.ConfigProperties["ehpa.min-cpu-target-utilization"], 10, 32) +func (a *ReplicasAdvisor) proposeTargetUtilization() (int32, int64, error) { + minCpuTargetUtilization, err := strconv.ParseInt(a.Context.ConfigProperties["replicas.min-cpu-target-utilization"], 10, 32) if err != nil { return 0, 0, err } - maxCpuTargetUtilization, err := strconv.ParseInt(a.Context.ConfigProperties["ehpa.max-cpu-target-utilization"], 10, 32) + maxCpuTargetUtilization, err := strconv.ParseInt(a.Context.ConfigProperties["replicas.max-cpu-target-utilization"], 10, 32) if err != nil { return 0, 0, err } @@ -306,7 +316,7 @@ func (a *EHPAAdvisor) proposeTargetUtilization() (int32, int64, error) { return 0, 0, err } - klog.V(4).Infof("EHPAAdvisor propose targetUtilization, cpuUsage %f requestsPod %d", cpuUsage, requestTotal) + klog.V(4).Infof("ReplicasAdvisor propose targetUtilization, cpuUsage %f requestsPod %d", cpuUsage, requestTotal) targetUtilization := int32(math.Ceil((cpuUsage * 1000 / float64(requestTotal)) * 100)) // capping @@ -322,14 +332,14 @@ func (a *EHPAAdvisor) proposeTargetUtilization() (int32, int64, error) { return targetUtilization, requestTotal, nil } -// proposeMinReplicas calculate min replicas based on ehpa.default-min-replicas -func (a *EHPAAdvisor) proposeMinReplicas(medianMin float64, requestTotal int64) (int32, error) { - defaultMinReplicas, err := strconv.ParseInt(a.Context.ConfigProperties["ehpa.default-min-replicas"], 10, 32) +// proposeMinReplicas calculate min replicas based on replicas.default-min-replicas +func (a *ReplicasAdvisor) proposeMinReplicas(workloadCpu float64, requestTotal int64) (int32, error) { + defaultMinReplicas, err := strconv.ParseInt(a.Context.ConfigProperties["replicas.default-min-replicas"], 10, 32) if err != nil { return 0, err } - maxCpuTargetUtilization, err := strconv.ParseInt(a.Context.ConfigProperties["ehpa.max-cpu-target-utilization"], 10, 32) + targetUtilization, err := strconv.ParseInt(a.Context.ConfigProperties["replicas.cpu-target-utilization"], 10, 32) if err != nil { return 0, err } @@ -341,7 +351,7 @@ func (a *EHPAAdvisor) proposeMinReplicas(medianMin float64, requestTotal int64) minReplicas = 1 } - min := int32(math.Ceil(medianMin / (float64(maxCpuTargetUtilization) / 100. * float64(requestTotal) / 1000.))) + min := int32(math.Ceil(workloadCpu / (float64(targetUtilization) / 100. * float64(requestTotal) / 1000.))) if min > minReplicas { minReplicas = min } @@ -350,13 +360,13 @@ func (a *EHPAAdvisor) proposeMinReplicas(medianMin float64, requestTotal int64) } // proposeMaxReplicas use max cpu usage to compare with target pod cpu usage to get the max replicas. -func (a *EHPAAdvisor) proposeMaxReplicas(cpuUsages []float64, targetUtilization int32, minReplicas int32) (int32, error) { - maxReplicasFactor, err := strconv.ParseFloat(a.Context.ConfigProperties["ehpa.max-replicas-factor"], 64) +func (a *ReplicasAdvisor) proposeMaxReplicas(cpuUsages []float64, cpuPercentile float64, targetUtilization int32, minReplicas int32) (int32, error) { + maxReplicasFactor, err := strconv.ParseFloat(a.Context.ConfigProperties["replicas.max-replicas-factor"], 64) if err != nil { return 0, err } // use percentile to deburring data - p95thCpu, err := stats.Percentile(cpuUsages, 95) + p95thCpu, err := stats.Percentile(cpuUsages, cpuPercentile) if err != nil { return 0, err } @@ -365,7 +375,7 @@ func (a *EHPAAdvisor) proposeMaxReplicas(cpuUsages []float64, targetUtilization return 0, err } - klog.V(4).Infof("EHPAAdvisor proposeMaxReplicas, p95thCpu %f requestsPod %d targetUtilization %d", p95thCpu, requestsPod, targetUtilization) + klog.V(4).Infof("ReplicasAdvisor proposeMaxReplicas, p95thCpu %f requestsPod %d targetUtilization %d", p95thCpu, requestsPod, targetUtilization) // request * targetUtilization is the target average cpu usage, use total p95thCpu to divide, we can get the expect max replicas. calcMaxReplicas := (p95thCpu * 100 * 1000 * maxReplicasFactor) / float64(int32(requestsPod)*targetUtilization) @@ -384,11 +394,7 @@ func getPredictionCpuConfig() *config.Config { DSP: &predictionapi.DSP{ SampleInterval: "1m", HistoryLength: "5d", - Estimators: predictionapi.Estimators{ - FFTEstimators: []*predictionapi.FFTEstimator{ - {MarginFraction: "0.05", LowAmplitudeThreshold: "1.0", HighFrequencyThreshold: "0.05"}, - }, - }, + Estimators: predictionapi.Estimators{}, }, } } diff --git a/pkg/recommend/advisor/ehpa_test.go b/pkg/recommend/advisor/replicas_test.go similarity index 92% rename from pkg/recommend/advisor/ehpa_test.go rename to pkg/recommend/advisor/replicas_test.go index 8d3861013..aae10f0a2 100644 --- a/pkg/recommend/advisor/ehpa_test.go +++ b/pkg/recommend/advisor/replicas_test.go @@ -14,7 +14,7 @@ import ( ) func TestCheckFluctuation(t *testing.T) { - a := &EHPAAdvisor{ + a := &ReplicasAdvisor{ Context: &types.Context{ ConfigProperties: map[string]string{}, }, @@ -53,7 +53,7 @@ func TestCheckFluctuation(t *testing.T) { medianMin, medianMax, _ := a.minMaxMedians(tsList) for _, test := range tests { - a.Context.ConfigProperties["ehpa.fluctuation-threshold"] = test.threshold + a.Context.ConfigProperties["replicas.fluctuation-threshold"] = test.threshold err := a.checkFluctuation(medianMin, medianMax) if err != nil && !test.expectError { t.Errorf("Failed to checkFluctuation: %v", err) @@ -63,10 +63,10 @@ func TestCheckFluctuation(t *testing.T) { func TestProposeMaxReplicas(t *testing.T) { rand.Seed(time.Now().UnixNano()) - a := &EHPAAdvisor{ + a := &ReplicasAdvisor{ Context: &types.Context{ ConfigProperties: map[string]string{ - "ehpa.max-replicas-factor": "3", + "replicas.max-replicas-factor": "3", }, PodTemplate: &corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ @@ -128,7 +128,7 @@ func TestProposeMaxReplicas(t *testing.T) { } for _, test := range tests { - maxReplicas, err := a.proposeMaxReplicas(cpuUsages, test.targetUtilization, test.minReplicas) + maxReplicas, err := a.proposeMaxReplicas(cpuUsages, 95, test.targetUtilization, test.minReplicas) if err != nil { t.Errorf("Failed to checkFluctuation: %v", err) } diff --git a/pkg/recommend/advisor/resource_request.go b/pkg/recommend/advisor/resource_request.go index c5022b418..d26f5b560 100644 --- a/pkg/recommend/advisor/resource_request.go +++ b/pkg/recommend/advisor/resource_request.go @@ -12,6 +12,7 @@ import ( "github.com/gocrane/crane/pkg/metricnaming" "github.com/gocrane/crane/pkg/metricquery" + "github.com/gocrane/crane/pkg/metrics" "github.com/gocrane/crane/pkg/prediction/config" "github.com/gocrane/crane/pkg/recommend/types" "github.com/gocrane/crane/pkg/utils" @@ -40,18 +41,23 @@ func makeCpuConfig(props map[string]string) *config.Config { if !exists { marginFraction = "0.15" } - + targetUtilization, exists := props["resource.cpu-target-utilization"] + if !exists { + targetUtilization = "1.0" + } historyLength, exists := props["resource.cpu-model-history-length"] if !exists { historyLength = "168h" } + return &config.Config{ Percentile: &predictionapi.Percentile{ - Aggregated: true, - HistoryLength: historyLength, - SampleInterval: sampleInterval, - MarginFraction: marginFraction, - Percentile: percentile, + Aggregated: true, + HistoryLength: historyLength, + SampleInterval: sampleInterval, + MarginFraction: marginFraction, + TargetUtilization: targetUtilization, + Percentile: percentile, Histogram: predictionapi.HistogramConfig{ HalfLife: "24h", BucketSize: "0.1", @@ -74,7 +80,10 @@ func makeMemConfig(props map[string]string) *config.Config { if !exists { marginFraction = "0.15" } - + targetUtilization, exists := props["resource.mem-target-utilization"] + if !exists { + targetUtilization = "1.0" + } historyLength, exists := props["resource.mem-model-history-length"] if !exists { historyLength = "168h" @@ -82,11 +91,12 @@ func makeMemConfig(props map[string]string) *config.Config { return &config.Config{ Percentile: &predictionapi.Percentile{ - Aggregated: true, - HistoryLength: historyLength, - SampleInterval: sampleInterval, - MarginFraction: marginFraction, - Percentile: percentile, + Aggregated: true, + HistoryLength: historyLength, + SampleInterval: sampleInterval, + MarginFraction: marginFraction, + Percentile: percentile, + TargetUtilization: targetUtilization, Histogram: predictionapi.HistogramConfig{ HalfLife: "48h", BucketSize: "104857600", @@ -129,7 +139,10 @@ func (a *ResourceRequestAdvisor) Advise(proposed *types.ProposedRecommendation) return fmt.Errorf("no value retured for queryExpr: %s", metricNamer.BuildUniqueKey()) } v := int64(tsList[0].Samples[0].Value * 1000) - cr.Target[corev1.ResourceCPU] = resource.NewMilliQuantity(v, resource.DecimalSI).String() + q := resource.NewMilliQuantity(v, resource.DecimalSI) + cr.Target[corev1.ResourceCPU] = q.String() + // export recommended values as prom metrics + a.recordResourceRecommendation(c.Name, corev1.ResourceCPU, q) metricNamer = ResourceToContainerMetricNamer(namespace, a.Recommendation.Spec.TargetRef.Name, c.Name, corev1.ResourceMemory, caller) klog.V(6).Infof("Memory query for resource request recommendation: %s", metricNamer.BuildUniqueKey()) @@ -142,7 +155,10 @@ func (a *ResourceRequestAdvisor) Advise(proposed *types.ProposedRecommendation) return fmt.Errorf("no value retured for queryExpr: %s", metricNamer.BuildUniqueKey()) } v = int64(tsList[0].Samples[0].Value) - cr.Target[corev1.ResourceMemory] = resource.NewQuantity(v, resource.BinarySI).String() + q = resource.NewQuantity(v, resource.BinarySI) + cr.Target[corev1.ResourceMemory] = q.String() + // export recommended values as prom metrics + a.recordResourceRecommendation(c.Name, corev1.ResourceMemory, q) r.Containers = append(r.Containers, cr) } @@ -151,6 +167,23 @@ func (a *ResourceRequestAdvisor) Advise(proposed *types.ProposedRecommendation) return nil } +func (a *ResourceRequestAdvisor) recordResourceRecommendation(containerName string, resName corev1.ResourceName, quantity *resource.Quantity) { + labels := map[string]string{ + "apiversion": a.Recommendation.Spec.TargetRef.APIVersion, + "owner_kind": a.Recommendation.Spec.TargetRef.Kind, + "namespace": a.Recommendation.Spec.TargetRef.Namespace, + "owner_name": a.Recommendation.Spec.TargetRef.Name, + "container": containerName, + "resource": resName.String(), + } + switch resName { + case corev1.ResourceCPU: + metrics.ResourceRecommendation.With(labels).Set(float64(quantity.MilliValue()) / 1000.) + case corev1.ResourceMemory: + metrics.ResourceRecommendation.With(labels).Set(float64(quantity.Value())) + } +} + func (a *ResourceRequestAdvisor) Name() string { return "ResourceRequestAdvisor" } diff --git a/pkg/recommend/inspector/inspector.go b/pkg/recommend/inspector/inspector.go index af3faa075..b261adb86 100644 --- a/pkg/recommend/inspector/inspector.go +++ b/pkg/recommend/inspector/inspector.go @@ -22,7 +22,7 @@ func NewInspectors(ctx *types.Context) []Inspector { if ctx.Pods != nil { inspectors = append(inspectors, &ResourceRequestInspector{Context: ctx}) } - case analysisapi.AnalysisTypeHPA: + case analysisapi.AnalysisTypeReplicas: if ctx.Scale != nil { inspector := &WorkloadInspector{ Context: ctx, diff --git a/pkg/recommend/inspector/workload.go b/pkg/recommend/inspector/workload.go index b34361fbc..ca9c405da 100644 --- a/pkg/recommend/inspector/workload.go +++ b/pkg/recommend/inspector/workload.go @@ -12,29 +12,11 @@ type WorkloadInspector struct { } func (i *WorkloadInspector) Inspect() error { - deploymentMinReplicas, err := strconv.ParseInt(i.Context.ConfigProperties["ehpa.deployment-min-replicas"], 10, 32) + workloadMinReplicas, err := strconv.ParseInt(i.Context.ConfigProperties["replicas.workload-min-replicas"], 10, 32) if err != nil { return err } - statefulsetMinReplicas, err := strconv.ParseInt(i.Context.ConfigProperties["ehpa.statefulset-min-replicas"], 10, 32) - if err != nil { - return err - } - - workloadMinReplicas, err := strconv.ParseInt(i.Context.ConfigProperties["ehpa.workload-min-replicas"], 10, 32) - if err != nil { - return err - } - - if i.Context.Deployment != nil && *i.Context.Deployment.Spec.Replicas < int32(deploymentMinReplicas) { - return fmt.Errorf("deployment replicas %d should be larger than %d ", *i.Context.Deployment.Spec.Replicas, int32(deploymentMinReplicas)) - } - - if i.Context.StatefulSet != nil && *i.Context.StatefulSet.Spec.Replicas < int32(statefulsetMinReplicas) { - return fmt.Errorf("statefulSet replicas %d should be larger than %d ", *i.Context.StatefulSet.Spec.Replicas, int32(statefulsetMinReplicas)) - } - if i.Context.Scale != nil && i.Context.Scale.Spec.Replicas < int32(workloadMinReplicas) { return fmt.Errorf("workload replicas %d should be larger than %d ", i.Context.Scale.Spec.Replicas, int32(workloadMinReplicas)) } diff --git a/pkg/recommend/inspector/workload_pods.go b/pkg/recommend/inspector/workload_pods.go index 7981fdc3f..4956ffbbf 100644 --- a/pkg/recommend/inspector/workload_pods.go +++ b/pkg/recommend/inspector/workload_pods.go @@ -21,12 +21,12 @@ func (i *WorkloadPodsInspector) Inspect() error { return fmt.Errorf("existing pods should be larger than 0 ") } - podMinReadySeconds, err := strconv.ParseInt(i.Context.ConfigProperties["ehpa.pod-min-ready-seconds"], 10, 32) + podMinReadySeconds, err := strconv.ParseInt(i.Context.ConfigProperties["replicas.pod-min-ready-seconds"], 10, 32) if err != nil { return err } - podAvailableRatio, err := strconv.ParseFloat(i.Context.ConfigProperties["ehpa.pod-available-ratio"], 64) + podAvailableRatio, err := strconv.ParseFloat(i.Context.ConfigProperties["replicas.pod-available-ratio"], 64) if err != nil { return err } diff --git a/pkg/recommend/recommender.go b/pkg/recommend/recommender.go index 03bc06bbb..7e4bc9c58 100644 --- a/pkg/recommend/recommender.go +++ b/pkg/recommend/recommender.go @@ -129,22 +129,6 @@ func GetContext(kubeClient client.Client, restMapper meta.RESTMapper, return nil, err } - if recommendation.Spec.TargetRef.Kind == "Deployment" && mapping.GroupVersionKind.Group == "apps" { - var deployment appsv1.Deployment - if err := runtime.DefaultUnstructuredConverter.FromUnstructured(unstructured.UnstructuredContent(), &deployment); err != nil { - return nil, err - } - c.Deployment = &deployment - } - - if recommendation.Spec.TargetRef.Kind == "StatefulSet" && mapping.GroupVersionKind.Group == "apps" { - var statefulSet appsv1.StatefulSet - if err := runtime.DefaultUnstructuredConverter.FromUnstructured(unstructured.UnstructuredContent(), &statefulSet); err != nil { - return nil, err - } - c.StatefulSet = &statefulSet - } - var pods []corev1.Pod if recommendation.Spec.TargetRef.Kind != "DaemonSet" { pods, err = utils.GetPodsFromScale(kubeClient, scale) @@ -160,7 +144,7 @@ func GetContext(kubeClient client.Client, restMapper meta.RESTMapper, return nil, err } - if recommendation.Spec.Type == analysisapi.AnalysisTypeHPA { + if recommendation.Spec.Type == analysisapi.AnalysisTypeReplicas { c.PodTemplate, err = utils.GetPodTemplate(context.TODO(), recommendation.Spec.TargetRef.Namespace, recommendation.Spec.TargetRef.Name, diff --git a/pkg/recommend/types/types.go b/pkg/recommend/types/types.go index d51c073bf..a9c0af536 100644 --- a/pkg/recommend/types/types.go +++ b/pkg/recommend/types/types.go @@ -22,8 +22,6 @@ type Context struct { Recommendation *analysisapi.Recommendation Scale *autoscalingapiv1.Scale RestMapping *meta.RESTMapping - Deployment *appsv1.Deployment - StatefulSet *appsv1.StatefulSet DaemonSet *appsv1.DaemonSet Pods []corev1.Pod PodTemplate *corev1.PodTemplateSpec @@ -33,11 +31,18 @@ type Context struct { // ProposedRecommendation is the result for one recommendation type ProposedRecommendation struct { - // EffectiveHPA is the proposed recommendation for type HPA - EffectiveHPA *EffectiveHorizontalPodAutoscalerRecommendation + // EffectiveHPA is the proposed recommendation for type Replicas + EffectiveHPA *EffectiveHorizontalPodAutoscalerRecommendation `json:"effectiveHPA,omitempty"` + + // ReplicasRecommendation is the proposed replicas for type Replicas + ReplicasRecommendation *ReplicasRecommendation `json:"replicasRecommendation,omitempty"` // ResourceRequest is the proposed recommendation for type Resource - ResourceRequest *ResourceRequestRecommendation + ResourceRequest *ResourceRequestRecommendation `json:"resourceRequest,omitempty"` +} + +type ReplicasRecommendation struct { + Replicas *int32 `json:"replicas,omitempty"` } type EffectiveHorizontalPodAutoscalerRecommendation struct { diff --git a/pkg/resource/node_resource_manager.go b/pkg/resource/node_resource_manager.go index e6e7d31d7..7aeb56675 100644 --- a/pkg/resource/node_resource_manager.go +++ b/pkg/resource/node_resource_manager.go @@ -228,6 +228,7 @@ func (o *NodeResourceManager) BuildNodeStatus(node *v1.Node) map[v1.ResourceName if nextRecommendation < 0 { nextRecommendation = 0 } + metrics.UpdateNodeResourceRecommendedValue(metrics.SubComponentNodeResource, metrics.StepGetExtResourceRecommended, string(resourceName), resourceFrom, nextRecommendation) extResourceName := fmt.Sprintf(utils.ExtResourcePrefixFormat, string(resourceName)) resValue, exists := node.Status.Capacity[v1.ResourceName(extResourceName)] if exists && resValue.Value() != 0 && @@ -335,7 +336,9 @@ func (o *NodeResourceManager) GetCpuCoreCanNotBeReclaimedFromLocal() float64 { // 1. Exclusive tethered CPU cannot be reclaimed even if the free part is free, so add the exclusive CPUIdle to the CanNotBeReclaimed CPU // 2. The CPU used by extRes-container needs to be reclaimed, otherwise it will be double-counted due to the allotted mechanism of k8s, so the extResContainerCpuUsageTotal is subtracted from the CanNotBeReclaimedCpu - return nodeCpuUsageTotal + exclusiveCPUIdle - extResContainerCpuUsageTotal + nodeCpuCannotBeReclaimedSeconds := nodeCpuUsageTotal + exclusiveCPUIdle - extResContainerCpuUsageTotal + metrics.UpdateNodeCpuCannotBeReclaimedSeconds(nodeCpuCannotBeReclaimedSeconds) + return nodeCpuCannotBeReclaimedSeconds } func getReserveResourcePercentFromNodeAnnotations(annotations map[string]string, resourceName string) (float64, bool) { diff --git a/pkg/server/store/mock_store.go b/pkg/server/store/mock_store.go index 60b00ae01..fe00b3a50 100644 --- a/pkg/server/store/mock_store.go +++ b/pkg/server/store/mock_store.go @@ -129,6 +129,21 @@ func (mr *MockClusterStoreMockRecorder) ListClusters(arg0 interface{}) *gomock.C return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListClusters", reflect.TypeOf((*MockClusterStore)(nil).ListClusters), arg0) } +// ListNamespaces mocks base method. +func (m *MockClusterStore) ListNamespaces(arg0 context.Context, arg1 string) (*NamespaceList, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ListNamespaces", arg0, arg1) + ret0, _ := ret[0].(*NamespaceList) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ListNamespaces indicates an expected call of ListNamespaces. +func (mr *MockClusterStoreMockRecorder) ListNamespaces(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListNamespaces", reflect.TypeOf((*MockClusterStore)(nil).ListNamespaces), arg0, arg1) +} + // UpdateCluster mocks base method. func (m *MockClusterStore) UpdateCluster(arg0 context.Context, arg1 *Cluster) error { m.ctrl.T.Helper() @@ -142,18 +157,3 @@ func (mr *MockClusterStoreMockRecorder) UpdateCluster(arg0, arg1 interface{}) *g mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateCluster", reflect.TypeOf((*MockClusterStore)(nil).UpdateCluster), arg0, arg1) } - -// ListNamespaces mocks base method. -func (m *MockClusterStore) ListNamespaces(arg0 context.Context, arg1 string) (*NamespaceList, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "ListNamespaces", arg0, arg1) - ret0, _ := ret[0].(*NamespaceList) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// ListNamespaces indicates an expected call of GetCluster. -func (mr *MockClusterStoreMockRecorder) ListNamespaces(arg0, arg1 interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetCluster", reflect.TypeOf((*MockClusterStore)(nil).GetCluster), arg0, arg1) -} diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go index 123aa84a7..0dfa7e334 100644 --- a/pkg/utils/pod.go +++ b/pkg/utils/pod.go @@ -21,6 +21,19 @@ const ( ExtResourcePrefixFormat = "gocrane.io/%s" ) +// GetAvailablePods return a set with pod names that paas IsPodAvailable check +func GetAvailablePods(pods []v1.Pod) []v1.Pod { + var availablePods []v1.Pod + timeNow := metav1.Now() + + for _, pod := range pods { + if IsPodAvailable(&pod, 30, timeNow) { + availablePods = append(availablePods, pod) + } + } + return availablePods +} + // IsPodAvailable returns true if a pod is available; false otherwise. // copied from k8s.io/kubernetes/pkg/api/v1/pod.go func IsPodAvailable(pod *v1.Pod, minReadySeconds int32, now metav1.Time) bool { @@ -37,8 +50,11 @@ func IsPodAvailable(pod *v1.Pod, minReadySeconds int32, now metav1.Time) bool { } // IsPodReady returns true if a pod is ready; false otherwise. -// copied from k8s.io/kubernetes/pkg/api/v1/pod.go +// copied from k8s.io/kubernetes/pkg/api/v1/pod.go and modified func IsPodReady(pod *v1.Pod) bool { + if pod.DeletionTimestamp != nil || pod.Status.Phase != v1.PodRunning { + return false + } condition := GetPodReadyCondition(pod.Status) return condition != nil && condition.Status == v1.ConditionTrue } diff --git a/pkg/utils/ref.go b/pkg/utils/ref.go index 2882a37f0..1222fc246 100644 --- a/pkg/utils/ref.go +++ b/pkg/utils/ref.go @@ -8,7 +8,7 @@ import ( ) const ( - CgroupKubePods = "/kubepods" + CgroupKubePods = "kubepods" CgroupPodPrefix = "pod" )