From 67712fc4608fd506b0bf849065d7037f3f2acca5 Mon Sep 17 00:00:00 2001 From: kaiyuechen Date: Thu, 15 Sep 2022 15:47:05 +0800 Subject: [PATCH 1/3] watermark of memory usage --- .../elastic-pod-qos.yaml | 13 ++++ .../eviction-action.yaml | 9 +++ .../evict-on-mem-usage-total/pod.yaml | 56 +++++++++++++++ .../evict-on-mem-usage-total/waterline.yaml | 18 +++++ .../collector/cadvisor/cadvisor_linux.go | 5 +- pkg/ensurance/executor/mem_usage.go | 70 +++++++++++++++++++ pkg/ensurance/executor/podinfo/pod_info.go | 12 +++- pkg/ensurance/executor/sort/cpu_usage_sort.go | 10 +-- .../executor/sort/cpu_usage_sort_test.go | 14 ++-- .../executor/sort/mem_metrics_sort.go | 9 --- pkg/ensurance/executor/sort/mem_usage_sort.go | 36 ++++++++++ 11 files changed, 227 insertions(+), 25 deletions(-) create mode 100644 examples/ensurance/evict-on-mem-usage-total/elastic-pod-qos.yaml create mode 100644 examples/ensurance/evict-on-mem-usage-total/eviction-action.yaml create mode 100644 examples/ensurance/evict-on-mem-usage-total/pod.yaml create mode 100644 examples/ensurance/evict-on-mem-usage-total/waterline.yaml create mode 100644 pkg/ensurance/executor/mem_usage.go delete mode 100644 pkg/ensurance/executor/sort/mem_metrics_sort.go create mode 100644 pkg/ensurance/executor/sort/mem_usage_sort.go diff --git a/examples/ensurance/evict-on-mem-usage-total/elastic-pod-qos.yaml b/examples/ensurance/evict-on-mem-usage-total/elastic-pod-qos.yaml new file mode 100644 index 000000000..29ca6bcd9 --- /dev/null +++ b/examples/ensurance/evict-on-mem-usage-total/elastic-pod-qos.yaml @@ -0,0 +1,13 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: PodQOS +metadata: + name: all-elastic-pods +spec: + allowedActions: + - eviction + resourceQOS: + cpuQOS: + cpuPriority: 7 + labelSelector: + matchLabels: + preemptible_job: "true" \ No newline at end of file diff --git a/examples/ensurance/evict-on-mem-usage-total/eviction-action.yaml b/examples/ensurance/evict-on-mem-usage-total/eviction-action.yaml new file mode 100644 index 000000000..c9e715fec --- /dev/null +++ b/examples/ensurance/evict-on-mem-usage-total/eviction-action.yaml @@ -0,0 +1,9 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: AvoidanceAction +metadata: + name: eviction +spec: + coolDownSeconds: 300 + description: evict low priority pods + eviction: + terminationGracePeriodSeconds: 30 \ No newline at end of file diff --git a/examples/ensurance/evict-on-mem-usage-total/pod.yaml b/examples/ensurance/evict-on-mem-usage-total/pod.yaml new file mode 100644 index 000000000..6b1c81e58 --- /dev/null +++ b/examples/ensurance/evict-on-mem-usage-total/pod.yaml @@ -0,0 +1,56 @@ +apiVersion: v1 +kind: Pod +metadata: + name: low-pi6 + labels: + k8s-app: low + preemptible_job: "true" +spec: + containers: + - image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09 + command: + - stress-ng + - --vm-hang + - "3600" + - --vm + - "2" + - --vm-bytes + - "2G" + name: stress + volumeMounts: + - mountPath: /data + name: data + volumes: + - hostPath: + path: /data/dd + type: DirectoryOrCreate + name: data +--- +apiVersion: v1 +kind: Pod +metadata: + name: low-pi2 + labels: + k8s-app: low + preemptible_job: "true" +spec: + containers: + - image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09 + command: + - stress-ng + - --vm-hang + - "3600" + - --vm + - "2" + - --vm-bytes + - "3.5G" + name: stress + volumeMounts: + - mountPath: /data + name: data + volumes: + - hostPath: + path: /data/dd + type: DirectoryOrCreate + name: data + diff --git a/examples/ensurance/evict-on-mem-usage-total/waterline.yaml b/examples/ensurance/evict-on-mem-usage-total/waterline.yaml new file mode 100644 index 000000000..ce7fe777c --- /dev/null +++ b/examples/ensurance/evict-on-mem-usage-total/waterline.yaml @@ -0,0 +1,18 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: NodeQOS +metadata: + name: eviction-on-high-cpu-usage-percent +spec: + nodeQualityProbe: + nodeLocalGet: + localCacheTTLSeconds: 60 + timeoutSeconds: 10 + rules: + - actionName: eviction + avoidanceThreshold: 2 + metricRule: + name: memory_total_usage + value: 5000000000 + name: cpu-usage-percent + restoreThreshold: 2 + strategy: None \ No newline at end of file diff --git a/pkg/ensurance/collector/cadvisor/cadvisor_linux.go b/pkg/ensurance/collector/cadvisor/cadvisor_linux.go index 30ee23966..d8c2c36ed 100644 --- a/pkg/ensurance/collector/cadvisor/cadvisor_linux.go +++ b/pkg/ensurance/collector/cadvisor/cadvisor_linux.go @@ -159,11 +159,12 @@ func (c *CadvisorCollector) Collect() (map[string][]common.TimeSeries, error) { } if hasExtMemRes { - extResMemUse += float64(v.Stats[0].Memory.Usage) + extResMemUse += float64(v.Stats[0].Memory.WorkingSet) } var containerLabels = GetContainerLabels(pod, containerId, containerName, hasExtCpuRes) - addSampleToStateMap(types.MetricNameContainerMemTotalUsage, composeSample(containerLabels, float64(v.Stats[0].Memory.Usage), now), stateMap) + addSampleToStateMap(types.MetricNameContainerMemTotalUsage, composeSample(containerLabels, float64(v.Stats[0].Memory.WorkingSet), now), stateMap) + klog.V(6).Infof("Pod: %s, containerName: %s, key %s, container_mem_total_usage %#v", klog.KObj(pod), containerName, key, float64(v.Stats[0].Memory.WorkingSet)) if state, ok := c.latestContainersStates[key]; ok { klog.V(6).Infof("For key %s, LatestContainersStates exist", key) diff --git a/pkg/ensurance/executor/mem_usage.go b/pkg/ensurance/executor/mem_usage.go new file mode 100644 index 000000000..af6c6e5c2 --- /dev/null +++ b/pkg/ensurance/executor/mem_usage.go @@ -0,0 +1,70 @@ +package executor + +import ( + "sync" + + "k8s.io/klog/v2" + + "github.com/gocrane/crane/pkg/ensurance/executor/podinfo" + "github.com/gocrane/crane/pkg/ensurance/executor/sort" + "github.com/gocrane/crane/pkg/metrics" + "github.com/gocrane/crane/pkg/utils" +) + +func init() { + registerMetricMap(memUsage) +} + +var memUsage = metric{ + Name: MemUsage, + ActionPriority: 5, + Sortable: true, + SortFunc: sort.MemUsageSort, + + Throttleable: false, + ThrottleQuantified: false, + ThrottleFunc: nil, + RestoreFunc: nil, + + Evictable: true, + EvictQuantified: true, + EvictFunc: memUsageEvictPod, +} + +func memUsageEvictPod(wg *sync.WaitGroup, ctx *ExecuteContext, index int, totalReleasedResource *ReleaseResource, EvictPods EvictPods) (errPodKeys []string, released ReleaseResource) { + wg.Add(1) + + // Calculate release resources + released = releaseMemUsage(EvictPods[index]) + totalReleasedResource.Add(released) + + go func(evictPod podinfo.PodContext) { + defer wg.Done() + + pod, err := ctx.PodLister.Pods(evictPod.Key.Namespace).Get(evictPod.Key.Name) + if err != nil { + errPodKeys = append(errPodKeys, "not found ", evictPod.Key.String()) + return + } + klog.Warningf("Evicting pod %v", evictPod.Key) + err = utils.EvictPodWithGracePeriod(ctx.Client, pod, evictPod.DeletionGracePeriodSeconds) + if err != nil { + errPodKeys = append(errPodKeys, "evict failed ", evictPod.Key.String()) + klog.Warningf("Failed to evict pod %s: %v", evictPod.Key.String(), err) + return + } + metrics.ExecutorEvictCountsInc() + + klog.Warningf("Pod %s is evicted", klog.KObj(pod)) + }(EvictPods[index]) + return +} + +func releaseMemUsage(pod podinfo.PodContext) ReleaseResource { + if pod.ActionType == podinfo.Evict { + return ReleaseResource{ + MemUsage: pod.PodMemUsage, + } + } + return ReleaseResource{} +} diff --git a/pkg/ensurance/executor/podinfo/pod_info.go b/pkg/ensurance/executor/podinfo/pod_info.go index 011e8fae0..e46bdca6e 100644 --- a/pkg/ensurance/executor/podinfo/pod_info.go +++ b/pkg/ensurance/executor/podinfo/pod_info.go @@ -85,10 +85,14 @@ type PodContext struct { StartTime *metav1.Time DeletionGracePeriodSeconds *int32 - ElasticCPU int64 + ElasticCPULimit int64 + ElasticMemLimit int64 + PodCPUUsage, PodCPUShare, PodCPUQuota, PodCPUPeriod float64 ContainerCPUUsages, ContainerCPUShares, ContainerCPUQuotas, ContainerCPUPeriods []ContainerState + PodMemUsage float64 + ActionType ActionType CPUThrottle CPURatio Executed bool @@ -124,7 +128,11 @@ func BuildPodActionContext(pod *v1.Pod, stateMap map[string][]common.TimeSeries, podContext.PodCPUShare, podContext.ContainerCPUShares = GetPodUsage(string(stypes.MetricNameContainerCpuLimit), stateMap, pod) podContext.PodCPUQuota, podContext.ContainerCPUQuotas = GetPodUsage(string(stypes.MetricNameContainerCpuQuota), stateMap, pod) podContext.PodCPUPeriod, podContext.ContainerCPUPeriods = GetPodUsage(string(stypes.MetricNameContainerCpuPeriod), stateMap, pod) - podContext.ElasticCPU = utils.GetElasticResourceLimit(pod, v1.ResourceCPU) + podContext.ElasticCPULimit = utils.GetElasticResourceLimit(pod, v1.ResourceCPU) + + podContext.ElasticMemLimit = utils.GetElasticResourceLimit(pod, v1.ResourceMemory) + podContext.PodMemUsage, _ = GetPodUsage(string(stypes.MetricNameContainerMemTotalUsage), stateMap, pod) + podContext.StartTime = pod.Status.StartTime if action.Spec.Throttle != nil { diff --git a/pkg/ensurance/executor/sort/cpu_usage_sort.go b/pkg/ensurance/executor/sort/cpu_usage_sort.go index ef840db64..da68a8a72 100644 --- a/pkg/ensurance/executor/sort/cpu_usage_sort.go +++ b/pkg/ensurance/executor/sort/cpu_usage_sort.go @@ -13,12 +13,12 @@ func CpuUsageSort(pods []podinfo.PodContext) { // CompareElasticCPU compares the partition of extcpu usage to extcpu limit func CompareElasticCPU(p1, p2 podinfo.PodContext) int32 { // if both pod don't use ext resource, then return - if p1.ElasticCPU == 0 && p2.ElasticCPU == 0 { + if p1.ElasticCPULimit == 0 && p2.ElasticCPULimit == 0 { return 0 } - p1Ratio := p1.PodCPUUsage / float64(p1.ElasticCPU) - p2Ratio := p2.PodCPUUsage / float64(p2.ElasticCPU) + p1Ratio := p1.PodCPUUsage / float64(p1.ElasticCPULimit) + p2Ratio := p2.PodCPUUsage / float64(p2.ElasticCPULimit) return utils.CmpFloat(p1Ratio, p2Ratio) } @@ -30,8 +30,8 @@ func CompareCPUUsage(p1, p2 podinfo.PodContext) int32 { // UseElasticCPU compares pod by using ext resource whether func UseElasticCPU(p1, p2 podinfo.PodContext) int32 { - use1 := utils.Bool2Uint(p1.ElasticCPU != 0) - use2 := utils.Bool2Uint(p2.ElasticCPU != 0) + use1 := utils.Bool2Uint(p1.ElasticCPULimit != 0) + use2 := utils.Bool2Uint(p2.ElasticCPULimit != 0) return int32(use2 - use1) } diff --git a/pkg/ensurance/executor/sort/cpu_usage_sort_test.go b/pkg/ensurance/executor/sort/cpu_usage_sort_test.go index be113c1bd..c1914b8b0 100644 --- a/pkg/ensurance/executor/sort/cpu_usage_sort_test.go +++ b/pkg/ensurance/executor/sort/cpu_usage_sort_test.go @@ -17,14 +17,14 @@ func TestCpuUsageSorter(t *testing.T) { // orderedBy(UseElasticCPU, ComparePodQOSClass, ComparePriority, CompareCPUUsage, CompareElasticCPU, CompareRunningTime).Sort(pods) pods := []podinfo.PodContext{ { - Key: types.NamespacedName{Name: "elastic-cpu-2"}, - ElasticCPU: 2, - QOSClass: v1.PodQOSBestEffort, + Key: types.NamespacedName{Name: "elastic-cpu-2"}, + ElasticCPULimit: 2, + QOSClass: v1.PodQOSBestEffort, }, { - Key: types.NamespacedName{Name: "elastic-cpu-4"}, - ElasticCPU: 4, - QOSClass: v1.PodQOSBestEffort, + Key: types.NamespacedName{Name: "elastic-cpu-4"}, + ElasticCPULimit: 4, + QOSClass: v1.PodQOSBestEffort, }, { Key: types.NamespacedName{Name: "cpu-1"}, @@ -77,6 +77,6 @@ func TestCpuUsageSorter(t *testing.T) { CpuUsageSort(pods) t.Logf("sorted pods:") for _, p := range pods { - t.Logf("key %s, useElasticCPU %v, qosClass %s, priority %d, usage %f, elasticCPUUsage %d, startTime %v", p.Key, (p.ElasticCPU != 0), p.QOSClass, p.Priority, p.PodCPUUsage, p.ElasticCPU, p.StartTime) + t.Logf("key %s, useElasticCPU %v, qosClass %s, priority %d, usage %f, elasticCPUUsage %d, startTime %v", p.Key, (p.ElasticCPULimit != 0), p.QOSClass, p.Priority, p.PodCPUUsage, p.ElasticCPULimit, p.StartTime) } } diff --git a/pkg/ensurance/executor/sort/mem_metrics_sort.go b/pkg/ensurance/executor/sort/mem_metrics_sort.go deleted file mode 100644 index 9d1ae74eb..000000000 --- a/pkg/ensurance/executor/sort/mem_metrics_sort.go +++ /dev/null @@ -1,9 +0,0 @@ -package sort - -import "github.com/gocrane/crane/pkg/ensurance/executor/podinfo" - -// Todo: Memory metrics related sort func need to be filled - -func MemMetricsSorter(pods []podinfo.PodContext) { - orderedBy(ComparePriority, ComparePodQOSClass, CompareRunningTime).Sort(pods) -} diff --git a/pkg/ensurance/executor/sort/mem_usage_sort.go b/pkg/ensurance/executor/sort/mem_usage_sort.go new file mode 100644 index 000000000..eb377e980 --- /dev/null +++ b/pkg/ensurance/executor/sort/mem_usage_sort.go @@ -0,0 +1,36 @@ +package sort + +import ( + "github.com/gocrane/crane/pkg/ensurance/executor/podinfo" + "github.com/gocrane/crane/pkg/utils" +) + +func MemUsageSort(pods []podinfo.PodContext) { + orderedBy(UseElasticMem, ComparePriority, ComparePodQOSClass, CompareMemUsage, CompareElasticMem, CompareRunningTime).Sort(pods) +} + +// UseElasticMem compares pod by using ext resource whether +func UseElasticMem(p1, p2 podinfo.PodContext) int32 { + use1 := utils.Bool2Uint(p1.ElasticMemLimit != 0) + use2 := utils.Bool2Uint(p2.ElasticMemLimit != 0) + + return int32(use2 - use1) +} + +// CompareMemUsage compares the partition mem usage of mem limit +func CompareMemUsage(p1, p2 podinfo.PodContext) int32 { + return utils.CmpFloat(p2.PodMemUsage, p1.PodMemUsage) +} + +// CompareElasticMem compares the partition of extmem usage to extmem limit +func CompareElasticMem(p1, p2 podinfo.PodContext) int32 { + // if both pod don't use ext resource, then return + if p1.ElasticMemLimit == 0 && p2.ElasticMemLimit == 0 { + return 0 + } + + p1Ratio := p1.PodMemUsage / float64(p1.ElasticMemLimit) + p2Ratio := p2.PodMemUsage / float64(p2.ElasticMemLimit) + + return utils.CmpFloat(p1Ratio, p2Ratio) +} From f388a54a4097c7ee82e13a1942751ec49bb2103d Mon Sep 17 00:00:00 2001 From: kaiyuechen Date: Thu, 15 Sep 2022 18:41:42 +0800 Subject: [PATCH 2/3] watermark of memory usage percent --- .../{waterline.yaml => watermark.yaml} | 0 .../elastic-pod-qos.yaml | 13 ++++ .../eviction-action.yaml | 9 +++ .../evict-on-mem-usage-percent/pod.yaml | 56 +++++++++++++++ .../evict-on-mem-usage-percent/watermark.yaml | 18 +++++ .../{waterline.yaml => watermark.yaml} | 2 +- pkg/ensurance/collector/nodelocal/memory.go | 1 + pkg/ensurance/collector/types/types.go | 1 + pkg/ensurance/executor/mem_usage_percent.go | 70 +++++++++++++++++++ pkg/ensurance/executor/watermark.go | 13 +++- 10 files changed, 181 insertions(+), 2 deletions(-) rename examples/ensurance/evict-on-cpu-usage-percent/{waterline.yaml => watermark.yaml} (100%) create mode 100644 examples/ensurance/evict-on-mem-usage-percent/elastic-pod-qos.yaml create mode 100644 examples/ensurance/evict-on-mem-usage-percent/eviction-action.yaml create mode 100644 examples/ensurance/evict-on-mem-usage-percent/pod.yaml create mode 100644 examples/ensurance/evict-on-mem-usage-percent/watermark.yaml rename examples/ensurance/evict-on-mem-usage-total/{waterline.yaml => watermark.yaml} (89%) create mode 100644 pkg/ensurance/executor/mem_usage_percent.go diff --git a/examples/ensurance/evict-on-cpu-usage-percent/waterline.yaml b/examples/ensurance/evict-on-cpu-usage-percent/watermark.yaml similarity index 100% rename from examples/ensurance/evict-on-cpu-usage-percent/waterline.yaml rename to examples/ensurance/evict-on-cpu-usage-percent/watermark.yaml diff --git a/examples/ensurance/evict-on-mem-usage-percent/elastic-pod-qos.yaml b/examples/ensurance/evict-on-mem-usage-percent/elastic-pod-qos.yaml new file mode 100644 index 000000000..29ca6bcd9 --- /dev/null +++ b/examples/ensurance/evict-on-mem-usage-percent/elastic-pod-qos.yaml @@ -0,0 +1,13 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: PodQOS +metadata: + name: all-elastic-pods +spec: + allowedActions: + - eviction + resourceQOS: + cpuQOS: + cpuPriority: 7 + labelSelector: + matchLabels: + preemptible_job: "true" \ No newline at end of file diff --git a/examples/ensurance/evict-on-mem-usage-percent/eviction-action.yaml b/examples/ensurance/evict-on-mem-usage-percent/eviction-action.yaml new file mode 100644 index 000000000..c9e715fec --- /dev/null +++ b/examples/ensurance/evict-on-mem-usage-percent/eviction-action.yaml @@ -0,0 +1,9 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: AvoidanceAction +metadata: + name: eviction +spec: + coolDownSeconds: 300 + description: evict low priority pods + eviction: + terminationGracePeriodSeconds: 30 \ No newline at end of file diff --git a/examples/ensurance/evict-on-mem-usage-percent/pod.yaml b/examples/ensurance/evict-on-mem-usage-percent/pod.yaml new file mode 100644 index 000000000..6b1c81e58 --- /dev/null +++ b/examples/ensurance/evict-on-mem-usage-percent/pod.yaml @@ -0,0 +1,56 @@ +apiVersion: v1 +kind: Pod +metadata: + name: low-pi6 + labels: + k8s-app: low + preemptible_job: "true" +spec: + containers: + - image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09 + command: + - stress-ng + - --vm-hang + - "3600" + - --vm + - "2" + - --vm-bytes + - "2G" + name: stress + volumeMounts: + - mountPath: /data + name: data + volumes: + - hostPath: + path: /data/dd + type: DirectoryOrCreate + name: data +--- +apiVersion: v1 +kind: Pod +metadata: + name: low-pi2 + labels: + k8s-app: low + preemptible_job: "true" +spec: + containers: + - image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09 + command: + - stress-ng + - --vm-hang + - "3600" + - --vm + - "2" + - --vm-bytes + - "3.5G" + name: stress + volumeMounts: + - mountPath: /data + name: data + volumes: + - hostPath: + path: /data/dd + type: DirectoryOrCreate + name: data + diff --git a/examples/ensurance/evict-on-mem-usage-percent/watermark.yaml b/examples/ensurance/evict-on-mem-usage-percent/watermark.yaml new file mode 100644 index 000000000..a519b89c1 --- /dev/null +++ b/examples/ensurance/evict-on-mem-usage-percent/watermark.yaml @@ -0,0 +1,18 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: NodeQOS +metadata: + name: eviction-on-high-mem-usage-percent +spec: + nodeQualityProbe: + nodeLocalGet: + localCacheTTLSeconds: 60 + timeoutSeconds: 10 + rules: + - actionName: eviction + avoidanceThreshold: 2 + metricRule: + name: memory_total_utilization + value: 50 + name: cpu-usage-percent + restoreThreshold: 2 + strategy: None \ No newline at end of file diff --git a/examples/ensurance/evict-on-mem-usage-total/waterline.yaml b/examples/ensurance/evict-on-mem-usage-total/watermark.yaml similarity index 89% rename from examples/ensurance/evict-on-mem-usage-total/waterline.yaml rename to examples/ensurance/evict-on-mem-usage-total/watermark.yaml index ce7fe777c..85ae98d2b 100644 --- a/examples/ensurance/evict-on-mem-usage-total/waterline.yaml +++ b/examples/ensurance/evict-on-mem-usage-total/watermark.yaml @@ -1,7 +1,7 @@ apiVersion: ensurance.crane.io/v1alpha1 kind: NodeQOS metadata: - name: eviction-on-high-cpu-usage-percent + name: eviction-on-high-mem-usage spec: nodeQualityProbe: nodeLocalGet: diff --git a/pkg/ensurance/collector/nodelocal/memory.go b/pkg/ensurance/collector/nodelocal/memory.go index 177a4b02f..aa2fc7c99 100644 --- a/pkg/ensurance/collector/nodelocal/memory.go +++ b/pkg/ensurance/collector/nodelocal/memory.go @@ -39,6 +39,7 @@ func collectMemory(_ *nodeLocalContext) (map[string][]common.TimeSeries, error) var data = make(map[string][]common.TimeSeries, 2) data[string(types.MetricNameMemoryTotalUsage)] = []common.TimeSeries{{Samples: []common.Sample{{Value: float64(usage), Timestamp: now.Unix()}}}} data[string(types.MetricNameMemoryTotalUtilization)] = []common.TimeSeries{{Samples: []common.Sample{{Value: usagePercent, Timestamp: now.Unix()}}}} + data[string(types.MetricNameMemoryTotal)] = []common.TimeSeries{{Samples: []common.Sample{{Value: float64(stat.Total), Timestamp: now.Unix()}}}} return data, nil } diff --git a/pkg/ensurance/collector/types/types.go b/pkg/ensurance/collector/types/types.go index 0e0cb70e2..1e5025b89 100644 --- a/pkg/ensurance/collector/types/types.go +++ b/pkg/ensurance/collector/types/types.go @@ -25,6 +25,7 @@ const ( MetricNameMemoryTotalUsage MetricName = "memory_total_usage" MetricNameMemoryTotalUtilization MetricName = "memory_total_utilization" + MetricNameMemoryTotal MetricName = "memory_total" MetricDiskReadKiBPS MetricName = "disk_read_kibps" MetricDiskWriteKiBPS MetricName = "disk_write_kibps" diff --git a/pkg/ensurance/executor/mem_usage_percent.go b/pkg/ensurance/executor/mem_usage_percent.go new file mode 100644 index 000000000..cd0d92a2a --- /dev/null +++ b/pkg/ensurance/executor/mem_usage_percent.go @@ -0,0 +1,70 @@ +package executor + +import ( + "sync" + + "k8s.io/klog/v2" + + "github.com/gocrane/crane/pkg/ensurance/executor/podinfo" + "github.com/gocrane/crane/pkg/ensurance/executor/sort" + "github.com/gocrane/crane/pkg/metrics" + "github.com/gocrane/crane/pkg/utils" +) + +func init() { + registerMetricMap(memUsagePercent) +} + +var memUsagePercent = metric{ + Name: MemUsagePercent, + ActionPriority: 5, + Sortable: true, + SortFunc: sort.MemUsageSort, + + Throttleable: false, + ThrottleQuantified: false, + ThrottleFunc: nil, + RestoreFunc: nil, + + Evictable: true, + EvictQuantified: true, + EvictFunc: memUsagePercentEvictPod, +} + +func memUsagePercentEvictPod(wg *sync.WaitGroup, ctx *ExecuteContext, index int, totalReleasedResource *ReleaseResource, EvictPods EvictPods) (errPodKeys []string, released ReleaseResource) { + wg.Add(1) + + // Calculate release resources + released = releaseMemUsagePercent(EvictPods[index]) + totalReleasedResource.Add(released) + + go func(evictPod podinfo.PodContext) { + defer wg.Done() + + pod, err := ctx.PodLister.Pods(evictPod.Key.Namespace).Get(evictPod.Key.Name) + if err != nil { + errPodKeys = append(errPodKeys, "not found ", evictPod.Key.String()) + return + } + klog.Warningf("Evicting pod %v", evictPod.Key) + err = utils.EvictPodWithGracePeriod(ctx.Client, pod, evictPod.DeletionGracePeriodSeconds) + if err != nil { + errPodKeys = append(errPodKeys, "evict failed ", evictPod.Key.String()) + klog.Warningf("Failed to evict pod %s: %v", evictPod.Key.String(), err) + return + } + metrics.ExecutorEvictCountsInc() + + klog.Warningf("Pod %s is evicted", klog.KObj(pod)) + }(EvictPods[index]) + return +} + +func releaseMemUsagePercent(pod podinfo.PodContext) ReleaseResource { + if pod.ActionType == podinfo.Evict { + return ReleaseResource{ + MemUsagePercent: pod.PodMemUsage, + } + } + return ReleaseResource{} +} diff --git a/pkg/ensurance/executor/watermark.go b/pkg/ensurance/executor/watermark.go index cbafe8738..5200fb8ba 100644 --- a/pkg/ensurance/executor/watermark.go +++ b/pkg/ensurance/executor/watermark.go @@ -19,6 +19,7 @@ const ( CpuUsage = WatermarkMetric(types.MetricNameCpuTotalUsage) CpuUsagePercent = WatermarkMetric(types.MetricNameCpuTotalUtilization) MemUsage = WatermarkMetric(types.MetricNameMemoryTotalUsage) + MemUsagePercent = WatermarkMetric(types.MetricNameMemoryTotalUtilization) ) const ( @@ -171,8 +172,18 @@ func calculateGaps(stateMap map[string][]common.TimeSeries, } else { cpuPercentToUsage := (1 + executeExcessPercent) * (maxUsed - float64(evictWatermark.PopSmallest().Value())) * cpuCoreNums[0].Samples[0].Value * 1000 / types.MaxPercentage result[m.Name] = cpuPercentToUsage - klog.V(6).Infof("maxUsed is %f, watermark is %f, cpuPercentToUsageGap is %f", maxUsed, float64(evictWatermark.PopSmallest().Value()), cpuPercentToUsage) + klog.V(6).Infof("cpuPercent maxUsed is %f, watermark is %f, cpuPercentToUsageGap is %f", maxUsed, float64(evictWatermark.PopSmallest().Value()), cpuPercentToUsage) } + } else if m.Name == MemUsagePercent { + totalMem, ok := stateMap[string(types.MetricNameMemoryTotal)] + if !ok { + klog.Warningf("Can't get MetricNameMemoryTotal") + } else { + memPercentToUsage := (1 + executeExcessPercent) * (maxUsed - float64(evictWatermark.PopSmallest().Value())) * totalMem[0].Samples[0].Value / types.MaxPercentage + result[m.Name] = memPercentToUsage + klog.V(6).Infof("memPercent maxUsed is %f, watermark is %f, memPercentToUsageGap is %f", maxUsed, float64(evictWatermark.PopSmallest().Value()), memPercentToUsage) + } + } else { result[m.Name] = (1 + executeExcessPercent) * (maxUsed - float64(evictWatermark.PopSmallest().Value())) } From 042d169545fa8462f93dbfbeacb5fe6b0b824fb5 Mon Sep 17 00:00:00 2001 From: kaiyuechen Date: Sat, 8 Oct 2022 19:56:06 +0800 Subject: [PATCH 3/3] fix code about watermark of cpu/mem usage percent --- .../elastic-pod-qos.yaml | 3 - .../evict-on-cpu-usage-percent/pod.yaml | 4 +- .../evict-on-cpu-usage-total/be-rules.yaml | 5 -- .../elastic-pod-qos.yaml | 3 - .../evict-on-mem-usage-percent/pod.yaml | 22 +---- .../evict-on-mem-usage-percent/watermark.yaml | 2 +- .../elastic-pod-qos.yaml | 3 - .../evict-on-mem-usage-total/pod.yaml | 22 +---- .../evict-on-mem-usage-total/watermark.yaml | 4 +- .../collector/cadvisor/cadvisor_linux.go | 3 +- .../executor/sort/mem_usage_sort_test.go | 82 +++++++++++++++++++ pkg/utils/pod.go | 4 +- 12 files changed, 97 insertions(+), 60 deletions(-) create mode 100644 pkg/ensurance/executor/sort/mem_usage_sort_test.go diff --git a/examples/ensurance/evict-on-cpu-usage-percent/elastic-pod-qos.yaml b/examples/ensurance/evict-on-cpu-usage-percent/elastic-pod-qos.yaml index 29ca6bcd9..0901404fb 100644 --- a/examples/ensurance/evict-on-cpu-usage-percent/elastic-pod-qos.yaml +++ b/examples/ensurance/evict-on-cpu-usage-percent/elastic-pod-qos.yaml @@ -5,9 +5,6 @@ metadata: spec: allowedActions: - eviction - resourceQOS: - cpuQOS: - cpuPriority: 7 labelSelector: matchLabels: preemptible_job: "true" \ No newline at end of file diff --git a/examples/ensurance/evict-on-cpu-usage-percent/pod.yaml b/examples/ensurance/evict-on-cpu-usage-percent/pod.yaml index ef73009c0..c47082f6e 100644 --- a/examples/ensurance/evict-on-cpu-usage-percent/pod.yaml +++ b/examples/ensurance/evict-on-cpu-usage-percent/pod.yaml @@ -8,12 +8,12 @@ metadata: spec: containers: - command: - - stress-nga + - stress-ng - -c - "2" - --cpu-method - cpuid - image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09.ln + image: docker.io/gocrane/stress-ng:v0.12.09 imagePullPolicy: IfNotPresent name: low resources: diff --git a/examples/ensurance/evict-on-cpu-usage-total/be-rules.yaml b/examples/ensurance/evict-on-cpu-usage-total/be-rules.yaml index a03f0f2e6..16313a9a3 100644 --- a/examples/ensurance/evict-on-cpu-usage-total/be-rules.yaml +++ b/examples/ensurance/evict-on-cpu-usage-total/be-rules.yaml @@ -5,11 +5,6 @@ metadata: spec: allowedActions: - eviction - resourceQOS: - cpuQOS: - cpuPriority: 7 - htIsolation: - enable: false scopeSelector: matchExpressions: - operator: In diff --git a/examples/ensurance/evict-on-mem-usage-percent/elastic-pod-qos.yaml b/examples/ensurance/evict-on-mem-usage-percent/elastic-pod-qos.yaml index 29ca6bcd9..0901404fb 100644 --- a/examples/ensurance/evict-on-mem-usage-percent/elastic-pod-qos.yaml +++ b/examples/ensurance/evict-on-mem-usage-percent/elastic-pod-qos.yaml @@ -5,9 +5,6 @@ metadata: spec: allowedActions: - eviction - resourceQOS: - cpuQOS: - cpuPriority: 7 labelSelector: matchLabels: preemptible_job: "true" \ No newline at end of file diff --git a/examples/ensurance/evict-on-mem-usage-percent/pod.yaml b/examples/ensurance/evict-on-mem-usage-percent/pod.yaml index 6b1c81e58..58e201595 100644 --- a/examples/ensurance/evict-on-mem-usage-percent/pod.yaml +++ b/examples/ensurance/evict-on-mem-usage-percent/pod.yaml @@ -1,13 +1,13 @@ apiVersion: v1 kind: Pod metadata: - name: low-pi6 + name: low-pi1 labels: k8s-app: low preemptible_job: "true" spec: containers: - - image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09 + - image: docker.io/gocrane/stress-ng:v0.12.09 command: - stress-ng - --vm-hang @@ -17,14 +17,6 @@ spec: - --vm-bytes - "2G" name: stress - volumeMounts: - - mountPath: /data - name: data - volumes: - - hostPath: - path: /data/dd - type: DirectoryOrCreate - name: data --- apiVersion: v1 kind: Pod @@ -35,7 +27,7 @@ metadata: preemptible_job: "true" spec: containers: - - image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09 + - image: docker.io/gocrane/stress-ng:v0.12.09 command: - stress-ng - --vm-hang @@ -45,12 +37,4 @@ spec: - --vm-bytes - "3.5G" name: stress - volumeMounts: - - mountPath: /data - name: data - volumes: - - hostPath: - path: /data/dd - type: DirectoryOrCreate - name: data diff --git a/examples/ensurance/evict-on-mem-usage-percent/watermark.yaml b/examples/ensurance/evict-on-mem-usage-percent/watermark.yaml index a519b89c1..71b78be0f 100644 --- a/examples/ensurance/evict-on-mem-usage-percent/watermark.yaml +++ b/examples/ensurance/evict-on-mem-usage-percent/watermark.yaml @@ -13,6 +13,6 @@ spec: metricRule: name: memory_total_utilization value: 50 - name: cpu-usage-percent + name: mem-usage-percent restoreThreshold: 2 strategy: None \ No newline at end of file diff --git a/examples/ensurance/evict-on-mem-usage-total/elastic-pod-qos.yaml b/examples/ensurance/evict-on-mem-usage-total/elastic-pod-qos.yaml index 29ca6bcd9..0901404fb 100644 --- a/examples/ensurance/evict-on-mem-usage-total/elastic-pod-qos.yaml +++ b/examples/ensurance/evict-on-mem-usage-total/elastic-pod-qos.yaml @@ -5,9 +5,6 @@ metadata: spec: allowedActions: - eviction - resourceQOS: - cpuQOS: - cpuPriority: 7 labelSelector: matchLabels: preemptible_job: "true" \ No newline at end of file diff --git a/examples/ensurance/evict-on-mem-usage-total/pod.yaml b/examples/ensurance/evict-on-mem-usage-total/pod.yaml index 6b1c81e58..58e201595 100644 --- a/examples/ensurance/evict-on-mem-usage-total/pod.yaml +++ b/examples/ensurance/evict-on-mem-usage-total/pod.yaml @@ -1,13 +1,13 @@ apiVersion: v1 kind: Pod metadata: - name: low-pi6 + name: low-pi1 labels: k8s-app: low preemptible_job: "true" spec: containers: - - image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09 + - image: docker.io/gocrane/stress-ng:v0.12.09 command: - stress-ng - --vm-hang @@ -17,14 +17,6 @@ spec: - --vm-bytes - "2G" name: stress - volumeMounts: - - mountPath: /data - name: data - volumes: - - hostPath: - path: /data/dd - type: DirectoryOrCreate - name: data --- apiVersion: v1 kind: Pod @@ -35,7 +27,7 @@ metadata: preemptible_job: "true" spec: containers: - - image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09 + - image: docker.io/gocrane/stress-ng:v0.12.09 command: - stress-ng - --vm-hang @@ -45,12 +37,4 @@ spec: - --vm-bytes - "3.5G" name: stress - volumeMounts: - - mountPath: /data - name: data - volumes: - - hostPath: - path: /data/dd - type: DirectoryOrCreate - name: data diff --git a/examples/ensurance/evict-on-mem-usage-total/watermark.yaml b/examples/ensurance/evict-on-mem-usage-total/watermark.yaml index 85ae98d2b..750e40950 100644 --- a/examples/ensurance/evict-on-mem-usage-total/watermark.yaml +++ b/examples/ensurance/evict-on-mem-usage-total/watermark.yaml @@ -12,7 +12,7 @@ spec: avoidanceThreshold: 2 metricRule: name: memory_total_usage - value: 5000000000 - name: cpu-usage-percent + value: 5000000000 # means 5Gi + name: mem-usage restoreThreshold: 2 strategy: None \ No newline at end of file diff --git a/pkg/ensurance/collector/cadvisor/cadvisor_linux.go b/pkg/ensurance/collector/cadvisor/cadvisor_linux.go index d8c2c36ed..fef292b6c 100644 --- a/pkg/ensurance/collector/cadvisor/cadvisor_linux.go +++ b/pkg/ensurance/collector/cadvisor/cadvisor_linux.go @@ -162,7 +162,8 @@ func (c *CadvisorCollector) Collect() (map[string][]common.TimeSeries, error) { extResMemUse += float64(v.Stats[0].Memory.WorkingSet) } - var containerLabels = GetContainerLabels(pod, containerId, containerName, hasExtCpuRes) + hasExtRes := hasExtCpuRes || hasExtMemRes + var containerLabels = GetContainerLabels(pod, containerId, containerName, hasExtRes) addSampleToStateMap(types.MetricNameContainerMemTotalUsage, composeSample(containerLabels, float64(v.Stats[0].Memory.WorkingSet), now), stateMap) klog.V(6).Infof("Pod: %s, containerName: %s, key %s, container_mem_total_usage %#v", klog.KObj(pod), containerName, key, float64(v.Stats[0].Memory.WorkingSet)) diff --git a/pkg/ensurance/executor/sort/mem_usage_sort_test.go b/pkg/ensurance/executor/sort/mem_usage_sort_test.go new file mode 100644 index 000000000..a0483324b --- /dev/null +++ b/pkg/ensurance/executor/sort/mem_usage_sort_test.go @@ -0,0 +1,82 @@ +package sort + +import ( + "testing" + "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + "github.com/gocrane/crane/pkg/ensurance/executor/podinfo" +) + +func TestMemUsageSorter(t *testing.T) { + now := metav1.NewTime(time.Unix(1000, 0).UTC()) + later := metav1.NewTime(time.Unix(2000, 0).UTC()) + + pods := []podinfo.PodContext{ + { + Key: types.NamespacedName{Name: "elastic-mem-2"}, + ElasticMemLimit: 2, + QOSClass: v1.PodQOSBestEffort, + }, + { + Key: types.NamespacedName{Name: "elastic-mem-4"}, + ElasticMemLimit: 4, + QOSClass: v1.PodQOSBestEffort, + }, + { + Key: types.NamespacedName{Name: "mem-1"}, + PodMemUsage: 1, + QOSClass: v1.PodQOSGuaranteed, + }, + { + Key: types.NamespacedName{Name: "mem-2"}, + PodMemUsage: 2, + QOSClass: v1.PodQOSBurstable, + }, + { + Key: types.NamespacedName{Name: "guarantee-1"}, + PodMemUsage: 1, + QOSClass: v1.PodQOSGuaranteed, + }, + { + Key: types.NamespacedName{Name: "burstable-2"}, + PodMemUsage: 1, + QOSClass: v1.PodQOSBurstable, + }, + { + Key: types.NamespacedName{Name: "prioirty-2"}, + Priority: 2, + PodMemUsage: 1, + QOSClass: v1.PodQOSBurstable, + }, + { + Key: types.NamespacedName{Name: "prioirty-2-2"}, + Priority: 2, + PodMemUsage: 2, + QOSClass: v1.PodQOSBurstable, + }, + { + Key: types.NamespacedName{Name: "priority-1"}, + Priority: 1, + QOSClass: v1.PodQOSBurstable, + }, + { + Key: types.NamespacedName{Name: "time-1"}, + StartTime: &now, + QOSClass: v1.PodQOSGuaranteed, + }, + { + Key: types.NamespacedName{Name: "time-2"}, + StartTime: &later, + QOSClass: v1.PodQOSGuaranteed, + }, + } + MemUsageSort(pods) + t.Logf("sorted pods:") + for _, p := range pods { + t.Logf("key %s, useElasticMem %v, qosClass %s, priority %d, usage %f, elasticMemUsage %d, startTime %v", p.Key, (p.ElasticMemLimit != 0), p.QOSClass, p.Priority, p.PodMemUsage, p.ElasticMemLimit, p.StartTime) + } +} diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go index fca7c5302..22d40547e 100644 --- a/pkg/utils/pod.go +++ b/pkg/utils/pod.go @@ -205,8 +205,8 @@ func GetContainerNameFromPod(pod *v1.Pod, containerId string) string { if len(strList) > 0 { klog.V(6).Infof("cri-containerd is %s ", "cri-containerd-"+strList[len(strList)-1]+".scope") klog.V(6).Infof("containerid is %s", containerId) - if "cri-containerd-"+strList[len(strList)-1]+".scope" == containerId { - klog.V(6).Infof("111111111") + containerIdFromPod := fmt.Sprintf("cri-containerd-%s.scope", strList[len(strList)-1]) + if containerIdFromPod == containerId { return v.Name } }