From 20e5bd8935c57c3ce9362d3c39d07ea8963ab122 Mon Sep 17 00:00:00 2001 From: kaiyuechen Date: Thu, 1 Sep 2022 15:52:31 +0800 Subject: [PATCH 1/2] Add extend memory in nodeResourceManager;Collect memory info for pod and container;Fix cadvisor can't get container info err --- go.mod | 13 +- go.sum | 22 +--- pkg/agent/agent.go | 25 ++-- pkg/ensurance/analyzer/analyzer.go | 4 + .../collector/cadvisor/cadvisor_linux.go | 27 +++-- pkg/ensurance/collector/types/types.go | 37 +----- pkg/metrics/ensuarance.go | 16 +++ pkg/resource/node_resource_manager.go | 63 ++++++++-- pkg/utils/cgroup.go | 111 ++++++++++++++++++ pkg/utils/pod.go | 48 +++++++- 10 files changed, 270 insertions(+), 96 deletions(-) create mode 100644 pkg/utils/cgroup.go diff --git a/go.mod b/go.mod index 876e1287b..554bcb2be 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.17 require ( github.com/evanphx/json-patch v4.11.0+incompatible github.com/go-echarts/go-echarts/v2 v2.2.4 - github.com/gocrane/api v0.7.1-0.20220819080332-e4c0d60e812d + github.com/gocrane/api v0.7.1-0.20220906050113-0f331eb419b0 github.com/google/cadvisor v0.39.2 github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 github.com/prometheus/client_golang v1.11.0 @@ -48,12 +48,10 @@ require ( ) require ( - github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab // indirect github.com/Microsoft/go-winio v0.5.1 // indirect github.com/NYTimes/gziphandler v1.1.1 // indirect github.com/PuerkitoBio/purell v1.1.1 // indirect github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect - github.com/aws/aws-sdk-go v1.38.49 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bits-and-blooms/bitset v1.2.0 // indirect github.com/blang/semver v3.5.1+incompatible // indirect @@ -61,7 +59,6 @@ require ( github.com/cilium/ebpf v0.6.2 // indirect github.com/containerd/console v1.0.2 // indirect github.com/containerd/containerd v1.4.4 // indirect - github.com/containerd/ttrpc v1.0.2 // indirect github.com/coreos/go-semver v0.3.0 // indirect github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/cyphar/filepath-securejoin v0.2.2 // indirect @@ -96,7 +93,6 @@ require ( github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect github.com/imdario/mergo v0.3.12 // indirect github.com/inconshreveable/mousetrap v1.0.0 // indirect - github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/karrick/godirwalk v1.16.1 // indirect github.com/leodido/go-urn v1.2.0 // indirect @@ -104,7 +100,6 @@ require ( github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect github.com/mindprince/gonvml v0.0.0-20190828220739-9ebdce4bb989 // indirect github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible // indirect - github.com/moby/spdystream v0.2.0 // indirect github.com/moby/sys/mountinfo v0.4.1 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect @@ -147,7 +142,6 @@ require ( go.uber.org/zap v1.19.0 // indirect golang.org/x/text v0.3.7 // indirect golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect - golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect google.golang.org/appengine v1.6.7 // indirect gopkg.in/inf.v0 v0.9.1 // indirect @@ -156,11 +150,6 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect k8s.io/apiextensions-apiserver v0.22.2 // indirect - k8s.io/cloud-provider v0.22.3 // indirect - k8s.io/component-helpers v0.22.3 // indirect - k8s.io/kube-scheduler v0.0.0 // indirect - k8s.io/kubelet v0.0.0 // indirect - k8s.io/mount-utils v0.22.3 // indirect k8s.io/utils v0.0.0-20210819203725-bdf08cb9a70a // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.22 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.1.2 // indirect diff --git a/go.sum b/go.sum index 01a1117fb..45da8a363 100644 --- a/go.sum +++ b/go.sum @@ -61,7 +61,6 @@ github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/GoogleCloudPlatform/k8s-cloud-provider v0.0.0-20200415212048-7901bc822317/go.mod h1:DF8FZRxMHMGv/vP2lQP6h+dYzzjpuRn24VeRiYn3qjQ= -github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab h1:UKkYhof1njT1/xq4SEg5z+VpTgjmNeHwPGRQl7takDI= github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab/go.mod h1:3VYc5hodBMJ5+l/7J4xAyMeuM2PNuepvHlGs8yilUCA= github.com/MakeNowJust/heredoc v0.0.0-20170808103936-bb23615498cd/go.mod h1:64YHyfSL2R96J44Nlwm39UHepQbyR5q10x7iYa1ks2E= github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw= @@ -93,7 +92,6 @@ github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/auth0/go-jwt-middleware v1.0.1/go.mod h1:YSeUX3z6+TF2H+7padiEqNJ73Zy9vXW72U//IgN0BIM= github.com/aws/aws-sdk-go v1.35.24/go.mod h1:tlPOdRjfxPBpNIwqDj61rmsnA85v9jc0Ps9+muhnW+k= -github.com/aws/aws-sdk-go v1.38.49 h1:E31vxjCe6a5I+mJLmUGaZobiWmg9KdWaud9IfceYeYQ= github.com/aws/aws-sdk-go v1.38.49/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= github.com/benbjohnson/clock v1.0.3/go.mod h1:bGMdMPoPVvcYyt1gHDf4J2KE153Yf9BuiUKYMaxlTDM= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= @@ -163,7 +161,6 @@ github.com/containerd/ttrpc v0.0.0-20190828154514-0e0f228740de/go.mod h1:PvCDdDG github.com/containerd/ttrpc v1.0.2 h1:2/O3oTZN36q2xRolk0a2WWGgh7/Vf/liElg5hFYLX9U= github.com/containerd/ttrpc v1.0.2/go.mod h1:UAxOpgT9ziI0gJrmKvgcZivgxOp8iFPSk8httJEt98Y= github.com/containerd/typeurl v0.0.0-20180627222232-a93fcdb778cd/go.mod h1:Cm3kwCdlkCfMSHURc+r6fwoGH6/F1hH3S4sg0rLFWPc= -github.com/containerd/typeurl v1.0.1 h1:PvuK4E3D5S5q6IqsPDCy928FhP0LUIGcmZ/Yhgp5Djw= github.com/containerd/typeurl v1.0.1/go.mod h1:TB1hUtrpaiO88KEK56ijojHS1+NeF0izUACaJW2mdXg= github.com/containernetworking/cni v0.8.1/go.mod h1:LGwApLUm2FpoOfxTDEeq8T9ipbpZ61X79hmU3w8FmsY= github.com/coredns/caddy v1.1.0/go.mod h1:A6ntJQlAWuQfFlsd9hvigKbo2WS0VUs2l1e2F+BawD4= @@ -209,7 +206,6 @@ github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDD github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= -github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153 h1:yUdfgN0XgIJw7foRItutHYUIhlcKzcSf5vDpdhQAKTc= github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= github.com/emicklei/go-restful v2.9.5+incompatible/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= @@ -310,14 +306,8 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= github.com/gobwas/ws v1.1.0-rc.5 h1:QOAag7FoBaBYYHRqzqkhhd8fq5RTubvI4v3Ft/gDVVQ= github.com/gobwas/ws v1.1.0-rc.5/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0= -github.com/gocrane/api v0.6.1-0.20220809112454-68f0199a774e h1:pIocbZM7LchSMG7XBbfD9K+Im7zZtMZjVU7paVJOv6I= -github.com/gocrane/api v0.6.1-0.20220809112454-68f0199a774e/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= -github.com/gocrane/api v0.6.1-0.20220812033255-887f4b4e7d8b h1:ELyVltbne39izU2XaFrgJtqnhdeV+hBt+JBKooN7N4w= -github.com/gocrane/api v0.6.1-0.20220812033255-887f4b4e7d8b/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= -github.com/gocrane/api v0.7.0 h1:EIvr5KKHby1PXZI6wB+Ac+D2BCvc0qiK5VpHWQVwnxg= -github.com/gocrane/api v0.7.0/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= -github.com/gocrane/api v0.7.1-0.20220819080332-e4c0d60e812d h1:qqPrNx1AETykgX80aWAmna/eQMDVWnUdSemWlfaZUNM= -github.com/gocrane/api v0.7.1-0.20220819080332-e4c0d60e812d/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= +github.com/gocrane/api v0.7.1-0.20220906050113-0f331eb419b0 h1:IIHNT4bDsuBJq9JHHoQhUOrtE5Ec2Ug/Om8s8WQD8ws= +github.com/gocrane/api v0.7.1-0.20220906050113-0f331eb419b0/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= @@ -466,9 +456,7 @@ github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NH github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/ishidawataru/sctp v0.0.0-20190723014705-7c296d48a2b5/go.mod h1:DM4VvS+hD/kDi1U1QsX2fnZowwBhqD0Dk3bRPKF/Oc8= github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= -github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= -github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/jonboulle/clockwork v0.2.2 h1:UOGuzwb1PwsrDAObMuhUnj0p5ULPj8V/xJ7Kx9qUBdQ= @@ -553,7 +541,6 @@ github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 h1:dd7vnTDfjtwCETZDrRe+GPYNLA1jBtbZeyfyE8eZCyk= github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12/go.mod h1:i/KKcxEWEO8Yyl11DYafRPKOPVYTrhxiTRigjtEEXZU= github.com/moby/ipvs v1.0.1/go.mod h1:2pngiyseZbIKXNv7hsKj3O9UEz30c53MT9005gt2hxQ= -github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= github.com/moby/sys/mountinfo v0.4.1 h1:1O+1cHA1aujwEwwVMa2Xm2l+gIpUHyd3+D+d7LZh1kM= github.com/moby/sys/mountinfo v0.4.1/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= @@ -1272,13 +1259,11 @@ k8s.io/autoscaler/vertical-pod-autoscaler v0.10.0/go.mod h1:sOm4JDB+0ZrrqFuscsbW k8s.io/cli-runtime v0.22.3/go.mod h1:um6JvCxV9Hrhq0zCUxcqYoY7/wF64g6IYgOViI8sg6Q= k8s.io/client-go v0.22.3 h1:6onkOSc+YNdwq5zXE0wFXicq64rrym+mXwHu/CPVGO4= k8s.io/client-go v0.22.3/go.mod h1:ElDjYf8gvZsKDYexmsmnMQ0DYO8W9RwBjfQ1PI53yow= -k8s.io/cloud-provider v0.22.3 h1:ZsWdB0WmyjKlE901EM14BuSvnN+QPGrCGjcfDc+b5NI= k8s.io/cloud-provider v0.22.3/go.mod h1:GsKMR5EnNH4zcfkEvOxBPEZVuRvadVRkZvGqYxxBvO4= k8s.io/cluster-bootstrap v0.22.3/go.mod h1:FVBAeGJ/T6QbNgGb7DX98FCjExJnNLsRXtGRMjEQ26I= k8s.io/code-generator v0.22.3/go.mod h1:eV77Y09IopzeXOJzndrDyCI88UBok2h6WxAlBwpxa+o= k8s.io/component-base v0.22.3 h1:/+hryAW03u3FpJQww+GSMsArJNUbGjH66lrgxaRynLU= k8s.io/component-base v0.22.3/go.mod h1:kuybv1miLCMoOk3ebrqF93GbQHQx6W2287FC0YEQY6s= -k8s.io/component-helpers v0.22.3 h1:08tn+T8HnjRTwDP2ErIBhHGvPcYJf5zWaWW83golHWc= k8s.io/component-helpers v0.22.3/go.mod h1:7OVySVH5elhHKuJKUOxZEfpT1Bm3ChmBQZHmuFfbGHk= k8s.io/controller-manager v0.22.3/go.mod h1:4cvQGMvYf6IpTY08/NigEiI5UrN/cbtOe5e5WepYmcQ= k8s.io/cri-api v0.22.3 h1:6C6Af3BooYbmZzZydibKgyJvZK1MRJQ/sSsvjunos2o= @@ -1298,17 +1283,14 @@ k8s.io/kube-openapi v0.0.0-20210421082810-95288971da7e/go.mod h1:vHXdDvt9+2spS2R k8s.io/kube-openapi v0.0.0-20210817084001-7fbd8d59e5b8 h1:Xxl9TLJ30BJ1pGWfGZnqbpww2rwOt3RAzbSz+omQGtg= k8s.io/kube-openapi v0.0.0-20210817084001-7fbd8d59e5b8/go.mod h1:foAE7XkrXQ1Qo2eWsW/iWksptrVdbl6t+vscSdmmGjk= k8s.io/kube-proxy v0.22.3/go.mod h1:9ta1U8GKKo6by981sN/L6MhFJzPWxMdfh7plVPH1I2s= -k8s.io/kube-scheduler v0.22.3 h1:VDd3zuXPUNTT1WSt/s+1Pk4lnfBNcVdfoijHv0MnV/4= k8s.io/kube-scheduler v0.22.3/go.mod h1:jVLHSttd8cSejBLOeiWE+g8etA6XdOBGiR8tI577OhU= k8s.io/kubectl v0.22.3/go.mod h1:gcpQHPOx+Jke9Og6Li7YxR/ZuaOtFUeJw7xHH617tHs= -k8s.io/kubelet v0.22.3 h1:C21Kg66Zzvc21uJITEPg4stGMcSZsR1JB+7+6Uwm8zs= k8s.io/kubelet v0.22.3/go.mod h1:9nUZNGUigU2uAIm7kgf8BsvYDI9KjIE5nt9+yI1+p7w= k8s.io/kubernetes v1.22.3 h1:/eFfR5S2Vxn0t9kcLVAZXQFloKMkklWQIf5e0hFbzlA= k8s.io/kubernetes v1.22.3/go.mod h1:Snea7fgIObGgHmLbUJ3OgjGEr5bjj16iEdp5oHS6eS8= k8s.io/legacy-cloud-providers v0.22.3/go.mod h1:eEOOaRtP2PuCVkjZvuTPa6ZgyPpzJkCVqpE3YtuArLQ= k8s.io/metrics v0.22.3 h1:G4EGLIcm9CSlpLRXKjIJiZqM/l45xasz2BOiK4qJCNo= k8s.io/metrics v0.22.3/go.mod h1:HbLFLRKtXzoC/6tHLQAlO9AeOBXZp2eB6SsgkbujoNI= -k8s.io/mount-utils v0.22.3 h1:3BIJ5lgA4qIhQ2/YhpCs9VMDP+/CiBGKZ8N+zeHFCWE= k8s.io/mount-utils v0.22.3/go.mod h1:dHl6c2P60T5LHUnZxVslyly9EDCMzvhtISO5aY+Z4sk= k8s.io/pod-security-admission v0.22.3/go.mod h1:xtkf/UhVWICokQLSDvD+8plfGkTQW4VTJvnixVWCeWk= k8s.io/sample-apiserver v0.22.3/go.mod h1:HuEOdD/pT5R7gKNr2REb62uabZaJuFZyY3wUd86nFCA= diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index e52017482..1e510dea7 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -94,10 +94,7 @@ func NewAgent(ctx context.Context, managers = appendManagerIfNotNil(managers, avoidanceManager) if nodeResource := utilfeature.DefaultFeatureGate.Enabled(features.CraneNodeResource); nodeResource { - tspName, err := agent.CreateNodeResourceTsp() - if err != nil { - return agent, err - } + tspName := agent.CreateNodeResourceTsp() nodeResourceManager, err := resource.NewNodeResourceManager(kubeClient, nodeName, nodeResourceReserved, tspName, nodeInformer, tspInformer, stateCollector.NodeResourceChann) if err != nil { return agent, err @@ -146,13 +143,13 @@ func getAgentName(nodeName string) string { return nodeName + "." + string(uuid.NewUUID()) } -func (a *Agent) CreateNodeResourceTsp() (string, error) { +func (a *Agent) CreateNodeResourceTsp() string { foundTsp := true tsp, err := a.craneClient.PredictionV1alpha1().TimeSeriesPredictions(resource.TspNamespace).Get(context.TODO(), a.GenerateNodeResourceTspName(), metav1.GetOptions{}) if err != nil { if !errors.IsNotFound(err) { klog.Errorf("Failed to get noderesource tsp : %v", err) - return "", err + return "" } foundTsp = false } @@ -169,14 +166,14 @@ func (a *Agent) CreateNodeResourceTsp() (string, error) { n, err := a.kubeClient.CoreV1().Nodes().Get(context.TODO(), a.nodeName, metav1.GetOptions{}) if err != nil { klog.Errorf("Failed to get node : %v", err) - return "", err + return "" } spec := v1alpha12.TimeSeriesPredictionSpec{} tpl, err := template.New("").Parse(config.Data["spec"]) if err != nil { klog.Errorf("Failed to convert spec template : %v", err) - return "", err + return "" } var buf bytes.Buffer //The k8s object is converted here to a json object in order to use lowercase letters in the template to take the node field, @@ -188,12 +185,12 @@ func (a *Agent) CreateNodeResourceTsp() (string, error) { err = tpl.Execute(&buf, data) if err != nil { klog.Errorf("Failed to convert spec template : %v", err) - return "", err + return "" } err = yaml.Unmarshal(buf.Bytes(), &spec) if err != nil { klog.Errorf("Failed to convert spec template : %v", err) - return "", err + return "" } gvk, _ := apiutil.GVKForObject(n, scheme.Scheme) @@ -206,14 +203,14 @@ func (a *Agent) CreateNodeResourceTsp() (string, error) { if foundTsp { klog.V(4).Infof("Discover the presence of old noderesource tsp and try to contrast the changes: %s", a.GenerateNodeResourceTspName()) if reflect.DeepEqual(tsp.Spec, spec) { - return a.GenerateNodeResourceTspName(), nil + return a.GenerateNodeResourceTspName() } klog.V(4).Infof("Discover the presence of old noderesource tsp and the Tsp rules have been changed: %s", a.GenerateNodeResourceTspName()) tsp.Spec = spec _, err := a.craneClient.PredictionV1alpha1().TimeSeriesPredictions(tsp.Namespace).Update(context.TODO(), tsp, metav1.UpdateOptions{}) if err != nil { klog.Errorf("Failed to update noderesource tsp %s : %v", a.GenerateNodeResourceTspName(), err) - return "", err + return "" } klog.V(4).Infof("The noderesource tsp is updated successfully: %s", a.GenerateNodeResourceTspName()) } else { @@ -226,12 +223,12 @@ func (a *Agent) CreateNodeResourceTsp() (string, error) { _, err = a.craneClient.PredictionV1alpha1().TimeSeriesPredictions(tsp.Namespace).Create(context.TODO(), tsp, metav1.CreateOptions{}) if err != nil { klog.Errorf("Failed to create noderesource tsp %s : %v", a.GenerateNodeResourceTspName(), err) - return "", err + return "" } klog.V(4).Infof("The noderesource tsp is created successfully: %s", a.GenerateNodeResourceTspName()) } - return a.GenerateNodeResourceTspName(), nil + return a.GenerateNodeResourceTspName() } func (a *Agent) DeleteNodeResourceTsp() error { diff --git a/pkg/ensurance/analyzer/analyzer.go b/pkg/ensurance/analyzer/analyzer.go index 6a9c25a0a..5c153c23d 100644 --- a/pkg/ensurance/analyzer/analyzer.go +++ b/pkg/ensurance/analyzer/analyzer.go @@ -194,6 +194,10 @@ func (s *AnomalyAnalyzer) Analyze(state map[string][]common.TimeSeries) { klog.V(6).Infof("Analyze actionContexts: %#v", actionContexts) //step 3 : merge + if len(actionContexts) == 0 { + return + } + avoidanceAction := s.merge(state, actionMap, actionContexts) if err != nil { klog.Errorf("Failed to merge actions, error: %v", err) diff --git a/pkg/ensurance/collector/cadvisor/cadvisor_linux.go b/pkg/ensurance/collector/cadvisor/cadvisor_linux.go index ad97e844b..2ea0ada93 100644 --- a/pkg/ensurance/collector/cadvisor/cadvisor_linux.go +++ b/pkg/ensurance/collector/cadvisor/cadvisor_linux.go @@ -113,11 +113,12 @@ func (c *CadvisorCollector) Collect() (map[string][]common.TimeSeries, error) { return nil, err } var extResCpuUse float64 = 0 + var extResMemUse float64 = 0 var stateMap = make(map[string][]common.TimeSeries) for _, pod := range allPods { var now = time.Now() - containers, err := c.Manager.GetContainerInfoV2(types.GetCgroupPath(pod, c.Manager.GetCgroupDriver()), cadvisorapiv2.RequestOptions{ + containers, err := c.Manager.GetContainerInfoV2(utils.GetCgroupPath(pod, c.Manager.GetCgroupDriver()), cadvisorapiv2.RequestOptions{ IdType: cadvisorapiv2.TypeName, Count: 1, Recursive: true, @@ -144,7 +145,8 @@ func (c *CadvisorCollector) Collect() (map[string][]common.TimeSeries, error) { continue } - _, hasExtRes := utils.GetContainerExtCpuResFromPod(pod, containerName) + _, hasExtCpuRes := utils.GetContainerExtCpuResFromPod(pod, containerName) + _, hasExtMemRes := utils.GetContainerExtMemResFromPod(pod, containerName) // In the GetContainerInfoV2 not collect the cpu quota and period // We used GetContainerInfo instead @@ -156,15 +158,22 @@ func (c *CadvisorCollector) Collect() (map[string][]common.TimeSeries, error) { continue } + if hasExtMemRes { + extResMemUse += float64(v.Stats[0].Memory.Usage) + } + + var containerLabels = GetContainerLabels(pod, containerId, containerName, hasExtCpuRes) + addSampleToStateMap(types.MetricNameContainerMemTotalUsage, composeSample(containerLabels, float64(v.Stats[0].Memory.Usage), now), stateMap) + if state, ok := c.latestContainersStates[key]; ok { - klog.V(8).Infof("For key %s, LatestContainersStates exist", key) - var containerLabels = GetContainerLabels(pod, containerId, containerName, hasExtRes) + klog.V(6).Infof("For key %s, LatestContainersStates exist", key) cpuUsageSample, schedRunqueueTime := caculateCPUUsage(&v, &state) + if cpuUsageSample == 0 && schedRunqueueTime == 0 || math.IsNaN(cpuUsageSample) { continue } - if hasExtRes { + if hasExtCpuRes { extResCpuUse += cpuUsageSample } addSampleToStateMap(types.MetricNameContainerCpuTotalUsage, composeSample(containerLabels, cpuUsageSample, now), stateMap) @@ -173,24 +182,26 @@ func (c *CadvisorCollector) Collect() (map[string][]common.TimeSeries, error) { addSampleToStateMap(types.MetricNameContainerCpuQuota, composeSample(containerLabels, float64(containerInfoV1.Spec.Cpu.Quota), now), stateMap) addSampleToStateMap(types.MetricNameContainerCpuPeriod, composeSample(containerLabels, float64(containerInfoV1.Spec.Cpu.Period), now), stateMap) - klog.V(8).Infof("Pod: %s, containerName: %s, key %s, scheduler run queue time %.2f, container_cpu_total_usage %#v", klog.KObj(pod), containerName, key, schedRunqueueTime, cpuUsageSample) + klog.V(6).Infof("Pod: %s, containerName: %s, key %s, scheduler run queue time %.2f, container_cpu_total_usage %#v", klog.KObj(pod), containerName, key, schedRunqueueTime, cpuUsageSample) } containerStates[key] = ContainerState{stat: v, timestamp: now} } } addSampleToStateMap(types.MetricNameExtResContainerCpuTotalUsage, composeSample(make([]common.Label, 0), extResCpuUse, time.Now()), stateMap) + addSampleToStateMap(types.MetricNameExtResContainerMemTotalUsage, composeSample(make([]common.Label, 0), extResMemUse, time.Now()), stateMap) + klog.V(6).Infof("ext_res_container_mem_total_usage is %f, ext_res_container_cpu_total_usage is %f", extResMemUse, extResCpuUse) c.latestContainersStates = containerStates return stateMap, nil } -func composeSample(labels []common.Label, cpuUsageSample float64, sampleTime time.Time) common.TimeSeries { +func composeSample(labels []common.Label, UsageSample float64, sampleTime time.Time) common.TimeSeries { return common.TimeSeries{ Labels: labels, Samples: []common.Sample{ { - Value: cpuUsageSample, + Value: UsageSample, Timestamp: sampleTime.Unix(), }, }, diff --git a/pkg/ensurance/collector/types/types.go b/pkg/ensurance/collector/types/types.go index a29cfdc39..681b0367b 100644 --- a/pkg/ensurance/collector/types/types.go +++ b/pkg/ensurance/collector/types/types.go @@ -1,14 +1,5 @@ package types -import ( - "strings" - - v1 "k8s.io/api/core/v1" - "k8s.io/kubernetes/pkg/kubelet/cm" - - "github.com/gocrane/crane/pkg/utils" -) - type CollectType string const ( @@ -55,29 +46,7 @@ const ( MetricNameExtResContainerCpuTotalUsage MetricName = "ext_res_container_cpu_total_usage" MetricNameExtCpuTotalDistribute MetricName = "ext_cpu_total_distribute" -) - -func GetCgroupPath(p *v1.Pod, cgroupDriver string) string { - cgroupName := GetCgroupName(p) - switch cgroupDriver { - case "systemd": - return cgroupName.ToSystemd() - case "cgroupfs": - return cgroupName.ToCgroupfs() - default: - return "" - } -} -func GetCgroupName(p *v1.Pod) cm.CgroupName { - switch p.Status.QOSClass { - case v1.PodQOSGuaranteed: - return cm.NewCgroupName(cm.RootCgroupName, utils.CgroupKubePods, cm.GetPodCgroupNameSuffix(p.UID)) - case v1.PodQOSBurstable: - return cm.NewCgroupName(cm.RootCgroupName, utils.CgroupKubePods, strings.ToLower(string(v1.PodQOSBurstable)), cm.GetPodCgroupNameSuffix(p.UID)) - case v1.PodQOSBestEffort: - return cm.NewCgroupName(cm.RootCgroupName, utils.CgroupKubePods, strings.ToLower(string(v1.PodQOSBestEffort)), cm.GetPodCgroupNameSuffix(p.UID)) - default: - return cm.RootCgroupName - } -} + MetricNameContainerMemTotalUsage MetricName = "container_mem_total_usage" + MetricNameExtResContainerMemTotalUsage MetricName = "ext_res_container_mem_total_usage" +) diff --git a/pkg/metrics/ensuarance.go b/pkg/metrics/ensuarance.go index e008c5f38..f3ad338be 100644 --- a/pkg/metrics/ensuarance.go +++ b/pkg/metrics/ensuarance.go @@ -28,6 +28,7 @@ const ( PodResourceErrorTotal = "pod_resource_error_total" NodeCpuCannotBeReclaimedSeconds = "node_cpu_cannot_be_reclaimed_seconds" + NodeMemCannotBeReclaimedSeconds = "node_mem_cannot_be_reclaimed_seconds" NodeResourceRecommended = "node_resource_recommended" NodeResourceRecommendedFrom = "node_resource_recommended_from" ) @@ -199,6 +200,17 @@ var ( }, []string{}, ) + // LastActivity records the last activity time of each steps + nodeMemCannotBeReclaimedSeconds = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: CraneNamespace, + Subsystem: CraneAgentSubsystem, + Name: NodeMemCannotBeReclaimedSeconds, + Help: "The mem seconds that cannot be reclaimed.", + StabilityLevel: k8smetrics.ALPHA, + }, []string{}, + ) + //NodeResourceRecommended nodeResourceRecommended = k8smetrics.NewGaugeVec( &k8smetrics.GaugeOpts{ @@ -306,6 +318,10 @@ func UpdateNodeCpuCannotBeReclaimedSeconds(value float64) { nodeCpuCannotBeReclaimedSeconds.With(prometheus.Labels{}).Set(value) } +func UpdateNodeMemCannotBeReclaimedSeconds(value float64) { + nodeMemCannotBeReclaimedSeconds.With(prometheus.Labels{}).Set(value) +} + func UpdateNodeResourceRecommendedValue(subComponent SubComponent, stepName StepLabel, resourceName string, from string, value float64) { nodeResourceRecommended.With(prometheus.Labels{"subcomponent": string(subComponent), "step": string(stepName), "resourceName": resourceName}).Set(value) switch from { diff --git a/pkg/resource/node_resource_manager.go b/pkg/resource/node_resource_manager.go index 2995ccb9f..e2f3a91bd 100644 --- a/pkg/resource/node_resource_manager.go +++ b/pkg/resource/node_resource_manager.go @@ -194,14 +194,21 @@ func (o *NodeResourceManager) FindTargetNode(tsp *predictionapi.TimeSeriesPredic func (o *NodeResourceManager) BuildNodeStatus(node *v1.Node) map[string]int64 { tspCanNotBeReclaimedResource := o.GetCanNotBeReclaimedResourceFromTsp(node) localCanNotBeReclaimedResource := o.GetCanNotBeReclaimedResourceFromLocal() + reserveCpuPercent := o.reserveResource.CpuPercent if nodeReserveCpuPercent, ok := getReserveResourcePercentFromNodeAnnotations(node.GetAnnotations(), v1.ResourceCPU.String()); ok { reserveCpuPercent = &nodeReserveCpuPercent } + reserveMemPercent := o.reserveResource.MemPercent + if nodeReserveMemPercent, ok := getReserveResourcePercentFromNodeAnnotations(node.GetAnnotations(), v1.ResourceMemory.String()); ok { + reserveMemPercent = &nodeReserveMemPercent + } + extResourceFrom := map[string]int64{} for resourceName, value := range tspCanNotBeReclaimedResource { + klog.V(6).Infof("resourcename is %s", resourceName) resourceFrom := "tsp" maxUsage := value if localCanNotBeReclaimedResource[resourceName] > maxUsage { @@ -212,14 +219,19 @@ func (o *NodeResourceManager) BuildNodeStatus(node *v1.Node) map[string]int64 { var nextRecommendation float64 switch resourceName { case v1.ResourceCPU: - if reserveCpuPercent != nil { + if *reserveCpuPercent != 0 { nextRecommendation = float64(node.Status.Allocatable.Cpu().Value()) - float64(node.Status.Allocatable.Cpu().Value())*(*reserveCpuPercent) - maxUsage/1000 } else { nextRecommendation = float64(node.Status.Allocatable.Cpu().Value()) - maxUsage/1000 } case v1.ResourceMemory: // unit of memory in prometheus is in Ki, need to be converted to byte - nextRecommendation = float64(node.Status.Allocatable.Memory().Value()) - (maxUsage * 1000) + if *reserveMemPercent != 0 { + nextRecommendation = float64(node.Status.Allocatable.Memory().Value()) - float64(node.Status.Allocatable.Memory().Value())*(*reserveMemPercent) - maxUsage/1000 + } else { + klog.V(6).Infof("allocatable mem is %d, maxusage is %f", node.Status.Allocatable.Memory().Value(), maxUsage) + nextRecommendation = float64(node.Status.Allocatable.Memory().Value()) - maxUsage + } default: continue } @@ -234,10 +246,18 @@ func (o *NodeResourceManager) BuildNodeStatus(node *v1.Node) map[string]int64 { nextRecommendation)/float64(resValue.Value()) <= MinDeltaRatio { continue } - node.Status.Capacity[v1.ResourceName(extResourceName)] = - *resource.NewQuantity(int64(nextRecommendation), resource.DecimalSI) - node.Status.Allocatable[v1.ResourceName(extResourceName)] = - *resource.NewQuantity(int64(nextRecommendation), resource.DecimalSI) + switch resourceName { + case v1.ResourceCPU: + node.Status.Capacity[v1.ResourceName(extResourceName)] = + *resource.NewQuantity(int64(nextRecommendation), resource.DecimalSI) + node.Status.Allocatable[v1.ResourceName(extResourceName)] = + *resource.NewQuantity(int64(nextRecommendation), resource.DecimalSI) + case v1.ResourceMemory: + node.Status.Capacity[v1.ResourceName(extResourceName)] = + *resource.NewQuantity(int64(nextRecommendation), resource.BinarySI) + node.Status.Allocatable[v1.ResourceName(extResourceName)] = + *resource.NewQuantity(int64(nextRecommendation), resource.BinarySI) + } extResourceFrom[resourceFrom+"-"+resourceName.String()] = int64(nextRecommendation) } @@ -297,8 +317,37 @@ func (o *NodeResourceManager) GetCanNotBeReclaimedResourceFromTsp(node *v1.Node) func (o *NodeResourceManager) GetCanNotBeReclaimedResourceFromLocal() map[v1.ResourceName]float64 { return map[v1.ResourceName]float64{ v1.ResourceCPU: o.GetCpuCoreCanNotBeReclaimedFromLocal(), - v1.ResourceMemory: 0, + v1.ResourceMemory: o.GetMemCanNotBeReclaimedFromLocal(), + } +} + +func (o *NodeResourceManager) GetMemCanNotBeReclaimedFromLocal() float64 { + var memUsageTotal float64 + memUsage, ok := o.state[string(types.MetricNameMemoryTotalUsage)] + if ok { + memUsageTotal = memUsage[0].Samples[0].Value + klog.V(4).Infof("%s: %f", types.MetricNameMemoryTotalUsage, memUsageTotal) + + } else { + klog.V(4).Infof("Can't get %s from NodeResourceManager local state", types.MetricNameMemoryTotalUsage) } + + var extResContainerMemUsageTotal float64 = 0 + extResContainerCpuUsageTotalTimeSeries, ok := o.state[string(types.MetricNameExtResContainerMemTotalUsage)] + if ok { + extResContainerMemUsageTotal = extResContainerCpuUsageTotalTimeSeries[0].Samples[0].Value + } else { + klog.V(4).Infof("Can't get %s from NodeResourceManager local state", types.MetricNameExtResContainerCpuTotalUsage) + } + + klog.V(6).Infof("nodeMemUsageTotal: %f, extResContainerMemUsageTotal: %f", memUsageTotal, extResContainerMemUsageTotal) + + // 1. Exclusive tethered CPU cannot be reclaimed even if the free part is free, so add the exclusive CPUIdle to the CanNotBeReclaimed CPU + // 2. The CPU used by extRes-container needs to be reclaimed, otherwise it will be double-counted due to the allotted mechanism of k8s, so the extResContainerCpuUsageTotal is subtracted from the CanNotBeReclaimedCpu + nodeMemCannotBeReclaimedSeconds := memUsageTotal - extResContainerMemUsageTotal + + metrics.UpdateNodeMemCannotBeReclaimedSeconds(nodeMemCannotBeReclaimedSeconds) + return nodeMemCannotBeReclaimedSeconds } func (o *NodeResourceManager) GetCpuCoreCanNotBeReclaimedFromLocal() float64 { diff --git a/pkg/utils/cgroup.go b/pkg/utils/cgroup.go new file mode 100644 index 000000000..7b1b1d10e --- /dev/null +++ b/pkg/utils/cgroup.go @@ -0,0 +1,111 @@ +package utils + +import ( + "fmt" + "path" + "strings" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" +) + +func (cgroupName CgroupName) ToCgroupfs() string { + return "/" + path.Join(cgroupName...) +} + +func GetCgroupPath(p *v1.Pod, cgroupDriver string) string { + cgroupName := GetCgroupName(p) + switch cgroupDriver { + case "systemd": + return cgroupName.ToSystemd() + case "cgroupfs": + return cgroupName.ToCgroupfs() + default: + return "" + } +} + +var RootCgroupName = CgroupName([]string{}) + +func GetCgroupName(p *v1.Pod) CgroupName { + switch p.Status.QOSClass { + case v1.PodQOSGuaranteed: + return NewCgroupName(RootCgroupName, CgroupKubePods, GetPodCgroupNameSuffix(p.UID)) + case v1.PodQOSBurstable: + return NewCgroupName(RootCgroupName, CgroupKubePods, strings.ToLower(string(v1.PodQOSBurstable)), GetPodCgroupNameSuffix(p.UID)) + case v1.PodQOSBestEffort: + return NewCgroupName(RootCgroupName, CgroupKubePods, strings.ToLower(string(v1.PodQOSBestEffort)), GetPodCgroupNameSuffix(p.UID)) + default: + return RootCgroupName + } +} + +const ( + podCgroupNamePrefix = "pod" +) + +func GetPodCgroupNameSuffix(podUID types.UID) string { + return podCgroupNamePrefix + string(podUID) +} + +type CgroupName []string + +func NewCgroupName(base CgroupName, components ...string) CgroupName { + return append(append([]string{}, base...), components...) +} + +// systemdSuffix is the cgroup name suffix for systemd +const systemdSuffix string = ".slice" + +func (cgroupName CgroupName) ToSystemd() string { + if len(cgroupName) == 0 || (len(cgroupName) == 1 && cgroupName[0] == "") { + return "/" + } + newparts := []string{} + for _, part := range cgroupName { + part = escapeSystemdCgroupName(part) + newparts = append(newparts, part) + } + + result, err := ExpandSlice(strings.Join(newparts, "-") + systemdSuffix) + if err != nil { + // Should never happen... + panic(fmt.Errorf("error converting cgroup name [%v] to systemd format: %v", cgroupName, err)) + } + return result +} + +func escapeSystemdCgroupName(part string) string { + return strings.Replace(part, "-", "_", -1) +} + +func ExpandSlice(slice string) (string, error) { + suffix := ".slice" + // Name has to end with ".slice", but can't be just ".slice". + if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Path-separators are not allowed. + if strings.Contains(slice, "/") { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + var path, prefix string + sliceName := strings.TrimSuffix(slice, suffix) + // if input was -.slice, we should just return root now + if sliceName == "-" { + return "/", nil + } + for _, component := range strings.Split(sliceName, "-") { + // test--a.slice isn't permitted, nor is -test.slice. + if component == "" { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Append the component to the path and to the prefix. + path += "/" + prefix + component + suffix + prefix += component + "-" + } + return path, nil +} diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go index a6505e204..047c37494 100644 --- a/pkg/utils/pod.go +++ b/pkg/utils/pod.go @@ -157,6 +157,30 @@ func GetExtCpuRes(container v1.Container) (resource.Quantity, bool) { return val, true } } + + for res, val := range container.Resources.Requests { + if strings.HasPrefix(res.String(), fmt.Sprintf(ExtResourcePrefixFormat, v1.ResourceCPU)) && val.Value() != 0 { + return val, true + } + } + + return resource.Quantity{}, false +} + +// GetExtMemRes get container's gocrane.io/memory usage +func GetExtMemRes(container v1.Container) (resource.Quantity, bool) { + for res, val := range container.Resources.Limits { + if strings.HasPrefix(res.String(), fmt.Sprintf(ExtResourcePrefixFormat, v1.ResourceMemory)) && val.Value() != 0 { + return val, true + } + } + + for res, val := range container.Resources.Requests { + if strings.HasPrefix(res.String(), fmt.Sprintf(ExtResourcePrefixFormat, v1.ResourceMemory)) && val.Value() != 0 { + return val, true + } + } + return resource.Quantity{}, false } @@ -165,6 +189,7 @@ func GetContainerNameFromPod(pod *v1.Pod, containerId string) string { return "" } + // for docker for _, v := range pod.Status.ContainerStatuses { strList := strings.Split(v.ContainerID, "//") if len(strList) > 0 { @@ -174,6 +199,18 @@ func GetContainerNameFromPod(pod *v1.Pod, containerId string) string { } } + // for containerd + for _, v := range pod.Status.ContainerStatuses { + strList := strings.Split(v.ContainerID, "//") + if len(strList) > 0 { + klog.V(6).Infof("cri-containerd is %s ", "cri-containerd-"+strList[len(strList)-1]+".scope") + klog.V(6).Infof("containerid is %s", containerId) + if "cri-containerd-"+strList[len(strList)-1]+".scope" == containerId { + klog.V(6).Infof("111111111") + return v.Name + } + } + } return "" } @@ -189,7 +226,7 @@ func GetContainerFromPod(pod *v1.Pod, containerName string) *v1.Container { return nil } -// GetExtCpuRes get container's gocrane.io/cpu usage +// GetContainerExtCpuResFromPod get container's gocrane.io/cpu usage func GetContainerExtCpuResFromPod(pod *v1.Pod, containerName string) (resource.Quantity, bool) { c := GetContainerFromPod(pod, containerName) if c == nil { @@ -198,6 +235,15 @@ func GetContainerExtCpuResFromPod(pod *v1.Pod, containerName string) (resource.Q return GetExtCpuRes(*c) } +// GetContainerExtMemResFromPod get container's gocrane.io/memory usage +func GetContainerExtMemResFromPod(pod *v1.Pod, containerName string) (resource.Quantity, bool) { + c := GetContainerFromPod(pod, containerName) + if c == nil { + return resource.Quantity{}, false + } + return GetExtMemRes(*c) +} + func GetContainerStatus(pod *v1.Pod, container v1.Container) v1.ContainerState { for _, cs := range pod.Status.ContainerStatuses { if cs.Name == container.Name { From 729c46c777debb555455babf964939f358bc4e18 Mon Sep 17 00:00:00 2001 From: kaiyuechen Date: Thu, 15 Sep 2022 10:49:50 +0800 Subject: [PATCH 2/2] Watermark of cpu usage percent --- .../elastic-pod-qos.yaml | 13 ++++ .../eviction-action.yaml | 9 +++ .../evict-on-cpu-usage-percent/pod.yaml | 33 +++++++++ .../evict-on-cpu-usage-percent/waterline.yaml | 18 +++++ pkg/ensurance/analyzer/analyzer.go | 1 - pkg/ensurance/collector/nodelocal/cpu.go | 1 + pkg/ensurance/collector/types/types.go | 1 + pkg/ensurance/executor/cpu_usage.go | 4 +- pkg/ensurance/executor/cpu_usage_percent.go | 70 +++++++++++++++++++ pkg/ensurance/executor/evict.go | 3 +- pkg/ensurance/executor/watermark.go | 18 ++++- 11 files changed, 163 insertions(+), 8 deletions(-) create mode 100644 examples/ensurance/evict-on-cpu-usage-percent/elastic-pod-qos.yaml create mode 100644 examples/ensurance/evict-on-cpu-usage-percent/eviction-action.yaml create mode 100644 examples/ensurance/evict-on-cpu-usage-percent/pod.yaml create mode 100644 examples/ensurance/evict-on-cpu-usage-percent/waterline.yaml create mode 100644 pkg/ensurance/executor/cpu_usage_percent.go diff --git a/examples/ensurance/evict-on-cpu-usage-percent/elastic-pod-qos.yaml b/examples/ensurance/evict-on-cpu-usage-percent/elastic-pod-qos.yaml new file mode 100644 index 000000000..29ca6bcd9 --- /dev/null +++ b/examples/ensurance/evict-on-cpu-usage-percent/elastic-pod-qos.yaml @@ -0,0 +1,13 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: PodQOS +metadata: + name: all-elastic-pods +spec: + allowedActions: + - eviction + resourceQOS: + cpuQOS: + cpuPriority: 7 + labelSelector: + matchLabels: + preemptible_job: "true" \ No newline at end of file diff --git a/examples/ensurance/evict-on-cpu-usage-percent/eviction-action.yaml b/examples/ensurance/evict-on-cpu-usage-percent/eviction-action.yaml new file mode 100644 index 000000000..c9e715fec --- /dev/null +++ b/examples/ensurance/evict-on-cpu-usage-percent/eviction-action.yaml @@ -0,0 +1,9 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: AvoidanceAction +metadata: + name: eviction +spec: + coolDownSeconds: 300 + description: evict low priority pods + eviction: + terminationGracePeriodSeconds: 30 \ No newline at end of file diff --git a/examples/ensurance/evict-on-cpu-usage-percent/pod.yaml b/examples/ensurance/evict-on-cpu-usage-percent/pod.yaml new file mode 100644 index 000000000..ef73009c0 --- /dev/null +++ b/examples/ensurance/evict-on-cpu-usage-percent/pod.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +kind: Pod +metadata: + name: low + labels: + k8s-app: low + preemptible_job: "true" +spec: + containers: + - command: + - stress-nga + - -c + - "2" + - --cpu-method + - cpuid + image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09.ln + imagePullPolicy: IfNotPresent + name: low + resources: + limits: + gocrane.io/cpu: "2" + gocrane.io/memory: "2000Mi" + requests: + gocrane.io/cpu: "2" + gocrane.io/memory: "2000Mi" + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + serviceAccount: default + serviceAccountName: default + terminationGracePeriodSeconds: 30 \ No newline at end of file diff --git a/examples/ensurance/evict-on-cpu-usage-percent/waterline.yaml b/examples/ensurance/evict-on-cpu-usage-percent/waterline.yaml new file mode 100644 index 000000000..874a07907 --- /dev/null +++ b/examples/ensurance/evict-on-cpu-usage-percent/waterline.yaml @@ -0,0 +1,18 @@ +apiVersion: ensurance.crane.io/v1alpha1 +kind: NodeQOS +metadata: + name: eviction-on-high-cpu-usage-percent +spec: + nodeQualityProbe: + nodeLocalGet: + localCacheTTLSeconds: 60 + timeoutSeconds: 10 + rules: + - actionName: eviction + avoidanceThreshold: 2 + metricRule: + name: cpu_total_utilization + value: 50 + name: cpu-usage-percent + restoreThreshold: 2 + strategy: None \ No newline at end of file diff --git a/pkg/ensurance/analyzer/analyzer.go b/pkg/ensurance/analyzer/analyzer.go index 5c153c23d..519431660 100644 --- a/pkg/ensurance/analyzer/analyzer.go +++ b/pkg/ensurance/analyzer/analyzer.go @@ -475,7 +475,6 @@ func (s *AnomalyAnalyzer) getEvictPods(triggered bool, action *ensuranceapi.Avoi } for _, pod := range filteredPods { evictPods = append(evictPods, podinfo.BuildPodActionContext(pod, stateMap, action, podinfo.Evict)) - } } return evictPods diff --git a/pkg/ensurance/collector/nodelocal/cpu.go b/pkg/ensurance/collector/nodelocal/cpu.go index 8fa7446c1..a685d5af4 100644 --- a/pkg/ensurance/collector/nodelocal/cpu.go +++ b/pkg/ensurance/collector/nodelocal/cpu.go @@ -109,6 +109,7 @@ func collectCPU(nodeLocalContext *nodeLocalContext) (map[string][]common.TimeSer data[string(types.MetricNameCpuTotalUsage)] = []common.TimeSeries{{Samples: []common.Sample{{Value: usageCore, Timestamp: now.Unix()}}}} data[string(types.MetricNameCpuTotalUtilization)] = []common.TimeSeries{{Samples: []common.Sample{{Value: usagePercent, Timestamp: now.Unix()}}}} data[string(types.MetricNameExclusiveCPUIdle)] = []common.TimeSeries{{Samples: []common.Sample{{Value: exclusiveCPUIdle, Timestamp: now.Unix()}}}} + data[string(types.MetricNameCpuCoreNumbers)] = []common.TimeSeries{{Samples: []common.Sample{{Value: float64(nodeState.cpuCoreNumbers), Timestamp: now.Unix()}}}} return data, nil } diff --git a/pkg/ensurance/collector/types/types.go b/pkg/ensurance/collector/types/types.go index 681b0367b..d841cc235 100644 --- a/pkg/ensurance/collector/types/types.go +++ b/pkg/ensurance/collector/types/types.go @@ -18,6 +18,7 @@ const ( MetricNameCpuLoad1Min MetricName = "cpu_load_1_min" MetricNameCpuLoad5Min MetricName = "cpu_load_5_min" MetricNameCpuLoad15Min MetricName = "cpu_load_15_min" + MetricNameCpuCoreNumbers MetricName = "cpu_core_numbers" MetricNameExclusiveCPUIdle MetricName = "exclusive_cpu_idle" diff --git a/pkg/ensurance/executor/cpu_usage.go b/pkg/ensurance/executor/cpu_usage.go index 65bd1f0cf..828148462 100644 --- a/pkg/ensurance/executor/cpu_usage.go +++ b/pkg/ensurance/executor/cpu_usage.go @@ -31,7 +31,7 @@ var cpuUsage = metric{ Evictable: true, EvictQuantified: true, - EvictFunc: evictPod, + EvictFunc: cpuUsageEvictPod, } func throttleOnePodCpu(ctx *ExecuteContext, index int, ThrottleDownPods ThrottlePods, totalReleasedResource *ReleaseResource) (errPodKeys []string, released ReleaseResource) { @@ -198,7 +198,7 @@ func restoreOnePodCpu(ctx *ExecuteContext, index int, ThrottleUpPods ThrottlePod return } -func evictPod(wg *sync.WaitGroup, ctx *ExecuteContext, index int, totalReleasedResource *ReleaseResource, EvictPods EvictPods) (errPodKeys []string, released ReleaseResource) { +func cpuUsageEvictPod(wg *sync.WaitGroup, ctx *ExecuteContext, index int, totalReleasedResource *ReleaseResource, EvictPods EvictPods) (errPodKeys []string, released ReleaseResource) { wg.Add(1) // Calculate release resources diff --git a/pkg/ensurance/executor/cpu_usage_percent.go b/pkg/ensurance/executor/cpu_usage_percent.go new file mode 100644 index 000000000..e7bd18cb0 --- /dev/null +++ b/pkg/ensurance/executor/cpu_usage_percent.go @@ -0,0 +1,70 @@ +package executor + +import ( + "sync" + + "k8s.io/klog/v2" + + "github.com/gocrane/crane/pkg/ensurance/executor/podinfo" + "github.com/gocrane/crane/pkg/ensurance/executor/sort" + "github.com/gocrane/crane/pkg/metrics" + "github.com/gocrane/crane/pkg/utils" +) + +func init() { + registerMetricMap(cpuUsagePercent) +} + +var cpuUsagePercent = metric{ + Name: CpuUsagePercent, + ActionPriority: 5, + Sortable: true, + SortFunc: sort.CpuUsageSort, + + Throttleable: false, + ThrottleQuantified: false, + ThrottleFunc: nil, + RestoreFunc: nil, + + Evictable: true, + EvictQuantified: true, + EvictFunc: cpuUsagePercentEvictPod, +} + +func cpuUsagePercentEvictPod(wg *sync.WaitGroup, ctx *ExecuteContext, index int, totalReleasedResource *ReleaseResource, EvictPods EvictPods) (errPodKeys []string, released ReleaseResource) { + wg.Add(1) + + // Calculate release resources + released = releaseCPUUsagePercent(EvictPods[index]) + totalReleasedResource.Add(released) + + go func(evictPod podinfo.PodContext) { + defer wg.Done() + + pod, err := ctx.PodLister.Pods(evictPod.Key.Namespace).Get(evictPod.Key.Name) + if err != nil { + errPodKeys = append(errPodKeys, "not found ", evictPod.Key.String()) + return + } + klog.Warningf("Evicting pod %v", evictPod.Key) + err = utils.EvictPodWithGracePeriod(ctx.Client, pod, evictPod.DeletionGracePeriodSeconds) + if err != nil { + errPodKeys = append(errPodKeys, "evict failed ", evictPod.Key.String()) + klog.Warningf("Failed to evict pod %s: %v", evictPod.Key.String(), err) + return + } + metrics.ExecutorEvictCountsInc() + + klog.Warningf("Pod %s is evicted", klog.KObj(pod)) + }(EvictPods[index]) + return +} + +func releaseCPUUsagePercent(pod podinfo.PodContext) ReleaseResource { + if pod.ActionType == podinfo.Evict { + return ReleaseResource{ + CpuUsagePercent: pod.PodCPUUsage * CpuQuotaCoefficient, + } + } + return ReleaseResource{} +} diff --git a/pkg/ensurance/executor/evict.go b/pkg/ensurance/executor/evict.go index 984bb2b76..350c08b60 100644 --- a/pkg/ensurance/executor/evict.go +++ b/pkg/ensurance/executor/evict.go @@ -92,12 +92,11 @@ func (e *EvictExecutor) Avoid(ctx *ExecuteContext) error { execsort.GeneralSorter(e.EvictPods) } - klog.V(6).Info("After sort, the sequence to evict is ") for _, pc := range e.EvictPods { klog.V(6).Info(pc.Key.String()) } for !ctx.ToBeEvict.TargetGapsRemoved(m) { - klog.V(2).Infof("For metric %s, there is more gap to watermarks: %f of %s", m, ctx.ToBeEvict[m], m) + klog.V(2).Infof("For metric %s, there is gap %f to watermarks %s", m, ctx.ToBeEvict[m], m) if podinfo.ContainsNoExecutedPod(e.EvictPods) { index := podinfo.GetFirstPendingPod(e.EvictPods) errKeys, released = metricMap[m].EvictFunc(&wg, ctx, index, &totalReleased, e.EvictPods) diff --git a/pkg/ensurance/executor/watermark.go b/pkg/ensurance/executor/watermark.go index 5ad7fe8a8..cbafe8738 100644 --- a/pkg/ensurance/executor/watermark.go +++ b/pkg/ensurance/executor/watermark.go @@ -16,8 +16,9 @@ type WatermarkMetric string // Be consistent with metrics in collector/types/types.go const ( - CpuUsage = WatermarkMetric(types.MetricNameCpuTotalUsage) - MemUsage = WatermarkMetric(types.MetricNameMemoryTotalUsage) + CpuUsage = WatermarkMetric(types.MetricNameCpuTotalUsage) + CpuUsagePercent = WatermarkMetric(types.MetricNameCpuTotalUtilization) + MemUsage = WatermarkMetric(types.MetricNameMemoryTotalUsage) ) const ( @@ -163,7 +164,18 @@ func calculateGaps(stateMap map[string][]common.TimeSeries, delete(result, m.Name) } else { klog.V(6).Infof("BuildEvictWatermarkGap: For metrics %+v, maxUsed is %f, watermark is %f", m, maxUsed, float64(evictWatermark.PopSmallest().Value())) - result[m.Name] = (1 + executeExcessPercent) * (maxUsed - float64(evictWatermark.PopSmallest().Value())) + if m.Name == CpuUsagePercent { + cpuCoreNums, ok := stateMap[string(types.MetricNameCpuCoreNumbers)] + if !ok { + klog.Warningf("Can't get MetricNameCpuCoreNumbers") + } else { + cpuPercentToUsage := (1 + executeExcessPercent) * (maxUsed - float64(evictWatermark.PopSmallest().Value())) * cpuCoreNums[0].Samples[0].Value * 1000 / types.MaxPercentage + result[m.Name] = cpuPercentToUsage + klog.V(6).Infof("maxUsed is %f, watermark is %f, cpuPercentToUsageGap is %f", maxUsed, float64(evictWatermark.PopSmallest().Value()), cpuPercentToUsage) + } + } else { + result[m.Name] = (1 + executeExcessPercent) * (maxUsed - float64(evictWatermark.PopSmallest().Value())) + } } } } else if throttleExecutor != nil {