From 586c9822957c3e0f7af09b4417712f57b42b161f Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Fri, 25 Jun 2021 20:24:36 +0100 Subject: [PATCH] Updates made based on first round of review Signed-off-by: Swati Sehgal --- keps/prod-readiness/sig-scheduling/2044.yaml | 3 + .../README.md | 849 +++++++++++++----- .../kep.yaml | 54 +- 3 files changed, 640 insertions(+), 266 deletions(-) create mode 100644 keps/prod-readiness/sig-scheduling/2044.yaml diff --git a/keps/prod-readiness/sig-scheduling/2044.yaml b/keps/prod-readiness/sig-scheduling/2044.yaml new file mode 100644 index 00000000000..cf3acacf7e1 --- /dev/null +++ b/keps/prod-readiness/sig-scheduling/2044.yaml @@ -0,0 +1,3 @@ +kep-number: 2044 +alpha: + approver: "@wojtek-t" diff --git a/keps/sig-scheduling/2044-topology-awareness-in-kube-scheduler/README.md b/keps/sig-scheduling/2044-topology-awareness-in-kube-scheduler/README.md index 3744a775ea1..d50ab638c6b 100644 --- a/keps/sig-scheduling/2044-topology-awareness-in-kube-scheduler/README.md +++ b/keps/sig-scheduling/2044-topology-awareness-in-kube-scheduler/README.md @@ -1,32 +1,84 @@ # Topology awareness in Kube-scheduler +- [Release Signoff Checklist](#release-signoff-checklist) - [Summary](#summary) - [Motivation](#motivation) - - [Goals](#goals) - - [Non-Goals](#non-goals) +- [Goals](#goals) +- [Non-Goals](#non-goals) - [Proposal](#proposal) - - [Changes to the API](changes-to-the-api) - - [Scheduler Plugin implementation details](#scheduler-plugin-implementation-details) - - [Description of the Scheduling Algorithm](#description-of-the-scheduling-algorithm) -- [Alternative Solution](#alternative-solution) - - [Exporter Daemon Implementation Details](#exporter-daemon-implementation-details) - - [Topology format](#topology-format) - - [CRD API](#crd-api) + - [User Stories](#user-stories) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Design Consideration](#design-consideration) + - [Changes to the API](#changes-to-the-api) - [Plugin implementation details](#plugin-implementation-details) - - [Topology information in the NodeResourceTopologyMatch plugin](#topology-information-in-the-noderesourcetopologymatch-plugin) - - [Description of the Algorithm](#description-of-the-algorithm) - - [Accessing NodeResourceTopology CRD](#accessing-noderesourcetopology-crd) -- [Use cases](#use-cases) -- [Known limitations](#known-limitations) -- [Test plans](#test-plans) -- [Graduation criteria](#graduation-criteria) + - [Description of the Algorithm](#description-of-the-algorithm) + - [Test Plan](#test-plan) + - [Graduation Criteria](#graduation-criteria) + - [Alpha](#alpha) + - [Beta](#beta) + - [GA](#ga) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) - [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) - - [Feature enablement and rollback](#feature-enablement-and-rollback) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) - [Implementation history](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [Enable this capability out-of-tree](#enable-this-capability-out-of-tree) + - [1:1 worker pod to node assignment](#11-worker-pod-to-node-assignment) +- [Infrastructure Needed](#infrastructure-needed) -# Summary +## Release Signoff Checklist + + + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary Kubernetes clusters composed of nodes with complex hardware topology are becoming more prevalent. [Topology Manager](https://kubernetes.io/docs/tasks/administer-cluster/topology-manager/) was @@ -41,9 +93,14 @@ further context on how runaway pods are created because the scheduler is topolog In order to address this issue, scheduler needs to choose a node considering resource availability along with underlying resource topology and Topology Manager policy on the worker node. -This document describes behaviour of the Kubernetes Scheduler which takes worker node topology into account. +This enhancement proposes changes to make kube-scheduler aware of node NUMA topology when making scheduling decisions -# Motivation +The changes/artifacts proposed as part of this KEP are: +1. A new scheduler plugin that makes topology-aware placement decisions +2. A new resource object, `NodeResourceTopology` to communicate NUMA status between kubelet and kube-scheduler +3. Kubelet changes to populate `NodeResourceTopology` + +## Motivation After Topology Manager was introduced, the problem of launching pod in the cluster where worker nodes have different NUMA topology and different amount of resources in that topology became @@ -58,39 +115,84 @@ informed scheduling decision. This KEP describes how it would be implemented. ## Goals -- Make scheduling process more precise when we have NUMA topology on the -worker node. -- Enhance the node object to capture topology information which can be referred to -by the scheduler. +- Make scheduling process more precise when we have NUMA topology on the worker node. ## Non-Goals - Change the PodSpec to allow requesting a specific node topology manager policy -- This Proposal requires exposing NUMA topology information. This KEP doesn't -describe how to expose all necessary information it just declare what kind of -information is necessary. -- Changes to the TopologyManager and its policies. +- Changes to the Topology Manager and its policies. +- API changes with the Operating System or external components as all the required + information is already available in Kubelet running on linux flavours that support NUMA nodes. +- Enable Windows systems to support CPU Manager or memory manager and hence Topology aware + Scheduling in general. -# Proposal +## Proposal Kube-scheduler plugin will be moved from kuberntes-sigs/scheduler-plugin (or out-of-tree) -plugin into the main tree as a built-in plugin. This plugin implements a simplified version of Topology Manager and hence is different from original topology manager algorithm. Plugin would -be disabled by default and when enabled would check for the ability to run pod only in case of single-numa-node policy on the node, since it is the most strict policy, it implies that the launch on the node with other existing policies will be successful if the condition for single-numa-node policy passed for the worker node. +plugin into the main tree as a built-in plugin. This plugin implements a simplified version +of Topology Manager and hence is different from original topology manager algorithm. Plugin +would be disabled by default and when enabled would check for the ability to run pod only in +case of single-numa-node policy Topology Manager policy on the node, since it is the most strict +policy, it implies that the launch on the node with other existing policies will be successful +if the condition for single-numa-node policy passed for the worker node. -To work, this plugin requires topology information of the available resource on the worker nodes. +To work, this plugin requires topology information of the available resources for each NUMA cell on worker nodes. Kubelet will be responsible for collecting all necessary resource information of the pods, based on allocatable resources on the node and allocated resources to pods. The NUMA nodes -would be represented as Zones in Kubelet and the NodeResourceTopology would capture the -resource information at a zone level granularity. +would be represented as NUMA cells in Kubelet and the NodeResourceTopology would capture the +resource information at a NUMA cell level granularity. Once the information is captured in the NodeResourceTopology API, the scheduler can refer to it like it refers to Node Capacity and Allocatable while making a Topology-aware Scheduling decision. +### User Stories + +As a Kubernetes cluster operator managing a cluster with multiple bare metal worker nodes with NUMA +topology and Topology Manager enabled, I want the scheduler to be Topology-aware in order to ensure that the +pods are only placed on nodes where requested resources can be appropriately aligned by Topology Manager based on its policy. +The scheduler shouldn't send the pod to the node where kubelet will reject it with "Topology Affinity Error". +This issue leads to runaway pod creation if the pod is part of a Deployment or ReplicaSet as the associated controllers +notices the pod failure and keep creating another pod. + + +### Risks and Mitigations + +Topology Manager on the worker node knows exact resources and their NUMA node allocated to pods but the and node resource +topology information is delivered to Topology aware scheduler plugin with latency meaning that the scheduler will not know +actual NUMA topology until the information of the available resources at a NUMA node level is evaluated in the kubelet which +could still lead to scheduling of pods to nodes where they won't be admitted by Topology Manager. + +This can be mitigated if kube-scheduler provides a hint of which NUMA ID a pod should be assigned and Topology Manager on the +worker node takes that into account. + +## Design Details + +- add a new flag in Kubelet called `ExposeNodeResourceTopology` in the kubelet config or command line argument called `expose-noderesourcetopology` which allows + the user to specify when they would like Kubelet to compute and expose resource hardware topology information. +- The `ExposeNodeResourceTopology` flag is received from the kubelet config/command line args is propogated to the Container Manager. +- Based on the resources allocated to a pod, the topology associated with the resources is evaluated and populated as part of the NodeResourceTopology. +- Kubelet will collect information about resources allocated to running pods along with their topology, based on allocatable resources of the node and consumed + resources by pods it will populate available resources with the associated topology information in NodeResourceTopology, where a NodeResourceTopology instance + would represent a worker node. + The name of the CRD instance is the name of the worker node +- A new in-tree scheduler plugin `NodeResourceTopologyMatch` is created that makes topology-aware placement decisions implements a simplified version of Topology + Manager as part of the filter extension point to filter out nodes that are not suitable for the workload based on the resource request and the obtained + NodeResourceTopology information corresponding to that worker node. The scoring extension point to determine a score based on a configurable strategy + would be done as a Beta feature. +- The scheduler plugin `NodeResourceTopologyMatch` would be disabled by default and when enabled would also check for the ability to run pod only + in case of Topology Manager policy of single-numa-node policy is configured on the node. Since it is the most strict policy, it implies that the launch + on the node with other existing policies will be successful if the condition for single-numa-node policy passed for the worker node. + + +### Design Consideration -## Changes to the API +By default Kubelet would be resposible to fill the NodeResourceTopology instances, but a design consideration is to architect the +solution in such a way to allow future extension to facilitate the cloud administratore to disable the feature and introduce a custom external agents to fill the NodeResourceTopology data. -Code responsible for working with NodeResourceTopology API will be placed in the stagingit g directory +### Changes to the API + +Code responsible for working with NodeResourceTopology API will be placed in the staging directory at path staging/src/k8s.io/api/node/v1/types.go. ```go @@ -103,18 +205,28 @@ type NodeResourceTopology struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - TopologyPolicy []string `json:"topologyPolicies"` - Zones ZoneMap `json:"zones"` + TopologyPolicies []string `json:"topologyPolicies"` + Cells map[string]Cell `json:"cells"` } -// Zone is the spec for a NodeResourceTopology resource -type Zone struct { +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// NodeResourceTopologyList is a list of NodeResourceTopology resources +type NodeResourceTopologyList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata"` + + Items []NodeResourceTopology `json:"items"` +} + +// Cell is the spec for a NodeResourceTopology resource +type Cell struct { Name string `json:"name"` Type string `json:"type"` Parent string `json:"parent,omitempty"` - Costs CostList `json:"costs,omitempty"` - Attributes AttributeList `json:"attributes,omitempty"` - Resources ResourceInfoList `json:"resources,omitempty"` + Costs []CostInfo `json:"costs,omitempty"` + Attributes []AttributeInfo `json:"attributes,omitempty"` + Resources []ResourceInfo `json:"resources,omitempty"` } type ResourceInfo struct { @@ -123,294 +235,533 @@ type ResourceInfo struct { Capacity string `json:"capacity"` } -type ZoneList []Zone -type ResourceInfoList []ResourceInfo - type CostInfo struct { Name string `json:"name"` - Value int `json:"value"` + Value int `json:"value"` } type AttributeInfo struct { Name string `json:"name"` Value string `json:"value"` } +// Kubelet writes to NodeResourceTopology +// and scheduler plugin reads from it +// Real world example of how these fields are populated is as follows: +// Cells: +// Name: node-1 +// Type: Node +// Costs: +// Name: node-0 +// Value: 20 +// Name: node-1 +// Value: 10 +// Attributes: +// Name: performance-profile +// Value: high-performance-profile +// Resources: +// Name: example.com/deviceB +// Allocatable: 2 +// Capacity: 2 +// Name: example.com/deviceA +// Allocatable: 2 +// Capacity: 2 +// Name: cpu +// Allocatable: 4 +// Capacity: 4 -type CostList []CostInfo -type AttributeList []AttributeInfo +``` -// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -// NodeResourceTopologyList is a list of NodeResourceTopology resources -type NodeResourceTopologyList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata"` +### Plugin implementation details - Items []NodeResourceTopology `json:"items"` -} +### Description of the Algorithm -``` +The algorithm which of the scheduler plugin is as follows: + +1. At the filter extension point of the plugin, the QoS class of the pod is determined, in case it is a best effort pod or the + Topology Manager Policy configured on the node is not single-numa-node, the node is not considered for scheduling +1. The Topology Manager Scope is determined. +1. While interating through the containers of a pod + * A bitmask is created where each bit corresponds to a NUMA cell and are all the bits are set. If the resources cannot be aligned on the NUMA cell, + the bit should be unset. + * For each resource requested in a container, a new resourceBitmask is created to determined which NUMA cell is a good fit for each resource + 1. If requested resource cannot be found on a node, it is unset as available NUMA cell + 1. If an unknown resource has 0 quantity, the NUMA cell should be left set. + * The following checks are performed: + 1. Add NUMA cell to the resourceBitmask if resource is cpu and it's not guaranteed QoS, since cpu will flow + 1. Add NUMA cell to the resourceBitmask if resource is memory and it's not guaranteed QoS, since memory will flow + 1. Add NUMA cell to the resourceBitmask if zero quantity for non existing resource was requested + 1. otherwise check amount of resources + * Once the resourceBitMark is determined it is ANDed with the cummulative bitmask +4. If resources cannot be aligned from the same NUMA cell for a container, alignment cannot be achieved for the entire pod and the resource cannot be + aligned in case of the pod under consideration. Such a pod is returned with a Status Unschedulable + +### Test Plan +It would be ensured that the components developed or modified for this feature can be easily tested. -Where TopologyPolicy may have following values: none, best-effort, restricted, single-numa-node. -The current policies of TopologyManager can't coexist together at the same time, but in future such kind of policies could appear. -For example we can have policy for HyperThreading and it can live with NUMA policies. +* Unit Tests -To use these policy names both in kube-scheduler and in kubelet, string constants of these labels should be moved from pkg/kubelet/cm/topologymanager/ and pkg/kubelet/apis/config/types.go to pkg/apis/core/types.go a one single place. +Unit test for scheduler plugin (pkg/scheduler/framework/plugins/noderesources/node_resource_topology_match.go) +pkg/scheduler/framework/plugins/noderesources/node_resource_topology_match_test.go which test the plugin. -## Plugin implementation details +Separate tests for changes to Kubelet will also should be implemented. -### Description of the Algorithm +* Integration Tests + * Default configuration (this plugin is disabled) + * no side effect on basic scheduling flow (and performance) -The algorithm which implements single-numa-node policy is following: + * Enable this plugin + * basic workflow of this feature works (decision by scheduler is admitted by kubelet) + * basic negative path of this feature works (decision by scheduler is rejected by kubelet) -```go - if qos == v1.PodQOSBestEffort { - return nil - } - - zeroQuantity := resource.MustParse("0") - for _, container := range containers { - bitmask := bm.NewEmptyBitMask() - bitmask.Fill() - for resource, quantity := range container.Resources.Requests { - resourceBitmask := bm.NewEmptyBitMask() - for _, numaNode := range zones { - numaQuantity, ok := numaNode.Resources[resource] - // if can't find requested resource on the node - skip (don't set it as available NUMA node) - // if unfound resource has 0 quantity probably this numa node can be considered - if !ok && quantity.Cmp(zeroQuantity) != 0{ - continue - } - // Check for the following: - // 1. set numa node as possible node if resource is memory or Hugepages (until memory manager will not be merged and - // memory will not be provided in CRD - // 2. set numa node as possible node if resource is cpu and it's not guaranteed QoS, since cpu will flow - // 3. set numa node as possible node if zero quantity for non existing resource was requested (TODO check topology manaager behaviour) - // 4. otherwise check amount of resources - if resource == v1.ResourceMemory || - strings.HasPrefix(string(resource), string(v1.ResourceHugePagesPrefix)) || - resource == v1.ResourceCPU && qos != v1.PodQOSGuaranteed || - quantity.Cmp(zeroQuantity) == 0 || - numaQuantity.Cmp(quantity) >= 0 { - resourceBitmask.Add(numaNode.NUMAID) - } - } - bitmask.And(resourceBitmask) - } - if bitmask.IsEmpty() { - // definitely we can't align container, so we can't align a pod - return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("Can't align container: %s", container.Name)) - } - } - return nil -} -``` +* End-to-end tests +Integration and End-to-end would Implementation of it does not constitute a difficulty, but requires appropriate multi-numa hardware for comprehensive testing of this feature. Comprehensive E2E testing of this would be done in order to graduate this feature from Alpha to Beta. -# Alternative Solution -Enable an external daemon to expose resource information along with NUMA topology of a node as a -[CRD][1]. One way of doing this is to enhance Node Feature Discovery [daemon](https://github.com/kubernetes-sigs/node-feature-discovery) or a standalone component like [Resource Topology Exporter](https://github.com/k8stopologyawareschedwg/resource-topology-exporter) that runs on each node in the cluster as a daemonset and collect resources allocated to running pods along with associated topology (NUMA nodes) and provides information of the available resources (with numa node granularity) through a CRD instance created per node. The CRs created -per node are then later used by the scheduler to identify which topology policy is enabled and make a Topology aware placement decision. -# Exporter Daemon Implementation Details -Podresources interface of the kubelet is described in +### Graduation Criteria -[pkg/kubelet/apis/podresources/v1/api.proto](https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/apis/podresources/v1/api.proto) +#### Alpha -it is available for every process on the worker node by -unix domain socket situated by the following path: +- [ ] Introducing `NodeResourceTopology` API to faciliatate communication between kubelet and kube-scheduler +- [ ] A new scheduler plugin `NodeResourceTopologyMatch` that makes topology-aware placement decisions + - [ ] Implementation of Filter extension point +- [ ] Kubelet changes to populate `NodeResourceTopology` +- [ ] Unit tests and integration tests from [Test plans](#test-plans). -```go -filepath.Join(kl.getRootDir(), config.DefaultKubeletPodResourcesDirName) -``` +#### Beta + +- [ ] Implementation of Score extension point +- [ ] Add node E2E tests. +- [ ] Provide beta-level documentation. -it could be used to collect used resources on the worker node and to evaluate -its NUMA assignment (by device id). +#### GA -Podresources can be used to obtain initial information on resources of the worker node. +- Add Conformance Tests +- More rigorous testing—e.g., downgrade tests and scalability tests -```proto -syntax = "proto3"; +### Upgrade / Downgrade Strategy -package v1; + -message AllocatableResourcesRequest {} +No changes are required on upgrade to maintain previous behaviour. -// AvailableResourcesResponses contains informations about all the devices known by the kubelet -message AllocatableResourcesResponse { - repeated ContainerDevices devices = 1; - repeated int64 cpu_ids = 2; -} +It is possible to downgrade kubelet on a node that was is using this capability by simply +disabling the `ExposeNodeResourceTopology` feature gate and disabling the `NodeResourceTopologyMatch` +scheduler plugin in case this plugin was enabled in the KubeScheduler configuration. -// ListPodResourcesRequest is the request made to the PodResources service -message ListPodResourcesRequest {} +### Version Skew Strategy -// ListPodResourcesResponse is the response returned by List function -message ListPodResourcesResponse { - repeated PodResources pod_resources = 1; -} + -// ContainerResources contains information about the resources assigned to a container -message ContainerResources { - string name = 1; - repeated ContainerDevices devices = 2; - repeated int64 cpu_ids = 3; -} +Feature flag will apply to kubelet only, so version skew strategy is N/A. +This feature involves changes in the Kube-Scheduler and Kubelet. In case an older version of +Kubelet is used with the updated scheduler plugin -// Topology describes hardware topology of the resource -message TopologyInfo { - repeated NUMANode nodes = 1; -} +In case an older version of Kubelet is used with an updated version of Scheduler, +the scheduler plugin even if enabled should behave as as it does without the introduction of +the plugin. -// NUMA representation of NUMA node -message NUMANode { - int64 ID = 1; -} +## Production Readiness Review Questionnaire -// ContainerDevices contains information about the devices assigned to a container -message ContainerDevices { - string resource_name = 1; - repeated string device_ids = 2; - TopologyInfo topology = 3; -} + -Available resources with topology of the node should be stored in CRD. Format of the topology described -[in this document][1]. +### Feature Enablement and Rollback -The daemon which runs outside of the kubelet will collect all necessary information on running pods, based on allocatable resources of the node and consumed resources by pods it will provide available resources in CRD, where one CRD instance represents one worker node. The name of the CRD instance is the name of the worker node. + -## CRD API +###### How can this feature be enabled / disabled in a live cluster? -Format of the topology is described [in this document](https://docs.google.com/document/d/12kj3fK8boNuPNqob6F_pPU9ZTaNEnPGaXEooW1Cilwg/edit). + -[Code][3] responsible for working with NodeResourceTopology CRD API will be placed in the staging directory at path staging/src/k8s.io/noderesourcetopology-api. +- [X] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: `ExposeNodeResourceTopology` + - Components depending on the feature gate: kubelet +- [X] Enable Scheduler scheduler plugin `NodeResourceTopologyMatch` in the KubeScheduler config + - Describe the mechanism: + - Will enabling / disabling the feature require downtime of the control + plane? Yes, Feature gate must be set on kubelet start. To disable, kubelet must be + restarted. Hence, there would be brief control component downtime on a + given node. + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? (Do not assume `Dynamic Kubelet Config` feature is enabled). + See above; disabling would require brief node downtime. -At the time of writing this KEP, the CRD API is stored in Topology-aware Scheduling github organization in [noderesourcetopology-api](https://github.com/k8stopologyawareschedwg/noderesourcetopology-api) +###### Does enabling the feature change any default behavior? -## Plugin implementation details + -Since topology of the node is stored in the CRD, kube-scheduler subscribes for updates of appropriate CRD type. Kube-scheduler uses informers generated with the name NodeTopologyInformer. NodeTopologyInformer runs in NodeResourceTopologyMatch plugin. +Yes, Kubelet will collect information about resources allocated to running pods along with their topology, based on allocatable resources of the node and consumed resources by pods it will populate available resources with the associated topology information in NodeResourceTopology for the node. +In case a workload cannot be aligned on a NUMA cell where Toplogy Manager Policy is configured as single-numa-node NUMA, the chances of scheduling that workload +to such a node should be significantly reduced. -### Topology information in the NodeResourceTopologyMatch plugin +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? -Once NodeResourceTopology is received NodeResourceTopologyMatch plugin keeps it in its own state of type NodeTopologyMap. This state is used every time when scheduler needs to make a decision based on node topology. + -In order to allow the scheduler (deployed as a pod) to access NodeResourceTopology CRD instances, ClusterRole and ClusterRoleBinding would have to be configured as below: +Yes, disabling the feature gate in Kubelet and disabling the scheduler plugin shuts down the feature completely. -``` yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: noderesourcetopology-handler -rules: -- apiGroups: ["topology.node.k8s.io"] - resources: ["noderesourcetopologies"] - verbs: ["*"] -- apiGroups: ["rbac.authorization.k8s.io"] - resources: ["*"] - verbs: ["*"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: handle-noderesourcetopology -subjects: -- kind: ServiceAccount - name: noderesourcetopology-account - namespace: default -roleRef: - kind: ClusterRole - name: noderesourcetopology-handler - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: noderesourcetopology-account -``` +###### What happens if we reenable the feature if it was previously rolled back? +No changes. -`serviceAccountName: noderesourcetopology-account` would have to be added to the manifest file of the scheduler deployment file. +###### Are there any tests for feature enablement/disablement? -# Use cases + -Numbers of kubernetes worker nodes on bare metal with NUMA topology. TopologyManager feature gate enabled on the nodes. In this configuration, the operator does not want that in the case of an unsatisfactory host topology, it should be re-scheduled for launch, but wants the scheduling to be successful the first time. +Specific e2e test will be added to demonstrate that the default behaviour is preserved when the feature gate and scheduler plugin is disabled, or when the feature is not used (2 separate tests) + +### Rollout, Upgrade and Rollback Planning -# Known limitations + -Kube-scheduler makes an assumption about current resource usage on the worker node, since kube-scheduler knows which pod assigned to node. This assumption makes right after kube-scheduler choose a node. But in case of scheduling with NUMA topology only TopologyManager on the worker node knows exact NUMA node used by pod, this information about NUMA node delivers to kube-scheduler with latency. In this case kube-scheduler will not know actual NUMA topology until topology exporter will send it back. It could be mitigated if kube-scheduler in proposed plugin will add a hint on which NUMA id pod could be assigned, further Topology Manager on the worker node may take it into account. +###### How can a rollout or rollback fail? Can it impact already running workloads? -# Test plans + -* Unit Tests +###### What specific metrics should inform a rollback? -Unit test for scheduler plugin (pkg/scheduler/framework/plugins/noderesources/node_resource_topology_match.go) -pkg/scheduler/framework/plugins/noderesources/node_resource_topology_match_test.go which test the plugin. + -Separate tests for changes to Kubelet will also should be implemented. +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? -* Integration Tests - * Default configuration (this plugin is disabled) - * no side effect on basic scheduling flow (and performance) + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? - * Enable this plugin - * basic workflow of this feature works (decision by scheduler is admitted by kubelet) - * basic negative path of this feature works (decision by scheduler is rejected by kubelet) + -* End-to-end tests +### Monitoring Requirements -Integration and End-to-end would Implementation of it does not constitute a difficulty, but requires appropriate multi-numa hardware for comprehensive testing of this feature. Comprehensive E2E testing of this would be done in order to graduate this feature from Alpha to Beta. + -# Graduation criteria +###### How can an operator determine if the feature is in use by workloads? -* Alpha (v1.23) + -Following changes are required: -- [ ] Introducing a Topolgy information as part of Node API -- [ ] New `kube scheduler plugin` NodeResourceTopologyMatch. - - [ ] Implementation of Filter -- [ ] Unit tests and integration tests from [Test plans](#test-plans). +###### How can someone using this feature know that it is working for their instance? -* Beta -- [ ] Add node E2E tests. -- [ ] Provide beta-level documentation. + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +No. + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + +Yes, the scheduler will be accessing the NodeResourceTopology instance corresponding to the node + +###### Will enabling / using this feature result in introducing new API types? + + + +Yes, NodeResourceTopology API is introduced and an instance per node would be created to be referenced by the scheduler plugin + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + + +No. -# Production Readiness Review Questionnaire +###### Will enabling / using this feature result in increasing size or count of the existing API objects? -# TBD - + +Yes, there would NodeResourceTopology instances equal to the number of nodes in the cluster. -# Implementation history +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation history - 2021-06-10: Initial KEP sent out for review, including Summary, Motivation, Proposal, Test plans and Graduation criteria. +- 2021-07-02: Updated version after first round of reviews + + +## Drawbacks + +Topology Manager on the worker node knows exact resources and their NUMA node allocated to pods but the and node resource +topology information is delivered to Topology aware scheduler plugin with latency meaning that the scheduler will not know +actual NUMA topology until the information of the available resources at a NUMA node level is evaluated in the kubelet which +could still lead to scheduling of pods to nodes where they won't be admitted by Topology Manager. + +## Alternatives + + +### Enable this capability out-of-tree +1. An external daemon to expose resource information along with NUMA topology of a node as a [CRD][1]. + + The daemon runs on each node in the cluster as a daemonset and collects resources allocated to running pods along with associated topology (NUMA nodes) and provides information of the available resources (with numa node granularity) through a CRD instance created per node. [Enhancing](https://github.com/kubernetes-sigs/node-feature-discovery/issues/333) Node Feature Discovery [daemon](https://github.com/kubernetes-sigs/node-feature-discovery) or implementing standalone component like [Resource Topology Exporter](https://github.com/k8stopologyawareschedwg/resource-topology-exporter) would allow the information to be exposed as CRs corresponding to the node. The name of the CR is same as the name of the worker node. The daemon would use Podresources interface of the kubelet is described in [pkg/kubelet/apis/podresources/v1/api.proto](https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/apis/podresources/v1/api.proto) to collect used resources on a worker node along with their NUMA assignment to account the available resources on each NUMA node + + +2. Out-of-tree scheduler plugin + + An out-of-tree [Node Resource Topology](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/noderesourcetopology/README.md) scheduler plugin would use the CRs exposed by the exporter daemon to make a Topology aware placement decision. + +Cons: +1. [Code][2] responsible for working with NodeResourceTopology CRD API is stored separately in Topology-aware Scheduling github organization in [noderesourcetopology-api](https://github.com/k8stopologyawareschedwg/noderesourcetopology-api) +2. The implementation where the exporter uses PodResource API means that the API endpoints such as List and GetAllocatableResources require the daemon to consume the endpoints periodically. Because of this, in case of a huge influx of +pods and polling interval being large, the accounting might not happen correctly leading to the scenario we are trying to avoid scheduling pods on nodes where kubelet will reject it with "Topology Affinity Error" and end up. A watch endpoint in podresource API might solve this issue. + + +### 1:1 worker pod to node assignment + +So apart from kubelet and daemonsets, the pod will take the whole node and the application is responsible for forking processes and assign them to NUMA cells. + +Cons: +Topology Manager can deal with the alignment of resources at [container and pod scope](https://kubernetes.io/docs/tasks/administer-cluster/topology-manager/#topology-manager-scopes). So if we have a pod with multiple containers, a process running inside the containers can be assigned to a NUMA cell. But +it is not possible in case there are multiple processes running inside the same container even if the application is smart and is capacble of forking processes +there is not way in Topology Manager in Kubelet to perform assignment of resource from the same NUMA cell for a process level. + +## Infrastructure Needed + +Hardware with Multi-NUMA systems for e2e tests + [1]: https://docs.google.com/document/d/12kj3fK8boNuPNqob6F_pPU9ZTaNEnPGaXEooW1Cilwg/edit -[2]: https://github.com/kubernetes-sigs/node-feature-discovery -[3]: https://github.com/kubernetes/noderesourcetopology-api \ No newline at end of file +[2]: https://github.com/kubernetes/noderesourcetopology-api diff --git a/keps/sig-scheduling/2044-topology-awareness-in-kube-scheduler/kep.yaml b/keps/sig-scheduling/2044-topology-awareness-in-kube-scheduler/kep.yaml index c6f26fb6ad1..9181a439a76 100644 --- a/keps/sig-scheduling/2044-topology-awareness-in-kube-scheduler/kep.yaml +++ b/keps/sig-scheduling/2044-topology-awareness-in-kube-scheduler/kep.yaml @@ -1,4 +1,3 @@ ---- title: Topology awareness in Kube-scheduler kep-number: 2044 authors: @@ -8,34 +7,55 @@ owning-sig: sig-scheduling participating-sigs: - sig-node - sig-scheduling +status: implementable +creation-date: 2021-06-10 +last-updated: 2021-07-02 reviewers: - "@ahg-g" - "@alculquicondor" - "@huang-wei" - "@derekwaynecarr" - - "@dchen1107" + - "@mrunalp" + - "@rphillips" + - "@ehashman" - "@klueska" approvers: - - approvers: - - "@sig-node-leads" - "@sig-scheduling-leads" -editor: TBD + +##### WARNING !!! ###### +# prr-approvers has been moved to its own location +# You should create your own in keps/prod-readiness +# Please make a copy of keps/prod-readiness/template/nnnn.yaml +# to keps/prod-readiness/sig-xxxxx/00000.yaml (replace with kep number) +#prr-approvers: + +see-also: + - "https://github.com/kubernetes/enhancements/pull/1870" + - "https://github.com/kubernetes/enhancements/pull/1858" + +# The target maturity stage in the current dev cycle for this KEP. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.23" + +# The milestone at which this feature was, or is targeted to be, at each stage. milestone: alpha: "v1.23" beta: "v1.24" stable: "v1.26" -# TODO: # The following PRR answers are required at alpha release # List the feature gate name and the components for which it must be enabled -# feature-gates: -# - name: "" -# components: -# - kubelet -# - kube-scheduler -creation-date: 2021-06-10 -last-updated: 2021-06-10 -status: implementable -see-also: - - "https://github.com/kubernetes/enhancements/pull/1870" - - "https://github.com/kubernetes/enhancements/pull/1858" \ No newline at end of file +feature-gates: + - name: ExposeNodeResourceTopology + components: + - kubelet + +disable-supported: true + +# The following PRR answers are required at beta release +# metrics: +# - my_feature_metric