koordinator-sh · koordinator-bot · Nov 14, 2023 · Oct 19, 2023
diff --git a/apis/slo/v1alpha1/nodeslo_types.go b/apis/slo/v1alpha1/nodeslo_types.go
@@ -26,9 +26,28 @@ import (
 // CPUQOS enables cpu qos features.
 type CPUQOS struct {
 	// group identity value for pods, default = 0
+	// NOTE: It takes effect if cpuPolicy = "groupIdentity".
 	GroupIdentity *int64 `json:"groupIdentity,omitempty" validate:"omitempty,min=-1,max=2"`
+	// cpu.idle value for pods, default = 0.
+	// `1` means using SCHED_IDLE.
+	// CGroup Idle (introduced since mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
+	// NOTE: It takes effect if cpuPolicy = "coreSched".
+	SchedIdle *int64 `json:"schedIdle,omitempty" validate:"omitempty,min=0,max=1"`
+	// whether pods of the QoS class can expel the cgroup idle pods at the SMT-level. default = false
+	// If set to true, pods of this QoS will use a dedicated core sched group for noise clean with the SchedIdle pods.
+	// NOTE: It takes effect if cpuPolicy = "coreSched".
+	CoreExpeller *bool `json:"coreExpeller,omitempty"`
 }
 
+type CPUQOSPolicy string
+
+const (
+	// CPUQOSPolicyGroupIdentity indicates the Group Identity is applied to ensure the CPU QoS.
+	CPUQOSPolicyGroupIdentity CPUQOSPolicy = "groupIdentity"
+	// CPUQOSPolicyCoreSched indicates the Linux Core Scheduling and CGroup Idle is applied to ensure the CPU QoS.
+	CPUQOSPolicyCoreSched CPUQOSPolicy = "coreSched"
+)
+
 // MemoryQOS enables memory qos features.
 type MemoryQOS struct {
 	// memcg qos
@@ -185,7 +204,15 @@ type ResourceQOS struct {
 	ResctrlQOS *ResctrlQOSCfg `json:"resctrlQOS,omitempty"`
 }
 
+type ResourceQOSPolicies struct {
+	// applied policy for the CPU QoS, default = "groupIdentity"
+	CPUPolicy *CPUQOSPolicy `json:"cpuPolicy,omitempty"`
+}
+
 type ResourceQOSStrategy struct {
+	// Policies of pod QoS.
+	Policies *ResourceQOSPolicies `json:"policies,omitempty"`
+
 	// ResourceQOS for LSR pods.
 	LSRClass *ResourceQOS `json:"lsrClass,omitempty"`
 

diff --git a/apis/slo/v1alpha1/pod.go b/apis/slo/v1alpha1/pod.go
@@ -20,6 +20,7 @@ import (
 	"encoding/json"
 
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/utils/pointer"
 
 	apiext "github.com/koordinator-sh/koordinator/apis/extension"
 )
@@ -64,3 +65,40 @@ func GetPodMemoryQoSConfig(pod *corev1.Pod) (*PodMemoryQOSConfig, error) {
 	}
 	return &cfg, nil
 }
+
+const (
+	// AnnotationCoreSchedGroupID is the annotation key of the group ID of the Linux Core Scheduling.
+	// Value should be a valid UUID or the none value "0".
+	// When the value is a valid UUID, pods with that group ID and the equal CoreExpelled status on the node will be
+	// assigned to the same core sched cookie.
+	// When the value is the none value "0", pod will be reset to the default core sched cookie `0`.
+	// When the annotation is missing but the node-level strategy enables the core sched, the pod will be assigned an
+	// internal group according to the pod's UID.
+	//
+	// Core Sched: https://docs.kernel.org/admin-guide/hw-vuln/core-scheduling.html
+	// When the Core Sched is enabled, pods with the different core sched group IDs will not be running at the same SMT
+	// core at the same time, which means they will take different core sched cookies. If a pod sets the core sched
+	// disabled, it will take the default core sched cookie (0) and will also be force-idled to run on the same SMT core
+	// concurrently with the core-sched-enabled pods. In addition, the CoreExpelled configured in ResourceQOS also
+	// enables the individual cookie from pods of other QoS classes via adding a suffix for the group ID. So the pods
+	// of different QoS will take different cookies when their CoreExpelled status are diverse even if their group ID
+	// are the same.
+	AnnotationCoreSchedGroupID = apiext.DomainPrefix + "core-sched-group-id"
+
+	// CoreSchedGroupIDNone is the none value of the core sched group ID which indicates the core sched is disabled for
+	// the pod. The pod will be reset to the system-default cookie `0`.
+	CoreSchedGroupIDNone = "0"
+)
+
+// GetCoreSchedGroupID gets the core sched group ID from the pod annotations.
+// It returns the core sched group ID and whether the pod explicitly disables the core sched.
+func GetCoreSchedGroupID(annotations map[string]string) (string, *bool) {
+	if annotations == nil {
+		return "", nil
+	}
+	value, ok := annotations[AnnotationCoreSchedGroupID]
+	if !ok {
+		return "", nil
+	}
+	return value, pointer.Bool(value == CoreSchedGroupIDNone)
+}
diff --git a/apis/slo/v1alpha1/zz_generated.deepcopy.go b/apis/slo/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/slo.koordinator.sh_nodeslos.yaml b/config/crd/bases/slo.koordinator.sh_nodeslos.yaml
@@ -176,12 +176,26 @@ spec:
                       cpuQOS:
                         description: CPUQOSCfg stores node-level config of cpu qos
                         properties:
+                          coreExpeller:
+                            description: 'whether pods of the QoS class can expel
+                              the cgroup idle pods at the SMT-level. default = false
+                              If set to true, pods of this QoS will use a dedicated
+                              core sched group for noise clean with the SchedIdle
+                              pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
+                            type: boolean
                           enable:
                             description: Enable indicates whether the cpu qos is enabled.
                             type: boolean
                           groupIdentity:
-                            description: group identity value for pods, default =
-                              0
+                            description: 'group identity value for pods, default =
+                              0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
+                            format: int64
+                            type: integer
+                          schedIdle:
+                            description: 'cpu.idle value for pods, default = 0. `1`
+                              means using SCHED_IDLE. CGroup Idle (introduced since
+                              mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
+                              NOTE: It takes effect if cpuPolicy = "coreSched".'
                             format: int64
                             type: integer
                         type: object
@@ -387,12 +401,26 @@ spec:
                       cpuQOS:
                         description: CPUQOSCfg stores node-level config of cpu qos
                         properties:
+                          coreExpeller:
+                            description: 'whether pods of the QoS class can expel
+                              the cgroup idle pods at the SMT-level. default = false
+                              If set to true, pods of this QoS will use a dedicated
+                              core sched group for noise clean with the SchedIdle
+                              pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
+                            type: boolean
                           enable:
                             description: Enable indicates whether the cpu qos is enabled.
                             type: boolean
                           groupIdentity:
-                            description: group identity value for pods, default =
-                              0
+                            description: 'group identity value for pods, default =
+                              0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
+                            format: int64
+                            type: integer
+                          schedIdle:
+                            description: 'cpu.idle value for pods, default = 0. `1`
+                              means using SCHED_IDLE. CGroup Idle (introduced since
+                              mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
+                              NOTE: It takes effect if cpuPolicy = "coreSched".'
                             format: int64
                             type: integer
                         type: object
@@ -598,12 +626,26 @@ spec:
                       cpuQOS:
                         description: CPUQOSCfg stores node-level config of cpu qos
                         properties:
+                          coreExpeller:
+                            description: 'whether pods of the QoS class can expel
+                              the cgroup idle pods at the SMT-level. default = false
+                              If set to true, pods of this QoS will use a dedicated
+                              core sched group for noise clean with the SchedIdle
+                              pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
+                            type: boolean
                           enable:
                             description: Enable indicates whether the cpu qos is enabled.
                             type: boolean
                           groupIdentity:
-                            description: group identity value for pods, default =
-                              0
+                            description: 'group identity value for pods, default =
+                              0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
+                            format: int64
+                            type: integer
+                          schedIdle:
+                            description: 'cpu.idle value for pods, default = 0. `1`
+                              means using SCHED_IDLE. CGroup Idle (introduced since
+                              mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
+                              NOTE: It takes effect if cpuPolicy = "coreSched".'
                             format: int64
                             type: integer
                         type: object
@@ -809,12 +851,26 @@ spec:
                       cpuQOS:
                         description: CPUQOSCfg stores node-level config of cpu qos
                         properties:
+                          coreExpeller:
+                            description: 'whether pods of the QoS class can expel
+                              the cgroup idle pods at the SMT-level. default = false
+                              If set to true, pods of this QoS will use a dedicated
+                              core sched group for noise clean with the SchedIdle
+                              pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
+                            type: boolean
                           enable:
                             description: Enable indicates whether the cpu qos is enabled.
                             type: boolean
                           groupIdentity:
-                            description: group identity value for pods, default =
-                              0
+                            description: 'group identity value for pods, default =
+                              0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
+                            format: int64
+                            type: integer
+                          schedIdle:
+                            description: 'cpu.idle value for pods, default = 0. `1`
+                              means using SCHED_IDLE. CGroup Idle (introduced since
+                              mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
+                              NOTE: It takes effect if cpuPolicy = "coreSched".'
                             format: int64
                             type: integer
                         type: object
@@ -948,6 +1004,13 @@ spec:
                             type: integer
                         type: object
                     type: object
+                  policies:
+                    description: Policies of pod QoS.
+                    properties:
+                      cpuPolicy:
+                        description: applied policy for the CPU QoS, default = "groupIdentity"
+                        type: string
+                    type: object
                   systemClass:
                     description: ResourceQOS for system pods
                     properties:
@@ -1020,12 +1083,26 @@ spec:
                       cpuQOS:
                         description: CPUQOSCfg stores node-level config of cpu qos
                         properties:
+                          coreExpeller:
+                            description: 'whether pods of the QoS class can expel
+                              the cgroup idle pods at the SMT-level. default = false
+                              If set to true, pods of this QoS will use a dedicated
+                              core sched group for noise clean with the SchedIdle
+                              pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
+                            type: boolean
                           enable:
                             description: Enable indicates whether the cpu qos is enabled.
                             type: boolean
                           groupIdentity:
-                            description: group identity value for pods, default =
-                              0
+                            description: 'group identity value for pods, default =
+                              0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
+                            format: int64
+                            type: integer
+                          schedIdle:
+                            description: 'cpu.idle value for pods, default = 0. `1`
+                              means using SCHED_IDLE. CGroup Idle (introduced since
+                              mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
+                              NOTE: It takes effect if cpuPolicy = "coreSched".'
                             format: int64
                             type: integer
                         type: object

diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go
@@ -76,16 +76,31 @@ func (r *bvtRule) getHostQOSBvtValue(qosClass ext.QoSClass) int64 {
 
 func (b *bvtPlugin) parseRule(mergedNodeSLOIf interface{}) (bool, error) {
 	mergedNodeSLO := mergedNodeSLOIf.(*slov1alpha1.NodeSLOSpec)
+	qosStrategy := mergedNodeSLO.ResourceQOSStrategy
 
-	// check if bvt is enabled
-	enable := *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.Enable ||
-		*mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.Enable ||
-		*mergedNodeSLO.ResourceQOSStrategy.BEClass.CPUQOS.Enable
+	// default policy enables
+	isPolicyGroupIdentity := qosStrategy.Policies == nil || qosStrategy.Policies.CPUPolicy == nil ||
+		len(*qosStrategy.Policies.CPUPolicy) <= 0 || *qosStrategy.Policies.CPUPolicy == slov1alpha1.CPUQOSPolicyGroupIdentity
+	// check if bvt (group identity) is enabled
+	lsrEnabled := isPolicyGroupIdentity && *qosStrategy.LSRClass.CPUQOS.Enable
+	lsEnabled := isPolicyGroupIdentity && *qosStrategy.LSClass.CPUQOS.Enable
+	beEnabled := isPolicyGroupIdentity && *qosStrategy.BEClass.CPUQOS.Enable
 
 	// setting pod rule by qos config
-	lsrValue := *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.CPUQOS.GroupIdentity
-	lsValue := *mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.GroupIdentity
-	beValue := *mergedNodeSLO.ResourceQOSStrategy.BEClass.CPUQOS.GroupIdentity
+	// Group Identity should be reset if the CPU QOS disables (already merged in states informer) or the CPU QoS policy
+	// is not "groupIdentity".
+	lsrValue := *sloconfig.NoneCPUQOS().GroupIdentity
+	if lsrEnabled {
+		lsrValue = *qosStrategy.LSRClass.CPUQOS.GroupIdentity
+	}
+	lsValue := *sloconfig.NoneCPUQOS().GroupIdentity
+	if lsEnabled {
+		lsValue = *qosStrategy.LSClass.CPUQOS.GroupIdentity
+	}
+	beValue := *sloconfig.NoneCPUQOS().GroupIdentity
+	if beEnabled {
+		beValue = *qosStrategy.BEClass.CPUQOS.GroupIdentity
+	}
 
 	// setting besteffort according to BE
 	besteffortDirVal := beValue
@@ -95,18 +110,18 @@ func (b *bvtPlugin) parseRule(mergedNodeSLOIf interface{}) (bool, error) {
 	burstableDirVal := lsValue
 	burstablePodVal := lsValue
 
-	// NOTICE guaranteed root dir must set as 0 until kernel supported
+	// NOTE: guaranteed root dir must set as 0 until kernel supported
 	guaranteedDirVal := *sloconfig.NoneCPUQOS().GroupIdentity
 	// setting guaranteed pod enabled if LS or LSR enabled
 	guaranteedPodVal := *sloconfig.NoneCPUQOS().GroupIdentity
-	if *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.Enable {
+	if lsrEnabled {
 		guaranteedPodVal = lsrValue
-	} else if *mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.Enable {
+	} else if lsEnabled {
 		guaranteedPodVal = lsValue
 	}
 
 	newRule := &bvtRule{
-		enable: enable,
+		enable: lsrEnabled || lsEnabled || beEnabled,
 		podQOSParams: map[ext.QoSClass]int64{
 			ext.QoSLSE: lsrValue,
 			ext.QoSLSR: lsrValue,