Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

apis: add core sched apis #1720

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions apis/slo/v1alpha1/nodeslo_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,28 @@ import (
// CPUQOS enables cpu qos features.
type CPUQOS struct {
// group identity value for pods, default = 0
// NOTE: It takes effect if cpuPolicy = "groupIdentity".
GroupIdentity *int64 `json:"groupIdentity,omitempty" validate:"omitempty,min=-1,max=2"`
// cpu.idle value for pods, default = 0.
// `1` means using SCHED_IDLE.
// CGroup Idle (introduced since mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
// NOTE: It takes effect if cpuPolicy = "coreSched".
SchedIdle *int64 `json:"schedIdle,omitempty" validate:"omitempty,min=0,max=1"`
// whether pods of the QoS class can expel the cgroup idle pods at the SMT-level. default = false
// If set to true, pods of this QoS will use a dedicated core sched group for noise clean with the SchedIdle pods.
// NOTE: It takes effect if cpuPolicy = "coreSched".
CoreExpeller *bool `json:"coreExpeller,omitempty"`
}

type CPUQOSPolicy string

const (
// CPUQOSPolicyGroupIdentity indicates the Group Identity is applied to ensure the CPU QoS.
CPUQOSPolicyGroupIdentity CPUQOSPolicy = "groupIdentity"
// CPUQOSPolicyCoreSched indicates the Linux Core Scheduling and CGroup Idle is applied to ensure the CPU QoS.
CPUQOSPolicyCoreSched CPUQOSPolicy = "coreSched"
)

// MemoryQOS enables memory qos features.
type MemoryQOS struct {
// memcg qos
Expand Down Expand Up @@ -185,7 +204,15 @@ type ResourceQOS struct {
ResctrlQOS *ResctrlQOSCfg `json:"resctrlQOS,omitempty"`
}

type ResourceQOSPolicies struct {
// applied policy for the CPU QoS, default = "groupIdentity"
CPUPolicy *CPUQOSPolicy `json:"cpuPolicy,omitempty"`
}

type ResourceQOSStrategy struct {
// Policies of pod QoS.
Policies *ResourceQOSPolicies `json:"policies,omitempty"`

// ResourceQOS for LSR pods.
LSRClass *ResourceQOS `json:"lsrClass,omitempty"`

Expand Down
38 changes: 38 additions & 0 deletions apis/slo/v1alpha1/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"encoding/json"

corev1 "k8s.io/api/core/v1"
"k8s.io/utils/pointer"

apiext "github.com/koordinator-sh/koordinator/apis/extension"
)
Expand Down Expand Up @@ -64,3 +65,40 @@ func GetPodMemoryQoSConfig(pod *corev1.Pod) (*PodMemoryQOSConfig, error) {
}
return &cfg, nil
}

const (
// AnnotationCoreSchedGroupID is the annotation key of the group ID of the Linux Core Scheduling.
// Value should be a valid UUID or the none value "0".
// When the value is a valid UUID, pods with that group ID and the equal CoreExpelled status on the node will be
// assigned to the same core sched cookie.
// When the value is the none value "0", pod will be reset to the default core sched cookie `0`.
// When the annotation is missing but the node-level strategy enables the core sched, the pod will be assigned an
// internal group according to the pod's UID.
//
// Core Sched: https://docs.kernel.org/admin-guide/hw-vuln/core-scheduling.html
// When the Core Sched is enabled, pods with the different core sched group IDs will not be running at the same SMT
// core at the same time, which means they will take different core sched cookies. If a pod sets the core sched
// disabled, it will take the default core sched cookie (0) and will also be force-idled to run on the same SMT core
// concurrently with the core-sched-enabled pods. In addition, the CoreExpelled configured in ResourceQOS also
// enables the individual cookie from pods of other QoS classes via adding a suffix for the group ID. So the pods
// of different QoS will take different cookies when their CoreExpelled status are diverse even if their group ID
// are the same.
AnnotationCoreSchedGroupID = apiext.DomainPrefix + "core-sched-group-id"

// CoreSchedGroupIDNone is the none value of the core sched group ID which indicates the core sched is disabled for
// the pod. The pod will be reset to the system-default cookie `0`.
CoreSchedGroupIDNone = "0"
)

// GetCoreSchedGroupID gets the core sched group ID from the pod annotations.
// It returns the core sched group ID and whether the pod explicitly disables the core sched.
func GetCoreSchedGroupID(annotations map[string]string) (string, *bool) {
if annotations == nil {
return "", nil
}
value, ok := annotations[AnnotationCoreSchedGroupID]
if !ok {
return "", nil
}
return value, pointer.Bool(value == CoreSchedGroupIDNone)
}
35 changes: 35 additions & 0 deletions apis/slo/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

97 changes: 87 additions & 10 deletions config/crd/bases/slo.koordinator.sh_nodeslos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down Expand Up @@ -387,12 +401,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down Expand Up @@ -598,12 +626,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down Expand Up @@ -809,12 +851,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down Expand Up @@ -948,6 +1004,13 @@ spec:
type: integer
type: object
type: object
policies:
description: Policies of pod QoS.
properties:
cpuPolicy:
description: applied policy for the CPU QoS, default = "groupIdentity"
type: string
type: object
systemClass:
description: ResourceQOS for system pods
properties:
Expand Down Expand Up @@ -1020,12 +1083,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down
37 changes: 26 additions & 11 deletions pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,31 @@ func (r *bvtRule) getHostQOSBvtValue(qosClass ext.QoSClass) int64 {

func (b *bvtPlugin) parseRule(mergedNodeSLOIf interface{}) (bool, error) {
mergedNodeSLO := mergedNodeSLOIf.(*slov1alpha1.NodeSLOSpec)
qosStrategy := mergedNodeSLO.ResourceQOSStrategy

// check if bvt is enabled
enable := *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.Enable ||
*mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.Enable ||
*mergedNodeSLO.ResourceQOSStrategy.BEClass.CPUQOS.Enable
// default policy enables
isPolicyGroupIdentity := qosStrategy.Policies == nil || qosStrategy.Policies.CPUPolicy == nil ||
len(*qosStrategy.Policies.CPUPolicy) <= 0 || *qosStrategy.Policies.CPUPolicy == slov1alpha1.CPUQOSPolicyGroupIdentity
// check if bvt (group identity) is enabled
lsrEnabled := isPolicyGroupIdentity && *qosStrategy.LSRClass.CPUQOS.Enable
lsEnabled := isPolicyGroupIdentity && *qosStrategy.LSClass.CPUQOS.Enable
beEnabled := isPolicyGroupIdentity && *qosStrategy.BEClass.CPUQOS.Enable

// setting pod rule by qos config
lsrValue := *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.CPUQOS.GroupIdentity
lsValue := *mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.GroupIdentity
beValue := *mergedNodeSLO.ResourceQOSStrategy.BEClass.CPUQOS.GroupIdentity
// Group Identity should be reset if the CPU QOS disables (already merged in states informer) or the CPU QoS policy
// is not "groupIdentity".
lsrValue := *sloconfig.NoneCPUQOS().GroupIdentity
if lsrEnabled {
lsrValue = *qosStrategy.LSRClass.CPUQOS.GroupIdentity
}
lsValue := *sloconfig.NoneCPUQOS().GroupIdentity
if lsEnabled {
lsValue = *qosStrategy.LSClass.CPUQOS.GroupIdentity
}
beValue := *sloconfig.NoneCPUQOS().GroupIdentity
if beEnabled {
beValue = *qosStrategy.BEClass.CPUQOS.GroupIdentity
}

// setting besteffort according to BE
besteffortDirVal := beValue
Expand All @@ -95,18 +110,18 @@ func (b *bvtPlugin) parseRule(mergedNodeSLOIf interface{}) (bool, error) {
burstableDirVal := lsValue
burstablePodVal := lsValue

// NOTICE guaranteed root dir must set as 0 until kernel supported
// NOTE: guaranteed root dir must set as 0 until kernel supported
guaranteedDirVal := *sloconfig.NoneCPUQOS().GroupIdentity
// setting guaranteed pod enabled if LS or LSR enabled
guaranteedPodVal := *sloconfig.NoneCPUQOS().GroupIdentity
if *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.Enable {
if lsrEnabled {
guaranteedPodVal = lsrValue
} else if *mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.Enable {
} else if lsEnabled {
guaranteedPodVal = lsValue
}

newRule := &bvtRule{
enable: enable,
enable: lsrEnabled || lsEnabled || beEnabled,
podQOSParams: map[ext.QoSClass]int64{
ext.QoSLSE: lsrValue,
ext.QoSLSR: lsrValue,
Expand Down
Loading