Skip to content

Commit

Permalink
fix(qrm): support topology checking switch for normal share cores pods
Browse files Browse the repository at this point in the history
  • Loading branch information
nightmeng committed Oct 11, 2024
1 parent 2eef2df commit 2dc8849
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 52 deletions.
33 changes: 19 additions & 14 deletions cmd/katalyst-agent/app/options/qrm/memory_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ import (
)

type MemoryOptions struct {
PolicyName string
ReservedMemoryGB uint64
SkipMemoryStateCorruption bool
EnableSettingMemoryMigrate bool
EnableMemoryAdvisor bool
ExtraControlKnobConfigFile string
EnableOOMPriority bool
OOMPriorityPinnedMapAbsPath string
PolicyName string
ReservedMemoryGB uint64
SkipMemoryStateCorruption bool
EnableSettingMemoryMigrate bool
EnableMemoryAdvisor bool
ExtraControlKnobConfigFile string
EnableOOMPriority bool
OOMPriorityPinnedMapAbsPath string
EnableNormalShareCoresTopologyCheck bool

SockMemOptions
LogCacheOptions
Expand Down Expand Up @@ -65,12 +66,13 @@ type LogCacheOptions struct {

func NewMemoryOptions() *MemoryOptions {
return &MemoryOptions{
PolicyName: "dynamic",
ReservedMemoryGB: 0,
SkipMemoryStateCorruption: false,
EnableSettingMemoryMigrate: false,
EnableMemoryAdvisor: false,
EnableOOMPriority: false,
PolicyName: "dynamic",
ReservedMemoryGB: 0,
SkipMemoryStateCorruption: false,
EnableSettingMemoryMigrate: false,
EnableMemoryAdvisor: false,
EnableOOMPriority: false,
EnableNormalShareCoresTopologyCheck: true,
SockMemOptions: SockMemOptions{
EnableSettingSockMem: false,
SetGlobalTCPMemRatio: 20, // default: 20% * {host total memory}
Expand Down Expand Up @@ -105,6 +107,8 @@ func (o *MemoryOptions) AddFlags(fss *cliflag.NamedFlagSets) {
o.ExtraControlKnobConfigFile, "the absolute path of extra control knob config file")
fs.BoolVar(&o.EnableOOMPriority, "enable-oom-priority",
o.EnableOOMPriority, "if set true, we will enable oom priority enhancement")
fs.BoolVar(&o.EnableNormalShareCoresTopologyCheck, "enable-normal-share-cores-topology-check",
o.EnableNormalShareCoresTopologyCheck, "enable the topology check for normal shares cores pods")
fs.StringVar(&o.OOMPriorityPinnedMapAbsPath, "oom-priority-pinned-bpf-map-path",
o.OOMPriorityPinnedMapAbsPath, "the absolute path of oom priority pinned bpf map")
fs.BoolVar(&o.EnableSettingSockMem, "enable-setting-sockmem",
Expand Down Expand Up @@ -137,6 +141,7 @@ func (o *MemoryOptions) ApplyTo(conf *qrmconfig.MemoryQRMPluginConfig) error {
conf.EnableMemoryAdvisor = o.EnableMemoryAdvisor
conf.ExtraControlKnobConfigFile = o.ExtraControlKnobConfigFile
conf.EnableOOMPriority = o.EnableOOMPriority
conf.EnableNormalShareCoresTopologyCheck = o.EnableNormalShareCoresTopologyCheck
conf.OOMPriorityPinnedMapAbsPath = o.OOMPriorityPinnedMapAbsPath
conf.EnableSettingSockMem = o.EnableSettingSockMem
conf.SetGlobalTCPMemRatio = o.SetGlobalTCPMemRatio
Expand Down
53 changes: 28 additions & 25 deletions pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ type DynamicPolicy struct {

enableEvictingLogCache bool
logCacheEvictionManager logcache.Manager

enableNormalShareCoresTopologyCheck bool
}

func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration,
Expand Down Expand Up @@ -199,31 +201,32 @@ func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration
})

policyImplement := &DynamicPolicy{
topology: agentCtx.CPUTopology,
qosConfig: conf.QoSConfiguration,
emitter: wrappedEmitter,
metaServer: agentCtx.MetaServer,
state: stateImpl,
stopCh: make(chan struct{}),
migratingMemory: make(map[string]map[string]bool),
residualHitMap: make(map[string]int64),
enhancementHandlers: make(util.ResourceEnhancementHandlerMap),
extraStateFileAbsPath: conf.ExtraStateFileAbsPath,
name: fmt.Sprintf("%s_%s", agentName, memconsts.MemoryResourcePluginPolicyNameDynamic),
podDebugAnnoKeys: conf.PodDebugAnnoKeys,
podAnnotationKeptKeys: conf.PodAnnotationKeptKeys,
podLabelKeptKeys: conf.PodLabelKeptKeys,
asyncWorkers: asyncworker.NewAsyncWorkers(memoryPluginAsyncWorkersName, wrappedEmitter),
defaultAsyncLimitedWorkers: asyncworker.NewAsyncLimitedWorkers(memoryPluginAsyncWorkersName, defaultAsyncWorkLimit, wrappedEmitter),
enableSettingMemoryMigrate: conf.EnableSettingMemoryMigrate,
enableSettingSockMem: conf.EnableSettingSockMem,
enableMemoryAdvisor: conf.EnableMemoryAdvisor,
memoryAdvisorSocketAbsPath: conf.MemoryAdvisorSocketAbsPath,
memoryPluginSocketAbsPath: conf.MemoryPluginSocketAbsPath,
extraControlKnobConfigs: extraControlKnobConfigs, // [TODO]: support modifying extraControlKnobConfigs by KCC
enableOOMPriority: conf.EnableOOMPriority,
oomPriorityMapPinnedPath: conf.OOMPriorityPinnedMapAbsPath,
enableEvictingLogCache: conf.EnableEvictingLogCache,
topology: agentCtx.CPUTopology,
qosConfig: conf.QoSConfiguration,
emitter: wrappedEmitter,
metaServer: agentCtx.MetaServer,
state: stateImpl,
stopCh: make(chan struct{}),
migratingMemory: make(map[string]map[string]bool),
residualHitMap: make(map[string]int64),
enhancementHandlers: make(util.ResourceEnhancementHandlerMap),
extraStateFileAbsPath: conf.ExtraStateFileAbsPath,
name: fmt.Sprintf("%s_%s", agentName, memconsts.MemoryResourcePluginPolicyNameDynamic),
podDebugAnnoKeys: conf.PodDebugAnnoKeys,
podAnnotationKeptKeys: conf.PodAnnotationKeptKeys,
podLabelKeptKeys: conf.PodLabelKeptKeys,
asyncWorkers: asyncworker.NewAsyncWorkers(memoryPluginAsyncWorkersName, wrappedEmitter),
defaultAsyncLimitedWorkers: asyncworker.NewAsyncLimitedWorkers(memoryPluginAsyncWorkersName, defaultAsyncWorkLimit, wrappedEmitter),
enableSettingMemoryMigrate: conf.EnableSettingMemoryMigrate,
enableSettingSockMem: conf.EnableSettingSockMem,
enableMemoryAdvisor: conf.EnableMemoryAdvisor,
memoryAdvisorSocketAbsPath: conf.MemoryAdvisorSocketAbsPath,
memoryPluginSocketAbsPath: conf.MemoryPluginSocketAbsPath,
extraControlKnobConfigs: extraControlKnobConfigs, // [TODO]: support modifying extraControlKnobConfigs by KCC
enableOOMPriority: conf.EnableOOMPriority,
oomPriorityMapPinnedPath: conf.OOMPriorityPinnedMapAbsPath,
enableEvictingLogCache: conf.EnableEvictingLogCache,
enableNormalShareCoresTopologyCheck: conf.EnableNormalShareCoresTopologyCheck,
}

policyImplement.allocationHandlers = map[string]util.AllocationHandler{
Expand Down
28 changes: 15 additions & 13 deletions pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_hint_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,20 +48,22 @@ func (p *DynamicPolicy) sharedCoresHintHandler(ctx context.Context,

// TODO: support sidecar follow main container for normal share cores in future
if req.ContainerType == pluginapi.ContainerType_MAIN {
ok, err := p.checkNormalShareCoresResource(req)
if err != nil {
general.Errorf("failed to check share cores resource: %q", err)
return nil, fmt.Errorf("failed to check share cores resource: %q", err)
}
if p.enableNormalShareCoresTopologyCheck {
ok, err := p.checkNormalShareCoresResource(req)
if err != nil {
general.Errorf("failed to check share cores resource: %q", err)
return nil, fmt.Errorf("failed to check share cores resource: %q", err)
}

if !ok {
_ = p.emitter.StoreInt64(util.MetricNameShareCoresNoEnoughResourceFailed, 1, metrics.MetricTypeNameCount, metrics.ConvertMapToTags(map[string]string{
"resource": v1.ResourceMemory.String(),
"podNamespace": req.PodNamespace,
"podName": req.PodName,
"containerName": req.ContainerName,
})...)
return nil, errNoAvailableMemoryHints
if !ok {
_ = p.emitter.StoreInt64(util.MetricNameShareCoresNoEnoughResourceFailed, 1, metrics.MetricTypeNameCount, metrics.ConvertMapToTags(map[string]string{
"resource": v1.ResourceMemory.String(),
"podNamespace": req.PodNamespace,
"podName": req.PodName,
"containerName": req.ContainerName,
})...)
return nil, errNoAvailableMemoryHints
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/config/agent/qrm/memory_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ type MemoryQRMPluginConfig struct {
EnableOOMPriority bool
// OOMPriorityPinnedMapAbsPath: the absolute path of oom priority pinned bpf map
OOMPriorityPinnedMapAbsPath string
// EnableNormalShareCoresTopologyCheck: enable the topology check for normal share cores pods
EnableNormalShareCoresTopologyCheck bool

// SockMemQRMPluginConfig: the configuration for sockmem limitation in cgroup and host level
SockMemQRMPluginConfig
Expand Down

0 comments on commit 2dc8849

Please sign in to comment.