Skip to content

Commit

Permalink
feat(agent): refactor eviction/reporter healthz rule
Browse files Browse the repository at this point in the history
  • Loading branch information
zzzzhhb committed Jun 7, 2024
1 parent 11f15e8 commit c3d1a2c
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 9 deletions.
4 changes: 2 additions & 2 deletions pkg/agent/evictionmanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,12 +249,12 @@ func (m *EvictionManger) sync(ctx context.Context) {
general.Infof(" currently, there are %v candidate pods", len(pods))
_ = m.emitter.StoreInt64(MetricsNameCandidatePodCNT, int64(len(pods)), metrics.MetricTypeNameRaw)

errList := make([]error, 0)
collector, collectErr := m.collectEvictionResult(pods)
if collectErr != nil {
errList = append(errList, collectErr)
general.Infof("collect eviction result error:%v", collectErr)
}

errList := make([]error, 0)
evictErr := m.doEvict(collector.getSoftEvictPods(), collector.getForceEvictPods())
if evictErr != nil {
errList = append(errList, evictErr)
Expand Down
3 changes: 3 additions & 0 deletions pkg/agent/evictionmanager/plugin/memory/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package memory

import (
"strconv"
"time"

v1 "k8s.io/api/core/v1"

Expand All @@ -36,6 +37,8 @@ const (
actionNoop = iota
actionReclaimedEviction
actionEviction

healthCheckTimeout = 1 * time.Minute
)

// control-related variables
Expand Down
17 changes: 14 additions & 3 deletions pkg/agent/evictionmanager/plugin/memory/numa_pressure.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"time"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/errors"
"k8s.io/client-go/tools/events"

pluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1"
Expand Down Expand Up @@ -80,6 +81,7 @@ type NumaMemoryPressurePlugin struct {
}

func (n *NumaMemoryPressurePlugin) Start() {
general.RegisterHeartbeatCheck(EvictionPluginNameNumaMemoryPressure, healthCheckTimeout, general.HealthzCheckStateNotReady, healthCheckTimeout)
return
}

Expand All @@ -92,6 +94,11 @@ func (n *NumaMemoryPressurePlugin) Name() string {
}

func (n *NumaMemoryPressurePlugin) ThresholdMet(_ context.Context) (*pluginapi.ThresholdMetResponse, error) {
var err error
defer func() {
_ = general.UpdateHealthzStateByError(EvictionPluginNameNumaMemoryPressure, err)
}()

resp := &pluginapi.ThresholdMetResponse{
MetType: pluginapi.ThresholdMetType_NOT_MET,
}
Expand All @@ -100,18 +107,19 @@ func (n *NumaMemoryPressurePlugin) ThresholdMet(_ context.Context) (*pluginapi.T
return resp, nil
}

n.detectNumaPressures()
err = n.detectNumaPressures()
if n.isUnderNumaPressure {
resp = &pluginapi.ThresholdMetResponse{
MetType: pluginapi.ThresholdMetType_HARD_MET,
EvictionScope: EvictionScopeNumaMemory,
}
}

return resp, nil
return resp, err
}

func (n *NumaMemoryPressurePlugin) detectNumaPressures() {
func (n *NumaMemoryPressurePlugin) detectNumaPressures() error {
var errList []error
n.isUnderNumaPressure = false
for _, numaID := range n.metaServer.CPUDetails.NUMANodes().ToSliceNoSortInt() {
n.numaActionMap[numaID] = actionNoop
Expand All @@ -120,9 +128,12 @@ func (n *NumaMemoryPressurePlugin) detectNumaPressures() {
}

if err := n.detectNumaWatermarkPressure(numaID); err != nil {
errList = append(errList, err)
continue
}
}

return errors.NewAggregate(errList)
}

func (n *NumaMemoryPressurePlugin) detectNumaWatermarkPressure(numaID int) error {
Expand Down
5 changes: 2 additions & 3 deletions pkg/agent/evictionmanager/plugin/memory/system_pressure.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ const (
EvictionPluginNameSystemMemoryPressure = "system-memory-pressure-eviction-plugin"
EvictionScopeSystemMemory = "SystemMemory"
evictionConditionMemoryPressure = "MemoryPressure"
systemMemoryPressureHealthCheck = "system_memory_pressure_eviction_detect"
syncTolerationTurns = 3
)

Expand Down Expand Up @@ -102,7 +101,7 @@ func (s *SystemPressureEvictionPlugin) Name() string {
}

func (s *SystemPressureEvictionPlugin) Start() {
general.RegisterHeartbeatCheck(systemMemoryPressureHealthCheck, syncTolerationTurns*s.syncPeriod,
general.RegisterHeartbeatCheck(EvictionPluginNameSystemMemoryPressure, syncTolerationTurns*s.syncPeriod,
general.HealthzCheckStateNotReady, syncTolerationTurns*s.syncPeriod)
go wait.UntilWithContext(context.TODO(), s.detectSystemPressures, s.syncPeriod)
}
Expand Down Expand Up @@ -144,7 +143,7 @@ func (s *SystemPressureEvictionPlugin) detectSystemPressures(_ context.Context)
defer s.Unlock()
var err error
defer func() {
_ = general.UpdateHealthzStateByError(systemMemoryPressureHealthCheck, err)
_ = general.UpdateHealthzStateByError(EvictionPluginNameSystemMemoryPressure, err)
}()

s.isUnderSystemPressure = false
Expand Down
4 changes: 4 additions & 0 deletions pkg/agent/evictionmanager/plugin/resource/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"sort"
"time"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
Expand All @@ -36,6 +37,8 @@ const (
MetricsNamePodCount = "pod_count"
MetricsNamePodResource = "pod_resource"
MetricsNameGetResourceEmpty = "get_resource_empty"

defaultHealthCheckTimeout = 1 * time.Minute
)

type ResourcesGetter func(ctx context.Context) (v1.ResourceList, error)
Expand Down Expand Up @@ -94,6 +97,7 @@ func (b *ResourcesEvictionPlugin) Name() string {
}

func (b *ResourcesEvictionPlugin) Start() {
general.RegisterHeartbeatCheck(ReclaimedResourcesEvictionPluginName, defaultHealthCheckTimeout, general.HealthzCheckStateNotReady, defaultHealthCheckTimeout)
return
}

Expand Down
8 changes: 7 additions & 1 deletion pkg/agent/resourcemanager/fetcher/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ const (
metricsNameGetContentCost = "reporter_get_content_cost"
metricsNameGetContentPluginCost = "reporter_get_content_plugin_cost"
metricsNameGenericSyncCost = "reporter_generic_sync_cost"

reporterPushContentHealthCheckName = "reporter_push_content"
healthCheckAutoRecoverPeriod = 1 * time.Minute
)

// ReporterPluginManager is used to manage in-tree or out-tree reporter plugin registrations and
Expand Down Expand Up @@ -216,6 +219,7 @@ func (m *ReporterPluginManager) DeRegisterPlugin(pluginName string) {

// Run start the reporter plugin manager
func (m *ReporterPluginManager) Run(ctx context.Context) {
general.RegisterReportCheck(reporterManagerCheckpoint, healthCheckAutoRecoverPeriod)
go wait.UntilWithContext(ctx, m.syncFunc, m.reconcilePeriod)

klog.Infof("reporter plugin manager started")
Expand Down Expand Up @@ -304,7 +308,9 @@ func (m *ReporterPluginManager) pushContents(ctx context.Context, reportResponse
klog.Errorf("writing checkpoint encountered %v", err)
}

return m.reporter.PushContents(ctx, reportResponses)
err := m.reporter.PushContents(ctx, reportResponses)
_ = general.UpdateHealthzStateByError(reporterPushContentHealthCheckName, err)
return err
}

// genericSync periodically calls the Get function to obtain content changes
Expand Down

0 comments on commit c3d1a2c

Please sign in to comment.