Skip to content

Commit

Permalink
Merge pull request #165 from cloudpilot-ai/cherry-pick-158-release-0.1
Browse files Browse the repository at this point in the history
Cherry pick PR(158)/fix: calculate the overhead correctly
  • Loading branch information
jwcesign authored Dec 13, 2024
2 parents b700bf6 + 107f1b6 commit 3e02f3e
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 43 deletions.
39 changes: 1 addition & 38 deletions pkg/providers/instancetype/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
"github.com/patrickmn/go-cache"
"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
Expand Down Expand Up @@ -179,11 +178,6 @@ func (p *DefaultProvider) List(ctx context.Context, kc *v1alpha1.KubeletConfigur
return nil, fmt.Errorf("failed to get cluster CNI: %w", err)
}

nodeResourceOverhead, err := p.nodeOverhead(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get node resource overhead: %w", err)
}

result := lo.Map(p.instanceTypesInfo, func(i *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType, _ int) *cloudprovider.InstanceType {
zoneData := lo.Map(allZones.UnsortedList(), func(zoneID string, _ int) ZoneData {
if !p.instanceTypesOfferings[lo.FromPtr(i.InstanceTypeId)].Has(zoneID) || !vSwitchsZones.Has(zoneID) {
Expand All @@ -203,7 +197,7 @@ func (p *DefaultProvider) List(ctx context.Context, kc *v1alpha1.KubeletConfigur
// so that Karpenter is able to cache the set of InstanceTypes based on values that alter the set of instance types
// !!! Important !!!
offers := p.createOfferings(ctx, *i.InstanceTypeId, zoneData)
return NewInstanceType(ctx, nodeResourceOverhead, i, kc, p.region, nodeClass.Spec.SystemDisk, offers, clusterCNI)
return NewInstanceType(ctx, i, kc, p.region, nodeClass.Spec.SystemDisk, offers, clusterCNI)
})

// Filter out nil values
Expand All @@ -213,37 +207,6 @@ func (p *DefaultProvider) List(ctx context.Context, kc *v1alpha1.KubeletConfigur
return result, nil
}

func (p *DefaultProvider) nodeOverhead(ctx context.Context) (corev1.ResourceList, error) {
var nodes corev1.NodeList
if err := p.kubeClient.List(ctx, &nodes); err != nil {
return corev1.ResourceList{}, err
}

// We do not sure how to calculate the overhead of the node, let's just use the maximum possible
// To avoid some loop node creation
maxCPUOverHead := int64(0)
maxMemoryOverHead := int64(0)
for _, node := range nodes.Items {
capacity := node.Status.Capacity
allocatable := node.Status.Allocatable

cpuOverHead := capacity.Cpu().MilliValue() - allocatable.Cpu().MilliValue()
memoryOverHead := capacity.Memory().Value() - allocatable.Memory().Value()

if cpuOverHead > maxCPUOverHead {
maxCPUOverHead = cpuOverHead
}
if memoryOverHead > maxMemoryOverHead {
maxMemoryOverHead = memoryOverHead
}
}

return corev1.ResourceList{
corev1.ResourceCPU: *resource.NewMilliQuantity(maxCPUOverHead, resource.DecimalSI),
corev1.ResourceMemory: *resource.NewQuantity(maxMemoryOverHead, resource.DecimalSI),
}, nil
}

func (p *DefaultProvider) UpdateInstanceTypes(ctx context.Context) error {
// DO NOT REMOVE THIS LOCK ----------------------------------------------------------------------------
// We lock here so that multiple callers to getInstanceTypesOfferings do not result in cache misses and multiple
Expand Down
62 changes: 57 additions & 5 deletions pkg/providers/instancetype/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,52 @@ type ZoneData struct {
Available bool
}

func NewInstanceType(ctx context.Context, overhead corev1.ResourceList,
func calculateResourceOverhead(pods, cpuM, memoryMi int64) corev1.ResourceList {
// referring to: https://help.aliyun.com/zh/ack/ack-managed-and-ack-dedicated/user-guide/resource-reservation-policy#0f5ffe176df7q
// CPU overhead calculation
cpuOverHead := calculateCPUOverhead(cpuM)

// TODO: In a real environment, the formula does not produce accurate results,
// consistently yielding values that are 200MiB larger than expected.
// Memory overhead: min(11*pods + 255, memoryMi*0.25)
memoryOverHead := int64(math.Min(float64(11*pods+255), float64(memoryMi)*0.25)) + 200

return corev1.ResourceList{
corev1.ResourceCPU: *resource.NewMilliQuantity(cpuOverHead, resource.DecimalSI),
corev1.ResourceMemory: *resources.Quantity(fmt.Sprintf("%dMi", memoryOverHead)),
}
}

// thresholds defines CPU overhead thresholds and their corresponding percentages
var thresholds = [...]struct {
cores int64
overhead float64
}{
{1000, 0.06},
{3000, 0.01},
{3000, 0.005},
{4000, 0.005},
}

func calculateCPUOverhead(cpuM int64) int64 {
var cpuOverHead int64

// Calculate overhead for each threshold
for _, t := range thresholds {
if cpuM >= t.cores {
cpuOverHead += int64(1000 * t.overhead)
}
}

// Additional overhead for CPU > 4 cores (0.25%)
if cpuM > 4000 {
cpuOverHead += int64(float64(cpuM-4000) * 0.0025)
}

return cpuOverHead
}

func NewInstanceType(ctx context.Context,
info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType,
kc *v1alpha1.KubeletConfiguration, region string, systemDisk *v1alpha1.SystemDisk,
offerings cloudprovider.Offerings, clusterCNI string) *cloudprovider.InstanceType {
Expand All @@ -71,12 +116,15 @@ func NewInstanceType(ctx context.Context, overhead corev1.ResourceList,
Offerings: offerings,
Capacity: computeCapacity(ctx, info, kc.MaxPods, kc.PodsPerCore, systemDisk, clusterCNI),
Overhead: &cloudprovider.InstanceTypeOverhead{
// Follow overhead will be merged, so we can set only one overhead totally
KubeReserved: overhead,
KubeReserved: corev1.ResourceList{},
SystemReserved: corev1.ResourceList{},
EvictionThreshold: corev1.ResourceList{},
},
}

// Follow KubeReserved/SystemReserved/EvictionThreshold will be merged, so we can set only one overhead totally
it.Overhead.KubeReserved = calculateResourceOverhead(it.Capacity.Pods().Value(),
it.Capacity.Cpu().MilliValue(), extractMemory(info).Value()/MiBByteRatio)
if it.Requirements.Compatible(scheduling.NewRequirements(scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpIn, string(corev1.Windows)))) == nil {
it.Capacity[v1alpha1.ResourcePrivateIPv4Address] = *privateIPv4Address(info)
}
Expand Down Expand Up @@ -206,9 +254,13 @@ func cpu(info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceT
return resources.Quantity(fmt.Sprint(*info.CpuCoreCount))
}

func memory(ctx context.Context, info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType) *resource.Quantity {
func extractMemory(info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType) *resource.Quantity {
sizeInGib := tea.Float32Value(info.MemorySize)
mem := resources.Quantity(fmt.Sprintf("%fGi", sizeInGib))
return resources.Quantity(fmt.Sprintf("%fGi", sizeInGib))
}

func memory(ctx context.Context, info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType) *resource.Quantity {
mem := extractMemory(info)
if mem.IsZero() {
return mem
}
Expand Down

0 comments on commit 3e02f3e

Please sign in to comment.