From 90ddb290813a494224a5a3e95a8c79861734a60e Mon Sep 17 00:00:00 2001 From: z1ens Date: Thu, 15 Aug 2024 14:48:18 +0200 Subject: [PATCH] Node scope and cluster scope score added, fix docu, filter out completed pods, add explanation of the scores Signed-off-by: z1ens --- resource-usage-collect-addon/README.md | 61 ++++-- .../pkg/addon/agent/agent.go | 35 +++- .../pkg/addon/agent/calculate.go | 193 +++++++++++++----- .../pkg/addon/agent/calculate_test.go | 87 ++++---- 4 files changed, 245 insertions(+), 131 deletions(-) diff --git a/resource-usage-collect-addon/README.md b/resource-usage-collect-addon/README.md index 0daf482..af934ec 100644 --- a/resource-usage-collect-addon/README.md +++ b/resource-usage-collect-addon/README.md @@ -1,25 +1,39 @@ # Resource usage collect addon + ## Background -Open-Cluster-Management has already supported [extensible placement scheduling](https://github.com/open-cluster-management-io/enhancements/blob/main/enhancements/sig-architecture/32-extensiblescheduling/32-extensiblescheduling.md), which allow users to use [addonplacementscore](https://github.com/open-cluster-management-io/enhancements/blob/main/enhancements/sig-architecture/32-extensiblescheduling/32-extensiblescheduling.md#addonplacementscore-api) to select clusters under certain conditions. -The basic idea of `addonPlacementScore` is that, the addon agent, which is installed on the managed cluster, collect information about the managed cluster, and calculate a score. These scores can be used when selecting or comparing multiple clusters. -With the rapid advancement of artificial intelligence, an increasing number of developers need to schedule and plan workloads based on available resources to achieve better performance and save resources. +With the rapid advancement of artificial intelligence, an increasing number of developers are required to schedule and plan AI/ML workloads based on available resources to achieve optimal performance and resource efficiency. + + +Open-Cluster-Management (OCM) has already implemented `Placement` and supports [extensible placement scheduling](https://github.com/open-cluster-management-io/enhancements/blob/main/enhancements/sig-architecture/32-extensiblescheduling/32-extensiblescheduling.md), which allows for advanced, customizable workload scheduling across clusters. The key components are: + +- `Placement`: This feature enables the dynamic selection of a set of `ManagedClusters` within one or more `ManagedClusterSets` to facilitate Multi-Cluster scheduling. +- `AddOnPlacementScore`: An API introduced by `Placement` to support scheduling based on customized scores. + +The `resource-usage-addon` is developed with `AddonTemplate`, and operates within this framework. +- Once installed on the hub cluster, the addon deploys an agent on each managed cluster. +- Agent pods on the managed clusters collect resource usage data and calculate a corresponding score. +- These scores are then used by `Placement` to inform cluster selection, ensuring workloads are deployed on clusters with the most appropriate available resources. + +This repository, developed as part of [Google Summer of Code 2024](https://github.com/open-cluster-management-io/ocm/issues/369), introduces enhancements to the `resource-usage-addon`, including new support for scheduling based on GPU and TPU resource availability. +This update is particularly valuable for developers seeking to optimize AI/ML workloads across multiple clusters. -This repository mainly introduce an addon which collects the resource usage information in the managed clusters and calculate `addonPlacementScore`, users could select clusters based on the score using a `placement`. -A possible use case could be: As a developer, I want to deploy my work on the cluster who has the most GPU resources available. This addon is developed using `addonTemplate`. -More details about: -- Extensible scheduling, please refer to [Extend the multicluster scheduling capabilities with placement](https://open-cluster-management.io/scenarios/extend-multicluster-scheduling-capabilities/) -- Add-on, please refer to [What-is-an-addon](https://open-cluster-management.io/concepts/addon/#what-is-an-add-on) -- Placement, please refer to [What-is-a-placement](https://open-cluster-management.io/concepts/placement/#select-clusters-in-managedclusterset) -- Addon template, please refer to [Enhancement:addontemplate](https://github.com/open-cluster-management-io/enhancements/tree/main/enhancements/sig-architecture/82-addon-template) +REF: +- [GSoC 2024: Scheduling AI workload among multiple clusters #369](https://github.com/open-cluster-management-io/ocm/issues/369) +- [Extend the multicluster scheduling capabilities with placement](https://open-cluster-management.io/scenarios/extend-multicluster-scheduling-capabilities/) +- [What-is-an-addon](https://open-cluster-management.io/concepts/addon/#what-is-an-add-on) +- [What-is-a-placement](https://open-cluster-management.io/concepts/placement/#select-clusters-in-managedclusterset) +- [Enhancement:addontemplate](https://github.com/open-cluster-management-io/enhancements/tree/main/enhancements/sig-architecture/82-addon-template) # Quickstart ## Prerequisite -1. Follow the instructions on [OCM official website](https://open-cluster-management.io/getting-started/quick-start/) install`clusteradm` command-line tool and set up a hub (manager) cluster and two managed clusters. -If using a different kubernetes distribution, follow the instructions in [Set-hub-and-managed-cluster](https://open-cluster-management.io/getting-started/quick-start/#setup-hub-and-managed-cluster). +1. Follow the instructions on [OCM official website](https://open-cluster-management.io/getting-started/quick-start/), install `clusteradm` command-line tool and set up a hub (manager) cluster with two managed clusters. + If prefer using a different kubernetes distribution, follow the instructions in [Set-hub-and-managed-cluster](https://open-cluster-management.io/getting-started/quick-start/#setup-hub-and-managed-cluster). + 2. Command line tool `kubectl` installed. + 3. [Docker](https://www.docker.com/) installed. ## Deploy @@ -54,7 +68,7 @@ make deploy If deployed successfully: -On the hub cluster, you can see the `addonTemplate`, and check the `managedClusterAddon` status. +On the hub cluster, you can see the `AddonTemplate`, and check the `ManagedClusterAddon` status. ```bash $ kubectl get addontemplate NAME ADDON NAME @@ -66,7 +80,7 @@ cluster1 resource-usage-collect True False cluster2 resource-usage-collect True False ``` -After a short while,on the hub cluster, `addonPlacementScore` for each managed cluster will be generated. +After a short while, on the hub cluster, `AddonPlacementScore` for each managed cluster will be generated. ```bash $ kubectl config use kind-hub $ kubectl get addonplacementscore -A @@ -74,6 +88,23 @@ NAMESPACE NAME AGE cluster1 resource-usage-score 3m23s cluster2 resource-usage-score 3m24s ``` +### Resource Scoring Strategies + +#### Node Scope Score +- Node Scope Score: Indicates the available resources on the node with the most capacity in the cluster, aiding in selecting the best node for resource-intensive workloads. +- Code Representation: Represented as `cpuNodeAvailable`, `gpuNodeAvailable`, etc., indicating available CPU and GPU resources on specific nodes. + +#### Example Use Scenarios: +- Scenario: Suppose you have a cluster with three nodes: Node A with 2 available GPUs, Node B with 4 available GPUs, and Node C with 6 available GPUs. You need to deploy a job that requires 1 GPU. +- Scheduling Strategies: Using the Node Scope Score, specifically `gpuNodeAvailable`, the scheduler could identify Node A as the optimal node by choosing a lower `gpuNodeAvailable` for this job under a bin-packing strategy. The scheduler would prefer to place the job on Node A to keep Nodes B and C more available for future jobs that may require more resources. This approach minimizes fragmentation and ensures that larger jobs can be accommodated later. + +#### Cluster Scope Score +- Cluster Scope Score reflects the total available resources across the entire cluster, helping to determine if the cluster can support additional workloads. +- Code Representation: Represented as `cpuClusterAvailable`, `gpuClusterAvailable`, etc., aggregating available resources across all nodes in the cluster. + +#### Example Use Scenarios: +- Scenario: Consider a multi-cluster environment where Cluster X has 10 available GPUs across all nodes, Cluster Y has 6 available GPUs, and Cluster Z has 8 available GPUs. You need to deploy two jobs that first one requires 3 GPUs, and the other requires 4 GPUs. +- Scheduling Strategies: Using the Cluster Scope Score, specifically `gpuClusterAvailable`, the scheduler would prefer the first job Cluster X for the job because it has the most available GPU resource. Then the Cluster X's score becoming lower, the scheduler will then deploy the second job on Cluster Z. This ensures that workloads are spread out, maximizing resource utilization across clusters and avoiding overloading a single cluster. ### Use Placement to select clusters Consider this example use case: As a developer, I want to select a cluster with the most available GPU resources and deploy a job on it. @@ -118,4 +149,4 @@ make undeploy ### Troubleshoot 1. If `make deploy` could not work, it might be that there has an auto-generated `kustomization_tmp.yaml.tmp` file, delete it and rerun the command. -Also make sure you are under hub cluster context, check the `kustomization.yaml` file, delete the part under `configMapGenerator`(if there is one exists). + Also make sure you are under hub cluster context, check the `kustomization.yaml` file, delete the part under `configMapGenerator`(if there is one exists). diff --git a/resource-usage-collect-addon/pkg/addon/agent/agent.go b/resource-usage-collect-addon/pkg/addon/agent/agent.go index a215228..d5f1415 100644 --- a/resource-usage-collect-addon/pkg/addon/agent/agent.go +++ b/resource-usage-collect-addon/pkg/addon/agent/agent.go @@ -158,26 +158,43 @@ func newAgentController( func (c *agentController) sync(ctx context.Context, syncCtx factory.SyncContext) error { score := NewScore(c.nodeInformer, c.podInformer) - cpuScore, memScore, gpuScore, tpuScore, err := score.calculateScore() + cpuNodeScore, memNodeScore, gpuNodeScore, tpuNodeScore, err := score.calculateNodeScore() + cpuClusterScore, memClusterScore, gpuClusterScore, tpuClusterScore, err := score.calculateClusterScopeScore() if err != nil { return err } items := []apiv1alpha2.AddOnPlacementScoreItem{ { - Name: "cpuAvailable", - Value: cpuScore, + Name: "cpuNodeAvailable", + Value: cpuNodeScore, }, { - Name: "memAvailable", - Value: memScore, + Name: "cpuClusterAvailable", + Value: cpuClusterScore, }, { - Name: "gpuAvailable", - Value: gpuScore, + Name: "memNodeAvailable", + Value: memNodeScore, }, { - Name: "tpuAvailable", - Value: tpuScore, + Name: "memClusterAvailable", + Value: memClusterScore, + }, + { + Name: "gpuNodeAvailable", + Value: gpuNodeScore, + }, + { + Name: "gpuClusterAvailable", + Value: gpuClusterScore, + }, + { + Name: "tpuNodeAvailable", + Value: tpuNodeScore, + }, + { + Name: "tpuClusterAvailable", + Value: tpuClusterScore, }, } diff --git a/resource-usage-collect-addon/pkg/addon/agent/calculate.go b/resource-usage-collect-addon/pkg/addon/agent/calculate.go index 73c2817..754c911 100644 --- a/resource-usage-collect-addon/pkg/addon/agent/calculate.go +++ b/resource-usage-collect-addon/pkg/addon/agent/calculate.go @@ -46,75 +46,140 @@ func NewScore(nodeInformer corev1informers.NodeInformer, podInformer corev1infor } } -func (s *Score) calculateScore() (cpuScore int32, memScore int32, gpuScore int32, tpuScore int32, err error) { - cpuAlloc, cpuNode, err := s.calculateClusterAllocatable(string(clusterv1.ResourceCPU)) +// Calculate the available resources in the node scope, the node with the maximum available resources will be chosen to calculate the score. +func (s *Score) calculateNodeScore() (cpuScore int32, memScore int32, gpuScore int32, tpuScore int32, err error) { + // Get the amount of resources available for the node with the largest actual available CPU resources. + cpuAvailable, _, err := s.calculateMaxAvailableNode(string(clusterv1.ResourceCPU)) if err != nil { return 0, 0, 0, 0, err } - memAlloc, memNode, err := s.calculateClusterAllocatable(string(clusterv1.ResourceMemory)) + // Get the amount of resources available for the node with the largest actual available Memory resources. + memAvailable, _, err := s.calculateMaxAvailableNode(string(clusterv1.ResourceMemory)) if err != nil { return 0, 0, 0, 0, err } - gpuAlloc, gpuNode, err := s.calculateClusterAllocatable(ResourceGPU) + // Get the amount of resources available for the node with the largest actual available GPU resources. + gpuAvailable, _, err := s.calculateMaxAvailableNode(ResourceGPU) if err != nil { return 0, 0, 0, 0, err } - tpuAlloc, tpuNode, err := s.calculateClusterAllocatable(ResourceTPU) + // Get the amount of resources available for the node with the largest actual available TPU resources. + tpuAvailable, _, err := s.calculateMaxAvailableNode(ResourceTPU) if err != nil { return 0, 0, 0, 0, err } + // Use the amount of available resources directly to generate scores + return s.normalizeScore("node", cpuAvailable, memAvailable, gpuAvailable, tpuAvailable) +} - cpuUsage, err := s.calculateNodeResourceUsage(cpuNode, string(v1.ResourceCPU)) +// Calculate the available resources in the cluster scope and return four scores for CPU, Memory, GPU, and TPU. +func (s *Score) calculateClusterScopeScore() (cpuScore int32, memScore int32, gpuScore int32, tpuScore int32, err error) { + // Get the total available CPU resources across the cluster. + cpuAvailable, err := s.calculateClusterAvailable(string(clusterv1.ResourceCPU)) if err != nil { return 0, 0, 0, 0, err } - memUsage, err := s.calculateNodeResourceUsage(memNode, string(v1.ResourceMemory)) + + // Get the total available Memory resources across the cluster. + memAvailable, err := s.calculateClusterAvailable(string(clusterv1.ResourceMemory)) if err != nil { return 0, 0, 0, 0, err } - gpuUsage, err := s.calculateNodeResourceUsage(gpuNode, ResourceGPU) + + // Get the total available GPU resources across the cluster. + gpuAvailable, err := s.calculateClusterAvailable(ResourceGPU) if err != nil { return 0, 0, 0, 0, err } - tpuUsage, err := s.calculateNodeResourceUsage(tpuNode, ResourceTPU) + + // Get the total available TPU resources across the cluster. + tpuAvailable, err := s.calculateClusterAvailable(ResourceTPU) if err != nil { return 0, 0, 0, 0, err } - return s.normalizeScore(cpuAlloc, cpuUsage, memAlloc, memUsage, gpuAlloc, gpuUsage, tpuAlloc, tpuUsage) + // Normalize and return the scores based on available resources + return s.normalizeScore("cluster", cpuAvailable, memAvailable, gpuAvailable, tpuAvailable) +} + +// Calculate the available resources in the cluster scope. +func (s *Score) calculateClusterAvailable(resourceName string) (float64, error) { + nodes, err := s.nodeLister.List(labels.Everything()) + if err != nil { + return 0, err + } + + var totalAllocatable float64 + var totalUsage float64 + + for _, node := range nodes { + if node.Spec.Unschedulable { + continue + } + + // Accumulate allocatable resources from all nodes + alloc, exists := node.Status.Allocatable[v1.ResourceName(resourceName)] + if exists { + totalAllocatable += alloc.AsApproximateFloat64() + } + + // Calculate the resource usage for this node + usage, err := s.calculateNodeResourceUsage(node.Name, resourceName) + if err != nil { + return 0, err + } + totalUsage += usage + } + + // Calculate available resources + available := totalAllocatable - totalUsage + return available, nil } -func (s *Score) normalizeScore(cpuAlloc, cpuUsage, memAlloc, memUsage, gpuAlloc, gpuUsage, tpuAlloc, tpuUsage float64) (cpuScore int32, memScore int32, gpuScore int32, tpuScore int32, err error) { - klog.Infof("cpuAlloc = %v, cpuUsage = %v, memAlloc = %v, memUsage = %v, gpuAlloc = %v, gpuUsage = %v, tpuAlloc = %v, tpuUsage = %v", cpuAlloc, cpuUsage, memAlloc, memUsage, gpuAlloc, gpuUsage, tpuAlloc, tpuUsage) - availableCpu := cpuAlloc - cpuUsage +// Normalize the score with the logic of ScoreNormaliser. +func (s *Score) normalizeScore(scope string, cpuAvailable, memAvailable, gpuAvailable, tpuAvailable float64) (cpuScore int32, memScore int32, gpuScore int32, tpuScore int32, err error) { + // Add a parameter that identifies whether the current scope is "cluster scope" or "node scope". + klog.Infof("[%s] cpuAvailable = %v, memAvailable = %v, gpuAvailable = %v, tpuAvailable = %v", scope, cpuAvailable, memAvailable, gpuAvailable, tpuAvailable) + cpuScoreNormalizer := clustersdkv1alpha1.NewScoreNormalizer(MINCPUCOUNT, MAXCPUCOUNT) - cpuScore, err = cpuScoreNormalizer.Normalize(availableCpu) + cpuScore, err = cpuScoreNormalizer.Normalize(cpuAvailable) + if err != nil { + return 0, 0, 0, 0, err + } - availableMem := (memAlloc - memUsage) / (1024 * 1024) // MB + availableMem := memAvailable / 1024 * 1024 // MB memScoreNormalizer := clustersdkv1alpha1.NewScoreNormalizer(MINMEMCOUNT, MAXMEMCOUNT) memScore, err = memScoreNormalizer.Normalize(availableMem) + if err != nil { + return 0, 0, 0, 0, err + } - availableGpu := gpuAlloc - gpuUsage gpuScoreNormalizer := clustersdkv1alpha1.NewScoreNormalizer(MINGPUCOUNT, MAXGPUCOUNT) - gpuScore, err = gpuScoreNormalizer.Normalize(availableGpu) + gpuScore, err = gpuScoreNormalizer.Normalize(gpuAvailable) + if err != nil { + return 0, 0, 0, 0, err + } - availableTpu := tpuAlloc - tpuUsage tpuScoreNormalizer := clustersdkv1alpha1.NewScoreNormalizer(MINTPUCOUNT, MAXTPUCOUNT) - tpuScore, err = tpuScoreNormalizer.Normalize(availableTpu) + tpuScore, err = tpuScoreNormalizer.Normalize(tpuAvailable) + if err != nil { + return 0, 0, 0, 0, err + } - klog.Infof("cpuScore = %v, memScore = %v, gpuScore = %v, tpuScore = %v", cpuScore, memScore, gpuScore, tpuScore) + klog.Infof("[%s] cpuScore = %v, memScore = %v, gpuScore = %v, tpuScore = %v", scope, cpuScore, memScore, gpuScore, tpuScore) return cpuScore, memScore, gpuScore, tpuScore, nil } -// Iterate every node, find the node with maximum allocatable resource, return the number and node name. -func (s *Score) calculateClusterAllocatable(resourceName string) (float64, string, error) { +// Find the node in the cluster that has the maximum available resources. +func (s *Score) calculateMaxAvailableNode(resourceName string) (float64, string, error) { + // Get the list of all Nodes, nodes, err := s.nodeLister.List(labels.Everything()) if err != nil { return 0, "", err } - - var maxAllocatable float64 + var maxAvailable float64 var maxNodeName string + // Iterate every node, calculate its available resources amount. for _, node := range nodes { if node.Spec.Unschedulable { continue @@ -123,43 +188,26 @@ func (s *Score) calculateClusterAllocatable(resourceName string) (float64, strin if !exists { continue } - klog.Infof("Node: %s, Allocatable %s: %f", node.Name, resourceName, alloc.AsApproximateFloat64()) - if alloc.AsApproximateFloat64() > maxAllocatable { - maxAllocatable = alloc.AsApproximateFloat64() - maxNodeName = node.Name - } - } - klog.Infof("Max allocatable %s: %f on node: %s", resourceName, maxAllocatable, maxNodeName) - return maxAllocatable, maxNodeName, nil -} - -func (s *Score) getRequestForResource(resource string, requests *v1.ResourceList, nonZero bool) float64 { - if requests == nil { - return 0 - } - switch resource { - case string(v1.ResourceCPU): - // Override if un-set, but not if explicitly set to zero - if _, found := (*requests)[v1.ResourceCPU]; !found && nonZero { - return 100 - } - return requests.Cpu().AsApproximateFloat64() - case string(v1.ResourceMemory): - // Override if un-set, but not if explicitly set to zero - if _, found := (*requests)[v1.ResourceMemory]; !found && nonZero { - return 200 * 1024 * 1024 + // Get the resource usage on this node. + usage, err := s.calculateNodeResourceUsage(node.Name, resourceName) + if err != nil { + return 0, "", err } - return requests.Memory().AsApproximateFloat64() - default: - quantity, found := (*requests)[v1.ResourceName(resource)] - if !found { - return 0 + // Calculate the actual amount of resources available. + available := alloc.AsApproximateFloat64() - usage + // Find the node with the maximum available resources. + if available > maxAvailable { + maxAvailable = available + maxNodeName = node.Name } - return quantity.AsApproximateFloat64() } + klog.Infof("Max available %s: %f on node: %s", resourceName, maxAvailable, maxNodeName) + return maxAvailable, maxNodeName, nil } +// Calculate the actual usage of a specific resource (e.g., GPU) by unfinished Pods on a given node. func (s *Score) calculateNodeResourceUsage(nodeName string, resourceName string) (float64, error) { + // Get the list of all Pods. list, err := s.podLister.List(labels.Everything()) if err != nil { return 0, err @@ -167,15 +215,24 @@ func (s *Score) calculateNodeResourceUsage(nodeName string, resourceName string) var podRequest float64 for _, pod := range list { + // Only counts Pods dispatched to specific nodes. if pod.Spec.NodeName != nodeName { continue } + + // Skip completed Pods or Pods that have released resources. + if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed || pod.DeletionTimestamp != nil { + continue + } + + // Calculate resource requests for each container in the Pod. for i := range pod.Spec.Containers { container := &pod.Spec.Containers[i] value := s.getRequestForResource(resourceName, &container.Resources.Requests, !s.useRequested) podRequest += value } + // Calculate resource requests for the Init container. for i := range pod.Spec.InitContainers { initContainer := &pod.Spec.InitContainers[i] value := s.getRequestForResource(resourceName, &initContainer.Resources.Requests, !s.useRequested) @@ -193,3 +250,29 @@ func (s *Score) calculateNodeResourceUsage(nodeName string, resourceName string) } return podRequest, nil } + +func (s *Score) getRequestForResource(resource string, requests *v1.ResourceList, nonZero bool) float64 { + if requests == nil { + return 0 + } + switch resource { + case string(v1.ResourceCPU): + // Override if un-set, but not if explicitly set to zero + if _, found := (*requests)[v1.ResourceCPU]; !found && nonZero { + return 100 + } + return requests.Cpu().AsApproximateFloat64() + case string(v1.ResourceMemory): + // Override if un-set, but not if explicitly set to zero + if _, found := (*requests)[v1.ResourceMemory]; !found && nonZero { + return 200 * 1024 * 1024 + } + return requests.Memory().AsApproximateFloat64() + default: + quantity, found := (*requests)[v1.ResourceName(resource)] + if !found { + return 0 + } + return quantity.AsApproximateFloat64() + } +} diff --git a/resource-usage-collect-addon/pkg/addon/agent/calculate_test.go b/resource-usage-collect-addon/pkg/addon/agent/calculate_test.go index 5f60250..065d425 100644 --- a/resource-usage-collect-addon/pkg/addon/agent/calculate_test.go +++ b/resource-usage-collect-addon/pkg/addon/agent/calculate_test.go @@ -12,63 +12,47 @@ import ( "k8s.io/client-go/kubernetes/fake" ) -// Test normalize Score function. +// Test normalizeScore function. func TestNormalizeValue(t *testing.T) { cases := []struct { name string - cpuAlloc float64 - cpuUsage float64 - memAlloc float64 - memUsage float64 - gpuAlloc float64 - gpuUsage float64 - tpuAlloc float64 - tpuUsage float64 + cpuAvailable float64 + memAvailable float64 + gpuAvailable float64 + tpuAvailable float64 expectCPUScore int32 expectMemScore int32 expectGPUScore int32 expectTPUScore int32 }{ { - name: "usage < alloc", - cpuAlloc: 70, - cpuUsage: 30, - memAlloc: 1024 * 1024 * 1024 * 1024, - memUsage: 1024 * 1024 * 1024 * 500, - gpuAlloc: 8, - gpuUsage: 4, - tpuAlloc: 5, - tpuUsage: 4, - expectCPUScore: -20, - expectMemScore: 100, - expectGPUScore: -60, - expectTPUScore: -90, - }, - { - name: "usage = alloc", - cpuAlloc: 70, - cpuUsage: 70, - memAlloc: 1024 * 1024 * 1024, - memUsage: 1024 * 1024 * 1024, - gpuAlloc: 8, - gpuUsage: 8, - tpuAlloc: 10, - tpuUsage: 10, + name: "usage = alloc", // Indicating that cpuAvailable, gpuAvailable etc. are all 0. + cpuAvailable: 0, + memAvailable: 0, + gpuAvailable: 0, + tpuAvailable: 0, expectCPUScore: -100, expectMemScore: -100, expectGPUScore: -100, expectTPUScore: -100, }, { - name: "usage > alloc", - cpuAlloc: 70, - cpuUsage: 80, - memAlloc: 1024 * 1024 * 1024 * 1024, - memUsage: 1024 * 1024 * 1024 * 1025, - gpuAlloc: 8, - gpuUsage: 10, - tpuAlloc: 6, - tpuUsage: 12, + name: "usage < alloc", // Indicating that cpuAvailable, gpuAvailable etc. are all positive. + cpuAvailable: 40, + memAvailable: 524 * 1024 * 1024 * 1024, + gpuAvailable: 2, + tpuAvailable: 1, + expectCPUScore: -20, + expectMemScore: 100, + expectGPUScore: -80, + expectTPUScore: -90, + }, + { + name: "usage > alloc", // Indicating that cpuAvailable, gpuAvailable etc. are all negative. + cpuAvailable: -10, + memAvailable: -1024 * 1024 * 1024, + gpuAvailable: -10, + tpuAvailable: -10, expectCPUScore: -100, expectMemScore: -100, expectGPUScore: -100, @@ -79,7 +63,7 @@ func TestNormalizeValue(t *testing.T) { for _, c := range cases { t.Run(c.name, func(t *testing.T) { score := Score{} - cpuScore, memScore, gpuScore, tpuScore, err := score.normalizeScore(c.cpuAlloc, c.cpuUsage, c.memAlloc, c.memUsage, c.gpuAlloc, c.gpuUsage, c.tpuAlloc, c.tpuUsage) + cpuScore, memScore, gpuScore, tpuScore, err := score.normalizeScore("testScope", c.cpuAvailable, c.memAvailable, c.gpuAvailable, c.tpuAvailable) require.NoError(t, err) assert.Equal(t, c.expectCPUScore, cpuScore) assert.Equal(t, c.expectMemScore, memScore) @@ -89,7 +73,7 @@ func TestNormalizeValue(t *testing.T) { } } -// Test calculateClusterAllocatable and calculateNodeResourceUsage +// Test the calculation of resources across the cluster and on specific nodes func TestCalculateClusterResources(t *testing.T) { // Create testing nodes and pods. node1 := &corev1.Node{ @@ -152,18 +136,17 @@ func TestCalculateClusterResources(t *testing.T) { s := NewScore(nodeInformer, podInformer) - // Test calculateClusterAllocatable - gpuAlloc, nodeName, err := s.calculateClusterAllocatable(ResourceGPU) + // Test calculateClusterAvailable for GPUs + totalGPUAvailable, err := s.calculateClusterAvailable(ResourceGPU) require.NoError(t, err) - // Expect node2 has 8 GPUs. - assert.Equal(t, float64(8), gpuAlloc) - assert.Equal(t, "node2", nodeName) + // The cluster should have 12 GPUs available (6 from node1 + 6 from node2 after deducting 2 used by testPod). + assert.Equal(t, float64(12), totalGPUAvailable) - // Test calculateNodeResourceUsage - gpuUsage, err := s.calculateNodeResourceUsage(nodeName, ResourceGPU) + // Test calculateNodeResourceUsage for node2 + gpuUsage, err := s.calculateNodeResourceUsage("node2", ResourceGPU) require.NoError(t, err) - // Expect testPod use 2 GPUs. + // Expect testPod on node2 to use 2 GPUs. assert.Equal(t, float64(2), gpuUsage) }