diff --git a/pkg/api/helper/helpers.go b/pkg/api/helper/helpers.go index e181d4a249706..636d1dda0ea57 100644 --- a/pkg/api/helper/helpers.go +++ b/pkg/api/helper/helpers.go @@ -31,6 +31,30 @@ import ( "k8s.io/kubernetes/pkg/api" ) +// IsHugePageResourceName returns true if the resource name has the huge page +// resource prefix. +func IsHugePageResourceName(name api.ResourceName) bool { + return strings.HasPrefix(string(name), api.ResourceHugePagesPrefix) +} + +// HugePageResourceName returns a ResourceName with the canonical hugepage +// prefix prepended for the specified page size. The page size is converted +// to its canonical representation. +func HugePageResourceName(pageSize resource.Quantity) api.ResourceName { + return api.ResourceName(fmt.Sprintf("%s%s", api.ResourceHugePagesPrefix, pageSize.String())) +} + +// HugePageSizeFromResourceName returns the page size for the specified huge page +// resource name. If the specified input is not a valid huge page resource name +// an error is returned. +func HugePageSizeFromResourceName(name api.ResourceName) (resource.Quantity, error) { + if !IsHugePageResourceName(name) { + return resource.Quantity{}, fmt.Errorf("resource name: %s is not valid hugepage name", name) + } + pageSize := strings.TrimPrefix(string(name), api.ResourceHugePagesPrefix) + return resource.ParseQuantity(pageSize) +} + // NonConvertibleFields iterates over the provided map and filters out all but // any keys with the "non-convertible.kubernetes.io" prefix. func NonConvertibleFields(annotations map[string]string) map[string]string { @@ -113,7 +137,7 @@ var standardContainerResources = sets.NewString( // IsStandardContainerResourceName returns true if the container can make a resource request // for the specified resource func IsStandardContainerResourceName(str string) bool { - return standardContainerResources.Has(str) + return standardContainerResources.Has(str) || IsHugePageResourceName(api.ResourceName(str)) } // IsExtendedResourceName returns true if the resource name is not in the @@ -153,6 +177,7 @@ var overcommitBlacklist = sets.NewString(string(api.ResourceNvidiaGPU)) // namespace and not blacklisted. func IsOvercommitAllowed(name api.ResourceName) bool { return IsDefaultNamespaceResource(name) && + !IsHugePageResourceName(name) && !overcommitBlacklist.Has(string(name)) } @@ -220,7 +245,7 @@ var standardResources = sets.NewString( // IsStandardResourceName returns true if the resource is known to the system func IsStandardResourceName(str string) bool { - return standardResources.Has(str) + return standardResources.Has(str) || IsHugePageResourceName(api.ResourceName(str)) } var integerResources = sets.NewString( diff --git a/pkg/api/helper/helpers_test.go b/pkg/api/helper/helpers_test.go index f14f50d638047..7631b8bba0dc2 100644 --- a/pkg/api/helper/helpers_test.go +++ b/pkg/api/helper/helpers_test.go @@ -58,10 +58,28 @@ func TestIsStandardResource(t *testing.T) { {"disk", false}, {"blah", false}, {"x.y.z", false}, + {"hugepages-2Mi", true}, } for i, tc := range testCases { if IsStandardResourceName(tc.input) != tc.output { - t.Errorf("case[%d], expected: %t, got: %t", i, tc.output, !tc.output) + t.Errorf("case[%d], input: %s, expected: %t, got: %t", i, tc.input, tc.output, !tc.output) + } + } +} + +func TestIsStandardContainerResource(t *testing.T) { + testCases := []struct { + input string + output bool + }{ + {"cpu", true}, + {"memory", true}, + {"disk", false}, + {"hugepages-2Mi", true}, + } + for i, tc := range testCases { + if IsStandardContainerResourceName(tc.input) != tc.output { + t.Errorf("case[%d], input: %s, expected: %t, got: %t", i, tc.input, tc.output, !tc.output) } } } @@ -353,3 +371,120 @@ func TestGetNodeAffinityFromAnnotations(t *testing.T) { } } } + +func TestIsHugePageResourceName(t *testing.T) { + testCases := []struct { + name api.ResourceName + result bool + }{ + { + name: api.ResourceName("hugepages-2Mi"), + result: true, + }, + { + name: api.ResourceName("hugepages-1Gi"), + result: true, + }, + { + name: api.ResourceName("cpu"), + result: false, + }, + { + name: api.ResourceName("memory"), + result: false, + }, + } + for _, testCase := range testCases { + if testCase.result != IsHugePageResourceName(testCase.name) { + t.Errorf("resource: %v expected result: %v", testCase.name, testCase.result) + } + } +} + +func TestHugePageResourceName(t *testing.T) { + testCases := []struct { + pageSize resource.Quantity + name api.ResourceName + }{ + { + pageSize: resource.MustParse("2Mi"), + name: api.ResourceName("hugepages-2Mi"), + }, + { + pageSize: resource.MustParse("1Gi"), + name: api.ResourceName("hugepages-1Gi"), + }, + { + // verify we do not regress our canonical representation + pageSize: *resource.NewQuantity(int64(2097152), resource.BinarySI), + name: api.ResourceName("hugepages-2Mi"), + }, + } + for _, testCase := range testCases { + if result := HugePageResourceName(testCase.pageSize); result != testCase.name { + t.Errorf("pageSize: %v, expected: %v, but got: %v", testCase.pageSize.String(), testCase.name, result.String()) + } + } +} + +func TestHugePageSizeFromResourceName(t *testing.T) { + testCases := []struct { + name api.ResourceName + expectErr bool + pageSize resource.Quantity + }{ + { + name: api.ResourceName("hugepages-2Mi"), + pageSize: resource.MustParse("2Mi"), + expectErr: false, + }, + { + name: api.ResourceName("hugepages-1Gi"), + pageSize: resource.MustParse("1Gi"), + expectErr: false, + }, + { + name: api.ResourceName("hugepages-bad"), + expectErr: true, + }, + } + for _, testCase := range testCases { + value, err := HugePageSizeFromResourceName(testCase.name) + if testCase.expectErr && err == nil { + t.Errorf("Expected an error for %v", testCase.name) + } else if !testCase.expectErr && err != nil { + t.Errorf("Unexpected error for %v, got %v", testCase.name, err) + } else if testCase.pageSize.Value() != value.Value() { + t.Errorf("Unexpected pageSize for resource %v got %v", testCase.name, value.String()) + } + } +} + +func TestIsOvercommitAllowed(t *testing.T) { + testCases := []struct { + name api.ResourceName + allowed bool + }{ + { + name: api.ResourceCPU, + allowed: true, + }, + { + name: api.ResourceMemory, + allowed: true, + }, + { + name: api.ResourceNvidiaGPU, + allowed: false, + }, + { + name: HugePageResourceName(resource.MustParse("2Mi")), + allowed: false, + }, + } + for _, testCase := range testCases { + if testCase.allowed != IsOvercommitAllowed(testCase.name) { + t.Errorf("Unexpected result for %v", testCase.name) + } + } +} diff --git a/pkg/api/helper/qos/BUILD b/pkg/api/helper/qos/BUILD index 63b594dc911cc..1b2f763a17e1b 100644 --- a/pkg/api/helper/qos/BUILD +++ b/pkg/api/helper/qos/BUILD @@ -10,6 +10,7 @@ go_library( srcs = ["qos.go"], deps = [ "//pkg/api:go_default_library", + "//pkg/api/helper:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", ], diff --git a/pkg/api/helper/qos/qos.go b/pkg/api/helper/qos/qos.go index 6edd0e1b25a9a..cc58bde0c3177 100644 --- a/pkg/api/helper/qos/qos.go +++ b/pkg/api/helper/qos/qos.go @@ -22,10 +22,13 @@ import ( "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/helper" ) -// supportedComputeResources is the list of compute resources for with QoS is supported. -var supportedQoSComputeResources = sets.NewString(string(api.ResourceCPU), string(api.ResourceMemory)) +func isSupportedQoSComputeResource(name api.ResourceName) bool { + supportedQoSComputeResources := sets.NewString(string(api.ResourceCPU), string(api.ResourceMemory)) + return supportedQoSComputeResources.Has(string(name)) || helper.IsHugePageResourceName(name) +} // GetPodQOS returns the QoS class of a pod. // A pod is besteffort if none of its containers have specified any requests or limits. @@ -39,7 +42,7 @@ func GetPodQOS(pod *api.Pod) api.PodQOSClass { for _, container := range pod.Spec.Containers { // process requests for name, quantity := range container.Resources.Requests { - if !supportedQoSComputeResources.Has(string(name)) { + if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { @@ -55,7 +58,7 @@ func GetPodQOS(pod *api.Pod) api.PodQOSClass { // process limits qosLimitsFound := sets.NewString() for name, quantity := range container.Resources.Limits { - if !supportedQoSComputeResources.Has(string(name)) { + if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { @@ -70,7 +73,7 @@ func GetPodQOS(pod *api.Pod) api.PodQOSClass { } } - if len(qosLimitsFound) != len(supportedQoSComputeResources) { + if !qosLimitsFound.HasAll(string(api.ResourceMemory), string(api.ResourceCPU)) { isGuaranteed = false } } diff --git a/pkg/api/types.go b/pkg/api/types.go index 089b20aa8d028..c6e57efe47112 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -3339,6 +3339,8 @@ const ( ResourceOpaqueIntPrefix = "pod.alpha.kubernetes.io/opaque-int-resource-" // Default namespace prefix. ResourceDefaultNamespacePrefix = "kubernetes.io/" + // Name prefix for huge page resources (alpha). + ResourceHugePagesPrefix = "hugepages-" ) // ResourceList is a set of (resource name, quantity) pairs. diff --git a/pkg/api/v1/helper/BUILD b/pkg/api/v1/helper/BUILD index 6377e7e182816..f7e8849de73c4 100644 --- a/pkg/api/v1/helper/BUILD +++ b/pkg/api/v1/helper/BUILD @@ -24,6 +24,7 @@ go_library( deps = [ "//pkg/api/helper:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", "//vendor/k8s.io/apimachinery/pkg/selection:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", diff --git a/pkg/api/v1/helper/helpers.go b/pkg/api/v1/helper/helpers.go index 7a05c6da99ab4..0e6a48d816381 100644 --- a/pkg/api/v1/helper/helpers.go +++ b/pkg/api/v1/helper/helpers.go @@ -22,6 +22,7 @@ import ( "strings" "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/util/sets" @@ -41,6 +42,31 @@ func IsExtendedResourceName(name v1.ResourceName) bool { func IsDefaultNamespaceResource(name v1.ResourceName) bool { return !strings.Contains(string(name), "/") || strings.Contains(string(name), v1.ResourceDefaultNamespacePrefix) + +} + +// IsHugePageResourceName returns true if the resource name has the huge page +// resource prefix. +func IsHugePageResourceName(name v1.ResourceName) bool { + return strings.HasPrefix(string(name), v1.ResourceHugePagesPrefix) +} + +// HugePageResourceName returns a ResourceName with the canonical hugepage +// prefix prepended for the specified page size. The page size is converted +// to its canonical representation. +func HugePageResourceName(pageSize resource.Quantity) v1.ResourceName { + return v1.ResourceName(fmt.Sprintf("%s%s", v1.ResourceHugePagesPrefix, pageSize.String())) +} + +// HugePageSizeFromResourceName returns the page size for the specified huge page +// resource name. If the specified input is not a valid huge page resource name +// an error is returned. +func HugePageSizeFromResourceName(name v1.ResourceName) (resource.Quantity, error) { + if !IsHugePageResourceName(name) { + return resource.Quantity{}, fmt.Errorf("resource name: %s is not valid hugepage name", name) + } + pageSize := strings.TrimPrefix(string(name), v1.ResourceHugePagesPrefix) + return resource.ParseQuantity(pageSize) } // IsOpaqueIntResourceName returns true if the resource name has the opaque diff --git a/pkg/api/v1/helper/qos/BUILD b/pkg/api/v1/helper/qos/BUILD index e41f01c36b936..65eaa353ab327 100644 --- a/pkg/api/v1/helper/qos/BUILD +++ b/pkg/api/v1/helper/qos/BUILD @@ -24,6 +24,7 @@ go_library( name = "go_default_library", srcs = ["qos.go"], deps = [ + "//pkg/api/v1/helper:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", diff --git a/pkg/api/v1/helper/qos/qos.go b/pkg/api/v1/helper/qos/qos.go index c4cb17d224557..2fc7f22856a0f 100644 --- a/pkg/api/v1/helper/qos/qos.go +++ b/pkg/api/v1/helper/qos/qos.go @@ -20,12 +20,16 @@ import ( "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/sets" + v1helper "k8s.io/kubernetes/pkg/api/v1/helper" ) // QOSList is a set of (resource name, QoS class) pairs. type QOSList map[v1.ResourceName]v1.PodQOSClass -var supportedQoSComputeResources = sets.NewString(string(v1.ResourceCPU), string(v1.ResourceMemory)) +func isSupportedQoSComputeResource(name v1.ResourceName) bool { + supportedQoSComputeResources := sets.NewString(string(v1.ResourceCPU), string(v1.ResourceMemory)) + return supportedQoSComputeResources.Has(string(name)) || v1helper.IsHugePageResourceName(name) +} // GetPodQOS returns the QoS class of a pod. // A pod is besteffort if none of its containers have specified any requests or limits. @@ -39,7 +43,7 @@ func GetPodQOS(pod *v1.Pod) v1.PodQOSClass { for _, container := range pod.Spec.Containers { // process requests for name, quantity := range container.Resources.Requests { - if !supportedQoSComputeResources.Has(string(name)) { + if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { @@ -55,7 +59,7 @@ func GetPodQOS(pod *v1.Pod) v1.PodQOSClass { // process limits qosLimitsFound := sets.NewString() for name, quantity := range container.Resources.Limits { - if !supportedQoSComputeResources.Has(string(name)) { + if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { @@ -70,7 +74,7 @@ func GetPodQOS(pod *v1.Pod) v1.PodQOSClass { } } - if len(qosLimitsFound) != len(supportedQoSComputeResources) { + if !qosLimitsFound.HasAll(string(v1.ResourceMemory), string(v1.ResourceCPU)) { isGuaranteed = false } } diff --git a/pkg/api/v1/helper/qos/qos_test.go b/pkg/api/v1/helper/qos/qos_test.go index 78cf94da3ee96..a48387c3ded9e 100644 --- a/pkg/api/v1/helper/qos/qos_test.go +++ b/pkg/api/v1/helper/qos/qos_test.go @@ -130,6 +130,12 @@ func TestGetPodQOS(t *testing.T) { }), expected: v1.PodQOSBurstable, }, + { + pod: newPod("burstable-hugepages", []v1.Container{ + newContainer("burstable", addResource("hugepages-2Mi", "1Gi", getResourceList("0", "0")), addResource("hugepages-2Mi", "1Gi", getResourceList("0", "0"))), + }), + expected: v1.PodQOSBurstable, + }, } for id, testCase := range testCases { if actual := GetPodQOS(testCase.pod); testCase.expected != actual { @@ -141,7 +147,7 @@ func TestGetPodQOS(t *testing.T) { k8sv1.Convert_v1_Pod_To_api_Pod(testCase.pod, &pod, nil) if actual := qos.GetPodQOS(&pod); api.PodQOSClass(testCase.expected) != actual { - t.Errorf("[%d]: invalid qos pod %s, expected: %s, actual: %s", id, testCase.pod.Name, testCase.expected, actual) + t.Errorf("[%d]: conversion invalid qos pod %s, expected: %s, actual: %s", id, testCase.pod.Name, testCase.expected, actual) } } } diff --git a/pkg/api/validation/validation.go b/pkg/api/validation/validation.go index c484f1244315f..aac45af683bdc 100644 --- a/pkg/api/validation/validation.go +++ b/pkg/api/validation/validation.go @@ -2402,6 +2402,28 @@ func ValidateTolerations(tolerations []api.Toleration, fldPath *field.Path) fiel return allErrors } +func toResourceNames(resources api.ResourceList) []api.ResourceName { + result := []api.ResourceName{} + for resourceName := range resources { + result = append(result, resourceName) + } + return result +} + +func toSet(resourceNames []api.ResourceName) sets.String { + result := sets.NewString() + for _, resourceName := range resourceNames { + result.Insert(string(resourceName)) + } + return result +} + +func toContainerResourcesSet(ctr *api.Container) sets.String { + resourceNames := toResourceNames(ctr.Resources.Requests) + resourceNames = append(resourceNames, toResourceNames(ctr.Resources.Limits)...) + return toSet(resourceNames) +} + // validateContainersOnlyForPod does additional validation for containers on a pod versus a pod template // it only does additive validation of fields not covered in validateContainers func validateContainersOnlyForPod(containers []api.Container, fldPath *field.Path) field.ErrorList { @@ -2429,6 +2451,21 @@ func ValidatePod(pod *api.Pod) field.ErrorList { allErrs = append(allErrs, validateContainersOnlyForPod(pod.Spec.Containers, specPath.Child("containers"))...) allErrs = append(allErrs, validateContainersOnlyForPod(pod.Spec.InitContainers, specPath.Child("initContainers"))...) + if utilfeature.DefaultFeatureGate.Enabled(features.HugePages) { + hugePageResources := sets.NewString() + for i := range pod.Spec.Containers { + resourceSet := toContainerResourcesSet(&pod.Spec.Containers[i]) + for resourceStr := range resourceSet { + if v1helper.IsHugePageResourceName(v1.ResourceName(resourceStr)) { + hugePageResources.Insert(resourceStr) + } + } + } + if len(hugePageResources) > 1 { + allErrs = append(allErrs, field.Invalid(specPath, hugePageResources, "must use a single hugepage size in a pod spec")) + } + } + return allErrs } @@ -3445,7 +3482,10 @@ func ValidateNode(node *api.Node) field.ErrorList { allErrs = append(allErrs, validateNodeTaints(node.Spec.Taints, fldPath.Child("taints"))...) } - // Only validate spec. All status fields are optional and can be updated later. + // Only validate spec. + // All status fields are optional and can be updated later. + // That said, if specified, we need to ensure they are valid. + allErrs = append(allErrs, ValidateNodeResources(node)...) // external ID is required. if len(node.Spec.ExternalID) == 0 { @@ -3461,6 +3501,38 @@ func ValidateNode(node *api.Node) field.ErrorList { return allErrs } +// ValidateNodeResources is used to make sure a node has valid capacity and allocatable values. +func ValidateNodeResources(node *api.Node) field.ErrorList { + allErrs := field.ErrorList{} + // Validate resource quantities in capacity. + hugePageSizes := sets.NewString() + for k, v := range node.Status.Capacity { + resPath := field.NewPath("status", "capacity", string(k)) + allErrs = append(allErrs, ValidateResourceQuantityValue(string(k), v, resPath)...) + // track any huge page size that has a positive value + if helper.IsHugePageResourceName(k) && v.Value() > int64(0) { + hugePageSizes.Insert(string(k)) + } + if len(hugePageSizes) > 1 { + allErrs = append(allErrs, field.Invalid(resPath, v, "may not have pre-allocated hugepages for multiple page sizes")) + } + } + // Validate resource quantities in allocatable. + hugePageSizes = sets.NewString() + for k, v := range node.Status.Allocatable { + resPath := field.NewPath("status", "allocatable", string(k)) + allErrs = append(allErrs, ValidateResourceQuantityValue(string(k), v, resPath)...) + // track any huge page size that has a positive value + if helper.IsHugePageResourceName(k) && v.Value() > int64(0) { + hugePageSizes.Insert(string(k)) + } + if len(hugePageSizes) > 1 { + allErrs = append(allErrs, field.Invalid(resPath, v, "may not have pre-allocated hugepages for multiple page sizes")) + } + } + return allErrs +} + // ValidateNodeUpdate tests to make sure a node update can be applied. Modifies oldNode. func ValidateNodeUpdate(node, oldNode *api.Node) field.ErrorList { fldPath := field.NewPath("metadata") @@ -3473,16 +3545,7 @@ func ValidateNodeUpdate(node, oldNode *api.Node) field.ErrorList { // allErrs = append(allErrs, field.Invalid("status", node.Status, "must be empty")) // } - // Validate resource quantities in capacity. - for k, v := range node.Status.Capacity { - resPath := field.NewPath("status", "capacity", string(k)) - allErrs = append(allErrs, ValidateResourceQuantityValue(string(k), v, resPath)...) - } - // Validate resource quantities in allocatable. - for k, v := range node.Status.Allocatable { - resPath := field.NewPath("status", "allocatable", string(k)) - allErrs = append(allErrs, ValidateResourceQuantityValue(string(k), v, resPath)...) - } + allErrs = append(allErrs, ValidateNodeResources(node)...) // Validate no duplicate addresses in node status. addresses := make(map[api.NodeAddress]bool) @@ -3925,6 +3988,10 @@ func ValidateResourceRequirements(requirements *api.ResourceRequirements, fldPat if resourceName == api.ResourceEphemeralStorage && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) { allErrs = append(allErrs, field.Forbidden(limPath, "ResourceEphemeralStorage field disabled by feature-gate for ResourceRequirements")) } + if helper.IsHugePageResourceName(resourceName) && !utilfeature.DefaultFeatureGate.Enabled(features.HugePages) { + allErrs = append(allErrs, field.Forbidden(limPath, fmt.Sprintf("%s field disabled by feature-gate for ResourceRequirements", resourceName))) + } + } for resourceName, quantity := range requirements.Requests { fldPath := reqPath.Key(string(resourceName)) diff --git a/pkg/api/validation/validation_test.go b/pkg/api/validation/validation_test.go index 26b5aa061e5b1..05491bcbaed55 100644 --- a/pkg/api/validation/validation_test.go +++ b/pkg/api/validation/validation_test.go @@ -2759,6 +2759,106 @@ func TestValidateVolumes(t *testing.T) { } } +func TestAlphaHugePagesIsolation(t *testing.T) { + successCases := []api.Pod{ + { // Basic fields. + ObjectMeta: metav1.ObjectMeta{Name: "123", Namespace: "ns"}, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File", + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + }, + Limits: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + }, + }, + }, + }, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + }, + }, + } + failureCases := []api.Pod{ + { // Basic fields. + ObjectMeta: metav1.ObjectMeta{Name: "hugepages-shared", Namespace: "ns"}, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File", + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + }, + Limits: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("2Gi"), + }, + }, + }, + }, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + }, + }, + { // Basic fields. + ObjectMeta: metav1.ObjectMeta{Name: "hugepages-multiple", Namespace: "ns"}, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File", + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + api.ResourceName("hugepages-1Gi"): resource.MustParse("2Gi"), + }, + Limits: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + api.ResourceName("hugepages-1Gi"): resource.MustParse("2Gi"), + }, + }, + }, + }, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + }, + }, + } + // Enable alpha feature HugePages + err := utilfeature.DefaultFeatureGate.Set("HugePages=true") + if err != nil { + t.Errorf("Failed to enable feature gate for HugePages: %v", err) + return + } + for i := range successCases { + pod := &successCases[i] + if errs := ValidatePod(pod); len(errs) != 0 { + t.Errorf("Unexpected error for case[%d], err: %v", i, errs) + } + } + for i := range failureCases { + pod := &failureCases[i] + if errs := ValidatePod(pod); len(errs) == 0 { + t.Errorf("Expected error for case[%d], pod: %v", i, pod.Name) + } + } + // Disable alpha feature HugePages + err = utilfeature.DefaultFeatureGate.Set("HugePages=false") + if err != nil { + t.Errorf("Failed to disable feature gate for HugePages: %v", err) + return + } + // Disable alpha feature HugePages and ensure all success cases fail + for i := range successCases { + pod := &successCases[i] + if errs := ValidatePod(pod); len(errs) == 0 { + t.Errorf("Expected error for case[%d], pod: %v", i, pod.Name) + } + } +} + func TestAlphaLocalStorageCapacityIsolation(t *testing.T) { testCases := []api.VolumeSource{ @@ -7838,6 +7938,8 @@ func TestValidateNode(t *testing.T) { api.ResourceName(api.ResourceCPU): resource.MustParse("10"), api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), api.ResourceName("my.org/gpu"): resource.MustParse("10"), + api.ResourceName("hugepages-2Mi"): resource.MustParse("10Gi"), + api.ResourceName("hugepages-1Gi"): resource.MustParse("0"), }, }, Spec: api.NodeSpec{ @@ -8119,6 +8221,27 @@ func TestValidateNode(t *testing.T) { ExternalID: "external", }, }, + "multiple-pre-allocated-hugepages": { + ObjectMeta: metav1.ObjectMeta{ + Name: "abc", + Labels: validSelector, + }, + Status: api.NodeStatus{ + Addresses: []api.NodeAddress{ + {Type: api.NodeExternalIP, Address: "something"}, + }, + Capacity: api.ResourceList{ + api.ResourceName(api.ResourceCPU): resource.MustParse("10"), + api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), + api.ResourceName("my.org/gpu"): resource.MustParse("10"), + api.ResourceName("hugepages-2Mi"): resource.MustParse("10Gi"), + api.ResourceName("hugepages-1Gi"): resource.MustParse("10Gi"), + }, + }, + Spec: api.NodeSpec{ + ExternalID: "external", + }, + }, } for k, v := range errorCases { errs := ValidateNode(&v) diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index a83f5197d8f5d..711588e80eb5e 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -157,6 +157,12 @@ const ( // // Alternative container-level CPU affinity policies. CPUManager utilfeature.Feature = "CPUManager" + + // owner: @derekwaynecarr + // alpha: v1.8 + // + // Enable pods to consume pre-allocated huge pages of varying page sizes + HugePages utilfeature.Feature = "HugePages" ) func init() { @@ -180,6 +186,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS RotateKubeletClientCertificate: {Default: true, PreRelease: utilfeature.Beta}, PersistentLocalVolumes: {Default: false, PreRelease: utilfeature.Alpha}, LocalStorageCapacityIsolation: {Default: false, PreRelease: utilfeature.Alpha}, + HugePages: {Default: false, PreRelease: utilfeature.Alpha}, DebugContainers: {Default: false, PreRelease: utilfeature.Alpha}, PodPriority: {Default: false, PreRelease: utilfeature.Alpha}, EnableEquivalenceClassCache: {Default: false, PreRelease: utilfeature.Alpha}, diff --git a/pkg/kubelet/cadvisor/BUILD b/pkg/kubelet/cadvisor/BUILD index 15cea0e8d7222..989103b82a309 100644 --- a/pkg/kubelet/cadvisor/BUILD +++ b/pkg/kubelet/cadvisor/BUILD @@ -23,11 +23,14 @@ go_library( "//conditions:default": [], }), deps = [ + "//pkg/api/v1/helper:go_default_library", + "//pkg/features:go_default_library", "//vendor/github.com/google/cadvisor/events:go_default_library", "//vendor/github.com/google/cadvisor/info/v1:go_default_library", "//vendor/github.com/google/cadvisor/info/v2:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", ] + select({ "@io_bazel_rules_go//go/platform:linux_amd64": [ "//pkg/kubelet/types:go_default_library", diff --git a/pkg/kubelet/cadvisor/util.go b/pkg/kubelet/cadvisor/util.go index a95a8fb19e29a..dd4d7c4b1fda9 100644 --- a/pkg/kubelet/cadvisor/util.go +++ b/pkg/kubelet/cadvisor/util.go @@ -21,6 +21,9 @@ import ( cadvisorapi2 "github.com/google/cadvisor/info/v2" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + utilfeature "k8s.io/apiserver/pkg/util/feature" + v1helper "k8s.io/kubernetes/pkg/api/v1/helper" + "k8s.io/kubernetes/pkg/features" ) func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList { @@ -32,6 +35,17 @@ func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList { int64(info.MemoryCapacity), resource.BinarySI), } + + // if huge pages are enabled, we report them as a schedulable resource on the node + if utilfeature.DefaultFeatureGate.Enabled(features.HugePages) { + for _, hugepagesInfo := range info.HugePages { + pageSizeBytes := int64(hugepagesInfo.PageSize * 1024) + hugePagesBytes := pageSizeBytes * int64(hugepagesInfo.NumPages) + pageSizeQuantity := resource.NewQuantity(pageSizeBytes, resource.BinarySI) + c[v1helper.HugePageResourceName(*pageSizeQuantity)] = *resource.NewQuantity(hugePagesBytes, resource.BinarySI) + } + } + return c } diff --git a/pkg/kubelet/cm/BUILD b/pkg/kubelet/cm/BUILD index 992eafd9c9b4b..187cc3a0ecc22 100644 --- a/pkg/kubelet/cm/BUILD +++ b/pkg/kubelet/cm/BUILD @@ -52,6 +52,7 @@ go_library( ] + select({ "@io_bazel_rules_go//go/platform:linux_amd64": [ "//pkg/api:go_default_library", + "//pkg/api/v1/helper:go_default_library", "//pkg/api/v1/helper/qos:go_default_library", "//pkg/api/v1/resource:go_default_library", "//pkg/kubelet/cm/util:go_default_library", @@ -63,6 +64,7 @@ go_library( "//pkg/util/procfs:go_default_library", "//pkg/util/sysctl:go_default_library", "//pkg/util/version:go_default_library", + "//vendor/github.com/docker/go-units:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library", diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index c210dc2f60534..e24a955e10669 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -24,12 +24,16 @@ import ( "strings" "time" + units "github.com/docker/go-units" "github.com/golang/glog" libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd" libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs" + "k8s.io/apimachinery/pkg/util/sets" + utilfeature "k8s.io/apiserver/pkg/util/feature" + kubefeatures "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/metrics" ) @@ -43,6 +47,10 @@ const ( libcontainerSystemd libcontainerCgroupManagerType = "systemd" ) +// hugePageSizeList is useful for converting to the hugetlb canonical unit +// which is what is expected when interacting with libcontainer +var hugePageSizeList = []string{"B", "kB", "MB", "GB", "TB", "PB"} + // ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name. // For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice // If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form. @@ -299,10 +307,16 @@ type subsystem interface { GetStats(path string, stats *libcontainercgroups.Stats) error } -// Cgroup subsystems we currently support -var supportedSubsystems = []subsystem{ - &cgroupfs.MemoryGroup{}, - &cgroupfs.CpuGroup{}, +// getSupportedSubsystems returns list of subsystems supported +func getSupportedSubsystems() []subsystem { + supportedSubsystems := []subsystem{ + &cgroupfs.MemoryGroup{}, + &cgroupfs.CpuGroup{}, + } + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + supportedSubsystems = append(supportedSubsystems, &cgroupfs.HugetlbGroup{}) + } + return supportedSubsystems } // setSupportedSubsystems sets cgroup resource limits only on the supported @@ -315,7 +329,7 @@ var supportedSubsystems = []subsystem{ // but this is not possible with libcontainers Set() method // See https://github.com/opencontainers/runc/issues/932 func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error { - for _, sys := range supportedSubsystems { + for _, sys := range getSupportedSubsystems() { if _, ok := cgroupConfig.Paths[sys.Name()]; !ok { return fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name()) } @@ -343,6 +357,30 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont if resourceConfig.CpuPeriod != nil { resources.CpuPeriod = *resourceConfig.CpuPeriod } + + // if huge pages are enabled, we set them in libcontainer + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + // for each page size enumerated, set that value + pageSizes := sets.NewString() + for pageSize, limit := range resourceConfig.HugePageLimit { + sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, hugePageSizeList) + resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{ + Pagesize: sizeString, + Limit: uint64(limit), + }) + pageSizes.Insert(sizeString) + } + // for each page size omitted, limit to 0 + for _, pageSize := range cgroupfs.HugePageSizes { + if pageSizes.Has(pageSize) { + continue + } + resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{ + Pagesize: pageSize, + Limit: uint64(0), + }) + } + } return resources } @@ -502,7 +540,7 @@ func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error { func getStatsSupportedSubsystems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) { stats := libcontainercgroups.NewStats() - for _, sys := range supportedSubsystems { + for _, sys := range getSupportedSubsystems() { if _, ok := cgroupPaths[sys.Name()]; !ok { return nil, fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name()) } diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go index 39f445a943134..d3f91a14def76 100644 --- a/pkg/kubelet/cm/helpers_linux.go +++ b/pkg/kubelet/cm/helpers_linux.go @@ -26,6 +26,7 @@ import ( libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" "k8s.io/api/core/v1" + v1helper "k8s.io/kubernetes/pkg/api/v1/helper" v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos" "k8s.io/kubernetes/pkg/api/v1/resource" ) @@ -83,6 +84,23 @@ func MilliCPUToShares(milliCPU int64) int64 { return shares } +// HugePageLimits converts the API representation to a map +// from huge page size (in bytes) to huge page limit (in bytes). +func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 { + hugePageLimits := map[int64]int64{} + for k, v := range resourceList { + if v1helper.IsHugePageResourceName(k) { + pageSize, _ := v1helper.HugePageSizeFromResourceName(k) + if value, exists := hugePageLimits[pageSize.Value()]; exists { + hugePageLimits[pageSize.Value()] = value + v.Value() + } else { + hugePageLimits[pageSize.Value()] = v.Value() + } + } + } + return hugePageLimits +} + // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig { // sum requests and limits. @@ -108,6 +126,8 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig { // track if limits were applied for each resource. memoryLimitsDeclared := true cpuLimitsDeclared := true + // map hugepage pagesize (bytes) to limits (bytes) + hugePageLimits := map[int64]int64{} for _, container := range pod.Spec.Containers { if container.Resources.Limits.Cpu().IsZero() { cpuLimitsDeclared = false @@ -115,6 +135,14 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig { if container.Resources.Limits.Memory().IsZero() { memoryLimitsDeclared = false } + containerHugePageLimits := HugePageLimits(container.Resources.Requests) + for k, v := range containerHugePageLimits { + if value, exists := hugePageLimits[k]; exists { + hugePageLimits[k] = value + v + } else { + hugePageLimits[k] = v + } + } } // determine the qos class @@ -140,6 +168,7 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig { shares := int64(MinShares) result.CpuShares = &shares } + result.HugePageLimit = hugePageLimits return result } diff --git a/pkg/kubelet/cm/node_container_manager.go b/pkg/kubelet/cm/node_container_manager.go index a96f9c5403790..c9cb48ce55921 100644 --- a/pkg/kubelet/cm/node_container_manager.go +++ b/pkg/kubelet/cm/node_container_manager.go @@ -28,7 +28,9 @@ import ( "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/kubernetes/pkg/api" + kubefeatures "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/events" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" ) @@ -154,6 +156,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig { val := MilliCPUToShares(q.MilliValue()) rc.CpuShares = &val } + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + rc.HugePageLimit = HugePageLimits(rl) + } + return &rc } diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go index 1e5ae93933469..dab2e0bf3ac9e 100644 --- a/pkg/kubelet/cm/qos_container_manager_linux.go +++ b/pkg/kubelet/cm/qos_container_manager_linux.go @@ -27,9 +27,13 @@ import ( "k8s.io/apimachinery/pkg/util/wait" + units "github.com/docker/go-units" + cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" "k8s.io/api/core/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos" "k8s.io/kubernetes/pkg/api/v1/resource" + kubefeatures "k8s.io/kubernetes/pkg/features" ) const ( @@ -100,11 +104,18 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis minShares := int64(MinShares) resourceParameters.CpuShares = &minShares } + // containerConfig object stores the cgroup specifications containerConfig := &CgroupConfig{ Name: absoluteContainerName, ResourceParameters: resourceParameters, } + + // for each enumerated huge page size, the qos tiers are unbounded + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + m.setHugePagesUnbounded(containerConfig) + } + // check if it exists if !cm.Exists(absoluteContainerName) { if err := cm.Create(containerConfig); err != nil { @@ -138,6 +149,29 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis return nil } +// setHugePagesUnbounded ensures hugetlb is effectively unbounded +func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error { + hugePageLimit := map[int64]int64{} + for _, pageSize := range cgroupfs.HugePageSizes { + pageSizeBytes, err := units.RAMInBytes(pageSize) + if err != nil { + return err + } + hugePageLimit[pageSizeBytes] = int64(1 << 62) + } + cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit + return nil +} + +func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error { + for _, v := range configs { + if err := m.setHugePagesUnbounded(v); err != nil { + return err + } + } + return nil +} + func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error { pods := m.activePods() burstablePodCPURequest := int64(0) @@ -262,6 +296,13 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error { return err } + // update the qos level cgroup settings for huge pages (ensure they remain unbounded) + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + if err := m.setHugePagesConfig(qosConfigs); err != nil { + return err + } + } + for resource, percentReserve := range m.qosReserved { switch resource { case v1.ResourceMemory: diff --git a/pkg/kubelet/cm/types.go b/pkg/kubelet/cm/types.go index a2e8594262bd8..4419b0a3c72b1 100644 --- a/pkg/kubelet/cm/types.go +++ b/pkg/kubelet/cm/types.go @@ -31,6 +31,8 @@ type ResourceConfig struct { CpuQuota *int64 // CPU quota period. CpuPeriod *int64 + // HugePageLimit map from page size (in bytes) to limit (in bytes) + HugePageLimit map[int64]int64 } // CgroupName is the abstract name of a cgroup prior to any driver specific conversion. diff --git a/pkg/kubelet/kubelet_node_status.go b/pkg/kubelet/kubelet_node_status.go index 4cb35ba52a122..4817e6d68e815 100644 --- a/pkg/kubelet/kubelet_node_status.go +++ b/pkg/kubelet/kubelet_node_status.go @@ -630,6 +630,19 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { } node.Status.Allocatable[k] = value } + // for every huge page reservation, we need to remove it from allocatable memory + for k, v := range node.Status.Capacity { + if v1helper.IsHugePageResourceName(k) { + allocatableMemory := node.Status.Allocatable[v1.ResourceMemory] + value := *(v.Copy()) + allocatableMemory.Sub(value) + if allocatableMemory.Sign() < 0 { + // Negative Allocatable resources don't make sense. + allocatableMemory.Set(0) + } + node.Status.Allocatable[v1.ResourceMemory] = allocatableMemory + } + } } // Set versioninfo for the node. diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index 427700b5300d8..92b5e4fe81cc1 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -509,6 +509,12 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { result.SetExtended(rName, value) } } + if v1helper.IsHugePageResourceName(rName) { + value := rQuantity.Value() + if value > result.HugePages[rName] { + result.SetHugePages(rName, value) + } + } } } } @@ -542,7 +548,12 @@ func PodFitsResources(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.No // We couldn't parse metadata - fallback to computing it. podRequest = GetResourceRequest(pod) } - if podRequest.MilliCPU == 0 && podRequest.Memory == 0 && podRequest.NvidiaGPU == 0 && podRequest.EphemeralStorage == 0 && len(podRequest.ExtendedResources) == 0 { + if podRequest.MilliCPU == 0 && + podRequest.Memory == 0 && + podRequest.NvidiaGPU == 0 && + podRequest.EphemeralStorage == 0 && + len(podRequest.ExtendedResources) == 0 && + len(podRequest.HugePages) == 0 { return len(predicateFails) == 0, predicateFails, nil } @@ -567,6 +578,12 @@ func PodFitsResources(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.No } } + for rName, rQuant := range podRequest.HugePages { + if allocatable.HugePages[rName] < rQuant+nodeInfo.RequestedResource().HugePages[rName] { + predicateFails = append(predicateFails, NewInsufficientResourceError(rName, podRequest.HugePages[rName], nodeInfo.RequestedResource().HugePages[rName], allocatable.HugePages[rName])) + } + } + if glog.V(10) { if len(predicateFails) == 0 { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go index 4f5c877a866b9..734ff26a78cc9 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go @@ -73,11 +73,12 @@ func (pvs FakePersistentVolumeInfo) GetPersistentVolumeInfo(pvID string) (*v1.Pe } var ( - opaqueResourceA = v1helper.OpaqueIntResourceName("AAA") - opaqueResourceB = v1helper.OpaqueIntResourceName("BBB") + opaqueResourceA = v1helper.OpaqueIntResourceName("AAA") + opaqueResourceB = v1helper.OpaqueIntResourceName("BBB") + hugePageResourceA = v1helper.HugePageResourceName(resource.MustParse("2Mi")) ) -func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v1.NodeResources { +func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage, hugePageA int64) v1.NodeResources { return v1.NodeResources{ Capacity: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), @@ -86,11 +87,12 @@ func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI), opaqueResourceA: *resource.NewQuantity(opaqueA, resource.DecimalSI), v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI), + hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI), }, } } -func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v1.ResourceList { +func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage, hugePageA int64) v1.ResourceList { return v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), @@ -98,6 +100,7 @@ func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, stora v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI), opaqueResourceA: *resource.NewQuantity(opaqueA, resource.DecimalSI), v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI), + hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI), } } @@ -348,10 +351,38 @@ func TestPodFitsResources(t *testing.T) { test: "opaque resource allocatable enforced for unknown resource for init container", reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(opaqueResourceB, 1, 0, 0)}, }, + { + pod: newResourcePod( + schedulercache.Resource{MilliCPU: 1, Memory: 1, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 10}}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 0, Memory: 0, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 0}})), + fits: false, + test: "hugepages resource capacity enforced", + reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(hugePageResourceA, 10, 0, 5)}, + }, + { + pod: newResourceInitPod(newResourcePod(schedulercache.Resource{}), + schedulercache.Resource{MilliCPU: 1, Memory: 1, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 10}}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 0, Memory: 0, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 0}})), + fits: false, + test: "hugepages resource capacity enforced for init container", + reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(hugePageResourceA, 10, 0, 5)}, + }, + { + pod: newResourcePod( + schedulercache.Resource{MilliCPU: 1, Memory: 1, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 3}}, + schedulercache.Resource{MilliCPU: 1, Memory: 1, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 3}}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 0, Memory: 0, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 2}})), + fits: false, + test: "hugepages resource allocatable enforced for multiple containers", + reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(hugePageResourceA, 6, 2, 5)}, + }, } for _, test := range enoughPodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20, 5)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -406,7 +437,7 @@ func TestPodFitsResources(t *testing.T) { }, } for _, test := range notEnoughPodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0, 0)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0, 0, 0)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -464,7 +495,7 @@ func TestPodFitsResources(t *testing.T) { } for _, test := range storagePodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20, 5)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -1889,7 +1920,7 @@ func TestRunGeneralPredicates(t *testing.T) { newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)}, }, fits: true, wErr: nil, @@ -1901,7 +1932,7 @@ func TestRunGeneralPredicates(t *testing.T) { newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 19})), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)}, }, fits: false, wErr: nil, @@ -1915,7 +1946,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: &v1.Pod{}, nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}}, fits: true, wErr: nil, test: "no resources/port/host requested always fits on GPU machine", @@ -1924,7 +1955,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}), nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 1})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}}, fits: false, wErr: nil, reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(v1.ResourceNvidiaGPU, 1, 1, 1)}, @@ -1934,7 +1965,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}), nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 0})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}}, fits: true, wErr: nil, test: "enough GPU resource", @@ -1948,7 +1979,7 @@ func TestRunGeneralPredicates(t *testing.T) { nodeInfo: schedulercache.NewNodeInfo(), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)}, }, fits: false, wErr: nil, @@ -1960,7 +1991,7 @@ func TestRunGeneralPredicates(t *testing.T) { nodeInfo: schedulercache.NewNodeInfo(newPodWithPort(123)), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)}, }, fits: false, wErr: nil, @@ -3252,7 +3283,7 @@ func TestPodSchedulesOnNodeWithMemoryPressureCondition(t *testing.T) { ImagePullPolicy: "Always", // at least one requirement -> burstable pod Resources: v1.ResourceRequirements{ - Requests: makeAllocatableResources(100, 100, 100, 100, 0, 0), + Requests: makeAllocatableResources(100, 100, 100, 100, 0, 0, 0), }, }, }, diff --git a/plugin/pkg/scheduler/schedulercache/node_info.go b/plugin/pkg/scheduler/schedulercache/node_info.go index dd3f8206b0908..0ec8a94daa974 100644 --- a/plugin/pkg/scheduler/schedulercache/node_info.go +++ b/plugin/pkg/scheduler/schedulercache/node_info.go @@ -72,6 +72,7 @@ type Resource struct { // explicitly as int, to avoid conversions and improve performance. AllowedPodNumber int ExtendedResources map[v1.ResourceName]int64 + HugePages map[v1.ResourceName]int64 } // New creates a Resource from ResourceList @@ -103,6 +104,9 @@ func (r *Resource) Add(rl v1.ResourceList) { if v1helper.IsExtendedResourceName(rName) { r.AddExtended(rName, rQuant.Value()) } + if v1helper.IsHugePageResourceName(rName) { + r.AddHugePages(rName, rQuant.Value()) + } } } } @@ -118,6 +122,9 @@ func (r *Resource) ResourceList() v1.ResourceList { for rName, rQuant := range r.ExtendedResources { result[rName] = *resource.NewQuantity(rQuant, resource.DecimalSI) } + for rName, rQuant := range r.HugePages { + result[rName] = *resource.NewQuantity(rQuant, resource.BinarySI) + } return result } @@ -135,6 +142,12 @@ func (r *Resource) Clone() *Resource { res.ExtendedResources[k] = v } } + if r.HugePages != nil { + res.HugePages = make(map[v1.ResourceName]int64) + for k, v := range r.HugePages { + res.HugePages[k] = v + } + } return res } @@ -150,6 +163,18 @@ func (r *Resource) SetExtended(name v1.ResourceName, quantity int64) { r.ExtendedResources[name] = quantity } +func (r *Resource) AddHugePages(name v1.ResourceName, quantity int64) { + r.SetHugePages(name, r.HugePages[name]+quantity) +} + +func (r *Resource) SetHugePages(name v1.ResourceName, quantity int64) { + // Lazily allocate hugepages resource map. + if r.HugePages == nil { + r.HugePages = map[v1.ResourceName]int64{} + } + r.HugePages[name] = quantity +} + // NewNodeInfo returns a ready to use empty NodeInfo object. // If any pods are given in arguments, their information will be aggregated in // the returned object. @@ -307,6 +332,12 @@ func (n *NodeInfo) addPod(pod *v1.Pod) { for rName, rQuant := range res.ExtendedResources { n.requestedResource.ExtendedResources[rName] += rQuant } + if n.requestedResource.HugePages == nil && len(res.HugePages) > 0 { + n.requestedResource.HugePages = map[v1.ResourceName]int64{} + } + for rName, rQuant := range res.HugePages { + n.requestedResource.HugePages[rName] += rQuant + } n.nonzeroRequest.MilliCPU += non0_cpu n.nonzeroRequest.Memory += non0_mem n.pods = append(n.pods, pod) @@ -362,6 +393,12 @@ func (n *NodeInfo) removePod(pod *v1.Pod) error { for rName, rQuant := range res.ExtendedResources { n.requestedResource.ExtendedResources[rName] -= rQuant } + if len(res.HugePages) > 0 && n.requestedResource.HugePages == nil { + n.requestedResource.HugePages = map[v1.ResourceName]int64{} + } + for rName, rQuant := range res.HugePages { + n.requestedResource.HugePages[rName] -= rQuant + } n.nonzeroRequest.MilliCPU -= non0_cpu n.nonzeroRequest.Memory -= non0_mem diff --git a/staging/src/k8s.io/api/core/v1/types.go b/staging/src/k8s.io/api/core/v1/types.go index 4520cc6d5b09b..391736885d33e 100644 --- a/staging/src/k8s.io/api/core/v1/types.go +++ b/staging/src/k8s.io/api/core/v1/types.go @@ -3759,6 +3759,8 @@ const ( ResourceOpaqueIntPrefix = "pod.alpha.kubernetes.io/opaque-int-resource-" // Default namespace prefix. ResourceDefaultNamespacePrefix = "kubernetes.io/" + // Name prefix for huge page resources (alpha). + ResourceHugePagesPrefix = "hugepages-" ) // ResourceList is a set of (resource name, quantity) pairs.