Skip to content

Commit

Permalink
Merge pull request #57 from chatwork/karpenter
Browse files Browse the repository at this point in the history
Add karpenter test
  • Loading branch information
mumoshu authored Jul 2, 2024
2 parents 7d37666 + 2a3a2e6 commit b5a5b3c
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 1 deletion.
164 changes: 164 additions & 0 deletions cmd/cluster-autoscaler/karpenter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
package clusterautoscaler

import (
"context"
"os"
"path/filepath"
"testing"
"text/template"
"time"

"github.com/chatwork/kibertas/cmd"
"github.com/chatwork/kibertas/util/notify"
"github.com/stretchr/testify/require"
"k8s.io/apimachinery/pkg/util/wait"

"github.com/mumoshu/testkit"
"github.com/sirupsen/logrus"
)

func TestKarpenterScaleUpFromNonZero(t *testing.T) {
if testing.Short() {
t.Skip("Skipping test in short mode.")
}

vpcID := os.Getenv("VPC_ID")
if vpcID == "" {
t.Skip("VPC_ID is not set")
}

amdAMIID := os.Getenv("AMD_AMI_ID")
if amdAMIID == "" {
t.Skip("AMD_AMI_ID is not set")
}

appName := "sample-for-scale"

h := testkit.New(t,
testkit.Providers(
&testkit.TerraformProvider{
WorkspacePath: "testdata/terraform",
Vars: map[string]string{
"autoscaler_name": "karpenter",
"prefix": "kibertas-ca",
"region": "ap-northeast-1",
"vpc_id": vpcID,
"capacity_type": "SPOT",
"node_template_app_label_value": appName,
},
},
&testkit.KubectlProvider{},
),
testkit.RetainResourcesOnFailure(),
)

kc := h.KubernetesCluster(t)

k := testkit.NewKubernetes(kc.KubeconfigPath)
testkit.PollUntil(t, func() bool {
return len(k.ListReadyNodeNames(t)) > 1
}, 20*time.Second)

helm := testkit.NewHelm(kc.KubeconfigPath)

clusterautoscalerNs := "default"
helm.UpgradeOrInstall(t, "karpenter", "oci://public.ecr.aws/karpenter/karpenter", func(hc *testkit.HelmConfig) {
hc.Values = map[string]interface{}{
"settings": map[string]interface{}{
// This is so because we specify prefix=kibertas-ca in the testkit constructor above
// and the terraform main.tf uses prefix + "-cluster" as the cluster name.
"clusterName": "kibertas-ca-cluster",
"interruptionQueue": "kibertas-ca-cluster",
},
"controller": map[string]interface{}{
"resources": map[string]interface{}{
"requests": map[string]interface{}{
"cpu": "1",
"memory": "1Gi",
},
"limits": map[string]interface{}{
"cpu": "1",
"memory": "1Gi",
},
},
},
// Otherwise, karpenter can be deployed onto the spot nodes, and it will prevent the scale-down
"tolerations": []map[string]interface{}{
{
"key": "node-role.kubernetes.io/control-plane",
"effect": "NoSchedule",
},
},
}

hc.Namespace = clusterautoscalerNs
})

kubectl := testkit.NewKubectl(kc.KubeconfigPath)

tmpKarpenterYamlPath := filepath.Join(t.TempDir(), "karpenter.yaml")
tmpl, err := template.New("karpenter.yaml").ParseFiles("testdata/karpenter.yaml")
require.NoError(t, err)
{
f, err := os.Create(tmpKarpenterYamlPath)
require.NoError(t, err)
defer f.Close()

require.NoError(t, tmpl.ExecuteTemplate(f, "karpenter.yaml", map[string]string{
"ClusterName": "kibertas-ca-cluster",
"AmdAmiId": amdAMIID,
"RoleName": "kibertas-ca-node",
}))
}

t.Cleanup(func() {
if !t.Failed() {
kubectl.Capture(t, "delete", "-f", tmpKarpenterYamlPath)
}
})
kubectl.Capture(t, "apply", "-f", tmpKarpenterYamlPath)

os.Setenv("RESOURCE_NAME", appName)
os.Setenv("KUBECONFIG", kc.KubeconfigPath)
// os.Setenv("NODE_LABEL_VALUE", "ON_DEMAND")
os.Setenv("NODE_LABEL_KEY", "karpenter.sh/capacity-type")
os.Setenv("NODE_LABEL_VALUE", "spot")

logger := func() *logrus.Entry {
return logrus.NewEntry(logrus.New())
}
chatwork := &notify.Chatwork{
Logger: logger,
}
checker := cmd.NewChecker(context.Background(), false, logger, chatwork, "test", 7*time.Minute)
clusterautoscaler, err := NewClusterAutoscaler(checker)
if err != nil {
t.Fatalf("NewClusterAutoscaler: %s", err)
}

if clusterautoscaler == nil {
t.Error("Expected clusterautoscaler instance, got nil")
}

initialNodes := len(k.ListReadyNodeNames(t))

// Scale up by 1
require.NoError(t, clusterautoscaler.Check())
require.NoError(t, wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 5*time.Minute, false, func(ctx context.Context) (bool, error) {
nodes := k.ListReadyNodeNames(t)
return len(nodes) == initialNodes+1, nil
}))

// Scale to zero (or the original number of nodes)
require.NoError(t, wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 10*time.Minute, false, func(ctx context.Context) (bool, error) {
nodes := k.ListReadyNodeNames(t)
return len(nodes) == initialNodes, nil
}))

// Scale up by 1 (again)
require.NoError(t, clusterautoscaler.Check())
require.NoError(t, wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 8*time.Minute, false, func(ctx context.Context) (bool, error) {
nodes := k.ListReadyNodeNames(t)
return len(nodes) == initialNodes+1, nil
}))
}
54 changes: 54 additions & 0 deletions cmd/cluster-autoscaler/testdata/karpenter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: default
spec:
template:
# metadata:
# labels:
# karpenter.sh/capacity-type: spot
spec:
requirements:
# - key: kubernetes.io/arch
# operator: In
# values: ["amd64"]
# - key: kubernetes.io/os
# operator: In
# values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["spot"]
- key: karpenter.k8s.aws/instance-category
operator: In
values: ["c", "m", "r"]
- key: karpenter.k8s.aws/instance-size
operator: In
values: ["nano", "micro", "small", "medium"]
# - key: karpenter.k8s.aws/instance-family
# operator: In
# values: ["t4"]
nodeClassRef:
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
name: default
limits:
cpu: 1000
disruption:
consolidationPolicy: WhenUnderutilized
expireAfter: 720h # 30 * 24h = 720h
---
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
metadata:
name: default
spec:
amiFamily: AL2 # Amazon Linux 2
role: "{{ .RoleName }}" # replace with your cluster name
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "{{ .ClusterName }}" # replace with your cluster name
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "{{ .ClusterName }}" # replace with your cluster name
amiSelectorTerms:
- id: "{{ .AmdAmiId }}"
33 changes: 32 additions & 1 deletion cmd/cluster-autoscaler/testdata/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
// terraform plan -var vpc_id=$VPC_ID -var region=ap-northeast-1 -var prefix=kibertas-ca -var capacity_type=SPOT -var node_template_app_label_value=sample-for-scale
// terraform apply -var vpc_id=$VPC_ID -var region=ap-northeast-1 -var prefix=kibertas-ca -var capacity_type=SPOT -var node_template_app_label_value=sample-for-scale


terraform {
required_providers {
aws = {
Expand Down Expand Up @@ -99,6 +100,11 @@ resource "aws_eks_node_group" "spot" {
max_size = 3
min_size = 0
}
// This is translated to eks.amazonaws.com/capacityType=[SPOT|ON_DEMAND]
// node label by EKS.
// The label must match karpenter.sh/capacity-type label in the NodePool.
// Otherwise karpenter will complain with a message like:
// {"level":"ERROR","time":"*snip*","logger":"controller","message":"could not schedule pod","commit":"490ef94","controller":"provisioner","Pod":{"name":"sample-for-scale-68fcbd98cc-gs7g8","namespace":"cluster-autoscaler-test-20240619-pwpm2"},"error":"incompatible with nodepool \"default\", daemonset overhead={\"cpu\":\"150m\",\"pods\":\"2\"}, incompatible requirements, label \"eks.amazonaws.com/capacityType\" does not have known values"}
capacity_type = var.capacity_type
instance_types = ["t3.large"]
labels = {
Expand All @@ -108,6 +114,7 @@ resource "aws_eks_node_group" "spot" {
"k8s.io/cluster-autoscaler/node-template/label/app": var.node_template_app_label_value,
"k8s.io/cluster-autoscaler/enabled" = "true"
"k8s.io/cluster-autoscaler/${aws_eks_cluster.cluster.name}" = "owned"
"eks.amazonaws.com/capacityType" = "SPOT"
}
}

Expand Down Expand Up @@ -217,7 +224,7 @@ locals {
{
"Effect": "Allow",
"Action": "iam:PassRole",
"Resource": "arn:aws:iam::${local.aws_account_id}:role/KarpenterNodeRole-${local.cluster_name}",
"Resource": "${aws_iam_role.node.arn}",
"Sid": "PassNodeIAMRole"
},
{
Expand Down Expand Up @@ -287,6 +294,16 @@ locals {
"Effect": "Allow",
"Resource": "*",
"Action": "iam:GetInstanceProfile"
},
{
"Sid": "AllowInterruptionQueueActions",
"Effect": "Allow",
"Resource": "${aws_sqs_queue.karpenter_interruption_queue.arn}",
"Action": [
"sqs:DeleteMessage",
"sqs:GetQueueUrl",
"sqs:ReceiveMessage"
]
}
],
"Version": "2012-10-17"
Expand Down Expand Up @@ -355,6 +372,12 @@ resource "aws_security_group" "cluster" {
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
// Must match EC2NodeClass securityGroupSelectorTerms.tags
// Otherwise karpenter complains with a message like:
// {"level":"ERROR","time":"*snip*","logger":"controller","message":"failed listing instance types for default","commit":"490ef94","controller":"disruption","error":"no subnets found"}
"karpenter.sh/discovery" = local.cluster_name
}
}

data "aws_availability_zones" "available" {
Expand All @@ -367,4 +390,12 @@ resource "aws_subnet" "public" {
cidr_block = "${cidrsubnet(data.aws_vpc.vpc.cidr_block, 4, 10+count.index)}"
availability_zone = data.aws_availability_zones.available.names[count.index%length(data.aws_availability_zones.available.names)]
map_public_ip_on_launch = true
tags = {
// Must match EC2NodeClass subnetSelectorTerms.tags
"karpenter.sh/discovery" = local.cluster_name
}
}

resource "aws_sqs_queue" "karpenter_interruption_queue" {
name = "${local.cluster_name}"
}

0 comments on commit b5a5b3c

Please sign in to comment.