Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(test/quick): Add test to detect io_uring hangs with npm install #589

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions test/cases/quick/io_uring_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
//go:build e2e

package quick

import (
"context"
"log"
"testing"
"time"

"github.com/aws/aws-k8s-tester/internal/e2e"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"sigs.k8s.io/e2e-framework/klient/k8s"
"sigs.k8s.io/e2e-framework/klient/wait"
"sigs.k8s.io/e2e-framework/pkg/envconf"
"sigs.k8s.io/e2e-framework/pkg/features"
)

func TestNpmInstallWithCPULimits(t *testing.T) {
feat := features.New("npm-install-cpu-limits").
WithLabel("suite", "quick").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log.Println("[Setup] Verifying ARM64 nodes...")
var nodeList corev1.NodeList
if err := cfg.Client().Resources().List(ctx, &nodeList); err != nil {
t.Fatalf("Failed to list nodes: %v", err)
}

hasArm64Node := false
for _, node := range nodeList.Items {
arch := node.Labels["kubernetes.io/arch"]
t.Logf("Node: %s, Architecture: %s", node.Name, arch)
if arch == "arm64" {
hasArm64Node = true
break
}
}

if !hasArm64Node {
t.Fatal("No ARM64 nodes found in cluster")
}
return ctx
}).
Assess("Pod can successfully run npm install with CPU limits", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
podName := "npm-install-test"
podNS := "default"

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
Namespace: podNS,
Labels: map[string]string{
"app": "npm-install-test",
},
},
Spec: corev1.PodSpec{
NodeSelector: map[string]string{
"kubernetes.io/arch": "arm64",
},
Containers: []corev1.Container{
{
Name: "test-container",
Image: "ubuntu:noble-20241015",
Command: []string{"/bin/sh", "-c"},
Args: []string{`
set -x
echo "[Test] Starting npm installation test..."
mkdir asd &&
cd asd &&
apt-get update &&
apt-get install -y npm nodejs &&
echo "[Test] Starting npm install express..."
npm install express --loglevel verbose || exit 1
echo "[Test] npm install completed successfully"
`},
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
},
},
},
RestartPolicy: corev1.RestartPolicyNever,
},
}

if err := cfg.Client().Resources().Create(ctx, pod); err != nil {
t.Fatalf("[Assess] Failed to create pod: %v", err)
}

log.Printf("[Assess] Waiting up to 5 minutes for pod %s to complete...", podName)
err := wait.For(
e2e.NewConditionExtension(cfg.Client().Resources()).ResourceMatch(pod, func(object k8s.Object) bool {
pod := object.(*corev1.Pod)
return pod.Status.Phase == corev1.PodSucceeded
}),
wait.WithTimeout(5*time.Minute),
)
if err != nil {
t.Logf("[Assess] Pod did not complete successfully: %v", err)
e2e.PrintDaemonSetPodLogs(t, ctx, cfg.Client().RESTConfig(), podNS, "app=npm-install-test")
t.Fatal("Pod did not complete within 5 minutes - possible io_uring hang detected")
}

log.Printf("[Assess] Pod %s completed successfully", podName)
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
podName := "npm-install-test"
podNS := "default"

t.Logf("[Teardown] Cleaning up pod %s/%s...", podNS, podName)
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
Namespace: podNS,
},
}
if err := cfg.Client().Resources().Delete(ctx, pod); err != nil {
t.Logf("[Teardown] Failed to delete pod: %v", err)
}
return ctx
}).
Feature()

testenv.Test(t, feat)
}