Skip to content

Commit

Permalink
Change to 2x2x2 topology and remove idle node behavior
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
  • Loading branch information
ryanaoleary committed Aug 9, 2024
1 parent 0bab892 commit 0c6bb58
Showing 1 changed file with 25 additions and 28 deletions.
53 changes: 25 additions & 28 deletions ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ func TestRayClusterAutoscalerWithFakeSingleHostTPU(t *testing.T) {
WithMaxReplicas(3).
WithNumOfHosts(1).
WithGroupName("tpu-group").
WithRayStartParams(map[string]string{"num-cpus": "1", "resources": `"{\"TPU\": 4}"`}).
WithRayStartParams(map[string]string{"resources": `"{\"TPU\": 4}"`}).
WithTemplate(workerPodTemplateApplyConfiguration()))
rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name).
WithSpec(apply(rayClusterSpecAC, mountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](scripts, "/home/ray/test_scripts")))
Expand All @@ -179,7 +179,7 @@ func TestRayClusterAutoscalerWithFakeSingleHostTPU(t *testing.T) {
test.T().Logf("Found head pod %s/%s", headPod.Namespace, headPod.Name)

// Create a detached tpu actor, and 1 worker in the multi-host "tpu-group" should be created.
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"})
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor", "--custom-resource-name=TPU", "--num-custom-resources=4"})
test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1))))

Expand All @@ -206,21 +206,29 @@ func TestRayClusterAutoscalerWithFakeMultiHostTPU(t *testing.T) {
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created ConfigMap %s/%s successfully", scripts.Namespace, scripts.Name)

// Set 'replicaIndex' label that would be set by the GKE Ray TPU webhook. This is used to scale
// down entire multi-host replicas atomically.
replicaIndexLabel := map[string]string{
"replicaIndex": "tpu-group-0",
}
podTemplate := workerPodTemplateApplyConfiguration().WithLabels(replicaIndexLabel)
minRayVersion := "2.32.0" // Multi-host autoscaling support starts in this version.

test.T().Run("Create a RayCluster with autoscaling enabled", func(_ *testing.T) {
rayClusterSpecAC := rayv1ac.RayClusterSpec().
WithEnableInTreeAutoscaling(true).
WithRayVersion(GetRayVersion()).
WithRayVersion(minRayVersion).
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
WithRayStartParams(map[string]string{"num-cpus": "0"}).
WithTemplate(headPodTemplateApplyConfiguration())).
WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec().
WithReplicas(0).
WithMinReplicas(0).
WithMaxReplicas(3).
WithNumOfHosts(4).
WithNumOfHosts(2).
WithGroupName("tpu-group").
WithRayStartParams(map[string]string{"num-cpus": "1", "resources": `"{\"TPU\": 4}"`}).
WithTemplate(workerPodTemplateApplyConfiguration()))
WithRayStartParams(map[string]string{"resources": `"{\"TPU\": 4}"`}).
WithTemplate(podTemplate))
rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name).
WithSpec(apply(rayClusterSpecAC, mountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](scripts, "/home/ray/test_scripts")))

Expand All @@ -237,36 +245,26 @@ func TestRayClusterAutoscalerWithFakeMultiHostTPU(t *testing.T) {
headPod := GetHeadPod(test, rayCluster)
test.T().Logf("Found head pod %s/%s", headPod.Namespace, headPod.Name)

// Create a detached tpu actor, and 4 workers in the multi-host "tpu-group" should be created.
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_1", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"})
// Create a detached TPU actor, and 1 multi-host replica with 2 TPU workers should be created.
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_1", "--custom-resource-name=TPU", "--num-custom-resources=4"})
test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1))))

// Each TPU multi-host replica should have 4 workers, so we check for 4 pods in 'tpu-group'.
test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(4))

// Each TPU multi-host worker should have a task or actor scheduled on it, therefore we create 3 more detached actors
// to run on each node in the multi-host TPU worker group.
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_2", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"})
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_3", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"})
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_4", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"})

// Each new TPU detached actor should get scheduled to an existing scaled-up worker, so we check that there are still 4 pods in 'tpu-group'.
test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(4))
// Each TPU multi-host replica should have NumOfHosts workers, so we check for 2 pods in 'tpu-group'.
test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(2))

// Terminating one TPU detached actor will result in the Ray node becoming idle, causing Ray to scale down the entire multi-host
// worker group. A new multi-host worker group will then be scaled back up since the remaining detached actors are running.
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_1"})
// Each TPU multi-host worker should have a task or actor scheduled on it, therefore we create another detached actor
// to run on the second node in the multi-host TPU worker group.
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_2", "--custom-resource-name=TPU", "--num-custom-resources=4"})
test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0))))
Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1))))
test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(2))

// Terminate the remaining 3 TPU detached actors, and the worker group should be deleted.
// Terminate the TPU detached actors, and the multi-host replica should be scaled down.
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_1"})
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_2"})
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_3"})
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_4"})
test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0))))
test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(0))
})
}

Expand All @@ -285,7 +283,6 @@ func TestRayClusterAutoscalerWithCustomResource(t *testing.T) {

test.T().Run("Create a RayCluster with autoscaling enabled", func(_ *testing.T) {
groupName := "custom-resource-group"

rayClusterSpecAC := rayv1ac.RayClusterSpec().
WithEnableInTreeAutoscaling(true).
WithRayVersion(GetRayVersion()).
Expand Down

0 comments on commit 0c6bb58

Please sign in to comment.