test: working Planner scaling test -- 2P1D

hhzhang16 · hhzhang16 · commit 16c865e9d30e · 2025-08-28T08:20:38.000-07:00
Signed-off-by: Hannah Zhang &lt;hannahz@nvidia.com&gt;
diff --git a/tests/planner/README.md b/tests/planner/README.md
@@ -133,7 +133,7 @@ The fourth plot, similar to the third plot, shows the actual decode throughput,
 
 ## Scaling Tests
 
-This directory contains comprehensive tests for validating the SLA planner's scaling behavior. The tests validate both the replica calculation logic and end-to-end scaling behavior.
+This directory contains comprehensive tests for validating the SLA planner's scaling behavior. The tests validate both the replica calculation logic and end-to-end scaling behavior. The scaling test uses a graduated load approach rather than dataset files, as it proved more reliable for metric generation and scaling triggers.
 
 ### Test Types
 
@@ -166,13 +166,22 @@ To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
 ./run_scaling_test.sh --save-results
 ```
 
-### Test Scenario
+**E2E Test Deployment Management:**
+- If no deployment exists: creates, tests, and cleans up deployment
+- If deployment exists: uses existing deployment and preserves it
+- Perfect for development workflows where you want to keep deployments running between tests
 
-The main test scenario validates scaling for **H200 with 1P1D configuration**:
-- **Phase 1**: 12 req/s (maintains 1P1D)
-- **Phase 2**: 24 req/s (scales to 2P1D - 2 prefill workers, 1 decode worker)
-- **ISL/OSL**: 3000/150 tokens
-- **Capacity**: ~15 req/s/gpu based on interpolator analysis
+**Test Scenario**
+
+The main test scenario validates prefill scaling for H200 with 1P1D → 2P1D configuration:
+
+- **Phase 1**: 8 req/s for 90s (baseline - maintains 1P1D)
+- **Phase 2**: 15 req/s for 120s (moderate load - maintains 1P1D)
+- **Phase 3**: 25 req/s for 180s (scaling trigger - scales to 2P1D)
+- **ISL/OSL**: 4000/150 tokens (optimized for prefill bottleneck)
+- **Transition delay**: 30s between phases
+- **Total test duration**: ~7 minutes + scaling observation
+- **Smart cleanup**: Only removes deployment if test created it (preserves existing deployments)
 
 ### Prerequisites for E2E Tests
 
diff --git a/tests/planner/run_scaling_test.sh b/tests/planner/run_scaling_test.sh
@@ -100,7 +100,7 @@ check_existing_deployment() {
         # Check if the DynamoGraphDeployment is ready
         local status=$(kubectl get dynamographdeployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" -o jsonpath='{.status.state}')
         if [ "$status" = "successful" ]; then
-            # Check if frontend pod is running (main indicator)
+            # Check if frontend pod is running
             if kubectl get pods -n "$NAMESPACE" -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=vllm-disagg-planner" --field-selector=status.phase=Running | grep -q .; then
                 log_success "Existing deployment is ready"
                 return 0
@@ -135,7 +135,6 @@ deploy_planner() {
         exit 1
     fi
 
-    # Wait for DynamoGraphDeployment to be processed
     log_info "Waiting for DynamoGraphDeployment to be processed..."
     if kubectl wait --for=condition=Ready dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=600s; then
         log_success "DynamoGraphDeployment is ready"
@@ -144,10 +143,8 @@ deploy_planner() {
         exit 1
     fi
 
-    # Wait for pods to be running (this may take a while for image pulls)
     log_info "Waiting for pods to be running (this may take several minutes for image pulls)..."
 
-    # Wait for frontend pod (main component we need for testing)
     log_info "Waiting for frontend pod..."
     if kubectl wait --for=condition=Ready pod -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=vllm-disagg-planner" -n "$NAMESPACE" --timeout=900s; then
         log_success "Frontend pod is ready"
@@ -156,12 +153,10 @@ deploy_planner() {
         exit 1
     fi
 
-    # Wait a bit more for all pods to be fully running
     log_info "Waiting for all pods to be running..."
     sleep 30
 }
 
-# Setup port forwarding
 setup_port_forward() {
     log_info "Setting up port forwarding..."
 
@@ -172,10 +167,8 @@ setup_port_forward() {
         sleep 2
     fi
 
-    # Start port forwarding to frontend service directly
     local frontend_service="vllm-disagg-planner-frontend"
 
-    # Check if the frontend service exists
     if ! kubectl get service "$frontend_service" -n "$NAMESPACE" &> /dev/null; then
         log_error "Frontend service '$frontend_service' not found"
         return 1
@@ -185,7 +178,6 @@ setup_port_forward() {
     kubectl port-forward service/"$frontend_service" "$LOCAL_PORT:$FRONTEND_PORT" -n "$NAMESPACE" >/dev/null 2>&1 &
     PORT_FORWARD_PID=$!
 
-    # Wait for port forwarding to be established
     log_info "Waiting for port forwarding to be established..."
     for i in {1..30}; do
         if curl -s http://localhost:$LOCAL_PORT/health &> /dev/null; then
@@ -199,7 +191,6 @@ setup_port_forward() {
     return 1
 }
 
-# Clean up port forwarding
 cleanup_port_forward() {
     if [ ! -z "$PORT_FORWARD_PID" ]; then
         log_info "Cleaning up port forwarding..."
@@ -208,21 +199,18 @@ cleanup_port_forward() {
     fi
 }
 
-# Clean up deployment
 cleanup_deployment() {
     log_info "Cleaning up deployment..."
     kubectl delete -f "$YAML_FILE" -n "$NAMESPACE" --ignore-not-found
 
-    # Wait for cleanup to complete
     log_info "Waiting for cleanup to complete..."
     kubectl wait --for=delete dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=120s || true
 
     log_info "Cleanup complete"
 }
 
-# Run the scaling test
 run_test() {
-    log_info "Running scaling test (12 req/s -> 24 req/s)..."
+    log_info "Running scaling test (graduated 8->15->25 req/s)..."
 
     local python_cmd="python3"
     if ! command -v python3 &> /dev/null; then
@@ -244,9 +232,7 @@ run_test() {
     fi
 }
 
-# Main function
 main() {
-    # Parse arguments
     while [[ $# -gt 0 ]]; do
         case $1 in
             --namespace)
@@ -260,7 +246,7 @@ main() {
             --help)
                 echo "Usage: $0 [--namespace NS] [--save-results]"
                 echo ""
-                echo "Run SLA planner scaling test (hardcoded 12 req/s -> 24 req/s scenario)"
+                echo "Run SLA planner scaling test (graduated 8->15->25 req/s prefill scaling)"
                 echo ""
                 echo "Options:"
                 echo "  --namespace NS    Kubernetes namespace (default: default)"
@@ -278,33 +264,33 @@ main() {
 
     log_info "SLA Planner Scaling Test"
     log_info "Namespace: $NAMESPACE"
-    log_info "Scenario: 12 req/s -> 24 req/s (1P1D -> 2P1D)"
+    log_info "Scenario: Graduated 8->15->25 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
 
-    # Check prerequisites
     check_prerequisites
 
-    # Setup trap for cleanup
     trap cleanup_port_forward EXIT
 
     # Check if we need to deploy
+    local deployed_by_us=false
     if ! check_existing_deployment; then
         deploy_planner
+        deployed_by_us=true
     fi
 
-    # Setup port forwarding
     if ! setup_port_forward; then
         log_error "Failed to setup port forwarding"
         exit 1
     fi
 
-    # Run the test
     local test_result=0
     if ! run_test; then
         test_result=1
     fi
 
-    # Always cleanup deployment
-    # cleanup_deployment
+    # Only cleanup deployment if we deployed it
+    if [ "$deployed_by_us" = true ]; then
+        cleanup_deployment
+    fi
 
     if [ $test_result -eq 0 ]; then
         log_success "Test completed successfully!"
@@ -315,5 +301,4 @@ main() {
     exit $test_result
 }
 
-# Run main function with all arguments
 main "$@"
diff --git a/tests/planner/utils/load_generator.py b/tests/planner/utils/load_generator.py