Skip to content

Commit 16c865e

Browse files
committed
test: working Planner scaling test -- 2P1D
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
1 parent e294e5a commit 16c865e

File tree

3 files changed

+143
-135
lines changed

3 files changed

+143
-135
lines changed

tests/planner/README.md

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ The fourth plot, similar to the third plot, shows the actual decode throughput,
133133
134134
## Scaling Tests
135135
136-
This directory contains comprehensive tests for validating the SLA planner's scaling behavior. The tests validate both the replica calculation logic and end-to-end scaling behavior.
136+
This directory contains comprehensive tests for validating the SLA planner's scaling behavior. The tests validate both the replica calculation logic and end-to-end scaling behavior. The scaling test uses a graduated load approach rather than dataset files, as it proved more reliable for metric generation and scaling triggers.
137137
138138
### Test Types
139139
@@ -166,13 +166,22 @@ To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
166166
./run_scaling_test.sh --save-results
167167
```
168168
169-
### Test Scenario
169+
**E2E Test Deployment Management:**
170+
- If no deployment exists: creates, tests, and cleans up deployment
171+
- If deployment exists: uses existing deployment and preserves it
172+
- Perfect for development workflows where you want to keep deployments running between tests
170173
171-
The main test scenario validates scaling for **H200 with 1P1D configuration**:
172-
- **Phase 1**: 12 req/s (maintains 1P1D)
173-
- **Phase 2**: 24 req/s (scales to 2P1D - 2 prefill workers, 1 decode worker)
174-
- **ISL/OSL**: 3000/150 tokens
175-
- **Capacity**: ~15 req/s/gpu based on interpolator analysis
174+
**Test Scenario**
175+
176+
The main test scenario validates prefill scaling for H200 with 1P1D → 2P1D configuration:
177+
178+
- **Phase 1**: 8 req/s for 90s (baseline - maintains 1P1D)
179+
- **Phase 2**: 15 req/s for 120s (moderate load - maintains 1P1D)
180+
- **Phase 3**: 25 req/s for 180s (scaling trigger - scales to 2P1D)
181+
- **ISL/OSL**: 4000/150 tokens (optimized for prefill bottleneck)
182+
- **Transition delay**: 30s between phases
183+
- **Total test duration**: ~7 minutes + scaling observation
184+
- **Smart cleanup**: Only removes deployment if test created it (preserves existing deployments)
176185
177186
### Prerequisites for E2E Tests
178187

tests/planner/run_scaling_test.sh

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ check_existing_deployment() {
100100
# Check if the DynamoGraphDeployment is ready
101101
local status=$(kubectl get dynamographdeployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" -o jsonpath='{.status.state}')
102102
if [ "$status" = "successful" ]; then
103-
# Check if frontend pod is running (main indicator)
103+
# Check if frontend pod is running
104104
if kubectl get pods -n "$NAMESPACE" -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=vllm-disagg-planner" --field-selector=status.phase=Running | grep -q .; then
105105
log_success "Existing deployment is ready"
106106
return 0
@@ -135,7 +135,6 @@ deploy_planner() {
135135
exit 1
136136
fi
137137

138-
# Wait for DynamoGraphDeployment to be processed
139138
log_info "Waiting for DynamoGraphDeployment to be processed..."
140139
if kubectl wait --for=condition=Ready dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=600s; then
141140
log_success "DynamoGraphDeployment is ready"
@@ -144,10 +143,8 @@ deploy_planner() {
144143
exit 1
145144
fi
146145

147-
# Wait for pods to be running (this may take a while for image pulls)
148146
log_info "Waiting for pods to be running (this may take several minutes for image pulls)..."
149147

150-
# Wait for frontend pod (main component we need for testing)
151148
log_info "Waiting for frontend pod..."
152149
if kubectl wait --for=condition=Ready pod -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=vllm-disagg-planner" -n "$NAMESPACE" --timeout=900s; then
153150
log_success "Frontend pod is ready"
@@ -156,12 +153,10 @@ deploy_planner() {
156153
exit 1
157154
fi
158155

159-
# Wait a bit more for all pods to be fully running
160156
log_info "Waiting for all pods to be running..."
161157
sleep 30
162158
}
163159

164-
# Setup port forwarding
165160
setup_port_forward() {
166161
log_info "Setting up port forwarding..."
167162

@@ -172,10 +167,8 @@ setup_port_forward() {
172167
sleep 2
173168
fi
174169

175-
# Start port forwarding to frontend service directly
176170
local frontend_service="vllm-disagg-planner-frontend"
177171

178-
# Check if the frontend service exists
179172
if ! kubectl get service "$frontend_service" -n "$NAMESPACE" &> /dev/null; then
180173
log_error "Frontend service '$frontend_service' not found"
181174
return 1
@@ -185,7 +178,6 @@ setup_port_forward() {
185178
kubectl port-forward service/"$frontend_service" "$LOCAL_PORT:$FRONTEND_PORT" -n "$NAMESPACE" >/dev/null 2>&1 &
186179
PORT_FORWARD_PID=$!
187180

188-
# Wait for port forwarding to be established
189181
log_info "Waiting for port forwarding to be established..."
190182
for i in {1..30}; do
191183
if curl -s http://localhost:$LOCAL_PORT/health &> /dev/null; then
@@ -199,7 +191,6 @@ setup_port_forward() {
199191
return 1
200192
}
201193

202-
# Clean up port forwarding
203194
cleanup_port_forward() {
204195
if [ ! -z "$PORT_FORWARD_PID" ]; then
205196
log_info "Cleaning up port forwarding..."
@@ -208,21 +199,18 @@ cleanup_port_forward() {
208199
fi
209200
}
210201

211-
# Clean up deployment
212202
cleanup_deployment() {
213203
log_info "Cleaning up deployment..."
214204
kubectl delete -f "$YAML_FILE" -n "$NAMESPACE" --ignore-not-found
215205

216-
# Wait for cleanup to complete
217206
log_info "Waiting for cleanup to complete..."
218207
kubectl wait --for=delete dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=120s || true
219208

220209
log_info "Cleanup complete"
221210
}
222211

223-
# Run the scaling test
224212
run_test() {
225-
log_info "Running scaling test (12 req/s -> 24 req/s)..."
213+
log_info "Running scaling test (graduated 8->15->25 req/s)..."
226214

227215
local python_cmd="python3"
228216
if ! command -v python3 &> /dev/null; then
@@ -244,9 +232,7 @@ run_test() {
244232
fi
245233
}
246234

247-
# Main function
248235
main() {
249-
# Parse arguments
250236
while [[ $# -gt 0 ]]; do
251237
case $1 in
252238
--namespace)
@@ -260,7 +246,7 @@ main() {
260246
--help)
261247
echo "Usage: $0 [--namespace NS] [--save-results]"
262248
echo ""
263-
echo "Run SLA planner scaling test (hardcoded 12 req/s -> 24 req/s scenario)"
249+
echo "Run SLA planner scaling test (graduated 8->15->25 req/s prefill scaling)"
264250
echo ""
265251
echo "Options:"
266252
echo " --namespace NS Kubernetes namespace (default: default)"
@@ -278,33 +264,33 @@ main() {
278264

279265
log_info "SLA Planner Scaling Test"
280266
log_info "Namespace: $NAMESPACE"
281-
log_info "Scenario: 12 req/s -> 24 req/s (1P1D -> 2P1D)"
267+
log_info "Scenario: Graduated 8->15->25 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
282268

283-
# Check prerequisites
284269
check_prerequisites
285270

286-
# Setup trap for cleanup
287271
trap cleanup_port_forward EXIT
288272

289273
# Check if we need to deploy
274+
local deployed_by_us=false
290275
if ! check_existing_deployment; then
291276
deploy_planner
277+
deployed_by_us=true
292278
fi
293279

294-
# Setup port forwarding
295280
if ! setup_port_forward; then
296281
log_error "Failed to setup port forwarding"
297282
exit 1
298283
fi
299284

300-
# Run the test
301285
local test_result=0
302286
if ! run_test; then
303287
test_result=1
304288
fi
305289

306-
# Always cleanup deployment
307-
# cleanup_deployment
290+
# Only cleanup deployment if we deployed it
291+
if [ "$deployed_by_us" = true ]; then
292+
cleanup_deployment
293+
fi
308294

309295
if [ $test_result -eq 0 ]; then
310296
log_success "Test completed successfully!"
@@ -315,5 +301,4 @@ main() {
315301
exit $test_result
316302
}
317303

318-
# Run main function with all arguments
319304
main "$@"

0 commit comments

Comments
 (0)