Skip to content

Commit ce2168c

Browse files
QiliangCuijinzhen-lin
authored andcommitted
[TPU][Test] Divide TPU v1 Test into 2 parts. (vllm-project#21431)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
1 parent 7d12e27 commit ce2168c

File tree

2 files changed

+166
-12
lines changed

2 files changed

+166
-12
lines changed
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
#!/bin/bash
2+
3+
set -xu
4+
5+
6+
remove_docker_container() {
7+
docker rm -f tpu-test || true;
8+
docker rm -f vllm-tpu || true;
9+
}
10+
11+
trap remove_docker_container EXIT
12+
13+
# Remove the container that might not be cleaned up in the previous run.
14+
remove_docker_container
15+
16+
# Build the docker image.
17+
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
18+
19+
# Set up cleanup.
20+
cleanup_docker() {
21+
# Get Docker's root directory
22+
docker_root=$(docker info -f '{{.DockerRootDir}}')
23+
if [ -z "$docker_root" ]; then
24+
echo "Failed to determine Docker root directory."
25+
exit 1
26+
fi
27+
echo "Docker root directory: $docker_root"
28+
# Check disk usage of the filesystem where Docker's root directory is located
29+
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
30+
# Define the threshold
31+
threshold=70
32+
if [ "$disk_usage" -gt "$threshold" ]; then
33+
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
34+
# Remove dangling images (those that are not tagged and not used by any container)
35+
docker image prune -f
36+
# Remove unused volumes / force the system prune for old images as well.
37+
docker volume prune -f && docker system prune --force --filter "until=72h" --all
38+
echo "Docker images and volumes cleanup completed."
39+
else
40+
echo "Disk usage is below $threshold%. No cleanup needed."
41+
fi
42+
}
43+
cleanup_docker
44+
45+
# For HF_TOKEN.
46+
source /etc/environment
47+
48+
docker run --privileged --net host --shm-size=16G -it \
49+
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
50+
vllm-tpu /bin/bash -c '
51+
set -e # Exit immediately if a command exits with a non-zero status.
52+
set -u # Treat unset variables as an error.
53+
54+
echo "--- Starting script inside Docker container ---"
55+
56+
# Create results directory
57+
RESULTS_DIR=$(mktemp -d)
58+
# If mktemp fails, set -e will cause the script to exit.
59+
echo "Results will be stored in: $RESULTS_DIR"
60+
61+
# Install dependencies
62+
echo "--- Installing Python dependencies ---"
63+
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
64+
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
65+
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
66+
&& python3 -m pip install --progress-bar off hf-transfer
67+
echo "--- Python dependencies installed ---"
68+
export VLLM_USE_V1=1
69+
export VLLM_XLA_CHECK_RECOMPILATION=1
70+
export VLLM_XLA_CACHE_PATH=
71+
echo "Using VLLM V1"
72+
73+
echo "--- Hardware Information ---"
74+
# tpu-info
75+
echo "--- Starting Tests ---"
76+
set +e
77+
overall_script_exit_code=0
78+
79+
# --- Test Definitions ---
80+
# If a test fails, this function will print logs and will not cause the main script to exit.
81+
run_test() {
82+
local test_num=$1
83+
local test_name=$2
84+
local test_command=$3
85+
local log_file="$RESULTS_DIR/test_${test_num}.log"
86+
local actual_exit_code
87+
88+
echo "--- TEST_$test_num: Running $test_name ---"
89+
90+
# Execute the test command.
91+
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
92+
actual_exit_code=$?
93+
94+
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
95+
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
96+
97+
if [ "$actual_exit_code" -ne 0 ]; then
98+
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
99+
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
100+
if [ -f "$log_file" ]; then
101+
cat "$log_file" >&2
102+
else
103+
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
104+
fi
105+
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
106+
return "$actual_exit_code" # Return the failure code
107+
else
108+
echo "TEST_$test_num ($test_name) PASSED."
109+
return 0 # Return success
110+
fi
111+
}
112+
113+
# Helper function to call run_test and update the overall script exit code
114+
run_and_track_test() {
115+
local test_num_arg="$1"
116+
local test_name_arg="$2"
117+
local test_command_arg="$3"
118+
119+
# Run the test
120+
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
121+
local test_specific_exit_code=$?
122+
123+
# If the test failed, set the overall script exit code to 1
124+
if [ "$test_specific_exit_code" -ne 0 ]; then
125+
# No need for extra echo here, run_test already logged the failure.
126+
overall_script_exit_code=1
127+
fi
128+
}
129+
130+
# --- Actual Test Execution ---
131+
run_and_track_test 1 "test_struct_output_generate.py" \
132+
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
133+
run_and_track_test 2 "test_moe_pallas.py" \
134+
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
135+
run_and_track_test 3 "test_lora.py" \
136+
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
137+
run_and_track_test 4 "test_tpu_qkv_linear.py" \
138+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
139+
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
140+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
141+
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
142+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
143+
144+
# After all tests have been attempted, exit with the overall status.
145+
if [ "$overall_script_exit_code" -ne 0 ]; then
146+
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
147+
else
148+
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
149+
fi
150+
exit "$overall_script_exit_code"
151+
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
152+
153+
# Capture the exit code of the docker run command
154+
DOCKER_RUN_EXIT_CODE=$?
155+
156+
# The trap will run for cleanup.
157+
# Exit the main script with the Docker run command's exit code.
158+
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
159+
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
160+
exit "$DOCKER_RUN_EXIT_CODE"
161+
else
162+
echo "Docker run command completed successfully."
163+
exit 0
164+
fi
165+
# TODO: This test fails because it uses RANDOM_SEED sampling
166+
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
150150
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
151151
run_and_track_test 10 "test_pallas.py" \
152152
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
153-
run_and_track_test 11 "test_struct_output_generate.py" \
154-
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
155-
run_and_track_test 12 "test_moe_pallas.py" \
156-
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
157-
run_and_track_test 13 "test_lora.py" \
158-
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
159-
run_and_track_test 14 "test_tpu_qkv_linear.py" \
160-
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
161-
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
162-
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
163-
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
164-
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
165153
166154
# After all tests have been attempted, exit with the overall status.
167155
if [ "$overall_script_exit_code" -ne 0 ]; then

0 commit comments

Comments
 (0)