|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +set -xu |
| 4 | + |
| 5 | + |
| 6 | +remove_docker_container() { |
| 7 | + docker rm -f tpu-test || true; |
| 8 | + docker rm -f vllm-tpu || true; |
| 9 | +} |
| 10 | + |
| 11 | +trap remove_docker_container EXIT |
| 12 | + |
| 13 | +# Remove the container that might not be cleaned up in the previous run. |
| 14 | +remove_docker_container |
| 15 | + |
| 16 | +# Build the docker image. |
| 17 | +docker build -f docker/Dockerfile.tpu -t vllm-tpu . |
| 18 | + |
| 19 | +# Set up cleanup. |
| 20 | +cleanup_docker() { |
| 21 | + # Get Docker's root directory |
| 22 | + docker_root=$(docker info -f '{{.DockerRootDir}}') |
| 23 | + if [ -z "$docker_root" ]; then |
| 24 | + echo "Failed to determine Docker root directory." |
| 25 | + exit 1 |
| 26 | + fi |
| 27 | + echo "Docker root directory: $docker_root" |
| 28 | + # Check disk usage of the filesystem where Docker's root directory is located |
| 29 | + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') |
| 30 | + # Define the threshold |
| 31 | + threshold=70 |
| 32 | + if [ "$disk_usage" -gt "$threshold" ]; then |
| 33 | + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." |
| 34 | + # Remove dangling images (those that are not tagged and not used by any container) |
| 35 | + docker image prune -f |
| 36 | + # Remove unused volumes / force the system prune for old images as well. |
| 37 | + docker volume prune -f && docker system prune --force --filter "until=72h" --all |
| 38 | + echo "Docker images and volumes cleanup completed." |
| 39 | + else |
| 40 | + echo "Disk usage is below $threshold%. No cleanup needed." |
| 41 | + fi |
| 42 | +} |
| 43 | +cleanup_docker |
| 44 | + |
| 45 | +# For HF_TOKEN. |
| 46 | +source /etc/environment |
| 47 | + |
| 48 | +docker run --privileged --net host --shm-size=16G -it \ |
| 49 | + -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ |
| 50 | + vllm-tpu /bin/bash -c ' |
| 51 | +set -e # Exit immediately if a command exits with a non-zero status. |
| 52 | +set -u # Treat unset variables as an error. |
| 53 | +
|
| 54 | +echo "--- Starting script inside Docker container ---" |
| 55 | +
|
| 56 | +# Create results directory |
| 57 | +RESULTS_DIR=$(mktemp -d) |
| 58 | +# If mktemp fails, set -e will cause the script to exit. |
| 59 | +echo "Results will be stored in: $RESULTS_DIR" |
| 60 | +
|
| 61 | +# Install dependencies |
| 62 | +echo "--- Installing Python dependencies ---" |
| 63 | +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ |
| 64 | + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ |
| 65 | + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \ |
| 66 | + && python3 -m pip install --progress-bar off hf-transfer |
| 67 | +echo "--- Python dependencies installed ---" |
| 68 | +export VLLM_USE_V1=1 |
| 69 | +export VLLM_XLA_CHECK_RECOMPILATION=1 |
| 70 | +export VLLM_XLA_CACHE_PATH= |
| 71 | +echo "Using VLLM V1" |
| 72 | +
|
| 73 | +echo "--- Hardware Information ---" |
| 74 | +# tpu-info |
| 75 | +echo "--- Starting Tests ---" |
| 76 | +set +e |
| 77 | +overall_script_exit_code=0 |
| 78 | +
|
| 79 | +# --- Test Definitions --- |
| 80 | +# If a test fails, this function will print logs and will not cause the main script to exit. |
| 81 | +run_test() { |
| 82 | + local test_num=$1 |
| 83 | + local test_name=$2 |
| 84 | + local test_command=$3 |
| 85 | + local log_file="$RESULTS_DIR/test_${test_num}.log" |
| 86 | + local actual_exit_code |
| 87 | +
|
| 88 | + echo "--- TEST_$test_num: Running $test_name ---" |
| 89 | + |
| 90 | + # Execute the test command. |
| 91 | + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) |
| 92 | + actual_exit_code=$? |
| 93 | +
|
| 94 | + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log |
| 95 | + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log |
| 96 | +
|
| 97 | + if [ "$actual_exit_code" -ne 0 ]; then |
| 98 | + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 |
| 99 | + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 |
| 100 | + if [ -f "$log_file" ]; then |
| 101 | + cat "$log_file" >&2 |
| 102 | + else |
| 103 | + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 |
| 104 | + fi |
| 105 | + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 |
| 106 | + return "$actual_exit_code" # Return the failure code |
| 107 | + else |
| 108 | + echo "TEST_$test_num ($test_name) PASSED." |
| 109 | + return 0 # Return success |
| 110 | + fi |
| 111 | +} |
| 112 | +
|
| 113 | +# Helper function to call run_test and update the overall script exit code |
| 114 | +run_and_track_test() { |
| 115 | + local test_num_arg="$1" |
| 116 | + local test_name_arg="$2" |
| 117 | + local test_command_arg="$3" |
| 118 | +
|
| 119 | + # Run the test |
| 120 | + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" |
| 121 | + local test_specific_exit_code=$? |
| 122 | +
|
| 123 | + # If the test failed, set the overall script exit code to 1 |
| 124 | + if [ "$test_specific_exit_code" -ne 0 ]; then |
| 125 | + # No need for extra echo here, run_test already logged the failure. |
| 126 | + overall_script_exit_code=1 |
| 127 | + fi |
| 128 | +} |
| 129 | +
|
| 130 | +# --- Actual Test Execution --- |
| 131 | +run_and_track_test 1 "test_struct_output_generate.py" \ |
| 132 | + "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" |
| 133 | +run_and_track_test 2 "test_moe_pallas.py" \ |
| 134 | + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" |
| 135 | +run_and_track_test 3 "test_lora.py" \ |
| 136 | + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" |
| 137 | +run_and_track_test 4 "test_tpu_qkv_linear.py" \ |
| 138 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" |
| 139 | +run_and_track_test 5 "test_spmd_model_weight_loading.py" \ |
| 140 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" |
| 141 | +run_and_track_test 6 "test_kv_cache_update_kernel.py" \ |
| 142 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" |
| 143 | +
|
| 144 | +# After all tests have been attempted, exit with the overall status. |
| 145 | +if [ "$overall_script_exit_code" -ne 0 ]; then |
| 146 | + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" |
| 147 | +else |
| 148 | + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" |
| 149 | +fi |
| 150 | +exit "$overall_script_exit_code" |
| 151 | +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. |
| 152 | + |
| 153 | +# Capture the exit code of the docker run command |
| 154 | +DOCKER_RUN_EXIT_CODE=$? |
| 155 | + |
| 156 | +# The trap will run for cleanup. |
| 157 | +# Exit the main script with the Docker run command's exit code. |
| 158 | +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then |
| 159 | + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." |
| 160 | + exit "$DOCKER_RUN_EXIT_CODE" |
| 161 | +else |
| 162 | + echo "Docker run command completed successfully." |
| 163 | + exit 0 |
| 164 | +fi |
| 165 | +# TODO: This test fails because it uses RANDOM_SEED sampling |
| 166 | +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ |
0 commit comments