1010# 3. Set variables (ALL REQUIRED)
1111# BASE: your directory for vllm repo
1212# MODEL: the model served by vllm
13+ # SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
1314# TP: ways of tensor parallelism
1415# DOWNLOAD_DIR: directory to download and load model weights.
1516# INPUT_LEN: request input len
3435TAG=$( date +" %Y_%m_%d_%H_%M" )
3536BASE=" "
3637MODEL=" meta-llama/Llama-3.1-8B-Instruct"
38+ SYSTEM=" TPU"
3739TP=1
3840DOWNLOAD_DIR=" "
3941INPUT_LEN=4000
@@ -45,12 +47,15 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
4547
4648LOG_FOLDER=" $BASE /auto-benchmark/$TAG "
4749RESULT=" $LOG_FOLDER /result.txt"
50+ PROFILE_PATH=" $LOG_FOLDER /profile"
4851
4952echo " result file: $RESULT "
5053echo " model: $MODEL "
5154
5255rm -rf $LOG_FOLDER
56+ rm -rf $PROFILE_PATH
5357mkdir -p $LOG_FOLDER
58+ mkdir -p $PROFILE_PATH
5459
5560cd " $BASE /vllm"
5661
@@ -70,10 +75,11 @@ start_server() {
7075 local max_num_seqs=$2
7176 local max_num_batched_tokens=$3
7277 local vllm_log=$4
78+ local profile_dir=$5
7379
7480 pkill -f vllm
7581
76- VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
82+ VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR= $profile_dir vllm serve $MODEL \
7783 --disable-log-requests \
7884 --port 8004 \
7985 --gpu-memory-utilization $gpu_memory_utilization \
@@ -105,19 +111,37 @@ start_server() {
105111 fi
106112}
107113
114+ update_best_profile () {
115+ local profile_dir=$1
116+ local profile_index=$2
117+ sorted_paths=($( find " $profile_dir " -maxdepth 1 -not -path " $profile_dir " | sort) )
118+ selected_profile_file=
119+ if [[ " $SYSTEM " == " TPU" ]]; then
120+ selected_profile_file=" ${sorted_paths[$profile_index]} /*.xplane.pb"
121+ fi
122+ if [[ " $SYSTEM " == " GPU" ]]; then
123+ selected_profile_file=" ${sorted_paths[$profile_index]} "
124+ fi
125+ rm -f $PROFILE_PATH /*
126+ cp $selected_profile_file $PROFILE_PATH
127+ }
128+
108129run_benchmark () {
109130 local max_num_seqs=$1
110131 local max_num_batched_tokens=$2
111132 local gpu_memory_utilization=$3
112133 echo " max_num_seq: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens "
113134 local vllm_log=" $LOG_FOLDER /vllm_log_${max_num_seqs} _${max_num_batched_tokens} .txt"
135+ local profile_dir=" $LOG_FOLDER /profile_${max_num_seqs} _${max_num_batched_tokens} "
114136 echo " vllm_log: $vllm_log "
115137 echo
116138 rm -f $vllm_log
139+ mkdir -p $profile_dir
117140 pkill -f vllm
141+ local profile_index=0
118142
119143 echo " starting server..."
120- start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
144+ start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
121145 result=$?
122146 if [[ " $result " -eq 1 ]]; then
123147 echo " server failed to start. gpu_memory_utilization:$gpu_memory_utilization , max_num_seqs:$max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens "
@@ -144,7 +168,8 @@ run_benchmark() {
144168 --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
145169 --num-prompts 1000 \
146170 --random-prefix-len $prefix_len \
147- --port 8004 & > " $bm_log "
171+ --port 8004 \
172+ --profile & > " $bm_log "
148173 throughput=$( grep " Request throughput (req/s):" " $bm_log " | sed ' s/[^0-9.]//g' )
149174 e2el=$( grep " P99 E2EL (ms):" " $bm_log " | awk ' {print $NF}' )
150175 goodput=$( grep " Request goodput (req/s):" " $bm_log " | sed ' s/[^0-9.]//g' )
@@ -158,6 +183,7 @@ run_benchmark() {
158183 # start from request-rate as int(throughput) + 1
159184 request_rate=$(( ${throughput% .* } + 1 ))
160185 while (( request_rate > 0 )) ; do
186+ profile_index=$(( profile_index+ 1 ))
161187 # clear prefix cache
162188 curl -X POST http://0.0.0.0:8004/reset_prefix_cache
163189 sleep 5
@@ -195,6 +221,12 @@ run_benchmark() {
195221 best_max_num_seqs=$max_num_seqs
196222 best_num_batched_tokens=$max_num_batched_tokens
197223 best_goodput=$goodput
224+ if [[ " $SYSTEM " == " TPU" ]]; then
225+ update_best_profile " $profile_dir /plugins/profile" $profile_index
226+ fi
227+ if [[ " $SYSTEM " == " GPU" ]]; then
228+ update_best_profile " $profile_dir " $profile_index
229+ fi
198230 fi
199231 else
200232 echo " max_num_seqs: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS} "
@@ -239,6 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
239271 done
240272done
241273echo " finish permutations"
242- echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput "
243- echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput " >> " $RESULT "
274+ echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput , profile saved in: $PROFILE_PATH "
275+ echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput , profile saved in: $PROFILE_PATH " >> " $RESULT "
244276
0 commit comments