use new scheduling policy

blinkbear · Dec 29, 2024 · e593acd · e593acd
1 parent 8c000af
commit e593acd
Show file tree

Hide file tree

Showing 15 changed files with 1,429 additions and 1,177 deletions.
diff --git a/benchmarks/1_serving_benchmark.sh b/benchmarks/1_serving_benchmark.sh
@@ -18,7 +18,7 @@ model_name="meta-llama/Llama-2-13b-chat-hf"
 # model_name="EleutherAI/gpt-neox-20b"
 # model_name="facebook/opt-6.7b"
 dataset_name="sharegpt"
-dataset_path="/root/v1/vllm/dataset/ShareGPT_V3_unfiltered_cleaned_split.json"
+dataset_path="/root/vllm/dataset/ShareGPT_V3_unfiltered_cleaned_split.json"
 result_dir="${pwd}/result"
 # scheduler_policy=(fcfs)
 # swap_policies=(full)
@@ -28,7 +28,7 @@ declare -a scheduler_swap_policies
 # scheduler_swap_policies[0]="tfittradeoff partial"
 scheduler_swap_policies[1]="fcfs full"
 # scheduler_swap_policies[2]="las full"
-# scheduler_swap_policies[3]="tfittradeoff full"
+scheduler_swap_policies[3]="tfittradeoff full"
 # scheduler_swap_policies[4]="sjf full"
 # scheduler_swap_policies[5]="srjf full"
 # scheduler_swap_policies[3]="sjmlfq full"
@@ -38,7 +38,7 @@ scheduler_swap_policies[1]="fcfs full"
 
 preemption_mode="swap"
 gpu_memory_utilization=0.9 # 0.5, 0.7, 0.9
-max_num_seqs=512
+max_num_seqs=384
 # max_num_seqs=1024
 swap_space=80
 # swap_space=32

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -107,11 +107,9 @@ def sample_sharegpt_requests(
         prompt_len_list.append(prompt_len)
         output_len = len(completion_token_ids
                          )
-        if output_len < 256:
-            continue
-        # # if prompt_len < 0:
+        # if prompt_len > 16:
         # #     # Prune too short sequences.
-        # #     continue
+            # continue
         # if output_len < 10  or prompt_len > 256:
         #     # Prune too long sequences.
         #     continue
@@ -124,9 +122,9 @@ def sample_sharegpt_requests(
         #     # Prune too short sequences.
         #     continue
 
-        # if prompt_len < 10 or output_len < 10:
-        #     # Prune too short sequences.
-        #     continue
+        if prompt_len < 10 or output_len < 10:
+            # Prune too short sequences.
+            continue
         # if prompt_len > 1024 or prompt_len + output_len > 2048:
         #     # Prune too long sequences.
         #     continue

diff --git a/benchmarks/motivation_benchmark.sh b/benchmarks/motivation_benchmark.sh
@@ -0,0 +1,86 @@
+# 读取当前计数器的值
+COUNTER_FILE=".counter.txt"
+if [ -f "$COUNTER_FILE" ]; then
+  COUNTER=$(cat $COUNTER_FILE)
+else
+  COUNTER=0
+fi
+# 自增计数器
+COUNTER=$((COUNTER + 1))
+# 将新的计数器值写回文件
+echo $COUNTER >$COUNTER_FILE
+
+# start vllm server
+pwd=`pwd`
+# model_name="meta-llama/Llama-2-70b-chat-hf"
+model_name="meta-llama/Llama-2-13b-chat-hf"
+# model_name="mistralai/Mistral-7B-Instruct-v0.1" # 32000
+# model_name="EleutherAI/gpt-neox-20b"
+# model_name="facebook/opt-6.7b"
+dataset_name="sharegpt"
+dataset_path="/root/vllm/dataset/ShareGPT_V3_unfiltered_cleaned_split.json"
+result_dir="${pwd}/result"
+
+declare -a scheduler_swap_policies
+scheduler_swap_policies[1]="fcfs full"
+
+
+preemption_mode="swap"
+gpu_memory_utilization=0.9 # 0.5, 0.7, 0.9
+max_num_seqs=512
+# max_num_seqs=1024
+swap_space=80
+# swap_space=32
+max_tokens=4096
+# max_tokens=4096
+iter_theshold=15
+max_serving_time=86400 # 86400
+request_duration=300 # 1
+num_shared_blocks=0
+
+declare -a request_rates
+
+# random request rates
+
+request_rates=(20)
+swap_out_partial_rates=(0.5)
+waiting_iter_base=(0.1)
+gpu_devices=0
+
+for waiting_iter in "${waiting_iter_base[@]}"; do
+  for swap_out_partial_rate in "${swap_out_partial_rates[@]}"; do
+    for scheduler_swap_policy in "${scheduler_swap_policies[@]}"; do
+      element=(${scheduler_swap_policy})
+      policy=${element[0]}
+      swap_policy=${element[1]}
+
+      CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 28-29 python3 -m vllm.entrypoints.openai.api_server \
+      --model $model_name --swap-space $swap_space --preemption-mode $preemption_mode --scheduler-policy $policy \
+      --enable-chunked-prefill --max-num-batched-tokens $max_tokens --iter-threshold $iter_theshold --max-num-seqs $max_num_seqs --swap-out-tokens-policy $swap_policy --swap-out-partial-rate $swap_out_partial_rate --execution-budget $iter_theshold  --enforce-eager\
+      --tensor-parallel-size 1 --num-shared-blocks $num_shared_blocks --gpu-memory-utilization $gpu_memory_utilization --disable-sliding-window --waiting-iter-base $waiting_iter --disable-log-requests --max-serving-time $max_serving_time >api_server_${policy}_${swap_policy}.log 2>&1 &
+      pid=$!
+
+        for request_rate in "${request_rates[@]}"; do
+          for i in {0..0}; do
+            taskset -c 30-49 python3 benchmark_serving.py --execution-counter $COUNTER --dataset-path $dataset_path \
+              --dataset-name $dataset_name --request-rate $request_rate \
+              --num-prompts 3000 --request-duration $request_duration --sharegpt-output-len 2000 --model $model_name --scheduler-policy $policy \
+              --save-result --result-dir $result_dir \
+              --metadata swap_space=$swap_space preemption_mode=$preemption_mode \
+              scheduler_policy=$policy gpu_memory_utilization=$gpu_memory_utilization \
+              max_num_seqs=$max_num_seqs max_tokens=$max_tokens swap_policy=$swap_policy \
+              iter_theshold=$iter_theshold swap_out_partial_rate=$swap_out_partial_rate waiting_iter_base=$waiting_iter \
+              >> benchmark-${policy}.log 2>&1
+
+            sleep 5
+            python3 parse_log.py --policy $policy --swap-policy $swap_policy --result-dir $result_dir \
+              --execution-counter $COUNTER --request-rate $request_rate \
+              --swap-out-partial-rate $swap_out_partial_rate --model $model_name
+          done
+          sleep 120
+        done
+      kill $pid
+      sleep 5
+    done
+  done
+done
diff --git a/benchmarks/result/analysis/inference_elapsed_time.pdf b/benchmarks/result/analysis/inference_elapsed_time.pdf
diff --git a/benchmarks/result/analysis/motivation_analysis.ipynb b/benchmarks/result/analysis/motivation_analysis.ipynb
diff --git a/benchmarks/result/analysis/result_analysis_1.ipynb b/benchmarks/result/analysis/result_analysis_1.ipynb
diff --git a/benchmarks/result/analysis/swap_time_cov.pdf b/benchmarks/result/analysis/swap_time_cov.pdf
diff --git a/benchmarks/result/analysis/transfer_time.pdf b/benchmarks/result/analysis/transfer_time.pdf
diff --git a/rebuild_vllm.sh b/rebuild_vllm.sh
@@ -6,3 +6,4 @@ pip uninstall -y vllm-flash-attn
 clear
 cd benchmarks
 bash 1_serving_benchmark.sh
+# bash motivation_benchmark.sh
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
@@ -286,7 +286,7 @@ def forward(
 
         assert query.shape[0] == num_prefill_tokens
         assert decode_query.shape[0] == num_decode_tokens
-
+        
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
             if kv_cache is None or prefill_meta.block_tables.numel() == 0: