Skip to content

Commit

Permalink
use new scheduling policy
Browse files Browse the repository at this point in the history
  • Loading branch information
blinkbear committed Dec 29, 2024
1 parent 8c000af commit e593acd
Show file tree
Hide file tree
Showing 15 changed files with 1,429 additions and 1,177 deletions.
6 changes: 3 additions & 3 deletions benchmarks/1_serving_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ model_name="meta-llama/Llama-2-13b-chat-hf"
# model_name="EleutherAI/gpt-neox-20b"
# model_name="facebook/opt-6.7b"
dataset_name="sharegpt"
dataset_path="/root/v1/vllm/dataset/ShareGPT_V3_unfiltered_cleaned_split.json"
dataset_path="/root/vllm/dataset/ShareGPT_V3_unfiltered_cleaned_split.json"
result_dir="${pwd}/result"
# scheduler_policy=(fcfs)
# swap_policies=(full)
Expand All @@ -28,7 +28,7 @@ declare -a scheduler_swap_policies
# scheduler_swap_policies[0]="tfittradeoff partial"
scheduler_swap_policies[1]="fcfs full"
# scheduler_swap_policies[2]="las full"
# scheduler_swap_policies[3]="tfittradeoff full"
scheduler_swap_policies[3]="tfittradeoff full"
# scheduler_swap_policies[4]="sjf full"
# scheduler_swap_policies[5]="srjf full"
# scheduler_swap_policies[3]="sjmlfq full"
Expand All @@ -38,7 +38,7 @@ scheduler_swap_policies[1]="fcfs full"

preemption_mode="swap"
gpu_memory_utilization=0.9 # 0.5, 0.7, 0.9
max_num_seqs=512
max_num_seqs=384
# max_num_seqs=1024
swap_space=80
# swap_space=32
Expand Down
12 changes: 5 additions & 7 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,9 @@ def sample_sharegpt_requests(
prompt_len_list.append(prompt_len)
output_len = len(completion_token_ids
)
if output_len < 256:
continue
# # if prompt_len < 0:
# if prompt_len > 16:
# # # Prune too short sequences.
# # continue
# continue
# if output_len < 10 or prompt_len > 256:
# # Prune too long sequences.
# continue
Expand All @@ -124,9 +122,9 @@ def sample_sharegpt_requests(
# # Prune too short sequences.
# continue

# if prompt_len < 10 or output_len < 10:
# # Prune too short sequences.
# continue
if prompt_len < 10 or output_len < 10:
# Prune too short sequences.
continue
# if prompt_len > 1024 or prompt_len + output_len > 2048:
# # Prune too long sequences.
# continue
Expand Down
86 changes: 86 additions & 0 deletions benchmarks/motivation_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# 读取当前计数器的值
COUNTER_FILE=".counter.txt"
if [ -f "$COUNTER_FILE" ]; then
COUNTER=$(cat $COUNTER_FILE)
else
COUNTER=0
fi
# 自增计数器
COUNTER=$((COUNTER + 1))
# 将新的计数器值写回文件
echo $COUNTER >$COUNTER_FILE

# start vllm server
pwd=`pwd`
# model_name="meta-llama/Llama-2-70b-chat-hf"
model_name="meta-llama/Llama-2-13b-chat-hf"
# model_name="mistralai/Mistral-7B-Instruct-v0.1" # 32000
# model_name="EleutherAI/gpt-neox-20b"
# model_name="facebook/opt-6.7b"
dataset_name="sharegpt"
dataset_path="/root/vllm/dataset/ShareGPT_V3_unfiltered_cleaned_split.json"
result_dir="${pwd}/result"

declare -a scheduler_swap_policies
scheduler_swap_policies[1]="fcfs full"


preemption_mode="swap"
gpu_memory_utilization=0.9 # 0.5, 0.7, 0.9
max_num_seqs=512
# max_num_seqs=1024
swap_space=80
# swap_space=32
max_tokens=4096
# max_tokens=4096
iter_theshold=15
max_serving_time=86400 # 86400
request_duration=300 # 1
num_shared_blocks=0

declare -a request_rates

# random request rates

request_rates=(20)
swap_out_partial_rates=(0.5)
waiting_iter_base=(0.1)
gpu_devices=0

for waiting_iter in "${waiting_iter_base[@]}"; do
for swap_out_partial_rate in "${swap_out_partial_rates[@]}"; do
for scheduler_swap_policy in "${scheduler_swap_policies[@]}"; do
element=(${scheduler_swap_policy})
policy=${element[0]}
swap_policy=${element[1]}

CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 28-29 python3 -m vllm.entrypoints.openai.api_server \
--model $model_name --swap-space $swap_space --preemption-mode $preemption_mode --scheduler-policy $policy \
--enable-chunked-prefill --max-num-batched-tokens $max_tokens --iter-threshold $iter_theshold --max-num-seqs $max_num_seqs --swap-out-tokens-policy $swap_policy --swap-out-partial-rate $swap_out_partial_rate --execution-budget $iter_theshold --enforce-eager\
--tensor-parallel-size 1 --num-shared-blocks $num_shared_blocks --gpu-memory-utilization $gpu_memory_utilization --disable-sliding-window --waiting-iter-base $waiting_iter --disable-log-requests --max-serving-time $max_serving_time >api_server_${policy}_${swap_policy}.log 2>&1 &
pid=$!

for request_rate in "${request_rates[@]}"; do
for i in {0..0}; do
taskset -c 30-49 python3 benchmark_serving.py --execution-counter $COUNTER --dataset-path $dataset_path \
--dataset-name $dataset_name --request-rate $request_rate \
--num-prompts 3000 --request-duration $request_duration --sharegpt-output-len 2000 --model $model_name --scheduler-policy $policy \
--save-result --result-dir $result_dir \
--metadata swap_space=$swap_space preemption_mode=$preemption_mode \
scheduler_policy=$policy gpu_memory_utilization=$gpu_memory_utilization \
max_num_seqs=$max_num_seqs max_tokens=$max_tokens swap_policy=$swap_policy \
iter_theshold=$iter_theshold swap_out_partial_rate=$swap_out_partial_rate waiting_iter_base=$waiting_iter \
>> benchmark-${policy}.log 2>&1

sleep 5
python3 parse_log.py --policy $policy --swap-policy $swap_policy --result-dir $result_dir \
--execution-counter $COUNTER --request-rate $request_rate \
--swap-out-partial-rate $swap_out_partial_rate --model $model_name
done
sleep 120
done
kill $pid
sleep 5
done
done
done
Binary file not shown.
1,063 changes: 1,063 additions & 0 deletions benchmarks/result/analysis/motivation_analysis.ipynb

Large diffs are not rendered by default.

924 changes: 53 additions & 871 deletions benchmarks/result/analysis/result_analysis_1.ipynb

Large diffs are not rendered by default.

Binary file added benchmarks/result/analysis/swap_time_cov.pdf
Binary file not shown.
Binary file added benchmarks/result/analysis/transfer_time.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions rebuild_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ pip uninstall -y vllm-flash-attn
clear
cd benchmarks
bash 1_serving_benchmark.sh
# bash motivation_benchmark.sh
2 changes: 1 addition & 1 deletion vllm/attention/backends/xformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def forward(

assert query.shape[0] == num_prefill_tokens
assert decode_query.shape[0] == num_decode_tokens

if prefill_meta := attn_metadata.prefill_metadata:
# Prompt run.
if kv_cache is None or prefill_meta.block_tables.numel() == 0:
Expand Down
Loading

0 comments on commit e593acd

Please sign in to comment.