vllm-project · KuntaiDu · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -125,9 +125,9 @@ def sample_sonnet_requests(
     prefix_len: int,
     tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, str, int, int]]:
-    assert (
-        input_len > prefix_len
-    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
+    assert input_len >= prefix_len, (
+        "'args.sonnet-input-len' must be greater than or equal to "
+        "'args.prefix-input-len'.")
 
     # Load the dataset.
     with open(dataset_path) as f:

diff --git a/benchmarks/disagg_benchmarks/analyze_benchmark_results.py b/benchmarks/disagg_benchmarks/analyze_benchmark_results.py
@@ -0,0 +1,48 @@
+
+import argparse
+import json
+import yaml
+import os
+from pathlib import Path
+
+def load(path):
+
+    with open(str(path), 'r') as f:
+        return json.loads(f.read())
+
+def main(args):
+
+    results = Path(args.results_folder)
+
+    chunk = load(results / "chunked_prefill_tp4.json")
+    prefill = load(results / "disagg_prefill_tp4.json")
+    decode = load(results / "disagg_decode_tp4.json")
+
+    ttft_ratio = chunk["mean_ttft_ms"] / prefill["mean_ttft_ms"]
+    itl_ratio = chunk["mean_itl_ms"] / decode["mean_itl_ms"]
+    prefill_decode_ratio = prefill["mean_ttft_ms"] / (decode["mean_itl_ms"] * args.output_len)
+
+    with open(results / args.output_file, 'a') as f:
+        f.write(yaml.dump([{
+            'qps': args.qps,
+            'output_len': args.output_len,
+            'prefill_decode_ratio': prefill_decode_ratio,
+            'ttft_ratio': ttft_ratio,
+            'itl_ratio': itl_ratio,
+            "chunk_ttft": chunk["mean_ttft_ms"],
+            "chunk_itl": chunk["mean_itl_ms"],
+            "disagg_ttft": prefill["mean_ttft_ms"],
+            "disagg_itl": decode["mean_itl_ms"]
+        }]))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Analyze benchmark results")
+    parser.add_argument("--results-folder", required=True, help="Path to the results folder")
+    parser.add_argument("--output-len", type=int, required=True, help="Target output length")
+    parser.add_argument("--qps", type=int, required=True, help="Target QPS")
+    parser.add_argument("--output-file", type=str, default="chunk_vs_disagg.yaml")
+
+    args = parser.parse_args()
+    main(args)
+
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+
+# Requirement: 8x H100 GPUs.
+
+
+# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
+# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
+# Resource: 8x H100
+# Approaches:
+# 1. Chunked prefill: 1 vllm instance with tp=8
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+benchmark() {
+
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  export VLLM_PORT=12345
+
+  # compare chunked prefill with disaggregated prefill
+
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=50
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+
+  # large model
+  VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+      -m vllm.entrypoints.openai.api_server \
+      --model $model \
+      --port 8100 \
+      -tp 4 \
+      --max-model-len 30000 \
+      --gpu-memory-utilization 0.8 &
+  VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    -tp 4 \
+    --max-model-len 30000 \
+    --gpu-memory-utilization 0.8 &
+
+  wait_for_server 8100
+  wait_for_server 8200
+
+  # let the prefill instance finish prefill
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8100 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate $qps
+
+
+  # send the request to decode.
+  # The TTFT of this command will be the overhead of disagg prefill impl.
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8200 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate $qps
+  kill_gpu_processes
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=1
+  default_output_len=1
+  benchmark $default_qps $default_output_len
+
+}
+
+
+main "$@"
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+
+# Requirement: 8x H100 GPUs.
+
+
+# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
+# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
+# Resource: 8x H100
+# Approaches:
+# 1. Chunked prefill: 1 vllm instance with tp=8
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill -f pt_main_thread
+  pkill -f python3
+  pkill -f round_robin_proxy.sh
+  ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
+  for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
+  sleep 1
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+launch_chunked_prefill() {
+  model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+  # disagg prefill
+  VLLM_RPC_PORT=5570 CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+      -m vllm.entrypoints.openai.api_server \
+      --model $model \
+      --port 8100 \
+      -tp 4 \
+      --max-model-len 30000 \
+      --disable-log-stats \
+      --disable-log-requests \
+      --enable-chunked-prefill \
+      --gpu-memory-utilization 0.8 &
+  VLLM_RPC_PORT=5580 CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    -tp 4 \
+    --max-model-len 30000 \
+    --disable-log-stats \
+    --disable-log-requests \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.8 &
+  wait_for_server 8100
+  wait_for_server 8200
+  bash round_robin_proxy.sh &
+  sleep 1
+}
+
+
+launch_disagg_prefill() {
+  model="meta-llama/Meta-Llama-3.1-70B-Instruct" 
+  # disagg prefill
+  VLLM_PORT=12345 VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+      -m vllm.entrypoints.openai.api_server \
+      --model $model \
+      --port 8100 \
+      -tp 4 \
+      --max-model-len 30000 \
+      --disable-log-stats \
+      --disable-log-requests \
+      --gpu-memory-utilization 0.8 &
+  VLLM_PORT=12345 VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    -tp 4 \
+    --max-model-len 30000 \
+    --disable-log-stats \
+    --disable-log-requests \
+    --gpu-memory-utilization 0.8 &
+  wait_for_server 8100
+  wait_for_server 8200
+  python3 disagg_prefill_proxy_server.py &
+  sleep 1
+}
+
+
+benchmark() {
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=400
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+  tag=$3
+
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename $tag-qps-$qps.json \
+          --request-rate $qps
+
+  sleep 2
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt so that we can sample 2048 tokens for input
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=10
+  default_output_len=150
+
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  launch_chunked_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len chunked_prefill
+  done
+  kill_gpu_processes
+
+  launch_disagg_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len disagg_prefill
+  done
+  kill_gpu_processes
+
+}
+
+
+main "$@"