From 4ad8545839f3f5adc544baa8f62c56f5509540ba Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 1 Oct 2025 16:22:46 -0700 Subject: [PATCH] Run latency and throughput benchmark for Qwen3 and Gemma3 Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 4 ++-- .../benchmarks/cuda/latency-tests.json | 22 +++++++++++++++++ .../benchmarks/cuda/throughput-tests.json | 24 +++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 32098f64..d4202ed3 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -2,8 +2,8 @@ name: vLLM Benchmark on: schedule: - # Run every 6 hours - - cron: '0 */6 * * *' + # Run every 12 hours + - cron: '0 */12 * * *' workflow_dispatch: inputs: vllm_branch: diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 47f021a4..719b4339 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -105,5 +105,27 @@ "num_iters": 15, "max_model_len": 8192 } + }, + { + "test_name": "latency_gemma_3_27b_it_tp8", + "parameters": { + "model": "google/gemma-3-27b-it", + "tensor_parallel_size": 8, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_qwen3_30b_a3b_tp8", + "parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 8, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index 0b8c7cf8..9ff9cdad 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -115,5 +115,29 @@ "backend": "vllm", "max_model_len": 8192 } + }, + { + "test_name": "throughput_gemma_3_27b_it_tp8", + "parameters": { + "model": "google/gemma-3-27b-it", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_qwen3_30b_a3b_tp8", + "parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } } ]