Skip to content

Commit e77e5df

Browse files
committed
Install torchao
1 parent 2bc5a58 commit e77e5df

File tree

4 files changed

+189
-6
lines changed

4 files changed

+189
-6
lines changed

.github/workflows/vllm-benchmark.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,9 @@ jobs:
292292
-w /tmp/workspace \
293293
"${DOCKER_IMAGE}"
294294
)
295+
if [[ "${DEVICE_NAME}" == "cuda" ]]; then
296+
docker exec -t "${container_name}" bash -c "uv pip install --system --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128"
297+
fi
295298
docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh"
296299
297300
- name: Authenticate with AWS

vllm-benchmarks/benchmarks/cuda/latency-tests.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@
140140
}
141141
},
142142
{
143-
"test_name": "latency_gemma3_12b_it_fp8",
143+
"test_name": "latency_gemma3_12b_it_fp8_torchao",
144144
"parameters": {
145145
"model": "pytorch/gemma-3-12b-it-FP8",
146146
"load_format": "dummy",
@@ -149,7 +149,7 @@
149149
}
150150
},
151151
{
152-
"test_name": "latency_gemma3_12b_it_int4",
152+
"test_name": "latency_gemma3_12b_it_int4_torchao",
153153
"parameters": {
154154
"model": "pytorch/gemma-3-12b-it-INT4",
155155
"load_format": "dummy",
@@ -158,7 +158,7 @@
158158
}
159159
},
160160
{
161-
"test_name": "latency_gemma3_12b_it_awq_int4",
161+
"test_name": "latency_gemma3_12b_it_awq_int4_torchao",
162162
"parameters": {
163163
"model": "pytorch/gemma-3-12b-it-AWQ-INT4",
164164
"load_format": "dummy",
@@ -167,7 +167,7 @@
167167
}
168168
},
169169
{
170-
"test_name": "latency_gemma3_27b_it_fp8",
170+
"test_name": "latency_gemma3_27b_it_fp8_torchao",
171171
"parameters": {
172172
"model": "pytorch/gemma-3-27b-it-FP8",
173173
"load_format": "dummy",
@@ -176,7 +176,7 @@
176176
}
177177
},
178178
{
179-
"test_name": "latency_gemma3_27b_it_int4",
179+
"test_name": "latency_gemma3_27b_it_int4_torchao",
180180
"parameters": {
181181
"model": "pytorch/gemma-3-27b-it-INT4",
182182
"load_format": "dummy",
@@ -185,7 +185,7 @@
185185
}
186186
},
187187
{
188-
"test_name": "latency_gemma3_27b_it_awq_int4",
188+
"test_name": "latency_gemma3_27b_it_awq_int4_torchao",
189189
"parameters": {
190190
"model": "pytorch/gemma-3-27b-it-AWQ-INT4",
191191
"load_format": "dummy",

vllm-benchmarks/benchmarks/cuda/serving-tests.json

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,5 +552,125 @@
552552
"random_input_len": 5250,
553553
"random_output_len": 8250
554554
}
555+
},
556+
{
557+
"test_name": "serving_gemma3_12b_it_fp8_torchao",
558+
"qps_list": [1, 4, 16, "inf"],
559+
"server_parameters": {
560+
"model": "pytorch/gemma-3-12b-it-FP8",
561+
"tensor_parallel_size": 1,
562+
"swap_space": 16,
563+
"disable_log_stats": "",
564+
"disable_log_requests": "",
565+
"load_format": "dummy"
566+
},
567+
"client_parameters": {
568+
"model": "pytorch/gemma-3-12b-it-FP8",
569+
"backend": "vllm",
570+
"dataset_name": "random",
571+
"num_prompts": 200,
572+
"random_input_len": 1024,
573+
"random_output_len": 2048
574+
}
575+
},
576+
{
577+
"test_name": "serving_gemma3_12b_it_int4_torchao",
578+
"qps_list": [1, 4, 16, "inf"],
579+
"server_parameters": {
580+
"model": "pytorch/gemma-3-12b-it-INT4",
581+
"tensor_parallel_size": 1,
582+
"swap_space": 16,
583+
"disable_log_stats": "",
584+
"disable_log_requests": "",
585+
"load_format": "dummy"
586+
},
587+
"client_parameters": {
588+
"model": "pytorch/gemma-3-12b-it-INT4",
589+
"backend": "vllm",
590+
"dataset_name": "random",
591+
"num_prompts": 200,
592+
"random_input_len": 1024,
593+
"random_output_len": 2048
594+
}
595+
},
596+
{
597+
"test_name": "serving_gemma3_12b_it_awq_int4_torchao",
598+
"qps_list": [1, 4, 16, "inf"],
599+
"server_parameters": {
600+
"model": "pytorch/gemma-3-12b-it-AWQ-INT4",
601+
"tensor_parallel_size": 1,
602+
"swap_space": 16,
603+
"disable_log_stats": "",
604+
"disable_log_requests": "",
605+
"load_format": "dummy"
606+
},
607+
"client_parameters": {
608+
"model": "pytorch/gemma-3-12b-it-AWQ-INT4",
609+
"backend": "vllm",
610+
"dataset_name": "random",
611+
"num_prompts": 200,
612+
"random_input_len": 1024,
613+
"random_output_len": 2048
614+
}
615+
},
616+
{
617+
"test_name": "serving_gemma3_27b_it_fp8_torchao",
618+
"qps_list": [1, 4, 16, "inf"],
619+
"server_parameters": {
620+
"model": "pytorch/gemma-3-27b-it-FP8",
621+
"tensor_parallel_size": 1,
622+
"swap_space": 16,
623+
"disable_log_stats": "",
624+
"disable_log_requests": "",
625+
"load_format": "dummy"
626+
},
627+
"client_parameters": {
628+
"model": "pytorch/gemma-3-27b-it-FP8",
629+
"backend": "vllm",
630+
"dataset_name": "random",
631+
"num_prompts": 200,
632+
"random_input_len": 1024,
633+
"random_output_len": 2048
634+
}
635+
},
636+
{
637+
"test_name": "serving_gemma3_27b_it_int4_torchao",
638+
"qps_list": [1, 4, 16, "inf"],
639+
"server_parameters": {
640+
"model": "pytorch/gemma-3-27b-it-INT4",
641+
"tensor_parallel_size": 1,
642+
"swap_space": 16,
643+
"disable_log_stats": "",
644+
"disable_log_requests": "",
645+
"load_format": "dummy"
646+
},
647+
"client_parameters": {
648+
"model": "pytorch/gemma-3-27b-it-INT4",
649+
"backend": "vllm",
650+
"dataset_name": "random",
651+
"num_prompts": 200,
652+
"random_input_len": 1024,
653+
"random_output_len": 2048
654+
}
655+
},
656+
{
657+
"test_name": "serving_gemma3_27b_it_awq_int4_torchao",
658+
"qps_list": [1, 4, 16, "inf"],
659+
"server_parameters": {
660+
"model": "pytorch/gemma-3-27b-it-AWQ-INT4",
661+
"tensor_parallel_size": 1,
662+
"swap_space": 16,
663+
"disable_log_stats": "",
664+
"disable_log_requests": "",
665+
"load_format": "dummy"
666+
},
667+
"client_parameters": {
668+
"model": "pytorch/gemma-3-27b-it-AWQ-INT4",
669+
"backend": "vllm",
670+
"dataset_name": "random",
671+
"num_prompts": 200,
672+
"random_input_len": 1024,
673+
"random_output_len": 2048
674+
}
555675
}
556676
]

vllm-benchmarks/benchmarks/cuda/throughput-tests.json

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,5 +151,65 @@
151151
"backend": "vllm",
152152
"max_model_len": 8192
153153
}
154+
},
155+
{
156+
"test_name": "throughput_gemma3_12b_it_fp8_torchao",
157+
"parameters": {
158+
"model": "pytorch/gemma-3-12b-it-FP8",
159+
"load_format": "dummy",
160+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
161+
"num_prompts": 200,
162+
"backend": "vllm"
163+
}
164+
},
165+
{
166+
"test_name": "throughput_gemma3_12b_it_int4_torchao",
167+
"parameters": {
168+
"model": "pytorch/gemma-3-12b-it-INT4",
169+
"load_format": "dummy",
170+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
171+
"num_prompts": 200,
172+
"backend": "vllm"
173+
}
174+
},
175+
{
176+
"test_name": "throughput_gemma3_12b_it_awq_int4_torchao",
177+
"parameters": {
178+
"model": "pytorch/gemma-3-12b-it-AWQ-INT4",
179+
"load_format": "dummy",
180+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
181+
"num_prompts": 200,
182+
"backend": "vllm"
183+
}
184+
},
185+
{
186+
"test_name": "throughput_gemma3_27b_it_fp8_torchao",
187+
"parameters": {
188+
"model": "pytorch/gemma-3-27b-it-FP8",
189+
"load_format": "dummy",
190+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
191+
"num_prompts": 200,
192+
"backend": "vllm"
193+
}
194+
},
195+
{
196+
"test_name": "throughput_gemma3_27b_it_int4_torchao",
197+
"parameters": {
198+
"model": "pytorch/gemma-3-27b-it-INT4",
199+
"load_format": "dummy",
200+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
201+
"num_prompts": 200,
202+
"backend": "vllm"
203+
}
204+
},
205+
{
206+
"test_name": "throughput_gemma3_27b_it_awq_int4_torchao",
207+
"parameters": {
208+
"model": "pytorch/gemma-3-27b-it-AWQ-INT4",
209+
"load_format": "dummy",
210+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
211+
"num_prompts": 200,
212+
"backend": "vllm"
213+
}
154214
}
155215
]

0 commit comments

Comments
 (0)