Skip to content

Commit 6b26c58

Browse files
committed
Add torchao quantization param
1 parent e77e5df commit 6b26c58

File tree

3 files changed

+24
-0
lines changed

3 files changed

+24
-0
lines changed

vllm-benchmarks/benchmarks/cuda/latency-tests.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@
143143
"test_name": "latency_gemma3_12b_it_fp8_torchao",
144144
"parameters": {
145145
"model": "pytorch/gemma-3-12b-it-FP8",
146+
"quantization": "torchao",
146147
"load_format": "dummy",
147148
"num_iters_warmup": 5,
148149
"num_iters": 15
@@ -152,6 +153,7 @@
152153
"test_name": "latency_gemma3_12b_it_int4_torchao",
153154
"parameters": {
154155
"model": "pytorch/gemma-3-12b-it-INT4",
156+
"quantization": "torchao",
155157
"load_format": "dummy",
156158
"num_iters_warmup": 5,
157159
"num_iters": 15
@@ -161,6 +163,7 @@
161163
"test_name": "latency_gemma3_12b_it_awq_int4_torchao",
162164
"parameters": {
163165
"model": "pytorch/gemma-3-12b-it-AWQ-INT4",
166+
"quantization": "torchao",
164167
"load_format": "dummy",
165168
"num_iters_warmup": 5,
166169
"num_iters": 15
@@ -170,6 +173,7 @@
170173
"test_name": "latency_gemma3_27b_it_fp8_torchao",
171174
"parameters": {
172175
"model": "pytorch/gemma-3-27b-it-FP8",
176+
"quantization": "torchao",
173177
"load_format": "dummy",
174178
"num_iters_warmup": 5,
175179
"num_iters": 15
@@ -179,6 +183,7 @@
179183
"test_name": "latency_gemma3_27b_it_int4_torchao",
180184
"parameters": {
181185
"model": "pytorch/gemma-3-27b-it-INT4",
186+
"quantization": "torchao",
182187
"load_format": "dummy",
183188
"num_iters_warmup": 5,
184189
"num_iters": 15
@@ -188,6 +193,7 @@
188193
"test_name": "latency_gemma3_27b_it_awq_int4_torchao",
189194
"parameters": {
190195
"model": "pytorch/gemma-3-27b-it-AWQ-INT4",
196+
"quantization": "torchao",
191197
"load_format": "dummy",
192198
"num_iters_warmup": 5,
193199
"num_iters": 15

vllm-benchmarks/benchmarks/cuda/serving-tests.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,8 @@
558558
"qps_list": [1, 4, 16, "inf"],
559559
"server_parameters": {
560560
"model": "pytorch/gemma-3-12b-it-FP8",
561+
"tokenizer": "google/gemma-2-12b-it",
562+
"quantization": "torchao",
561563
"tensor_parallel_size": 1,
562564
"swap_space": 16,
563565
"disable_log_stats": "",
@@ -578,6 +580,8 @@
578580
"qps_list": [1, 4, 16, "inf"],
579581
"server_parameters": {
580582
"model": "pytorch/gemma-3-12b-it-INT4",
583+
"tokenizer": "google/gemma-2-12b-it",
584+
"quantization": "torchao",
581585
"tensor_parallel_size": 1,
582586
"swap_space": 16,
583587
"disable_log_stats": "",
@@ -598,6 +602,8 @@
598602
"qps_list": [1, 4, 16, "inf"],
599603
"server_parameters": {
600604
"model": "pytorch/gemma-3-12b-it-AWQ-INT4",
605+
"tokenizer": "google/gemma-2-12b-it",
606+
"quantization": "torchao",
601607
"tensor_parallel_size": 1,
602608
"swap_space": 16,
603609
"disable_log_stats": "",
@@ -618,6 +624,8 @@
618624
"qps_list": [1, 4, 16, "inf"],
619625
"server_parameters": {
620626
"model": "pytorch/gemma-3-27b-it-FP8",
627+
"tokenizer": "google/gemma-2-27b-it",
628+
"quantization": "torchao",
621629
"tensor_parallel_size": 1,
622630
"swap_space": 16,
623631
"disable_log_stats": "",
@@ -638,6 +646,8 @@
638646
"qps_list": [1, 4, 16, "inf"],
639647
"server_parameters": {
640648
"model": "pytorch/gemma-3-27b-it-INT4",
649+
"tokenizer": "google/gemma-2-27b-it",
650+
"quantization": "torchao",
641651
"tensor_parallel_size": 1,
642652
"swap_space": 16,
643653
"disable_log_stats": "",
@@ -658,6 +668,8 @@
658668
"qps_list": [1, 4, 16, "inf"],
659669
"server_parameters": {
660670
"model": "pytorch/gemma-3-27b-it-AWQ-INT4",
671+
"tokenizer": "google/gemma-2-27b-it",
672+
"quantization": "torchao",
661673
"tensor_parallel_size": 1,
662674
"swap_space": 16,
663675
"disable_log_stats": "",

vllm-benchmarks/benchmarks/cuda/throughput-tests.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@
156156
"test_name": "throughput_gemma3_12b_it_fp8_torchao",
157157
"parameters": {
158158
"model": "pytorch/gemma-3-12b-it-FP8",
159+
"quantization": "torchao",
159160
"load_format": "dummy",
160161
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
161162
"num_prompts": 200,
@@ -166,6 +167,7 @@
166167
"test_name": "throughput_gemma3_12b_it_int4_torchao",
167168
"parameters": {
168169
"model": "pytorch/gemma-3-12b-it-INT4",
170+
"quantization": "torchao",
169171
"load_format": "dummy",
170172
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
171173
"num_prompts": 200,
@@ -176,6 +178,7 @@
176178
"test_name": "throughput_gemma3_12b_it_awq_int4_torchao",
177179
"parameters": {
178180
"model": "pytorch/gemma-3-12b-it-AWQ-INT4",
181+
"quantization": "torchao",
179182
"load_format": "dummy",
180183
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
181184
"num_prompts": 200,
@@ -186,6 +189,7 @@
186189
"test_name": "throughput_gemma3_27b_it_fp8_torchao",
187190
"parameters": {
188191
"model": "pytorch/gemma-3-27b-it-FP8",
192+
"quantization": "torchao",
189193
"load_format": "dummy",
190194
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
191195
"num_prompts": 200,
@@ -196,6 +200,7 @@
196200
"test_name": "throughput_gemma3_27b_it_int4_torchao",
197201
"parameters": {
198202
"model": "pytorch/gemma-3-27b-it-INT4",
203+
"quantization": "torchao",
199204
"load_format": "dummy",
200205
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
201206
"num_prompts": 200,
@@ -206,6 +211,7 @@
206211
"test_name": "throughput_gemma3_27b_it_awq_int4_torchao",
207212
"parameters": {
208213
"model": "pytorch/gemma-3-27b-it-AWQ-INT4",
214+
"quantization": "torchao",
209215
"load_format": "dummy",
210216
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
211217
"num_prompts": 200,

0 commit comments

Comments
 (0)