Add torchao quantization param

jainapurva · jainapurva · commit 6b26c58ac818 · 2025-10-27T14:32:08.000-07:00
diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json
@@ -143,6 +143,7 @@
         "test_name": "latency_gemma3_12b_it_fp8_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-12b-it-FP8",
+            "quantization": "torchao",
             "load_format": "dummy",
             "num_iters_warmup": 5,
             "num_iters": 15
@@ -152,6 +153,7 @@
         "test_name": "latency_gemma3_12b_it_int4_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-12b-it-INT4",
+            "quantization": "torchao",
             "load_format": "dummy",
             "num_iters_warmup": 5,
             "num_iters": 15
@@ -161,6 +163,7 @@
         "test_name": "latency_gemma3_12b_it_awq_int4_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-12b-it-AWQ-INT4",
+            "quantization": "torchao",
             "load_format": "dummy",
             "num_iters_warmup": 5,
             "num_iters": 15
@@ -170,6 +173,7 @@
         "test_name": "latency_gemma3_27b_it_fp8_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-27b-it-FP8",
+            "quantization": "torchao",
             "load_format": "dummy",
             "num_iters_warmup": 5,
             "num_iters": 15
@@ -179,6 +183,7 @@
         "test_name": "latency_gemma3_27b_it_int4_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-27b-it-INT4",
+            "quantization": "torchao",
             "load_format": "dummy",
             "num_iters_warmup": 5,
             "num_iters": 15
@@ -188,6 +193,7 @@
         "test_name": "latency_gemma3_27b_it_awq_int4_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-27b-it-AWQ-INT4",
+            "quantization": "torchao",
             "load_format": "dummy",
             "num_iters_warmup": 5,
             "num_iters": 15
diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json
@@ -558,6 +558,8 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
             "model": "pytorch/gemma-3-12b-it-FP8",
+            "tokenizer": "google/gemma-2-12b-it",
+            "quantization": "torchao",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
@@ -578,6 +580,8 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
             "model": "pytorch/gemma-3-12b-it-INT4",
+            "tokenizer": "google/gemma-2-12b-it",
+            "quantization": "torchao",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
@@ -598,6 +602,8 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
             "model": "pytorch/gemma-3-12b-it-AWQ-INT4",
+            "tokenizer": "google/gemma-2-12b-it",
+            "quantization": "torchao",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
@@ -618,6 +624,8 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
             "model": "pytorch/gemma-3-27b-it-FP8",
+            "tokenizer": "google/gemma-2-27b-it",
+            "quantization": "torchao",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
@@ -638,6 +646,8 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
             "model": "pytorch/gemma-3-27b-it-INT4",
+            "tokenizer": "google/gemma-2-27b-it",
+            "quantization": "torchao",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
@@ -658,6 +668,8 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
             "model": "pytorch/gemma-3-27b-it-AWQ-INT4",
+            "tokenizer": "google/gemma-2-27b-it",
+            "quantization": "torchao",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
@@ -156,6 +156,7 @@
         "test_name": "throughput_gemma3_12b_it_fp8_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-12b-it-FP8",
+            "quantization": "torchao",
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -166,6 +167,7 @@
         "test_name": "throughput_gemma3_12b_it_int4_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-12b-it-INT4",
+            "quantization": "torchao",
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -176,6 +178,7 @@
         "test_name": "throughput_gemma3_12b_it_awq_int4_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-12b-it-AWQ-INT4",
+            "quantization": "torchao",
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -186,6 +189,7 @@
         "test_name": "throughput_gemma3_27b_it_fp8_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-27b-it-FP8",
+            "quantization": "torchao",
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -196,6 +200,7 @@
         "test_name": "throughput_gemma3_27b_it_int4_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-27b-it-INT4",
+            "quantization": "torchao",
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -206,6 +211,7 @@
         "test_name": "throughput_gemma3_27b_it_awq_int4_torchao",
         "parameters": {
             "model": "pytorch/gemma-3-27b-it-AWQ-INT4",
+            "quantization": "torchao",
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,