From 930c65bb4a0b04e6e2810d4b6265ab83bac59c34 Mon Sep 17 00:00:00 2001
From: padg9912 <phone.and.desktop@gmail.com>
Date: Tue, 30 Sep 2025 04:04:16 -0400
Subject: [PATCH 1/5] Add INT8 check for Blackwell GPUs

Blackwell architecture (SM100+) doesn't support INT8 quantization.
Added validation in can_implement() to catch this early with a clear
error message directing users to FP8 or older GPU architectures.

Also updated error message in scaled_mm_helper.hpp for consistency
and added docs warning about the limitation.

Fixes #20221

Signed-off-by: padg9912 <phone.and.desktop@gmail.com>
---
 .../quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp |  5 ++++-
 docs/features/quantization/int8.md                     |  6 +++++-
 .../layers/quantization/kernels/scaled_mm/cutlass.py   | 10 ++++++++++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
index 3af59267bd60..86a4ff3a5179 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
@@ -25,7 +25,10 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
       if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
         int8_func(c, a, b, a_scales, b_scales, bias);
       } else {
-        TORCH_CHECK(false, "Int8 not supported for this architecture");
+        int32_t version_num = get_sm_version_num();
+        TORCH_CHECK(false, 
+                    "Int8 not supported on SM", version_num, 
+                    ". Use FP8 quantization instead, or run on older arch (SM < 100).");
       }
     }
   } else {
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index 247d0cbdd3f1..1acf91e0b50c 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -6,7 +6,11 @@ This quantization method is particularly useful for reducing model size while ma
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
 
 !!! note
-    INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
+    INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
+
+!!! warning
+    **Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell). 
+    Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures.
 
 ## Prerequisites
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 2f982f96b0d0..727e86aa60f9 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -28,6 +28,16 @@ def can_implement(
         if not current_platform.is_cuda():
             return False, "CutlassScaledMM requires running on CUDA."
 
+        # Blackwell doesn't support INT8
+        capability = current_platform.get_device_capability()
+        if capability is not None:
+            major, _ = capability
+            compute_cap = major * 10 + (_ if _ < 10 else 0)
+            if compute_cap >= 100 and c.weight_dtype == torch.int8:
+                return False, (
+                    f"INT8 not supported on SM{compute_cap}. "
+                    f"Use FP8 quantization or older GPU architecture.")
+
         return True, None
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

From 7bfb572477891a3888feb4c6c91ffe3037fd28db Mon Sep 17 00:00:00 2001
From: padg9912 <phone.and.desktop@gmail.com>
Date: Tue, 30 Sep 2025 04:15:50 -0400
Subject: [PATCH 2/5] Use capability.to_int() for consistency

Use the existing to_int() method instead of manually calculating
compute capability. More consistent with rest of codebase.

Signed-off-by: padg9912 <phone.and.desktop@gmail.com>
---
 .../layers/quantization/kernels/scaled_mm/cutlass.py           | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 727e86aa60f9..a6cbd3695c77 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -31,8 +31,7 @@ def can_implement(
         # Blackwell doesn't support INT8
         capability = current_platform.get_device_capability()
         if capability is not None:
-            major, _ = capability
-            compute_cap = major * 10 + (_ if _ < 10 else 0)
+            compute_cap = capability.to_int()
             if compute_cap >= 100 and c.weight_dtype == torch.int8:
                 return False, (
                     f"INT8 not supported on SM{compute_cap}. "

From daaec73b2c93aaa503c7171b1cbbeed38fbf9628 Mon Sep 17 00:00:00 2001
From: padg9912 <phone.and.desktop@gmail.com>
Date: Tue, 30 Sep 2025 04:19:18 -0400
Subject: [PATCH 3/5] Fix pre-commit issues

- Remove weight_dtype check (attribute doesn't exist in config)
- Fix markdown trailing space
- Error handling is done at kernel level instead

Signed-off-by: padg9912 <phone.and.desktop@gmail.com>
---
 docs/features/quantization/int8.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index 1acf91e0b50c..af3650e701ad 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -9,7 +9,7 @@ Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs re
     INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
 
 !!! warning
-    **Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell). 
+    **Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell).
     Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures.
 
 ## Prerequisites

From 151ea664de02baa5aaf584895f23243d7e9a127e Mon Sep 17 00:00:00 2001
From: padg9912 <phone.and.desktop@gmail.com>
Date: Tue, 30 Sep 2025 04:27:31 -0400
Subject: [PATCH 4/5] Remove invalid weight_dtype check

Config doesn't have weight_dtype attribute. Error checking
is properly handled at kernel level in scaled_mm_helper.hpp

Signed-off-by: padg9912 <phone.and.desktop@gmail.com>
---
 .../layers/quantization/kernels/scaled_mm/cutlass.py     | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index a6cbd3695c77..2f982f96b0d0 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -28,15 +28,6 @@ def can_implement(
         if not current_platform.is_cuda():
             return False, "CutlassScaledMM requires running on CUDA."
 
-        # Blackwell doesn't support INT8
-        capability = current_platform.get_device_capability()
-        if capability is not None:
-            compute_cap = capability.to_int()
-            if compute_cap >= 100 and c.weight_dtype == torch.int8:
-                return False, (
-                    f"INT8 not supported on SM{compute_cap}. "
-                    f"Use FP8 quantization or older GPU architecture.")
-
         return True, None
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

From dd0a7b008ba76d759a88b2d61d10f49dea2b4539 Mon Sep 17 00:00:00 2001
From: padg9912 <phone.and.desktop@gmail.com>
Date: Tue, 30 Sep 2025 04:34:11 -0400
Subject: [PATCH 5/5] Apply clang-format to C++ code

Signed-off-by: padg9912 <phone.and.desktop@gmail.com>
---
 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
index 86a4ff3a5179..2204a49257b0 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
@@ -26,9 +26,9 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
         int8_func(c, a, b, a_scales, b_scales, bias);
       } else {
         int32_t version_num = get_sm_version_num();
-        TORCH_CHECK(false, 
-                    "Int8 not supported on SM", version_num, 
-                    ". Use FP8 quantization instead, or run on older arch (SM < 100).");
+        TORCH_CHECK(
+            false, "Int8 not supported on SM", version_num,
+            ". Use FP8 quantization instead, or run on older arch (SM < 100).");
       }
     }
   } else {