From 930c65bb4a0b04e6e2810d4b6265ab83bac59c34 Mon Sep 17 00:00:00 2001 From: padg9912 Date: Tue, 30 Sep 2025 04:04:16 -0400 Subject: [PATCH 1/5] Add INT8 check for Blackwell GPUs Blackwell architecture (SM100+) doesn't support INT8 quantization. Added validation in can_implement() to catch this early with a clear error message directing users to FP8 or older GPU architectures. Also updated error message in scaled_mm_helper.hpp for consistency and added docs warning about the limitation. Fixes #20221 Signed-off-by: padg9912 --- .../quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp | 5 ++++- docs/features/quantization/int8.md | 6 +++++- .../layers/quantization/kernels/scaled_mm/cutlass.py | 10 ++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp index 3af59267bd60..86a4ff3a5179 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp @@ -25,7 +25,10 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a, if constexpr (!std::is_same_v) { int8_func(c, a, b, a_scales, b_scales, bias); } else { - TORCH_CHECK(false, "Int8 not supported for this architecture"); + int32_t version_num = get_sm_version_num(); + TORCH_CHECK(false, + "Int8 not supported on SM", version_num, + ". Use FP8 quantization instead, or run on older arch (SM < 100)."); } } } else { diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 247d0cbdd3f1..1acf91e0b50c 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -6,7 +6,11 @@ This quantization method is particularly useful for reducing model size while ma Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415). !!! note - INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell). + INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). + +!!! warning + **Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell). + Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures. ## Prerequisites diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 2f982f96b0d0..727e86aa60f9 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -28,6 +28,16 @@ def can_implement( if not current_platform.is_cuda(): return False, "CutlassScaledMM requires running on CUDA." + # Blackwell doesn't support INT8 + capability = current_platform.get_device_capability() + if capability is not None: + major, _ = capability + compute_cap = major * 10 + (_ if _ < 10 else 0) + if compute_cap >= 100 and c.weight_dtype == torch.int8: + return False, ( + f"INT8 not supported on SM{compute_cap}. " + f"Use FP8 quantization or older GPU architecture.") + return True, None def process_weights_after_loading(self, layer: torch.nn.Module) -> None: From 7bfb572477891a3888feb4c6c91ffe3037fd28db Mon Sep 17 00:00:00 2001 From: padg9912 Date: Tue, 30 Sep 2025 04:15:50 -0400 Subject: [PATCH 2/5] Use capability.to_int() for consistency Use the existing to_int() method instead of manually calculating compute capability. More consistent with rest of codebase. Signed-off-by: padg9912 --- .../layers/quantization/kernels/scaled_mm/cutlass.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 727e86aa60f9..a6cbd3695c77 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -31,8 +31,7 @@ def can_implement( # Blackwell doesn't support INT8 capability = current_platform.get_device_capability() if capability is not None: - major, _ = capability - compute_cap = major * 10 + (_ if _ < 10 else 0) + compute_cap = capability.to_int() if compute_cap >= 100 and c.weight_dtype == torch.int8: return False, ( f"INT8 not supported on SM{compute_cap}. " From daaec73b2c93aaa503c7171b1cbbeed38fbf9628 Mon Sep 17 00:00:00 2001 From: padg9912 Date: Tue, 30 Sep 2025 04:19:18 -0400 Subject: [PATCH 3/5] Fix pre-commit issues - Remove weight_dtype check (attribute doesn't exist in config) - Fix markdown trailing space - Error handling is done at kernel level instead Signed-off-by: padg9912 --- docs/features/quantization/int8.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 1acf91e0b50c..af3650e701ad 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -9,7 +9,7 @@ Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs re INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). !!! warning - **Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell). + **Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell). Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures. ## Prerequisites From 151ea664de02baa5aaf584895f23243d7e9a127e Mon Sep 17 00:00:00 2001 From: padg9912 Date: Tue, 30 Sep 2025 04:27:31 -0400 Subject: [PATCH 4/5] Remove invalid weight_dtype check Config doesn't have weight_dtype attribute. Error checking is properly handled at kernel level in scaled_mm_helper.hpp Signed-off-by: padg9912 --- .../layers/quantization/kernels/scaled_mm/cutlass.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index a6cbd3695c77..2f982f96b0d0 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -28,15 +28,6 @@ def can_implement( if not current_platform.is_cuda(): return False, "CutlassScaledMM requires running on CUDA." - # Blackwell doesn't support INT8 - capability = current_platform.get_device_capability() - if capability is not None: - compute_cap = capability.to_int() - if compute_cap >= 100 and c.weight_dtype == torch.int8: - return False, ( - f"INT8 not supported on SM{compute_cap}. " - f"Use FP8 quantization or older GPU architecture.") - return True, None def process_weights_after_loading(self, layer: torch.nn.Module) -> None: From dd0a7b008ba76d759a88b2d61d10f49dea2b4539 Mon Sep 17 00:00:00 2001 From: padg9912 Date: Tue, 30 Sep 2025 04:34:11 -0400 Subject: [PATCH 5/5] Apply clang-format to C++ code Signed-off-by: padg9912 --- csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp index 86a4ff3a5179..2204a49257b0 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp @@ -26,9 +26,9 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a, int8_func(c, a, b, a_scales, b_scales, bias); } else { int32_t version_num = get_sm_version_num(); - TORCH_CHECK(false, - "Int8 not supported on SM", version_num, - ". Use FP8 quantization instead, or run on older arch (SM < 100)."); + TORCH_CHECK( + false, "Int8 not supported on SM", version_num, + ". Use FP8 quantization instead, or run on older arch (SM < 100)."); } } } else {