PR #33794: [GPU] Support int4 in cuDNN GEMM fusions.

sergachev · Google-ML-Automation · commit b231d21703be · 2025-11-11T07:04:27.000-08:00
Imported from GitHub PR #33794 📝 Summary of Changes Support int4 in cuDNN GEMM fusions. 🎯 Justification Accelerates some int4 GEMM fusions (under the flag xla_gpu_cudnn_gemm_fusion_level). 🚀 Kind of Contribution ⚡️ Performance Improvement 📊 Benchmark (for Performance Improvements) > Please measure and include speedups for one of the public HLOs in `compiler/xla/tools/benchmarks/hlo/`. These do not use int4. 🧪 Unit Tests: yes 🧪 Execution Tests: yes Copybara import of the project: -- e1b8dc7 by Ilia Sergachev <isergachev@nvidia.com>: [GPU] Support int4 in cuDNN GEMM fusions. Merging this change closes #33794 FUTURE_COPYBARA_INTEGRATE_REVIEW=#33794 from openxla:cudnn_gemm_int4 e1b8dc7 PiperOrigin-RevId: 830894321
diff --git a/xla/backends/gpu/codegen/BUILD b/xla/backends/gpu/codegen/BUILD
@@ -104,7 +104,6 @@ xla_test(
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/xla/backends/gpu/codegen/cudnn_test.cc b/xla/backends/gpu/codegen/cudnn_test.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/env.h"
@@ -59,8 +58,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 class CuDnnFusionTest : public GpuCodegenTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
@@ -80,12 +77,14 @@ class CuDnnFusionTest : public GpuCodegenTest {
     return get_cuda_cc().IsAtLeastAmpere() &&
            GetDnnVersionInfoOrDefault(executor).major_version() >= 9;
   }
-  bool IsAtLeastCuDnn91() {
+  bool IsAtLeastCuDnnVersion(int major, int minor) {
     se::StreamExecutor* executor = backend().default_stream_executor();
     const se::dnn::VersionInfo version = GetDnnVersionInfoOrDefault(executor);
-    return (version.major_version() == 9 && version.minor_version() >= 1) ||
-           version.major_version() > 9;
+    return (version.major_version() == major &&
+            version.minor_version() >= minor) ||
+           version.major_version() > major;
   }
+  bool IsAtLeastCuDnn91() { return IsAtLeastCuDnnVersion(9, 1); }
 
  protected:
   void SetUp() override {
@@ -457,6 +456,29 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
+TEST_F(CuDnnFusionExecutionTest, DotS4BF16ExecutesCorrectly) {
+  if (!IsAtLeastCuDnnVersion(9, 12)) {
+    GTEST_SKIP() << "This test case requires cuDNN 9.12+.";
+  }
+  EXPECT_TRUE(RunAndCompare(R"(
+f {
+  a = s4[3,128,128] parameter(0)
+  c = bf16[3,128,128] convert(a)
+  b = bf16[3,128,128] parameter(1)
+  d = bf16[3,128,128] dot(c, b),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+e {
+  a = s4[3,128,128] parameter(0)
+  b = bf16[3,128,128] parameter(1)
+  f = bf16[3,128,128] fusion(a, b), kind=kCustom, calls=f,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
+}
+
 TEST_F(CuDnnFusionExecutionTest, DotF32WithOutputSubtractionExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
diff --git a/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc b/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
@@ -199,6 +199,12 @@ mlir::stablehlo::DotAlgorithmAttr ConvertDotAlgorithm(
       numPrimitiveOperations = 6;
       break;
     }
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9: {
+      lhs = rhs = builder->getBF16Type();
+      accum = builder->getF32Type();
+      numPrimitiveOperations = 9;
+      break;
+    }
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32: {
       lhs = rhs = builder->getTF32Type();
       accum = builder->getF32Type();
diff --git a/xla/service/gpu/transforms/cudnn_fusion_compiler.cc b/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
@@ -149,6 +149,8 @@ inline std::optional<fe::DataType_t> ToCudnnDataType(const PrimitiveType type) {
       return t::BFLOAT16;
     case PrimitiveType::S32:
       return t::INT32;
+    case PrimitiveType::S4:
+      return t::INT4;
     case PrimitiveType::S8:
       return t::INT8;
     case PrimitiveType::PRED: