【Inference Optimize】Support setting environment variables to enable stream_k (#74317)

chang-wenbin · web-flow · commit 9a7ec49c5fa2 · 2025-07-31T11:10:14.000+08:00
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/ft_gemm_configs.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/ft_gemm_configs.h
@@ -75,10 +75,10 @@ enum class SplitKStyle {
   SPLIT_K_SERIAL,
   // SPLIT_K_PARALLEL // Not supported yet
 };
-
+// NOTE: (changwenbin) split_k_serial is turned on by default here.
 struct CutlassGemmConfig {
   CutlassTileConfig tile_config = CutlassTileConfig::ChooseWithHeuristic;
-  SplitKStyle split_k_style = SplitKStyle::NO_SPLIT_K;
+  SplitKStyle split_k_style = SplitKStyle::SPLIT_K_SERIAL;
   int split_k_factor = -1;
   int stages = -1;
 };
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
@@ -76,6 +76,16 @@ struct GemmFpAIntB {
   using LayoutC = typename Mma::LayoutC;
   using ElementScale = typename Mma::IteratorA::Element;
 
+  // NOTE: (changwenbin) Currently only A row major and B column major are
+  // supported. Other cases have not been verified yet.
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
+  static_assert(
+      platform::is_same<LayoutA, layout::RowMajor>::value &&
+          platform::is_same<LayoutB, layout::ColumnMajor>::value,
+      "A must be row major and B must be col major in cuda_arch >= sm75");
+#endif
+
   static ComplexTransform const kTransformA = Mma::kTransformA;
   static ComplexTransform const kTransformB = Mma::kTransformA;
 
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h
@@ -259,7 +259,9 @@ struct GemmFpAIntBSplitK {
                           ? device_sms
                           : fast_min(args.avail_sms, device_sms);
 
-      // Initialize the block mapping structure
+      static_assert(WarpCount::kK == 1, "WarpCount::kK should always == 1");
+      // NOTE: (changwenbin) Adapt cutlass upgraded to version 3.8.0
+      //  Initialize the block mapping structure
       block_mapping = ThreadblockSwizzle(
           args.mode,
           args.problem_size,
@@ -271,7 +273,9 @@ struct GemmFpAIntBSplitK {
           cutlass::sizeof_bits<ElementA>::value,
           cutlass::sizeof_bits<ElementB>::value,
           cutlass::sizeof_bits<ElementC>::value,
-          ThreadblockShape::kK / (WarpCount::kK * InstructionShape::kK));
+          ThreadblockShape::kK /
+              (WarpCount::kK *
+               InstructionShape::kK));  // epilogue_acc_fragments_
     }
 
     /// Returns the workspace size (in bytes) needed for these parameters
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/cutlass_heuristic.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/cutlass_heuristic.h
@@ -120,9 +120,17 @@ static std::vector<CutlassGemmConfig> get_candidate_configs(
   if (is_moe) {
     max_stages = 5;
   }
+  // NOTE: (changwenbin)
+  // Support enabling stream_k by setting the environment
+  // variable `export CUTLASS_GEMM_STREAM_K=1`.
+  SplitKStyle env_split_k = SplitKStyle::NO_SPLIT_K;
+  const char* env_stream_k = std::getenv("CUTLASS_GEMM_STREAM_K");
+  if (env_stream_k != nullptr) {
+    env_split_k = SplitKStyle::SPLIT_K_SERIAL;
+  }
   for (const auto& tile_config : tiles) {
     for (int stages = min_stages; stages <= max_stages; ++stages) {
-      CutlassGemmConfig config{tile_config, SplitKStyle::NO_SPLIT_K, 1, stages};
+      CutlassGemmConfig config{tile_config, env_split_k, 1, stages};
       candidate_configs.push_back(config);
     }
   }
diff --git a/test/quantization/test_weight_only_linear.py b/test/quantization/test_weight_only_linear.py
@@ -925,5 +925,138 @@ def test_weightonly_linear_backward(
             )
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or get_cuda_version() < 11020
+    or paddle.device.cuda.get_device_capability()[0] < 8,
+    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
+)
+class WeightOnlyLinear_stream_k_TestCase(unittest.TestCase):
+
+    def test_weightonly_linear_backward_int4(self):
+        def test_weightonly_linear_backward(
+            self, algo='weight_only_int4', weight_dtype='int4'
+        ):
+            x = (
+                paddle.rand(shape=(128, 8192), dtype='float16')
+                * 1
+                / math.sqrt(8192)
+            )
+            x.stop_gradient = False
+            quant_x = copy.deepcopy(x)
+            quant_x.stop_gradient = False
+            weight = (
+                paddle.rand(shape=(8192, 8192), dtype='float16')
+                * 1
+                / math.sqrt(8192)
+            )
+
+            quant_weight, quant_scale = Q.weight_quantize(
+                x=weight.cuda(), algo=algo
+            )
+
+            quant_out = Q.weight_only_linear(
+                x=quant_x,
+                weight=quant_weight,
+                weight_scale=quant_scale,
+                weight_dtype=weight_dtype,
+            )
+
+        test_weightonly_linear_backward(self)
+
+    def test_weightonly_linear_backward_int4_bf16(self):
+        def test_weightonly_linear_backward(
+            self, algo='weight_only_int4', weight_dtype='int4'
+        ):
+            x = (
+                paddle.rand(shape=(128, 8192), dtype='bfloat16')
+                * 1
+                / math.sqrt(8192)
+            )
+            x.stop_gradient = False
+            quant_x = copy.deepcopy(x)
+            quant_x.stop_gradient = False
+            weight = (
+                paddle.rand(shape=(8192, 8192), dtype='bfloat16')
+                * 1
+                / math.sqrt(8192)
+            )
+
+            quant_weight, quant_scale = Q.weight_quantize(
+                x=weight.cuda(), algo=algo
+            )
+
+            quant_out = Q.weight_only_linear(
+                x=quant_x,
+                weight=quant_weight,
+                weight_scale=quant_scale,
+                weight_dtype=weight_dtype,
+            )
+
+        test_weightonly_linear_backward(self)
+
+    def test_weightonly_linear_backward_int8(self):
+        def test_weightonly_linear_backward(
+            self, algo='weight_only_int8', weight_dtype='int8'
+        ):
+            x = (
+                paddle.rand(shape=(128, 8192), dtype='float16')
+                * 1
+                / math.sqrt(8192)
+            )
+            x.stop_gradient = False
+            quant_x = copy.deepcopy(x)
+            quant_x.stop_gradient = False
+            weight = (
+                paddle.rand(shape=(8192, 8192), dtype='float16')
+                * 1
+                / math.sqrt(8192)
+            )
+
+            quant_weight, quant_scale = Q.weight_quantize(
+                x=weight.cuda(), algo=algo
+            )
+
+            quant_out = Q.weight_only_linear(
+                x=quant_x,
+                weight=quant_weight,
+                weight_scale=quant_scale,
+                weight_dtype=weight_dtype,
+            )
+
+        test_weightonly_linear_backward(self)
+
+    def test_weightonly_linear_backward_int8_bf16(self):
+        def test_weightonly_linear_backward(
+            self, algo='weight_only_int8', weight_dtype='int8'
+        ):
+            x = (
+                paddle.rand(shape=(128, 8192), dtype='bfloat16')
+                * 1
+                / math.sqrt(8192)
+            )
+            x.stop_gradient = False
+            quant_x = copy.deepcopy(x)
+            quant_x.stop_gradient = False
+            weight = (
+                paddle.rand(shape=(8192, 8192), dtype='bfloat16')
+                * 1
+                / math.sqrt(8192)
+            )
+
+            quant_weight, quant_scale = Q.weight_quantize(
+                x=weight.cuda(), algo=algo
+            )
+
+            quant_out = Q.weight_only_linear(
+                x=quant_x,
+                weight=quant_weight,
+                weight_scale=quant_scale,
+                weight_dtype=weight_dtype,
+            )
+
+        test_weightonly_linear_backward(self)
+
+
 if __name__ == '__main__':
     unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -120,9 +120,17 @@ static std::vector<CutlassGemmConfig> get_candidate_configs(`
`120`	`120`	`if (is_moe) {`
`121`	`121`	`max_stages = 5;`
`122`	`122`	`}`
	`123`	`+ // NOTE: (changwenbin)`
	`124`	`+ // Support enabling stream_k by setting the environment`
	`125`	+ // variable `export CUTLASS_GEMM_STREAM_K=1`.
	`126`	`+ SplitKStyle env_split_k = SplitKStyle::NO_SPLIT_K;`
	`127`	`+ const char* env_stream_k = std::getenv("CUTLASS_GEMM_STREAM_K");`
	`128`	`+ if (env_stream_k != nullptr) {`
	`129`	`+ env_split_k = SplitKStyle::SPLIT_K_SERIAL;`
	`130`	`+ }`
`123`	`131`	`for (const auto& tile_config : tiles) {`
`124`	`132`	`for (int stages = min_stages; stages <= max_stages; ++stages) {`
`125`		`- CutlassGemmConfig config{tile_config, SplitKStyle::NO_SPLIT_K, 1, stages};`
	`133`	`+ CutlassGemmConfig config{tile_config, env_split_k, 1, stages};`
`126`	`134`	`candidate_configs.push_back(config);`
`127`	`135`	`}`
`128`	`136`	`}`