PaddlePaddle · yuanlehome · Jan 4, 2024 · Dec 20, 2023 · Dec 22, 2023 · Dec 22, 2023
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/arch/mma.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/arch/mma.h
@@ -1,18 +1,34 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
  *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
 
 /* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
@@ -42,5 +58,58 @@ namespace arch {
 // Tag which triggers MMA which will trigger
 struct OpMultiplyAddDequantizeInterleavedBToA;
 
+/*
+  Below we have extra tags to signal what kind of dequantization we want to do
+  (per col, scale only fine grained, finegrained with zero). This still lets us
+  the existing template infrastructure (incl. that in CUTLASS). However, we
+  split out the template below into OpMultiplyAddDequantizeInterleavedBToA along
+  with the quantization op before instantiating the GEMM pieces.
+
+  Note that this is somewhat of a hack, but it SIGNIFICANTLY reduces the amount
+  of code we need to duplicate.
+ */
+struct OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_grained_scale;
+
+// The default just forwards the original operator
+template <typename MmaOp, bool FineGrained>
+struct TagOperator {
+  using TaggedOperator = MmaOp;
+};
+
+// Specializations below attach more information to the operator
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, false> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, true> {
+  using TaggedOperator =
+      OpMultiplyAddDequantizeInterleavedBToA_fine_grained_scale;
+};
+
+// Here we instantiate some structs to "detag" the tagged operator. It splits it
+// back to the original operator + the extra information. If no extra info was
+// tagged, the dequant op per column scaling as a default.
+template <typename TaggedMmaOp>
+struct DetagOperator {
+  using Operator = TaggedMmaOp;
+  static constexpr bool FineGrained = false;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_percol_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr bool FineGrained = false;
+};
+
+template <>
+struct DetagOperator<
+    OpMultiplyAddDequantizeInterleavedBToA_fine_grained_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr bool FineGrained = true;
+};
+
 }  // namespace arch
 }  // namespace cutlass
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/ft_gemm_configs.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/ft_gemm_configs.h
@@ -61,7 +61,9 @@ enum class CutlassTileConfig {
 
   // configs for large M in encoder
   CtaShape128x256x64_WarpShape64x64x64,
-  // CtaShape256x128x64_WarpShape64x64x64
+
+  // configs for finegrained
+  CtaShape256x128x64_WarpShape64x64x64,
 };
 
 enum class SplitKStyle {

diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
@@ -56,8 +56,10 @@ template <typename Mma_,  ///! Threadblock-scoped matrix multiply-accumulate
           typename KernelArch,  ///! The Architecture this kernel is compiled
                                 /// for. Used since SIMT kernels lose top-level
                                 /// arch.
-          bool SplitKSerial     ///! If true, code supporting split-K via serial
+          bool SplitKSerial,    ///! If true, code supporting split-K via serial
                                 /// reduction is enabled.
+          bool Finegrained      ///! If true, finegrained mode is enabled.
+                                /// Currently only support groupwise.
           >
 struct GemmFpAIntB {
   using Mma = Mma_;
@@ -103,6 +105,7 @@ struct GemmFpAIntB {
   /// Parameters structure
   struct Arguments : UniversalArgumentsBase {
     cutlass::gemm::GemmCoord problem_size;
+    int group_size;
     typename Mma::IteratorA::TensorRef ref_A;
     typename Mma::IteratorB::TensorRef ref_B;
     typename Mma::IteratorScale::TensorRef ref_scale;
@@ -125,6 +128,7 @@ struct GemmFpAIntB {
 
     CUTLASS_HOST_DEVICE
     Arguments(cutlass::gemm::GemmCoord const& problem_size,
+              int group_size,
               typename Mma::IteratorA::TensorRef ref_A,
               typename Mma::IteratorB::TensorRef ref_B,
               typename Mma::IteratorScale::TensorRef ref_scale,
@@ -143,6 +147,7 @@ struct GemmFpAIntB {
               problem_size,
               /*serial_split_k_factor=*/serial_split_k_factor,
               /*batch_stride_D=*/0),
+          group_size(group_size),
           ref_A(ref_A),
           ref_B(ref_B),
           ref_scale(ref_scale),
@@ -181,6 +186,7 @@ struct GemmFpAIntB {
     int const* gather_A_indices;
     int const* gather_B_indices;
     int const* scatter_D_indices;
+    int group_size;
 
     //
     // Methods
@@ -192,6 +198,7 @@ struct GemmFpAIntB {
     CUTLASS_HOST_DEVICE
     Params(Arguments const& args, int device_sms, int sm_occupancy)
         : ParamsBase(args, device_sms, sm_occupancy),
+          group_size(args.group_size),
           params_A(args.ref_A.layout()),
           ref_A(args.ref_A),
           params_B(args.ref_B.layout()),
@@ -276,6 +283,52 @@ struct GemmFpAIntB {
     return Status::kSuccess;
   }
 
+  // Initializes the fine grained scale+bias iterator. Needed since the fine
+  // grained iterator has a different constructor signature than a regular
+  // cutlass iterator
+
+  template <typename IteratorScale, bool FineGrained>
+  struct initialize_scale {
+    CUTLASS_DEVICE static IteratorScale apply(
+        typename IteratorScale::Params const& params,
+        typename IteratorScale::Pointer pointer_scale,
+        typename IteratorScale::TensorCoord extent,
+        int thread_id,
+        typename IteratorScale::TensorCoord const& threadblock_offset,
+        int group_size);
+  };
+
+  template <typename IteratorScale>
+  struct initialize_scale<IteratorScale, true> {
+    CUTLASS_DEVICE static IteratorScale apply(
+        typename IteratorScale::Params const& params,
+        typename IteratorScale::Pointer pointer_scale,
+        typename IteratorScale::TensorCoord extent,
+        int thread_id,
+        typename IteratorScale::TensorCoord const& threadblock_offset,
+        int group_size) {
+      return IteratorScale(params,
+                           pointer_scale,
+                           extent,
+                           thread_id,
+                           threadblock_offset,
+                           group_size);
+    }
+  };
+
+  template <typename IteratorScale>
+  struct initialize_scale<IteratorScale, false> {
+    CUTLASS_DEVICE static IteratorScale apply(
+        typename IteratorScale::Params const& params,
+        typename IteratorScale::Pointer pointer_scale,
+        typename IteratorScale::TensorCoord extent,
+        int thread_id,
+        typename IteratorScale::TensorCoord const& threadblock_offset,
+        int group_size) {
+      return IteratorScale(
+          params, pointer_scale, extent, thread_id, threadblock_offset);
+    }
+  };
   static size_t get_extra_workspace_size(
       Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
     return 0;
@@ -335,8 +388,12 @@ struct GemmFpAIntB {
           threadblock_tile_offset.k() * params.gemm_k_size * kInterleave,
           threadblock_tile_offset.n() * Mma::Shape::kN / kInterleave};
 
+      typename MatrixCoord::Index fg_row_offset =
+          threadblock_tile_offset.k() * params.gemm_k_size / 64;
+      typename MatrixCoord::Index scale_row_offset =
+          Finegrained == true ? fg_row_offset : 0;
       cutlass::MatrixCoord tb_offset_scale{
-          0, threadblock_tile_offset.n() * Mma::Shape::kN};
+          scale_row_offset, threadblock_tile_offset.n() * Mma::Shape::kN};
 
       // Problem size is a function of threadblock index in the K dimension
       int problem_size_k =
@@ -368,11 +425,16 @@ struct GemmFpAIntB {
           tb_offset_B,
           params.gather_B_indices);
 
-      typename Mma::IteratorScale iterator_scale(params.params_scale,
-                                                 params.ref_scale.data(),
-                                                 {1, params.problem_size.n()},
-                                                 thread_idx,
-                                                 tb_offset_scale);
+      typename MatrixCoord::Index scale_row_extent =
+          Finegrained == true ? problem_size_k / 64 : 1;
+      typename Mma::IteratorScale iterator_scale =
+          initialize_scale<typename Mma::IteratorScale, Finegrained>::apply(
+              params.params_scale,
+              params.ref_scale.data(),
+              {scale_row_extent, params.problem_size.n()},
+              thread_idx,
+              tb_offset_scale,
+              params.group_size);
 
       // Broadcast the warp_id computed by lane 0 to ensure dependent code
       // is compiled as warp-uniform.
@@ -383,7 +445,11 @@ struct GemmFpAIntB {
       // Main loop
       //
       // Construct thread-scoped matrix multiply
-      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+      Mma mma(shared_storage.main_loop,
+              params.group_size,
+              thread_idx,
+              warp_idx,
+              lane_idx);
 
       typename Mma::FragmentC accumulators;
 

diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h
@@ -847,7 +847,7 @@ struct GemmFpAIntBSplitK {
     // static_assert(print_type<Mma::>());
 
     // Perform this tile's range of multiply-accumulate (MAC) iterations
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+    Mma mma(shared_storage.main_loop, -1, thread_idx, warp_idx, lane_idx);
 
     mma(tile_work.k_iters_remaining,
         accumulator_tile,