cutlass3.0

qingshui · Jan 2, 2024 · 9293d34 · 9293d34
1 parent ef35202
commit 9293d34
Show file tree

Hide file tree

Showing 47 changed files with 3,401 additions and 3,619 deletions.
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -290,10 +290,10 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
 message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}, NVCC_ARCH_BIN: ${NVCC_ARCH_BIN}")
 
 # Set C++14 support
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+set(CUDA_PROPAGATE_HOST_FLAGS ON)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 17)
 
 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0
@@ -321,6 +321,7 @@ if(WIN32)
     endforeach()
   endif()
 endif()
+message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
 
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)

diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
@@ -16,9 +16,9 @@ include(ExternalProject)
 
 set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)
 set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
-set(CUTLASS_TAG v2.11.0)
+set(CUTLASS_TAG v3.3.0)
 
-set(CUTLASS_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cutlass)
+set(CUTLASS_SOURCE_DIR ${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass)
 include_directories("${CUTLASS_SOURCE_DIR}/")
 include_directories("${CUTLASS_SOURCE_DIR}/include/")
 include_directories("${CUTLASS_SOURCE_DIR}/tools/util/include/")

diff --git a/cmake/external/flashattn.cmake b/cmake/external/flashattn.cmake
@@ -19,14 +19,14 @@ add_definitions(-DPADDLE_WITH_FLASHATTN)
 set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn)
 set(FLASHATTN_SOURCE_SUBDIR csrc)
 set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn)
-set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/flashattn)
+set(SOURCE_DIR ${THIRD_PARTY_PATH}/flashattn/src/extern_flashattn)
 #set(FLASHATTN_TAG 0598fa245bbfb8c4462002600864518c0e37e714)
 set(FLASHATTN_TAG 705e8c69fe1511aa6abd4bfea493f24e119193ee)
 set(FLASHATTN_INCLUDE_DIR
     "${FLASHATTN_INSTALL_DIR}/include"
     CACHE PATH "flash-attn Directory" FORCE)
 set(FLASHATTN_LIB_DIR
-    "${FLASHATTN_INSTALL_DIR}/lib"
+    "${FLASHATTN_INSTALL_DIR}/lib"ex
     CACHE PATH "flash-attn Library Directory" FORCE)
 
 if(WIN32)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
@@ -35,7 +35,7 @@ endfunction()
 
 checkcompilercxx14flag()
 if(NOT WIN32)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
 else()
   set(CMAKE_CXX_STANDARD 14)
 endif()

diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
@@ -216,8 +216,7 @@ class DenseTensor : public TensorBase,
   };
 
  protected:
-  std::shared_ptr<InplaceVersion> inplace_version_counter_{
-      std::make_shared<InplaceVersion>()};
+  std::shared_ptr<InplaceVersion> inplace_version_counter_ = std::make_shared<InplaceVersion>();
 
 /* @jim19930609: This is a hack
 In general, it is badly designed to fuse MKLDNN-specific objects into a

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -137,27 +137,27 @@ endif()
 file(GLOB kernel_xpu "xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc")
 
 if(WITH_CUTLASS)
-  add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION")  # for memory_efficient_attention.h
-  execute_process(
-    COMMAND
-      ${PYTHON_EXECUTABLE}
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
-      --cuda_arch "${NVCC_ARCH_BIN}"
-    RESULT_VARIABLE memory_efficient_attention_gen_res)
-
-  execute_process(
-    COMMAND
-      ${PYTHON_EXECUTABLE}
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
-      --cuda_arch "${NVCC_ARCH_BIN}"
-    RESULT_VARIABLE memory_efficient_attention_gen_res)
-
-  if(NOT memory_efficient_attention_gen_res EQUAL 0)
-    message(
-      FATAL_ERROR
-        "The memory efficient attention kernel generation errors with NVCC_ARCH_BIN=${NVCC_ARCH_BIN}"
-    )
-  endif()
+  # add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION")  # for memory_efficient_attention.h
+  # execute_process(
+  #   COMMAND
+  #     ${PYTHON_EXECUTABLE}
+  #     ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
+  #     --cuda_arch "${NVCC_ARCH_BIN}"
+  #   RESULT_VARIABLE memory_efficient_attention_gen_res)
+
+  # execute_process(
+  #   COMMAND
+  #     ${PYTHON_EXECUTABLE}
+  #     ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
+  #     --cuda_arch "${NVCC_ARCH_BIN}"
+  #   RESULT_VARIABLE memory_efficient_attention_gen_res)
+
+  # if(NOT memory_efficient_attention_gen_res EQUAL 0)
+  #   message(
+  #     FATAL_ERROR
+  #       "The memory efficient attention kernel generation errors with NVCC_ARCH_BIN=${NVCC_ARCH_BIN}"
+  #   )
+  # endif()
 
   execute_process(
     COMMAND
@@ -183,8 +183,8 @@ if(WITH_CUTLASS)
     GLOB cutlass_cu
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
     "fusion/cutlass/*.cu"
-    "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu"
-    "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu"
+    # "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu"
+    # "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu"
     "fusion/cutlass/cutlass_kernels/*.cu"
     "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/*.cu"
     "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/*.cu"

diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/arch/mma.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/arch/mma.h
@@ -33,6 +33,7 @@ limitations under the License. */
 */
 
 #pragma once
+#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/weight_only_quant_op.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -42,5 +43,75 @@ namespace arch {
 // Tag which triggers MMA which will trigger
 struct OpMultiplyAddDequantizeInterleavedBToA;
 
+/*
+  Below we have extra tags to signal what kind of dequantization we want to do
+  (per col, scale only fine grained, finegrained with zero). This still lets us
+  the existing template infrastructure (incl. that in CUTLASS). However, we
+  split out the template below into OpMultiplyAddDequantizeInterleavedBToA along
+  with the quantization op before instantiating the GEMM pieces.
+
+  Note that this is somewhat of a hack, but it SIGNIFICANTLY reduces the amount
+  of code we need to duplicate.
+ */
+struct OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+
+// The default just forwards the original operator
+template <typename MmaOp, WeightOnlyQuantOp QuantOp_>
+struct TagOperator {
+  using TaggedOperator = MmaOp;
+};
+
+// Specializations below attach more information to the operator
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA,
+                   WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA,
+                   WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA,
+                   WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+};
+
+// Here we instantiate some structs to "detag" the tagged operator. It splits it
+// back to the original operator + the extra information. If no extra info was
+// tagged, the dequant op per column scaling as a default.
+template <typename TaggedMmaOp>
+struct DetagOperator {
+  using Operator = TaggedMmaOp;
+  static constexpr WeightOnlyQuantOp QuantOp =
+      WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_percol_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp =
+      WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp =
+      WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp =
+      WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
+};
+
 }  // namespace arch
 }  // namespace cutlass
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/compute_occupancy.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/compute_occupancy.h
@@ -67,35 +67,4 @@ inline int compute_occupancy_for_kernel() {
   return max_active_blocks;
 }
 
-template <typename GemmKernel>
-inline int compute_occupancy_for_kernel2() {
-  int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
-
-  if (smem_size > (48 << 10)) {
-    cudaError_t status =
-        cudaFuncSetAttribute(cutlass::Kernel2<GemmKernel>,
-                             cudaFuncAttributeMaxDynamicSharedMemorySize,
-                             smem_size);
-    if (status == cudaError::cudaErrorInvalidValue) {
-      // Clear the error bit since we can ignore this.
-      // This should mean that smem_size >
-      // cudaDevAttrMaxSharedMemoryPerBlockOptin. In that case, we return an
-      // occupancy of 0. This will cause the heuristic to ignore this
-      // configuration.
-      status = cudaGetLastError();
-      return 0;
-    }
-    check_cuda_error(status);
-  }
-
-  int max_active_blocks = -1;
-  check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &max_active_blocks,
-      cutlass::Kernel2<GemmKernel>,
-      GemmKernel::kThreadCount,
-      smem_size));
-
-  return max_active_blocks;
-}
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/thread/ft_fused_activations.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/thread/ft_fused_activations.h