Skip to content

Commit

Permalink
cutlass3.0
Browse files Browse the repository at this point in the history
  • Loading branch information
humingqing authored and humingqing committed Jan 2, 2024
1 parent ef35202 commit 9293d34
Show file tree
Hide file tree
Showing 47 changed files with 3,401 additions and 3,619 deletions.
5 changes: 3 additions & 2 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,10 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}, NVCC_ARCH_BIN: ${NVCC_ARCH_BIN}")

# Set C++14 support
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
set(CUDA_PROPAGATE_HOST_FLAGS ON)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD 17)

# (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
# So replace /W[1-4] with /W0
Expand Down Expand Up @@ -321,6 +321,7 @@ if(WIN32)
endforeach()
endif()
endif()
message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")

mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
Expand Down
4 changes: 2 additions & 2 deletions cmake/external/cutlass.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ include(ExternalProject)

set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)
set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
set(CUTLASS_TAG v2.11.0)
set(CUTLASS_TAG v3.3.0)

set(CUTLASS_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cutlass)
set(CUTLASS_SOURCE_DIR ${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass)
include_directories("${CUTLASS_SOURCE_DIR}/")
include_directories("${CUTLASS_SOURCE_DIR}/include/")
include_directories("${CUTLASS_SOURCE_DIR}/tools/util/include/")
Expand Down
4 changes: 2 additions & 2 deletions cmake/external/flashattn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ add_definitions(-DPADDLE_WITH_FLASHATTN)
set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn)
set(FLASHATTN_SOURCE_SUBDIR csrc)
set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn)
set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/flashattn)
set(SOURCE_DIR ${THIRD_PARTY_PATH}/flashattn/src/extern_flashattn)
#set(FLASHATTN_TAG 0598fa245bbfb8c4462002600864518c0e37e714)
set(FLASHATTN_TAG 705e8c69fe1511aa6abd4bfea493f24e119193ee)
set(FLASHATTN_INCLUDE_DIR
"${FLASHATTN_INSTALL_DIR}/include"
CACHE PATH "flash-attn Directory" FORCE)
set(FLASHATTN_LIB_DIR
"${FLASHATTN_INSTALL_DIR}/lib"
"${FLASHATTN_INSTALL_DIR}/lib"ex
CACHE PATH "flash-attn Library Directory" FORCE)

if(WIN32)
Expand Down
2 changes: 1 addition & 1 deletion cmake/flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ endfunction()

checkcompilercxx14flag()
if(NOT WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
else()
set(CMAKE_CXX_STANDARD 14)
endif()
Expand Down
3 changes: 1 addition & 2 deletions paddle/phi/core/dense_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,7 @@ class DenseTensor : public TensorBase,
};

protected:
std::shared_ptr<InplaceVersion> inplace_version_counter_{
std::make_shared<InplaceVersion>()};
std::shared_ptr<InplaceVersion> inplace_version_counter_ = std::make_shared<InplaceVersion>();

/* @jim19930609: This is a hack
In general, it is badly designed to fuse MKLDNN-specific objects into a
Expand Down
46 changes: 23 additions & 23 deletions paddle/phi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -137,27 +137,27 @@ endif()
file(GLOB kernel_xpu "xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc")

if(WITH_CUTLASS)
add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION") # for memory_efficient_attention.h
execute_process(
COMMAND
${PYTHON_EXECUTABLE}
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
--cuda_arch "${NVCC_ARCH_BIN}"
RESULT_VARIABLE memory_efficient_attention_gen_res)

execute_process(
COMMAND
${PYTHON_EXECUTABLE}
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
--cuda_arch "${NVCC_ARCH_BIN}"
RESULT_VARIABLE memory_efficient_attention_gen_res)

if(NOT memory_efficient_attention_gen_res EQUAL 0)
message(
FATAL_ERROR
"The memory efficient attention kernel generation errors with NVCC_ARCH_BIN=${NVCC_ARCH_BIN}"
)
endif()
# add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION") # for memory_efficient_attention.h
# execute_process(
# COMMAND
# ${PYTHON_EXECUTABLE}
# ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
# --cuda_arch "${NVCC_ARCH_BIN}"
# RESULT_VARIABLE memory_efficient_attention_gen_res)

# execute_process(
# COMMAND
# ${PYTHON_EXECUTABLE}
# ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
# --cuda_arch "${NVCC_ARCH_BIN}"
# RESULT_VARIABLE memory_efficient_attention_gen_res)

# if(NOT memory_efficient_attention_gen_res EQUAL 0)
# message(
# FATAL_ERROR
# "The memory efficient attention kernel generation errors with NVCC_ARCH_BIN=${NVCC_ARCH_BIN}"
# )
# endif()

execute_process(
COMMAND
Expand All @@ -183,8 +183,8 @@ if(WITH_CUTLASS)
GLOB cutlass_cu
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
"fusion/cutlass/*.cu"
"fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu"
"fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu"
# "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu"
# "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu"
"fusion/cutlass/cutlass_kernels/*.cu"
"fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/*.cu"
"fusion/cutlass/cutlass_kernels/fpA_intB_gemm/*.cu"
Expand Down
71 changes: 71 additions & 0 deletions paddle/phi/kernels/fusion/cutlass/cutlass_extensions/arch/mma.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ limitations under the License. */
*/

#pragma once
#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/weight_only_quant_op.h"

/////////////////////////////////////////////////////////////////////////////////////////////////

Expand All @@ -42,5 +43,75 @@ namespace arch {
// Tag which triggers MMA which will trigger
struct OpMultiplyAddDequantizeInterleavedBToA;

/*
Below we have extra tags to signal what kind of dequantization we want to do
(per col, scale only fine grained, finegrained with zero). This still lets us
the existing template infrastructure (incl. that in CUTLASS). However, we
split out the template below into OpMultiplyAddDequantizeInterleavedBToA along
with the quantization op before instantiating the GEMM pieces.
Note that this is somewhat of a hack, but it SIGNIFICANTLY reduces the amount
of code we need to duplicate.
*/
struct OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
struct OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
struct OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;

// The default just forwards the original operator
template <typename MmaOp, WeightOnlyQuantOp QuantOp_>
struct TagOperator {
using TaggedOperator = MmaOp;
};

// Specializations below attach more information to the operator
template <>
struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA,
WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY> {
using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
};

template <>
struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA,
WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY> {
using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
};

template <>
struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA,
WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS> {
using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
};

// Here we instantiate some structs to "detag" the tagged operator. It splits it
// back to the original operator + the extra information. If no extra info was
// tagged, the dequant op per column scaling as a default.
template <typename TaggedMmaOp>
struct DetagOperator {
using Operator = TaggedMmaOp;
static constexpr WeightOnlyQuantOp QuantOp =
WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
};

template <>
struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_percol_scale> {
using Operator = OpMultiplyAddDequantizeInterleavedBToA;
static constexpr WeightOnlyQuantOp QuantOp =
WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
};

template <>
struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scale> {
using Operator = OpMultiplyAddDequantizeInterleavedBToA;
static constexpr WeightOnlyQuantOp QuantOp =
WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
};

template <>
struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias> {
using Operator = OpMultiplyAddDequantizeInterleavedBToA;
static constexpr WeightOnlyQuantOp QuantOp =
WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
};

} // namespace arch
} // namespace cutlass
Original file line number Diff line number Diff line change
Expand Up @@ -67,35 +67,4 @@ inline int compute_occupancy_for_kernel() {
return max_active_blocks;
}

template <typename GemmKernel>
inline int compute_occupancy_for_kernel2() {
int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));

if (smem_size > (48 << 10)) {
cudaError_t status =
cudaFuncSetAttribute(cutlass::Kernel2<GemmKernel>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
smem_size);
if (status == cudaError::cudaErrorInvalidValue) {
// Clear the error bit since we can ignore this.
// This should mean that smem_size >
// cudaDevAttrMaxSharedMemoryPerBlockOptin. In that case, we return an
// occupancy of 0. This will cause the heuristic to ignore this
// configuration.
status = cudaGetLastError();
return 0;
}
check_cuda_error(status);
}

int max_active_blocks = -1;
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks,
cutlass::Kernel2<GemmKernel>,
GemmKernel::kThreadCount,
smem_size));

return max_active_blocks;
}

} // namespace phi

This file was deleted.

Loading

0 comments on commit 9293d34

Please sign in to comment.