Opitmization for AlphaFold2 model (Dao-AILab#4)

* add for alpha_fold2 * add some extra setting * fix some bugs * fix some changes * fix some bugs 2nd * Add another initition of Gmem_tile_qkv and Gmem_tile_o * add some compensation for try..catch * fix mistake in flash_attn_fwd * commit for code style and bug check * fix some bugs for flash_attn_with_bias-mask * add more print for pointer debug * add some bug test cases. * backward function * fix bugs * make some changes for backward * Fix compiling error. * quote all printf debug * quote all printf debug and fix interface error * quote all printf debug and fix interface error, fix typo * remove all printf * split files * remove useless debug code * split fwd and bwd execution function * split fwd and bwd execution function * remove useless codes * remove useless codes * remove useless codes 3rd times * remove useless codes 4th times * Fix compiling error. * Remove const.
kuizhiqing · May 8, 2023 · 209f02b · 209f02b
1 parent 5ff4bbf
commit 209f02b
Show file tree

Hide file tree

Showing 27 changed files with 3,033 additions and 13 deletions.
diff --git a/csrc/flash_attn/CMakeLists.txt b/csrc/flash_attn/CMakeLists.txt
@@ -22,6 +22,7 @@ add_library(flashattn SHARED
  ${SOURCES_CU}
  ${SOURCES_CPP}
  flash_attn.cpp
+ flash_attn_with_bias_mask.cpp
  )
 
 target_compile_options(flashattn PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:

diff --git a/csrc/flash_attn/flash_attn.cpp b/csrc/flash_attn/flash_attn.cpp
@@ -26,6 +26,7 @@
  *
  ******************************************************************************/
 
+#include "flash_attn.h"
 #include "fmha.h"
 #include "utils.h"
 #include "cuda_utils.h"
@@ -62,7 +63,7 @@ extern "C" {
 
 static thread_local std::unique_ptr<char[]> flash_attn_err_msg;
 
-static void flash_attn_set_error(const char *msg) {
+void flash_attn_set_error(const char *msg) {
  if (msg == nullptr || *msg == '\0') {
  msg = "unknown error";
  }

diff --git a/csrc/flash_attn/flash_attn.h b/csrc/flash_attn/flash_attn.h
@@ -70,6 +70,79 @@ bool flash_attn_bwd(
  uint64_t offset
 );
 
+bool flash_attn_fwd_with_bias_and_mask(
+ const void *q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+ const void *k, // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+ const void *v, // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+ void *out, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+ const int32_t *cu_seqlens_q, // int32, batch_size+1, starting offset of each sequence
+ const int32_t *cu_seqlens_k, // int32, batch_size+1, starting offset of each sequence
+ const int total_q,
+ const int total_k,
+ const int batch_size,
+ const int num_heads,
+ const int head_size,
+ const int max_seqlen_q_,
+ const int max_seqlen_k_,
+ const float p_dropout,
+ const float softmax_scale,
+ const bool zero_tensors,
+ const bool is_causal,
+ const bool is_bf16,
+ const int num_splits, // SMs per attention matrix, can be 1
+ void *softmax_lse_ptr, // softmax log_sum_exp
+ void *softmax_ptr,
+ void *workspace_ptr,
+ uint64_t *workspace_size,
+ cudaStream_t stream,
+ uint64_t seed,
+ uint64_t offset,
+ const void *attn_mask,
+ const void *attn_bias,
+ const int64_t* mask_dims,
+ const int64_t* bias_dims
+);
+
+bool flash_attn_bwd_with_bias_and_mask(
+ const void *q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+ const void *k, // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+ const void *v, // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+ void *dq, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+ void *dk, // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+ void *dv, // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+ const void *out, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+ const void *dout, // total_q x num_heads, x head_size
+ const int32_t *cu_seqlens_q, // int32, batch_size+1
+ const int32_t *cu_seqlens_k, // int32, batch_size+1
+ const int total_q,
+ const int total_k,
+ const int batch_size,
+ const int num_heads,
+ const int head_size,
+ const int max_seqlen_q_,
+ const int max_seqlen_k_,
+ const float p_dropout,
+ const float softmax_scale,
+ const bool zero_tensors,
+ const bool is_causal,
+ const bool is_bf16,
+ const int num_splits,
+ const void *softmax_lse_ptr,
+ void *dsoftmax_ptr,
+ void *dbias_ptr,
+ void *workspace_ptr,
+ uint64_t *workspace_size,
+ cudaStream_t stream,
+ uint64_t seed,
+ uint64_t offset,
+ const void* attn_mask,
+ const void* attn_bias,
+ const int64_t* mask_dims,
+ const int64_t* bias_dims
+);
+
+void flash_attn_set_error(const char *msg);
+
 const char *flash_attn_error();
 
 #ifdef __cplusplus