vllm-project
diff --git a/‎csrc/cutlass‎ b/‎csrc/cutlass‎
diff --git a/‎flash_attn/modules/mha.py‎
Lines changed: 10 additions & 37 deletions b/‎flash_attn/modules/mha.py‎
Lines changed: 10 additions & 37 deletions
diff --git a/‎flash_attn/ops/fused_dense.py‎
Lines changed: 1 addition & 1 deletion b/‎flash_attn/ops/fused_dense.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎flash_attn/ops/triton/layer_norm.py‎
Lines changed: 8 additions & 4 deletions b/‎flash_attn/ops/triton/layer_norm.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎flash_attn/ops/triton/mlp.py‎
Lines changed: 1 addition & 1 deletion b/‎flash_attn/ops/triton/mlp.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎flash_attn/ops/triton/rotary.py‎
Lines changed: 4 additions & 3 deletions b/‎flash_attn/ops/triton/rotary.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎flash_attn/utils/torch.py‎
Lines changed: 21 additions & 0 deletions b/‎flash_attn/utils/torch.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎hopper/benchmark_mla_decode.py‎
Lines changed: 11 additions & 10 deletions b/‎hopper/benchmark_mla_decode.py‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎hopper/flash.h‎
Lines changed: 1 addition & 0 deletions b/‎hopper/flash.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎hopper/flash_api.cpp‎
Lines changed: 31 additions & 9 deletions b/‎hopper/flash_api.cpp‎
Lines changed: 31 additions & 9 deletions
@@ -23,9 +23,9 @@
     flash_attn_with_kvcache = None
 
 try:
-    from flash_attn.ops.fused_dense import ColumnParallelLinear, FusedDense, RowParallelLinear
+    from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
 except ImportError:
-    FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None
+    ColumnParallelLinear, RowParallelLinear = None, None, None
 
 try:
     from flash_attn.layers.rotary import RotaryEmbedding
@@ -341,13 +341,6 @@ def forward(self, q, kv, causal=None, key_padding_mask=None):
         return output
 
 
-class LinearResidual(nn.Linear):
-    """Wrap nn.Linear to return the residual as well. For compatibility with FusedDense."""
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return super().forward(input), input
-
-
 def _update_kv_cache(kv, inference_params, layer_idx):
     """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
     # Pre-allocate memory for key-values for inference.
@@ -452,13 +445,6 @@ def __init__(
                 device=device,
             )
 
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
-        linear_resid_cls = (
-            LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True)
-        )
-        wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
         inner_attn_cls = (
             partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
             if use_flash_attn
@@ -470,10 +456,10 @@ def __init__(
             else CrossAttention
         )
         if not self.cross_attn:
-            self.Wqkv = wqkv_cls(embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs)
+            self.Wqkv = nn.Linear(embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs)
         else:
-            self.Wq = linear_cls(embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs)
-            self.Wkv = wqkv_cls(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs)
+            self.Wq = nn.Linear(embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs)
+            self.Wkv = nn.Linear(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs)
         if self.dwconv:
             if self.num_heads_kv == self.num_heads:
                 self.dwconv_qkv = nn.Conv1d(
@@ -492,7 +478,7 @@ def __init__(
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
-        self.out_proj = linear_cls(embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs)
 
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
         dtype = self.out_proj.weight.dtype if dtype is None else dtype
@@ -646,10 +632,7 @@ def forward(
         batch, seqlen = x.shape[:2]
         if not self.cross_attn and self.num_heads_kv == self.num_heads:
             assert x_kv is None and mixer_subset is None
-            if not self.return_residual:
-                qkv = self.Wqkv(x)
-            else:
-                qkv, x = self.Wqkv(x)
+            qkv = self.Wqkv(x)
             if self.dwconv:
                 qkv = rearrange(
                     self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
@@ -680,21 +663,11 @@ def forward(
                 )
         else:
             if self.cross_attn:
-                if not self.return_residual:
-                    q = self.Wq(x if mixer_subset is None else x[:, mixer_subset])
-                    kv = self.Wkv(x_kv if x_kv is not None else x)
-                else:
-                    if x_kv is not None:
-                        kv, x_kv = self.Wkv(x_kv)
-                    else:
-                        kv, x = self.Wkv(x)
-                    q = self.Wq(x if mixer_subset is None else x[:, mixer_subset])
+                q = self.Wq(x if mixer_subset is None else x[:, mixer_subset])
+                kv = self.Wkv(x_kv if x_kv is not None else x)
             else:
                 assert self.num_heads_kv != self.num_heads
-                if not self.return_residual:
-                    qkv = self.Wqkv(x)
-                else:
-                    qkv, x = self.Wqkv(x)
+                qkv = self.Wqkv(x)
                 q = qkv[..., : self.num_heads * self.head_dim]
                 kv = qkv[..., self.num_heads * self.head_dim :]
             q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
 
@@ -11,9 +11,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
 
+from flash_attn.utils.torch import custom_fwd, custom_bwd
 from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_bwd, sqrelu_fwd
 from flash_attn.utils.distributed import (
     all_gather_raw,
 
@@ -10,11 +10,13 @@
 
 import torch
 import torch.nn.functional as F
-from torch.cuda.amp import custom_fwd, custom_bwd
 
 import triton
 import triton.language as tl
 
+from flash_attn.utils.torch import custom_fwd, custom_bwd
+
+
 def triton_autotune_configs():
     # Return configs with a valid warp count for the current device
     configs=[]
@@ -635,7 +637,9 @@ def _layer_norm_bwd(
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+    # Increasing the multiple (e.g. 8) will allow more thread blocks to be launched and hide the
+    # latency of the gmem reads/writes, but will increase the time of summing up dw / db.
+    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count * 8
     _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
     _db = (
         torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
@@ -1018,12 +1022,12 @@ def forward(
             norm_bias,
             eps,
             residual,
-            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype(),
+            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_dtype("cuda"),
             residual_dtype=residual_dtype,
             is_rms_norm=is_rms_norm,
         )
         y = y.reshape(x_shape_og)
-        dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
+        dtype = torch.get_autocast_dtype("cuda") if torch.is_autocast_enabled() else y.dtype
         linear_weight = linear_weight.to(dtype)
         linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
         out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
 
@@ -4,8 +4,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.cuda.amp import custom_bwd, custom_fwd
 
+from flash_attn.utils.torch import custom_fwd, custom_bwd
 from flash_attn.ops.activations import sqrelu_bwd, sqrelu_fwd
 from flash_attn.ops.triton.linear import triton_dgrad_act, triton_linear_act
 
 
@@ -38,8 +38,8 @@ def rotary_kernel(
     BLOCK_M: tl.constexpr,
 ):
     pid_m = tl.program_id(axis=0)
-    pid_batch = tl.program_id(axis=1)
-    pid_head = tl.program_id(axis=2)
+    pid_head = tl.program_id(axis=1)
+    pid_batch = tl.program_id(axis=2)
     rotary_dim_half = rotary_dim // 2
 
     if not IS_VARLEN:
@@ -193,7 +193,7 @@ def apply_rotary(
         if rotary_dim <= 32
         else (64 if rotary_dim <= 64 else (128 if rotary_dim <= 128 else 256))
     )
-    grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), batch, nheads)  # noqa
+    grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), nheads, batch)  # noqa
     BLOCK_M = 4 if interleaved else (8 if rotary_dim <= 128 else 4)
 
     # Need this, otherwise Triton tries to launch from cuda:0 and we get
@@ -223,5 +223,6 @@ def apply_rotary(
             interleaved,
             conjugate,
             BLOCK_M,
+            num_warps=2 if rotary_dim <= 64 else 4,
         )
     return output
@@ -0,0 +1,21 @@
+import torch
+from typing import Callable
+
+
+def custom_amp_decorator(dec: Callable, cuda_amp_deprecated: bool):
+    def decorator(*args, **kwargs):
+        if cuda_amp_deprecated:
+            kwargs["device_type"] = "cuda"
+        return dec(*args, **kwargs)
+    return decorator
+
+
+if hasattr(torch.amp, "custom_fwd"): # type: ignore[attr-defined]
+    deprecated = True
+    from torch.amp import custom_fwd, custom_bwd # type: ignore[attr-defined]
+else:
+    deprecated = False
+    from torch.cuda.amp import custom_fwd, custom_bwd
+
+custom_fwd = custom_amp_decorator(custom_fwd, deprecated)
+custom_bwd = custom_amp_decorator(custom_bwd, deprecated)
@@ -36,15 +36,15 @@
 
 use_bench_cudagraph = False
 
-attn_variants = ["mha", "gqa", "mqa", "mla"]
-for attn_variant in attn_variants:
-# for attn_variant in attn_variants[3:]:
-    nheads_kv = nheads_q if attn_variant == "mha" else (max(nheads_q // 8, 1) if attn_variant == "gqa" else 1)
-    headdim = 64 if attn_variant == "mla" else 128
-    headdim_v = 512 if attn_variant == "mla" else headdim
-    has_qv = headdim == 64 and headdim_v == 512
+attn_variants = ["mha", "gqa", "mqa", "mla", "gla"]
+# for attn_variant in attn_variants:
+for attn_variant in attn_variants[3:5]:
+    nheads_kv = nheads_q if attn_variant == "mha" else (max(nheads_q // 8, 1) if attn_variant == "gqa" else (1 if attn_variant == "mla" else 2))
+    headdim = 64 if attn_variant in ["mla", "gla"] else 128
+    headdim_v = 512 if attn_variant == "mla" else (256 if attn_variant == "gla" else headdim)
+    has_qv = headdim == 64 and headdim_v > 64
     # page_size = None
-    page_size = 64 if attn_variant == "mla" else 128
+    page_size = 64 if attn_variant in ["mla", "gla"] else 128
 
     should_run_flashmla = attn_variant == "mla" and page_size == 64 and flash_mla_with_kvcache is not None
 
@@ -60,7 +60,7 @@
     print(f"\n{attn_variant.upper()}, nheads_q = {nheads_q}, nheads_kv = {nheads_kv}, headdim = {headdim}, headdim_v = {headdim_v}, page_size = {page_size}")
 
     for seqlen in [s * 1024 for s in [1, 2, 4, 8, 16, 32, 64]]:
-    # for seqlen in [s * 1024 for s in [1]]:
+    # for seqlen in [s * 1024 for s in [8]]:
         cache_seqlens = torch.tensor([seqlen] * batch_size, device=device, dtype=torch.int)
         num_splits = 0
         q = torch.randn(batch_size, seqlen_q, nheads_q, headdim, dtype=dtype, device=device)
@@ -84,6 +84,7 @@
             cache_seqlens, q.dtype, headdim_v=headdim_v, page_size=page_size, causal=True
         )
         # scheduler_metadata = None
+        # breakpoint()
         fn0 = lambda: flash_attn_with_kvcache(q, k_cache, v_cache, cache_seqlens=cache_seqlens, num_splits=num_splits, qv=qv, page_table=page_table, causal=True, scheduler_metadata=scheduler_metadata)
         time.sleep(1)  # to avoid power throttling
         # Time in ms
@@ -109,7 +110,7 @@
                     t1 = do_bench_cudagraph(fn1, rep=10)
 
         total_seqlen = seqlen * batch_size if cache_seqlens is None else cache_seqlens.sum().item()
-        mem_io = total_seqlen * nheads_kv * (headdim + headdim_v) * 2 + q.numel() * 2 + (qv.numel() * 2 if has_qv else 0) + q.numel() * headdim_v // headdim * 2  # last time is for the output
+        mem_io = total_seqlen * nheads_kv * (headdim + headdim_v) * 2 + q.numel() * 2 + (qv.numel() * 2 if has_qv else 0) + q.numel() * headdim_v // headdim * 2  # last term is for the output
         flops = seqlen_q * total_seqlen * nheads_q * (headdim + headdim_v * (2 if has_qv else 1)) * 2
         ideal_h100_time_mem = mem_io / 3.35e12 * 1e6
         ideal_h100_time_flop = flops / 989e12 * 1e6
 
@@ -112,6 +112,7 @@ struct Flash_fwd_params : public Qkv_params {
     // The cos and sin matrices for rotary embedding.
     void * __restrict__ rotary_cos_ptr;
     void * __restrict__ rotary_sin_ptr;
+    int *__restrict__ seqlens_rotary;
 
     // The indices to index into the KV cache.
     int * __restrict__ kv_batch_idx;
 
@@ -272,10 +272,11 @@ void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream) {
                             if (params.is_bf16) {
                                 #ifndef FLASHATTENTION_DISABLE_HDIM64
                                 if (params.d <= 64) {
-                                    if (params.dv > 64 && Arch == 90) {
+                                    if (params.dv > 256 && Arch == 90) {
                                         return run_mha_fwd_<Arch, cutlass::bfloat16_t, 64, 512, Split, PagedKVNonTMA, Has_softcap, PackGQA>(params, stream);
-                                    }
-                                    else {
+                                    } else if (params.dv > 64 && Arch == 90) {
+                                        return run_mha_fwd_<Arch, cutlass::bfloat16_t, 64, 256, Split, PagedKVNonTMA, Has_softcap, PackGQA>(params, stream);
+                                    } else {
                                         return run_mha_fwd_<Arch, cutlass::bfloat16_t, 64, 64, Split, PagedKVNonTMA, Has_softcap, PackGQA>(params, stream);
                                     }
                                 }
@@ -302,10 +303,11 @@ void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream) {
                                 #ifndef FLASHATTENTION_DISABLE_FP16
                                 #ifndef FLASHATTENTION_DISABLE_HDIM64
                                 if (params.d <= 64) {
-                                    if (params.dv > 64 && Arch == 90) {
+                                    if (params.dv > 256 && Arch == 90) {
                                         return run_mha_fwd_<Arch, cutlass::half_t, 64, 512, Split, PagedKVNonTMA, Has_softcap, PackGQA>(params, stream);
-                                    }
-                                    else {
+                                    } else if (params.dv > 64 && Arch == 90) {
+                                        return run_mha_fwd_<Arch, cutlass::half_t, 64, 256, Split, PagedKVNonTMA, Has_softcap, PackGQA>(params, stream);
+                                    } else {
                                         return run_mha_fwd_<Arch, cutlass::half_t, 64, 64, Split, PagedKVNonTMA, Has_softcap, PackGQA>(params, stream);
                                     }
                                 }
@@ -490,6 +492,15 @@ inline int round_up_headdim(int head_size) {
     return 256;
 }
 
+inline int round_up_headdimv(int head_size) {
+    if (head_size <= 64) { return 64; }
+    if (head_size <= 96) { return 96; }
+    if (head_size <= 128) { return 128; }
+    if (head_size <= 192) { return 192; }
+    if (head_size <= 256) { return 256; }
+    return 512;
+}
+
 // Only applicable to the case where seqused_k (i.e. cache_seqlens) is available
 at::Tensor
 mha_fwd_get_scheduler_metadata(
@@ -534,7 +545,7 @@ mha_fwd_get_scheduler_metadata(
     params.d = headdim;
     params.dv = headdim_v;
     params.d_rounded = round_up_headdim(headdim);
-    params.dv_rounded = round_up_headdim(headdim_v);
+    params.dv_rounded = headdim_v == headdim ? params.d_rounded : round_up_headdimv(headdim_v);
     params.seqlen_knew = max_seqlen_k_new;
 
     bool const is_varlen_q = cu_seqlens_q_.has_value();
@@ -640,6 +651,7 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         std::optional<const at::Tensor> &leftpad_k_, // b
         std::optional<const at::Tensor> &rotary_cos_, // seqlen_ro x (rotary_dim / 2)
         std::optional<const at::Tensor> &rotary_sin_, // seqlen_ro x (rotary_dim / 2)
+        std::optional<const at::Tensor> &seqlens_rotary_, // b
         std::optional<at::Tensor> &q_descale_,  // (b, h_k), not (b, h)
         std::optional<at::Tensor> &k_descale_,  // (b, h_k)
         std::optional<at::Tensor> &v_descale_,  // (b, h_k)
@@ -823,7 +835,7 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
 
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
     int const head_size_rounded = round_up_headdim(head_size);
-    int const head_size_v_rounded = round_up_headdim(head_size_v);
+    int const head_size_v_rounded = head_size_v == head_size ? head_size_rounded : round_up_headdimv(head_size_v);
     int const seqlen_q_rounded = round_multiple(seqlen_q, 128);
     int const seqlen_k_rounded = round_multiple(seqlen_k, 128);
 
@@ -1001,6 +1013,13 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         params.rotary_cos_ptr = rotary_cos.data_ptr();
         params.rotary_sin_ptr = rotary_sin.data_ptr();
         params.is_rotary_interleaved = is_rotary_interleaved;
+        if (seqlens_rotary_.has_value()) {
+            at::Tensor seqlens_rotary = seqlens_rotary_.value();
+            CHECK_DEVICE(seqlens_rotary); CHECK_CONTIGUOUS(seqlens_rotary);
+            TORCH_CHECK(seqlens_rotary.dtype() == torch::kInt32, "seqlens_rotary must have dtype torch.int32");
+            CHECK_SHAPE(seqlens_rotary, batch_size);
+            params.seqlens_rotary = seqlens_rotary.data_ptr<int>();
+        }
     } else {
         params.rotary_dim = 0;
     }
@@ -1104,7 +1123,11 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
             //     params.b = 1;
             //     params.seqlen_q = total_q;
             // }
+            // This will zero out the semaphore if needed
             run_mha_fwd_combine(params, stream, true /*enable_pdl*/);
+        } else if (scheduler_needs_semaphore && params.skip_scheduler_metadata_computation) {
+            // need to zero out the semaphore in this case
+            tile_count_semaphore.index({torch::indexing::Slice(0, 1)}).zero_();
         }
     } else if (total_q > 0 && num_heads_k > 0) {
         // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
@@ -1492,7 +1515,6 @@ mha_combine(const at::Tensor &out_partial,         // num_splits x batch_size x
     const int seqlen = sizes[2];
     const int num_heads = sizes[3];
     const int head_size_og = sizes[4];
-    TORCH_CHECK(head_size_og <= 512, "FlashAttention combine only supports head dimension at most 512");
     TORCH_CHECK(num_splits <= 256, "FlashAttention combine only supports num_splits at most 256");
 
     CHECK_SHAPE(out_partial, num_splits, batch_size, seqlen, num_heads, head_size_og);