FlagOpen · MARD1NO · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/benchmark/test_attention_perf.py b/benchmark/test_attention_perf.py
@@ -0,0 +1,48 @@
+from typing import Generator
+
+import torch
+
+from .performance_utils import Benchmark
+
+
+class AttentionBenchmark(Benchmark):
+    """
+    benchmark for attention
+    """
+
+    def __init__(self, *args, input_fn, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.input_fn = input_fn
+
+    def get_input_iter(self, cur_dtype) -> Generator:
+        for seq_len in [1024, 2048, 3072, 4096]:
+            yield from self.input_fn(cur_dtype, seq_len)
+
+
+def test_perf_scaled_dot_product_attention():
+    def scaled_dot_product_attention_kwargs(dtype, seq_len):
+        num_heads = 8
+        head_size = 128
+        batch = 4
+
+        query = torch.randn(
+            (batch, num_heads, seq_len, head_size), device="cuda", dtype=dtype
+        )
+        key = torch.randn(
+            (batch, num_heads, seq_len, head_size), device="cuda", dtype=dtype
+        )
+        value = torch.randn(
+            (batch, num_heads, seq_len, head_size), device="cuda", dtype=dtype
+        )
+        yield query, key, value, None, 0.0, True
+
+    bench = AttentionBenchmark(
+        op_name="scaled_dot_product_attention",
+        input_fn=scaled_dot_product_attention_kwargs,
+        torch_op=torch.nn.functional.scaled_dot_product_attention,
+        dtypes=[
+            # torch.float32,
+            torch.float16,
+        ],
+    )
+    bench.run()
diff --git a/src/flag_gems/__init__.py b/src/flag_gems/__init__.py
@@ -159,6 +159,7 @@ def enable(lib=aten_lib):
     lib.impl("repeat_interleave.self_int", repeat_interleave_self_int, "CUDA")
     lib.impl("vstack", vstack, "CUDA")
     lib.impl("repeat_interleave.Tensor", repeat_interleave_tensor, "CUDA")
+    lib.impl("scaled_dot_product_attention", scaled_dot_product_attention, "CUDA")
     lib.impl("repeat_interleave.self_Tensor", repeat_interleave_self_tensor, "CUDA")
 
 

diff --git a/src/flag_gems/ops/__init__.py b/src/flag_gems/ops/__init__.py
@@ -6,6 +6,7 @@
 from .any import any, any_dim, any_dims
 from .arange import arange, arange_start
 from .argmax import argmax
+from .attention import scaled_dot_product_attention
 from .bitwise_and import (
     bitwise_and_scalar,
     bitwise_and_scalar_tensor,
@@ -247,5 +248,6 @@
     "repeat_interleave_self_int",
     "vstack",
     "repeat_interleave_tensor",
+    "scaled_dot_product_attention",
     "repeat_interleave_self_tensor",
 ]