foundation-model-stack
diff --git a/‎Dockerfile‎
Lines changed: 4 additions & 1 deletion b/‎Dockerfile‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎ibm-triton-lib/ibm_triton_lib/backend/platform.py‎
Lines changed: 44 additions & 19 deletions b/‎ibm-triton-lib/ibm_triton_lib/backend/platform.py‎
Lines changed: 44 additions & 19 deletions
diff --git a/‎ibm-triton-lib/ibm_triton_lib/backend/triton_attn.py‎
Lines changed: 68 additions & 86 deletions b/‎ibm-triton-lib/ibm_triton_lib/backend/triton_attn.py‎
Lines changed: 68 additions & 86 deletions
diff --git a/‎ibm-triton-lib/ibm_triton_lib/kernels/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎ibm-triton-lib/ibm_triton_lib/kernels/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json‎
Lines changed: 27 additions & 0 deletions b/‎ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json‎
Lines changed: 27 additions & 0 deletions
@@ -240,10 +240,13 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
     git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness && cd lm-evaluation-harness && uv pip install .
 
+RUN git clone --depth 1 https://github.com/IBM/fmwork.git
+
 ENV STORE_TEST_RESULT_PATH=/results
 
-# copy vllm benchmarks
+# copy vllm benchmarks and tests
 COPY vllm/benchmarks benchmarks
+COPY vllm/tests tests
 COPY ShareGPT_V3_unfiltered_cleaned_split.json ShareGPT_V3_unfiltered_cleaned_split.json
 
 # Copy thid-party kernels and insert into path
 
@@ -27,8 +27,13 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 
+
 from vllm.platforms import Platform, PlatformEnum
-from vllm.platforms.cuda import CudaPlatform
+
+if not torch.version.hip:
+    from vllm.platforms.cuda import CudaPlatform
+else:
+    from vllm.platforms.rocm import RocmPlatform
 
 
 from vllm.platforms.interface import DeviceCapability, Platform, PlatformEnum, _Backend
@@ -46,21 +51,41 @@
 torch.backends.cuda.enable_cudnn_sdp(False)
 
 
-# CudaPlatform is a constant, not a class, but it dynamically decdes between Nvml and NonNVML class
-#  so we should inherit from this
-class TritonPlatform(CudaPlatform):
-
-    @classmethod
-    def get_attn_backend_cls(
-        cls,
-        selected_backend,
-        head_size,
-        dtype,
-        kv_cache_dtype,
-        block_size,
-        use_v1,
-        use_mla,
-    ) -> str:
-        if not envs.VLLM_USE_V1:
-            raise RuntimeError("vllm-triton-backend plugin only supports vLLM V1")
-        return "ibm_triton_lib.backend.triton_attn.TritonAttentionBackend"
+if not torch.version.hip:
+    # CudaPlatform is a constant, not a class, but it dynamically decdes between Nvml and NonNVML class
+    #  so we should inherit from this
+    class TritonPlatform(CudaPlatform):
+
+        @classmethod
+        def get_attn_backend_cls(
+            cls,
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_v1,
+            use_mla,
+        ) -> str:
+            if not envs.VLLM_USE_V1:
+                raise RuntimeError("vllm-triton-backend plugin only supports vLLM V1")
+            return "ibm_triton_lib.backend.triton_attn.TritonAttentionBackend"
+
+else:
+
+    class TritonPlatform(RocmPlatform):
+
+        @classmethod
+        def get_attn_backend_cls(
+            cls,
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_v1,
+            use_mla,
+        ) -> str:
+            if not envs.VLLM_USE_V1:
+                raise RuntimeError("vllm-triton-backend plugin only supports vLLM V1")
+            return "ibm_triton_lib.backend.triton_attn.TritonAttentionBackend"
@@ -40,12 +40,9 @@
     AttentionMetadata,
     AttentionType,
 )
-from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
-from vllm.attention.ops.paged_attn import PagedAttention
 from ibm_triton_lib.kernels import unified_attention
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
@@ -72,6 +69,8 @@ class TritonAttentionMetadata:
 
     num_actual_tokens: int  # Number of tokens excluding padding.
     max_query_len: int
+    avg_query_len: int
+    avg_seq_len: int
     query_start_loc: torch.Tensor
     max_seq_len: int
     seq_lens: torch.Tensor
@@ -97,6 +96,8 @@ class LocalAttentionMetadata:
         local_block_table: torch.Tensor
         local_max_query_len: int
         local_max_seq_len: int
+        local_avg_query_len: int
+        local_avg_seq_len: int
         local_scheduler_metadata: Optional[torch.Tensor]
 
     local_attn_metadata: Optional[LocalAttentionMetadata] = None
@@ -139,6 +140,9 @@ def build(
         block_table = self.block_table
         block_table_tensor = block_table.get_device_tensor()[:num_reqs]
 
+        avg_seq_len = int(self.runner.seq_lens_np[:num_reqs].mean())
+        avg_query_len = int(self.runner.query_start_loc_np[num_reqs] / num_reqs)
+
         block_table.slot_mapping[:num_actual_tokens].copy_(
             block_table.slot_mapping_cpu[:num_actual_tokens], non_blocking=True
         )
@@ -170,14 +174,18 @@ def build(
                 self.runner.device, non_blocking=True
             )
             local_max_query_len = seqlens_q_local_np.max()
+            local_avg_query_len = int(seqlens_q_local_np[num_reqs] / num_reqs)
             local_max_seq_len = virt_k_seqlens_np.max()
+            local_avg_seq_len = int(virt_k_seqlens_np[num_reqs] / num_reqs)
 
             local_attn_metadata = TritonAttentionMetadata.LocalAttentionMetadata(
                 local_query_start_loc=local_query_start_loc,
                 local_seqused_k=local_seqused_k,
                 local_block_table=virt_block_table_tensor,
                 local_max_query_len=local_max_query_len,
                 local_max_seq_len=local_max_seq_len,
+                local_avg_query_len=local_avg_query_len,
+                local_avg_seq_len=local_avg_seq_len,
                 local_scheduler_metadata=None,
             )
 
@@ -213,6 +221,8 @@ def build(
             suffix_kv_lens=suffix_kv_lens,
             local_attn_metadata=local_attn_metadata,
             prefix_scheduler_metadata=prefix_scheduler_metadata,
+            avg_query_len=avg_query_len,
+            avg_seq_len=avg_seq_len,
         )
         return attn_metadata
 
@@ -227,10 +237,22 @@ class TritonAttentionBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
 
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes."
+            )
+
     @staticmethod
     def get_name() -> str:
         return "TRITON_ATTN_VLLM_V1"
@@ -304,12 +326,7 @@ def __init__(
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        support_head_sizes = TritonAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by TritonAttention. "
-                f"Supported head sizes are: {support_head_sizes}."
-            )
+        TritonAttentionBackend.validate_head_size(head_size)
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError(
@@ -331,7 +348,7 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: FlashAttentionMetadata,
+        attn_metadata: TritonAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -369,41 +386,23 @@ def forward(
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
 
-        use_prefill_decode_attn = self.force_prefill_decode_attn
         num_actual_tokens = attn_metadata.num_actual_tokens
 
-        if use_prefill_decode_attn:
-            key_cache, value_cache = PagedAttention.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size
-            )
-        else:
-            key_cache, value_cache = kv_cache.unbind(0)
+        key_cache, value_cache = kv_cache.unbind(0)
 
         if self.kv_sharing_target_layer_name is None:
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
-            if use_prefill_decode_attn:
-                PagedAttention.write_to_paged_cache(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-            else:
-                torch.ops._C_cache_ops.reshape_and_cache_flash(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(self.fp8_dtype)
@@ -433,56 +432,39 @@ def forward(
             max_seqlen_q = local_metadata.local_max_query_len
             max_seqlen_k = local_metadata.local_max_seq_len
             block_table = local_metadata.local_block_table
+            avg_seqlen_q = local_metadata.local_avg_query_len
+            avg_seqlen_k = local_metadata.local_avg_seq_len
         else:
             cu_seqlens_q = attn_metadata.query_start_loc
             seqused_k = attn_metadata.seq_lens
             max_seqlen_q = attn_metadata.max_query_len
             max_seqlen_k = attn_metadata.max_seq_len
             block_table = attn_metadata.block_table
-
-        if use_prefill_decode_attn:
-            # Compute attention and update output up to `num_actual_tokens`.
-            chunked_prefill_paged_decode(
-                query=query[:num_actual_tokens],
-                key=key[:num_actual_tokens],
-                value=value[:num_actual_tokens],
-                output=output[:num_actual_tokens],
-                kv_cache_dtype=self.kv_cache_dtype,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                block_table=block_table,
-                query_start_loc=cu_seqlens_q,
-                seq_lens=seqused_k,
-                max_seq_len=max_seqlen_k,
-                max_query_len=max_seqlen_q,
-                k_scale=layer._k_scale,
-                v_scale=layer._v_scale,
-                alibi_slopes=self.alibi_slopes,
-                sliding_window=self.sliding_window[0],
-                sm_scale=self.scale,
-            )
-
-        else:
-            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
-
-            unified_attention(
-                q=query[:num_actual_tokens],
-                k=key_cache,
-                v=value_cache,
-                out=output[:num_actual_tokens],
-                cu_seqlens_q=cu_seqlens_q,
-                max_seqlen_q=max_seqlen_q,
-                seqused_k=seqused_k,
-                max_seqlen_k=max_seqlen_k,
-                softmax_scale=self.scale,
-                causal=True,
-                alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
-                block_table=block_table,
-                softcap=self.logits_soft_cap,
-                q_descale=None,  # Not supported
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
-            )
+            avg_seqlen_q = attn_metadata.avg_query_len
+            avg_seqlen_k = attn_metadata.avg_seq_len
+
+        descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+
+        unified_attention(
+            q=query[:num_actual_tokens],
+            k=key_cache,
+            v=value_cache,
+            out=output[:num_actual_tokens],
+            cu_seqlens_q=cu_seqlens_q,
+            max_seqlen_q=max_seqlen_q,
+            seqused_k=seqused_k,
+            max_seqlen_k=max_seqlen_k,
+            avg_seqlen_q=avg_seqlen_q,
+            avg_seqlen_k=avg_seqlen_k,
+            softmax_scale=self.scale,
+            causal=True,
+            alibi_slopes=self.alibi_slopes,
+            window_size=self.sliding_window,
+            block_table=block_table,
+            softcap=self.logits_soft_cap,
+            q_descale=None,  # Not supported
+            k_descale=layer._k_scale.expand(descale_shape),
+            v_descale=layer._v_scale.expand(descale_shape),
+        )
 
         return output
@@ -67,3 +67,5 @@ def ConfigSpace(
 )
 
 from .triton_unified_attention import unified_attention
+
+from .mamba_ssm import selective_state_update
@@ -0,0 +1,27 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 58.42541313171387,
+    "evaluated_configs": 75,
+    "keys": [
+        "dstate",
+        "BLOCK_SIZE_DSTATE",
+        "dim",
+        "nheads_ngroups_ratio"
+    ],
+    "cache": {
+        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE_M: 8, num_warps: 2, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": [
+            0.003274054965004325
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -67,3 +67,5 @@ def ConfigSpace(`
`67`	`67`	`)`
`68`	`68`
`69`	`69`	`from .triton_unified_attention import unified_attention`
	`70`	`+`
	`71`	`+from .mamba_ssm import selective_state_update`