From a989ce08420f86344a9ee3be41bd8ab03447be6e Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 16 May 2025 10:19:58 +0000
Subject: [PATCH 01/10] use single AITER fmoe module

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/fused_moe/rocm_aiter_fused_moe.py  | 332 +++++-------------
 1 file changed, 82 insertions(+), 250 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index a92081862bfa..31964b709e8a 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from functools import cache
 from typing import Optional
+from enum import IntEnum
 
 import torch
 
@@ -8,6 +9,12 @@
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
+class QuantMethod(IntEnum):
+    NO = 0 # a16w16
+    PER_TENSOR = 1 # w8a8 (pre_Tensor)
+    PER_TOKEN = 2  # w8a8/w8a4 (per_Token)
+    BLOCK_1X128 = 3 # block quantized w8a8 (per_1x128)
+    BLOCK_128x128 = 4 # block quantized w8a8 (per_128x128)
 
 @cache
 def is_rocm_aiter_moe_enabled() -> bool:
@@ -68,163 +75,6 @@ def rocm_aiter_asm_moe_tkw1_fake(
         activation_str: str = "silu") -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
-
-def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl(
-        topk_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        hidden_states_dtype: torch.dtype,
-        expert_mask: torch.Tensor,
-        a1: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        block_shape: list[int],
-        smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
-    from aiter import fmoe_fp8_blockscale_g1u1
-    from aiter.fused_moe_bf16_asm import moe_sorting_ck
-
-    topk = topk_ids.shape[1]
-    model_dim = w1.shape[-1]
-    local_E = E = w1.shape[0]
-    if expert_mask is not None:
-        E = expert_mask.numel()
-
-    (
-        sorted_token_ids,
-        sorted_weight_buf,
-        sorted_expert_ids,
-        num_valid_ids,
-        out_asm,
-    ) = moe_sorting_ck(topk_ids,
-                       topk_weights,
-                       E,
-                       model_dim,
-                       hidden_states_dtype,
-                       expert_mask=expert_mask)
-
-    fmoe_fp8_blockscale_g1u1(out_asm, a1, w1, w2, sorted_token_ids,
-                             sorted_weight_buf, sorted_expert_ids,
-                             num_valid_ids, topk,
-                             a1_scale.t().contiguous(),
-                             w1_scale.view(local_E, -1),
-                             w2_scale.view(local_E,
-                                           -1), *block_shape, smooth_scale)
-
-    return out_asm
-
-
-def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake(
-        topk_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        hidden_states_dtype: torch.dtype,
-        expert_mask: torch.Tensor,
-        a1: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        block_shape: list[int],
-        smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-    return torch.empty_like(a1, dtype=hidden_states_dtype)
-
-
-def rocm_aiter_asm_moe_impl(hidden_states: torch.Tensor,
-                            w1: torch.Tensor,
-                            w2: torch.Tensor,
-                            topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
-                            fc1_scale: Optional[torch.Tensor] = None,
-                            fc2_scale: Optional[torch.Tensor] = None,
-                            fc1_smooth_scale: Optional[torch.Tensor] = None,
-                            fc2_smooth_scale: Optional[torch.Tensor] = None,
-                            a16: bool = False,
-                            activation: str = "silu") -> torch.Tensor:
-    import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
-    from aiter import ActivationType
-
-    assert activation in ["silu", "gelu"], "The given activation:" \
-                                          f" {activation}"         \
-                                           " is not supported in" \
-                                           " AITER."
-    if activation == "silu":
-        aiter_activation = ActivationType.Silu
-    else:
-        aiter_activation = ActivationType.Gelu
-
-    return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
-                                       w1=w1,
-                                       w2=w2,
-                                       topk_weight=topk_weights,
-                                       topk_ids=topk_ids,
-                                       fc1_scale=fc1_scale,
-                                       fc2_scale=fc2_scale,
-                                       fc1_smooth_scale=fc1_smooth_scale,
-                                       fc2_smooth_scale=fc2_smooth_scale,
-                                       a16=a16,
-                                       activation=aiter_activation)
-
-
-def rocm_aiter_asm_moe_fake(hidden_states: torch.Tensor,
-                            w1: torch.Tensor,
-                            w2: torch.Tensor,
-                            topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
-                            fc1_scale: Optional[torch.Tensor] = None,
-                            fc2_scale: Optional[torch.Tensor] = None,
-                            fc1_smooth_scale: Optional[torch.Tensor] = None,
-                            fc2_smooth_scale: Optional[torch.Tensor] = None,
-                            a16: bool = False,
-                            activation: str = "silu") -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-def rocm_aiter_ck_moe_2stages_impl(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    fc1_scale: Optional[torch.Tensor] = None,
-    fc2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_size: Optional[list[int]] = None,
-    expert_mask: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    from aiter.fused_moe_bf16_asm import ck_moe_2stages
-    return ck_moe_2stages(a1=hidden_states,
-                          w1=w1,
-                          w2=w2,
-                          topk_weight=topk_weights,
-                          topk_ids=topk_ids,
-                          fc1_scale=fc1_scale,
-                          fc2_scale=fc2_scale,
-                          a1_scale=a1_scale,
-                          a2_scale=a2_scale,
-                          block_size=block_size,
-                          expert_mask=expert_mask)
-
-
-def rocm_aiter_ck_moe_2stages_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    fc1_scale: Optional[torch.Tensor] = None,
-    fc2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_size: Optional[list[int]] = None,
-    expert_mask: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
 def rocm_aiter_topk_softmax_impl(topk_weights: torch.Tensor,
                                  topk_indices: torch.Tensor,
                                  token_expert_indices: torch.Tensor,
@@ -274,6 +124,49 @@ def rocm_aiter_biased_grouped_topk_fake(
     pass
 
 
+def rocm_aiter_fused_moe_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2:torch.Tensor,
+    topk_weight:torch.Tensor,
+    topk_ids:torch.Tensor,
+    expert_mask: Optional[torch.Tensor]=None,  # EP
+    activation_str: str="silu",
+    quant_method: int = QuantMethod.NO.value,
+    doweight_stage1: bool=False,
+    w1_scale: Optional[torch.Tensor]=None,
+    w2_scale: Optional[torch.Tensor]=None,
+    a1_scale: Optional[torch.Tensor]=None,
+    a2_scale: Optional[torch.Tensor]=None,
+) -> torch.Tensor:
+    from aiter.fused_moe import fused_moe
+    from aiter import ActivationType
+    from aiter import QuantType
+
+    activation = \
+        ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu
+    quant_type = QuantType(quant_method)
+
+    return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask, activation, quant_type, doweight_stage1, w1_scale, w2_scale, a1_scale, a2_scale)
+    
+def rocm_aiter_fused_moe_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2:torch.Tensor,
+    topk_weight:torch.Tensor,
+    topk_ids:torch.Tensor,
+    expert_mask: Optional[torch.Tensor]=None,
+    activation_str: str="silu",
+    quant_method: int=QuantMethod.NO.value,
+    doweight_stage1: bool=False,
+    w1_scale: Optional[torch.Tensor]=None,
+    w2_scale: Optional[torch.Tensor]=None,
+    a1_scale: Optional[torch.Tensor]=None,
+    a2_scale: Optional[torch.Tensor]=None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
 if current_platform.is_rocm():
 
     direct_register_custom_op(
@@ -285,26 +178,10 @@ def rocm_aiter_biased_grouped_topk_fake(
     )
 
     direct_register_custom_op(
-        op_name="rocm_aiter_fmoe_fp8_blockscale_g1u1",
-        op_func=rocm_aiter_fmoe_fp8_blockscale_g1u1_impl,
+        op_name="rocm_aiter_fused_moe",
+        op_func=rocm_aiter_fused_moe_impl,
         mutates_args=[],
-        fake_impl=rocm_aiter_fmoe_fp8_blockscale_g1u1_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_asm_moe",
-        op_func=rocm_aiter_asm_moe_impl,
-        mutates_args=[],
-        fake_impl=rocm_aiter_asm_moe_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_ck_moe_2stages",
-        op_func=rocm_aiter_ck_moe_2stages_impl,
-        mutates_args=[],
-        fake_impl=rocm_aiter_ck_moe_2stages_fake,
+        fake_impl=rocm_aiter_fused_moe_fake,
         dispatch_key=current_platform.dispatch_key,
     )
 
@@ -324,7 +201,6 @@ def rocm_aiter_biased_grouped_topk_fake(
         dispatch_key=current_platform.dispatch_key,
     )
 
-
 def rocm_aiter_biased_group_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -373,32 +249,12 @@ def rocm_aiter_fused_experts(
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[list[int]] = None) -> torch.Tensor:
 
-    from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-        per_token_group_quant_fp8)
-
     # All AITER Fused MoE kernels are expecting the following datatypes
     topk_weights = topk_weights.to(torch.float32)
     topk_ids = topk_ids.to(torch.int32)
 
-    # w8a8 block-scaled
-    if block_shape is not None and use_fp8_w8a8:
-        assert not apply_router_weight_on_input, (
-            "apply_router_weight_on_input is not supported for block scaled moe"
-        )
-        assert w1_scale is not None
-        assert w2_scale is not None
-
-        # The default block sizes are 128 in AITER.
-        block_shape = [128, 128] if block_shape is None else block_shape
-
-        a1, a1_scale = per_token_group_quant_fp8(hidden_states, block_shape[1])
-
-        return torch.ops.vllm.rocm_aiter_fmoe_fp8_blockscale_g1u1(
-            topk_ids, topk_weights, hidden_states.dtype, None, a1, w1, w2,
-            w1_scale, w2_scale, a1_scale, block_shape, None)
-
     # w8a8 per-channel quantization
-    elif per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+    if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
         # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
         # This applies topk_weights on the GEMM output of the first FC layer
         #  rather than the second FC.
@@ -423,58 +279,34 @@ def rocm_aiter_fused_experts(
             expert_mask=None,
             activation_str=activation)
 
-    # w8a8 per-tensor activation per-tensor weight
-    elif use_fp8_w8a8:
-        assert not apply_router_weight_on_input, (
-            "apply_router_weight_on_input is not supported for fp8_w8a8")
-
-        # - faster static per-tensor-activation static per-tensor-weight
-        #   fp8 quantization w8a8
-        if a1_scale is not None and a2_scale is not None:
-            return torch.ops.vllm.rocm_aiter_ck_moe_2stages(
-                hidden_states=hidden_states,
-                w1=w1,
-                w2=w2,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                fc1_scale=w1_scale,
-                fc2_scale=w2_scale,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale)
-
-        # - fallback static per-tensor-activation static per-tensor-weight
-        #   fp8 quantization w8a8
-        # - dynamic per-tensor activation static per-tensor-weight
-        #   fp8 quantization w8a8
-        return torch.ops.vllm.rocm_aiter_asm_moe(hidden_states=hidden_states,
-                                                 w1=w1,
-                                                 w2=w2,
-                                                 topk_weights=topk_weights,
-                                                 topk_ids=topk_ids,
-                                                 fc1_scale=w1_scale,
-                                                 fc2_scale=w2_scale,
-                                                 fc1_smooth_scale=None,
-                                                 fc2_smooth_scale=None,
-                                                 a16=False,
-                                                 activation=activation)
-    if apply_router_weight_on_input:
-        assert (topk_weights.dim() == 2
-                ), "`topk_weights` should be in shape (num_tokens, topk)"
-        _, topk = topk_weights.shape
-        assert (
-            topk == 1
-        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
-
-        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
-        topk_ids = topk_ids.to(torch.int32)
-        topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
-
-    return torch.ops.vllm.rocm_aiter_ck_moe_2stages(
-        hidden_states=hidden_states,
-        w1=w1,
-        w2=w2,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids)
+    else:
+        quant_method = QuantMethod.NO.value
+        
+        # w8a8 block-scaled
+        if block_shape is not None and use_fp8_w8a8:
+            assert not apply_router_weight_on_input, (
+                "apply_router_weight_on_input is not supported for block scaled moe"
+            )
+            assert w1_scale is not None
+            assert w2_scale is not None
+            quant_method = QuantMethod.BLOCK_128x128.value
+        elif use_fp8_w8a8:
+            quant_method = QuantMethod.PER_TOKEN.value
+
+        if apply_router_weight_on_input:
+            assert (topk_weights.dim() == 2
+                    ), "`topk_weights` should be in shape (num_tokens, topk)"
+            _, topk = topk_weights.shape
+            assert (
+                topk == 1
+            ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+
+            hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+            topk_ids = topk_ids.to(torch.int32)
+            topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
+
+        return torch.ops.vllm.rocm_aiter_fused_moe(hidden_states, w1, w2, topk_weights, topk_ids, quant_method=quant_method, activation_str=activation, w1_scale=w1_scale, w2_scale=w2_scale, a1_scale=a1_scale, a2_scale=a2_scale)
+
 
 
 def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,

From da0622c4f07622eb8a37de4f631278866aca4335 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 16 May 2025 11:15:33 +0000
Subject: [PATCH 02/10] bugfixes: remove the weight scale expension and set
 correct layout size

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/model_executor/layers/fused_moe/layer.py       |  3 +--
 .../layers/fused_moe/rocm_aiter_fused_moe.py        |  2 ++
 vllm/model_executor/layers/quantization/fp8.py      | 13 +------------
 3 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index f1cb77f64eae..c90c0e03fc19 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -376,10 +376,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             shuffle_weights)
 
         if self.rocm_aiter_moe_enabled:
-            # use 2stage ck moe layout
             shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data,
                                                         layer.w2_weight.data,
-                                                        layout=(32, 32))
+                                                        layout=(16, 16))
 
             layer.w13_weight.data = shuffled_w13
             layer.w2_weight.data = shuffled_w2
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 31964b709e8a..034b65e26843 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -292,6 +292,8 @@ def rocm_aiter_fused_experts(
             quant_method = QuantMethod.BLOCK_128x128.value
         elif use_fp8_w8a8:
             quant_method = QuantMethod.PER_TOKEN.value
+            if a1_scale is not None and a2_scale is not None:
+                quant_method = QuantMethod.PER_TENSOR.value
 
         if apply_router_weight_on_input:
             assert (topk_weights.dim() == 2
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f4cdc3db1a0d..e5fa08fd222c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -762,20 +762,9 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     start += shard_size
 
             if self.rocm_aiter_moe_enabled:
-                # reshaping weights is required for aiter moe kernel.
-                expansion_dims = [
-                    layer.w13_weight.shape[1], layer.w2_weight.shape[1]
-                ]
-                max_w13_scales, w2_scales = expand_weights(
-                    max_w13_scales,
-                    layer.w2_weight_scale.data,
-                    expansion_dims=expansion_dims)
-                layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_scales.contiguous(), requires_grad=False)
-
                 shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight,
                                                             layer.w2_weight,
-                                                            layout=(32, 32))
+                                                            layout=(16, 16))
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)

From 350bf88eb0a7535bb897ece65bd6eda55d59a9c7 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 16 May 2025 11:53:35 +0000
Subject: [PATCH 03/10] clean code in aiter fmoe module

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/fused_moe/rocm_aiter_fused_moe.py  | 114 +++++++++++-------
 1 file changed, 70 insertions(+), 44 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 034b65e26843..0e22b8111bed 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+from enum import IntEnum
 from functools import cache
 from typing import Optional
-from enum import IntEnum
 
 import torch
 
@@ -9,12 +9,23 @@
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
+
 class QuantMethod(IntEnum):
-    NO = 0 # a16w16
-    PER_TENSOR = 1 # w8a8 (pre_Tensor)
+    # This allows interfacing with AITER QuantType Enum
+    # without importing the QuantType from AITER globally
+    NO = 0  # a16w16
+    PER_TENSOR = 1  # w8a8 (pre_Tensor)
     PER_TOKEN = 2  # w8a8/w8a4 (per_Token)
-    BLOCK_1X128 = 3 # block quantized w8a8 (per_1x128)
-    BLOCK_128x128 = 4 # block quantized w8a8 (per_128x128)
+    BLOCK_1X128 = 3  # block quantized w8a8 (per_1x128)
+    BLOCK_128x128 = 4  # block quantized w8a8 (per_128x128)
+
+
+class ActivationMethod(IntEnum):
+    # This allows interfacing with AITER ActivationType enum
+    # without importing the ActivationType enum from AITER globally
+    SILU = 0
+    GELU = 1
+
 
 @cache
 def is_rocm_aiter_moe_enabled() -> bool:
@@ -36,13 +47,12 @@ def rocm_aiter_asm_moe_tkw1_impl(
         a16: bool = False,
         per_tensor_quant_scale: Optional[torch.Tensor] = None,
         expert_mask: Optional[torch.Tensor] = None,
-        activation_str: str = "silu") -> torch.Tensor:
+        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
 
     from aiter import ActivationType
     from aiter.fused_moe_bf16_asm import asm_moe_tkw1
 
-    activation = \
-        ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu
+    activation = ActivationType(activation_method)
 
     return asm_moe_tkw1(hidden_states,
                         w1,
@@ -72,9 +82,10 @@ def rocm_aiter_asm_moe_tkw1_fake(
         a16: bool = False,
         per_tensor_quant_scale: Optional[torch.Tensor] = None,
         expert_mask: Optional[torch.Tensor] = None,
-        activation_str: str = "silu") -> torch.Tensor:
+        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
+
 def rocm_aiter_topk_softmax_impl(topk_weights: torch.Tensor,
                                  topk_indices: torch.Tensor,
                                  token_expert_indices: torch.Tensor,
@@ -127,42 +138,43 @@ def rocm_aiter_biased_grouped_topk_fake(
 def rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
-    w2:torch.Tensor,
-    topk_weight:torch.Tensor,
-    topk_ids:torch.Tensor,
-    expert_mask: Optional[torch.Tensor]=None,  # EP
-    activation_str: str="silu",
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
     quant_method: int = QuantMethod.NO.value,
-    doweight_stage1: bool=False,
-    w1_scale: Optional[torch.Tensor]=None,
-    w2_scale: Optional[torch.Tensor]=None,
-    a1_scale: Optional[torch.Tensor]=None,
-    a2_scale: Optional[torch.Tensor]=None,
+    doweight_stage1: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
+    from aiter import ActivationType, QuantType
     from aiter.fused_moe import fused_moe
-    from aiter import ActivationType
-    from aiter import QuantType
 
-    activation = \
-        ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu
+    activation = ActivationType(activation_method)
     quant_type = QuantType(quant_method)
 
-    return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask, activation, quant_type, doweight_stage1, w1_scale, w2_scale, a1_scale, a2_scale)
-    
+    return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask,
+                     activation, quant_type, doweight_stage1, w1_scale,
+                     w2_scale, a1_scale, a2_scale)
+
+
 def rocm_aiter_fused_moe_fake(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
-    w2:torch.Tensor,
-    topk_weight:torch.Tensor,
-    topk_ids:torch.Tensor,
-    expert_mask: Optional[torch.Tensor]=None,
-    activation_str: str="silu",
-    quant_method: int=QuantMethod.NO.value,
-    doweight_stage1: bool=False,
-    w1_scale: Optional[torch.Tensor]=None,
-    w2_scale: Optional[torch.Tensor]=None,
-    a1_scale: Optional[torch.Tensor]=None,
-    a2_scale: Optional[torch.Tensor]=None,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+    quant_method: int = QuantMethod.NO.value,
+    doweight_stage1: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
@@ -201,6 +213,7 @@ def rocm_aiter_fused_moe_fake(
         dispatch_key=current_platform.dispatch_key,
     )
 
+
 def rocm_aiter_biased_group_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -249,6 +262,8 @@ def rocm_aiter_fused_experts(
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[list[int]] = None) -> torch.Tensor:
 
+    activation_method = (ActivationMethod.SILU
+                         if activation == "silu" else ActivationMethod.GELU)
     # All AITER Fused MoE kernels are expecting the following datatypes
     topk_weights = topk_weights.to(torch.float32)
     topk_ids = topk_ids.to(torch.int32)
@@ -277,16 +292,16 @@ def rocm_aiter_fused_experts(
             a16=False,
             per_tensor_quant_scale=None,
             expert_mask=None,
-            activation_str=activation)
+            activation_method=activation_method)
 
     else:
         quant_method = QuantMethod.NO.value
-        
+
         # w8a8 block-scaled
         if block_shape is not None and use_fp8_w8a8:
             assert not apply_router_weight_on_input, (
-                "apply_router_weight_on_input is not supported for block scaled moe"
-            )
+                "apply_router_weight_on_input is\
+                not supported for block scaled moe")
             assert w1_scale is not None
             assert w2_scale is not None
             quant_method = QuantMethod.BLOCK_128x128.value
@@ -303,12 +318,23 @@ def rocm_aiter_fused_experts(
                 topk == 1
             ), "Only support topk=1 when `apply_router_weight_on_input` is True"
 
-            hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+            hidden_states = hidden_states * topk_weights.to(
+                hidden_states.dtype)
             topk_ids = topk_ids.to(torch.int32)
             topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
 
-        return torch.ops.vllm.rocm_aiter_fused_moe(hidden_states, w1, w2, topk_weights, topk_ids, quant_method=quant_method, activation_str=activation, w1_scale=w1_scale, w2_scale=w2_scale, a1_scale=a1_scale, a2_scale=a2_scale)
-
+        return torch.ops.vllm.rocm_aiter_fused_moe(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            quant_method=quant_method,
+            activation_method=activation_method,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale)
 
 
 def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
@@ -358,4 +384,4 @@ def expand_weights(*tensors: torch.Tensor,
 
     return tuple(
         tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1))
-        for tensor, dim in zip(tensors, expansion_dims))
\ No newline at end of file
+        for tensor, dim in zip(tensors, expansion_dims))

From 2b40c7294b914ad3e46bed3a7576dfe322011769 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 16 May 2025 17:53:39 +0000
Subject: [PATCH 04/10] udpate docker file for new AITER package

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 docker/Dockerfile.rocm_base | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 222b9c158e5e..45efcbde698b 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="5a77249"
+ARG AITER_BRANCH="c1debd8"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base

From 85a7151d180de411d9593f7831a5f1d8c437685f Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 19 May 2025 02:39:42 +0000
Subject: [PATCH 05/10] update shuffle weights documentation.

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/model_executor/layers/fused_moe/layer.py       |  5 ++---
 .../layers/fused_moe/rocm_aiter_fused_moe.py        | 13 ++++++++++---
 .../compressed_tensors/compressed_tensors_moe.py    |  5 ++---
 vllm/model_executor/layers/quantization/fp8.py      |  9 +++------
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index c90c0e03fc19..220bf110a310 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -376,9 +376,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             shuffle_weights)
 
         if self.rocm_aiter_moe_enabled:
-            shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data,
-                                                        layer.w2_weight.data,
-                                                        layout=(16, 16))
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
 
             layer.w13_weight.data = shuffled_w13
             layer.w2_weight.data = shuffled_w2
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 0e22b8111bed..c3d2fefd1399 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -348,14 +348,21 @@ def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
     return topk_weights, topk_indices
 
 
-def shuffle_weights(*tensors: torch.Tensor,
-                    layout: tuple[int, int]) -> tuple[torch.Tensor, ...]:
+def shuffle_weights(
+    *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
+) -> tuple[torch.Tensor, ...]:
     """
     Applies shuffle_weight function from AITER to each 
     input tensor and returns them.
+    
+    Rearranges (shuffles) the input tensor/s
+    into a specified block layout for optimized computation.
 
     Args:
-    *tensors: Variable number of torch.Tensor objects.
+        *tensors: Variable number of torch.Tensor objects.
+        layout: A pair of integers specifying the 
+        block sizes used to divide the tensors during shuffling.
+        Default is (16, 16).
 
     Returns:
     A Tuple of shuffled tensors.
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index fa0067c44802..9241ceeb4db2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -286,9 +286,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 rocm_aiter_fused_experts, shuffle_weights)
 
             # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data,
-                                                        layer.w2_weight.data,
-                                                        layout=(16, 16))
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
 
             layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                   requires_grad=False)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e5fa08fd222c..546a6144a257 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -629,9 +629,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = shuffle_weights(
-                    layer.w13_weight.data,
-                    layer.w2_weight.data,
-                    layout=(16, 16))
+                    layer.w13_weight.data, layer.w2_weight.data)
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)
@@ -688,9 +686,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 layer.w2_weight_scale = torch.nn.Parameter(
                     w2_scales.contiguous(), requires_grad=False)
 
-                shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight,
-                                                            layer.w2_weight,
-                                                            layout=(16, 16))
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)

From 33e36d6c85451ab57738cf7bdc0de2ab6ee66474 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 22 May 2025 05:20:13 +0000
Subject: [PATCH 06/10] only enable per tensor quantization for fp8 w8a8

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/fused_moe/rocm_aiter_fused_moe.py  | 35 +++++--------------
 .../model_executor/layers/quantization/fp8.py | 13 +------
 2 files changed, 10 insertions(+), 38 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index c3d2fefd1399..49df3e3219d1 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -12,7 +12,12 @@
 
 class QuantMethod(IntEnum):
     # This allows interfacing with AITER QuantType Enum
-    # without importing the QuantType from AITER globally
+    # without importing the QuantType from AITER globally.
+
+    # Note that these quantization methods are 
+    # supported in AITER package. However, 
+    # not all are used in this module.
+
     NO = 0  # a16w16
     PER_TENSOR = 1  # w8a8 (pre_Tensor)
     PER_TOKEN = 2  # w8a8/w8a4 (per_Token)
@@ -22,7 +27,7 @@ class QuantMethod(IntEnum):
 
 class ActivationMethod(IntEnum):
     # This allows interfacing with AITER ActivationType enum
-    # without importing the ActivationType enum from AITER globally
+    # without importing the ActivationType enum from AITER globally.
     SILU = 0
     GELU = 1
 
@@ -306,9 +311,8 @@ def rocm_aiter_fused_experts(
             assert w2_scale is not None
             quant_method = QuantMethod.BLOCK_128x128.value
         elif use_fp8_w8a8:
-            quant_method = QuantMethod.PER_TOKEN.value
-            if a1_scale is not None and a2_scale is not None:
-                quant_method = QuantMethod.PER_TENSOR.value
+            # Currently only per tensor quantization method is enabled.
+            quant_method = QuantMethod.PER_TENSOR.value
 
         if apply_router_weight_on_input:
             assert (topk_weights.dim() == 2
@@ -371,24 +375,3 @@ def shuffle_weights(
 
     return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
 
-
-def expand_weights(*tensors: torch.Tensor,
-                   expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
-    """
-    Expands the dimensions of input tensors.
-
-    Args:
-        *tensors: A variable number of torch.Tensor objects.
-        expansion_dims: A list of expansion dimensions 
-        corresponding to each tensor.
-
-    Returns:
-        A Tuple of tensors with expanded dimensions.
-    """
-
-    assert len(tensors) == len(expansion_dims), \
-    "Number of tensors must match the number of expansion dimensions."
-
-    return tuple(
-        tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1))
-        for tensor, dim in zip(tensors, expansion_dims))
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 546a6144a257..e04d2bf5a298 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -597,7 +597,7 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
     def process_weights_after_loading(self, layer: Module) -> None:
         # Lazy import to avoid importing triton too early.
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            expand_weights, is_rocm_aiter_moe_enabled, shuffle_weights)
+            is_rocm_aiter_moe_enabled, shuffle_weights)
 
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
 
@@ -675,17 +675,6 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                  requires_grad=False)
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
-                w13_scales, w2_scales = expand_weights(
-                    layer.w13_weight_scale.data,
-                    layer.w2_weight_scale.data,
-                    expansion_dims=[
-                        layer.w13_weight.shape[1], layer.w2_weight.shape[1]
-                    ])
-                layer.w13_weight_scale = torch.nn.Parameter(
-                    w13_scales.contiguous(), requires_grad=False)
-                layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_scales.contiguous(), requires_grad=False)
-
                 shuffled_w13, shuffled_w2 = shuffle_weights(
                     layer.w13_weight, layer.w2_weight)
 

From aa9e31de91307763d983cc49b18d12a225bd730c Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 22 May 2025 07:21:43 +0000
Subject: [PATCH 07/10] fix precommit error

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 49df3e3219d1..b46a1606636a 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -374,4 +374,3 @@ def shuffle_weights(
     from aiter.ops.shuffle import shuffle_weight
 
     return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
-

From ee9506fdea7ef5519b13db18cee2ac1bee63e9f9 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 22 May 2025 07:34:33 +0000
Subject: [PATCH 08/10] fix precommit error

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index b46a1606636a..be158a328a60 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -14,8 +14,8 @@ class QuantMethod(IntEnum):
     # This allows interfacing with AITER QuantType Enum
     # without importing the QuantType from AITER globally.
 
-    # Note that these quantization methods are 
-    # supported in AITER package. However, 
+    # Note that these quantization methods are
+    # supported in AITER package. However,
     # not all are used in this module.
 
     NO = 0  # a16w16

From 55e11f7b21e1444860cc4fc346f3a3ab25904649 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 22 May 2025 14:31:34 +0000
Subject: [PATCH 09/10] avoid unnecessary multiplication of hidden_states and
 topk_weights instead pass the operation to be done in the kernel api

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/fused_moe/rocm_aiter_fused_moe.py              | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index be158a328a60..10b61fcda176 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -322,11 +322,6 @@ def rocm_aiter_fused_experts(
                 topk == 1
             ), "Only support topk=1 when `apply_router_weight_on_input` is True"
 
-            hidden_states = hidden_states * topk_weights.to(
-                hidden_states.dtype)
-            topk_ids = topk_ids.to(torch.int32)
-            topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
-
         return torch.ops.vllm.rocm_aiter_fused_moe(
             hidden_states,
             w1,
@@ -338,7 +333,8 @@ def rocm_aiter_fused_experts(
             w1_scale=w1_scale,
             w2_scale=w2_scale,
             a1_scale=a1_scale,
-            a2_scale=a2_scale)
+            a2_scale=a2_scale,
+            doweight_stage1=apply_router_weight_on_input)
 
 
 def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,

From 730fb7785d1975fa42c87016728084572946c347 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 23 May 2025 08:23:34 +0000
Subject: [PATCH 10/10] remove unnecessary layout argument

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e04d2bf5a298..cb12c78e5a3a 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -748,9 +748,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     start += shard_size
 
             if self.rocm_aiter_moe_enabled:
-                shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight,
-                                                            layer.w2_weight,
-                                                            layout=(16, 16))
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)