From a989ce08420f86344a9ee3be41bd8ab03447be6e Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 16 May 2025 10:19:58 +0000 Subject: [PATCH 01/10] use single AITER fmoe module Signed-off-by: vllmellm --- .../layers/fused_moe/rocm_aiter_fused_moe.py | 332 +++++------------- 1 file changed, 82 insertions(+), 250 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index a92081862bfa..31964b709e8a 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from functools import cache from typing import Optional +from enum import IntEnum import torch @@ -8,6 +9,12 @@ from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op +class QuantMethod(IntEnum): + NO = 0 # a16w16 + PER_TENSOR = 1 # w8a8 (pre_Tensor) + PER_TOKEN = 2 # w8a8/w8a4 (per_Token) + BLOCK_1X128 = 3 # block quantized w8a8 (per_1x128) + BLOCK_128x128 = 4 # block quantized w8a8 (per_128x128) @cache def is_rocm_aiter_moe_enabled() -> bool: @@ -68,163 +75,6 @@ def rocm_aiter_asm_moe_tkw1_fake( activation_str: str = "silu") -> torch.Tensor: return torch.empty_like(hidden_states) - -def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl( - topk_ids: torch.Tensor, - topk_weights: torch.Tensor, - hidden_states_dtype: torch.dtype, - expert_mask: torch.Tensor, - a1: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a1_scale: torch.Tensor, - block_shape: list[int], - smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor: - from aiter import fmoe_fp8_blockscale_g1u1 - from aiter.fused_moe_bf16_asm import moe_sorting_ck - - topk = topk_ids.shape[1] - model_dim = w1.shape[-1] - local_E = E = w1.shape[0] - if expert_mask is not None: - E = expert_mask.numel() - - ( - sorted_token_ids, - sorted_weight_buf, - sorted_expert_ids, - num_valid_ids, - out_asm, - ) = moe_sorting_ck(topk_ids, - topk_weights, - E, - model_dim, - hidden_states_dtype, - expert_mask=expert_mask) - - fmoe_fp8_blockscale_g1u1(out_asm, a1, w1, w2, sorted_token_ids, - sorted_weight_buf, sorted_expert_ids, - num_valid_ids, topk, - a1_scale.t().contiguous(), - w1_scale.view(local_E, -1), - w2_scale.view(local_E, - -1), *block_shape, smooth_scale) - - return out_asm - - -def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake( - topk_ids: torch.Tensor, - topk_weights: torch.Tensor, - hidden_states_dtype: torch.dtype, - expert_mask: torch.Tensor, - a1: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a1_scale: torch.Tensor, - block_shape: list[int], - smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor: - - return torch.empty_like(a1, dtype=hidden_states_dtype) - - -def rocm_aiter_asm_moe_impl(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - fc1_smooth_scale: Optional[torch.Tensor] = None, - fc2_smooth_scale: Optional[torch.Tensor] = None, - a16: bool = False, - activation: str = "silu") -> torch.Tensor: - import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe - from aiter import ActivationType - - assert activation in ["silu", "gelu"], "The given activation:" \ - f" {activation}" \ - " is not supported in" \ - " AITER." - if activation == "silu": - aiter_activation = ActivationType.Silu - else: - aiter_activation = ActivationType.Gelu - - return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weight=topk_weights, - topk_ids=topk_ids, - fc1_scale=fc1_scale, - fc2_scale=fc2_scale, - fc1_smooth_scale=fc1_smooth_scale, - fc2_smooth_scale=fc2_smooth_scale, - a16=a16, - activation=aiter_activation) - - -def rocm_aiter_asm_moe_fake(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - fc1_smooth_scale: Optional[torch.Tensor] = None, - fc2_smooth_scale: Optional[torch.Tensor] = None, - a16: bool = False, - activation: str = "silu") -> torch.Tensor: - return torch.empty_like(hidden_states) - - -def rocm_aiter_ck_moe_2stages_impl( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_size: Optional[list[int]] = None, - expert_mask: Optional[torch.Tensor] = None, -) -> torch.Tensor: - from aiter.fused_moe_bf16_asm import ck_moe_2stages - return ck_moe_2stages(a1=hidden_states, - w1=w1, - w2=w2, - topk_weight=topk_weights, - topk_ids=topk_ids, - fc1_scale=fc1_scale, - fc2_scale=fc2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_size=block_size, - expert_mask=expert_mask) - - -def rocm_aiter_ck_moe_2stages_fake( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_size: Optional[list[int]] = None, - expert_mask: Optional[torch.Tensor] = None, -) -> torch.Tensor: - return torch.empty_like(hidden_states) - - def rocm_aiter_topk_softmax_impl(topk_weights: torch.Tensor, topk_indices: torch.Tensor, token_expert_indices: torch.Tensor, @@ -274,6 +124,49 @@ def rocm_aiter_biased_grouped_topk_fake( pass +def rocm_aiter_fused_moe_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2:torch.Tensor, + topk_weight:torch.Tensor, + topk_ids:torch.Tensor, + expert_mask: Optional[torch.Tensor]=None, # EP + activation_str: str="silu", + quant_method: int = QuantMethod.NO.value, + doweight_stage1: bool=False, + w1_scale: Optional[torch.Tensor]=None, + w2_scale: Optional[torch.Tensor]=None, + a1_scale: Optional[torch.Tensor]=None, + a2_scale: Optional[torch.Tensor]=None, +) -> torch.Tensor: + from aiter.fused_moe import fused_moe + from aiter import ActivationType + from aiter import QuantType + + activation = \ + ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu + quant_type = QuantType(quant_method) + + return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask, activation, quant_type, doweight_stage1, w1_scale, w2_scale, a1_scale, a2_scale) + +def rocm_aiter_fused_moe_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2:torch.Tensor, + topk_weight:torch.Tensor, + topk_ids:torch.Tensor, + expert_mask: Optional[torch.Tensor]=None, + activation_str: str="silu", + quant_method: int=QuantMethod.NO.value, + doweight_stage1: bool=False, + w1_scale: Optional[torch.Tensor]=None, + w2_scale: Optional[torch.Tensor]=None, + a1_scale: Optional[torch.Tensor]=None, + a2_scale: Optional[torch.Tensor]=None, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + if current_platform.is_rocm(): direct_register_custom_op( @@ -285,26 +178,10 @@ def rocm_aiter_biased_grouped_topk_fake( ) direct_register_custom_op( - op_name="rocm_aiter_fmoe_fp8_blockscale_g1u1", - op_func=rocm_aiter_fmoe_fp8_blockscale_g1u1_impl, + op_name="rocm_aiter_fused_moe", + op_func=rocm_aiter_fused_moe_impl, mutates_args=[], - fake_impl=rocm_aiter_fmoe_fp8_blockscale_g1u1_fake, - dispatch_key=current_platform.dispatch_key, - ) - - direct_register_custom_op( - op_name="rocm_aiter_asm_moe", - op_func=rocm_aiter_asm_moe_impl, - mutates_args=[], - fake_impl=rocm_aiter_asm_moe_fake, - dispatch_key=current_platform.dispatch_key, - ) - - direct_register_custom_op( - op_name="rocm_aiter_ck_moe_2stages", - op_func=rocm_aiter_ck_moe_2stages_impl, - mutates_args=[], - fake_impl=rocm_aiter_ck_moe_2stages_fake, + fake_impl=rocm_aiter_fused_moe_fake, dispatch_key=current_platform.dispatch_key, ) @@ -324,7 +201,6 @@ def rocm_aiter_biased_grouped_topk_fake( dispatch_key=current_platform.dispatch_key, ) - def rocm_aiter_biased_group_topk( hidden_states: torch.Tensor, gating_output: torch.Tensor, @@ -373,32 +249,12 @@ def rocm_aiter_fused_experts( a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None) -> torch.Tensor: - from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - per_token_group_quant_fp8) - # All AITER Fused MoE kernels are expecting the following datatypes topk_weights = topk_weights.to(torch.float32) topk_ids = topk_ids.to(torch.int32) - # w8a8 block-scaled - if block_shape is not None and use_fp8_w8a8: - assert not apply_router_weight_on_input, ( - "apply_router_weight_on_input is not supported for block scaled moe" - ) - assert w1_scale is not None - assert w2_scale is not None - - # The default block sizes are 128 in AITER. - block_shape = [128, 128] if block_shape is None else block_shape - - a1, a1_scale = per_token_group_quant_fp8(hidden_states, block_shape[1]) - - return torch.ops.vllm.rocm_aiter_fmoe_fp8_blockscale_g1u1( - topk_ids, topk_weights, hidden_states.dtype, None, a1, w1, w2, - w1_scale, w2_scale, a1_scale, block_shape, None) - # w8a8 per-channel quantization - elif per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8: + if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8: # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input` # This applies topk_weights on the GEMM output of the first FC layer # rather than the second FC. @@ -423,58 +279,34 @@ def rocm_aiter_fused_experts( expert_mask=None, activation_str=activation) - # w8a8 per-tensor activation per-tensor weight - elif use_fp8_w8a8: - assert not apply_router_weight_on_input, ( - "apply_router_weight_on_input is not supported for fp8_w8a8") - - # - faster static per-tensor-activation static per-tensor-weight - # fp8 quantization w8a8 - if a1_scale is not None and a2_scale is not None: - return torch.ops.vllm.rocm_aiter_ck_moe_2stages( - hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids, - fc1_scale=w1_scale, - fc2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale) - - # - fallback static per-tensor-activation static per-tensor-weight - # fp8 quantization w8a8 - # - dynamic per-tensor activation static per-tensor-weight - # fp8 quantization w8a8 - return torch.ops.vllm.rocm_aiter_asm_moe(hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids, - fc1_scale=w1_scale, - fc2_scale=w2_scale, - fc1_smooth_scale=None, - fc2_smooth_scale=None, - a16=False, - activation=activation) - if apply_router_weight_on_input: - assert (topk_weights.dim() == 2 - ), "`topk_weights` should be in shape (num_tokens, topk)" - _, topk = topk_weights.shape - assert ( - topk == 1 - ), "Only support topk=1 when `apply_router_weight_on_input` is True" - - hidden_states = hidden_states * topk_weights.to(hidden_states.dtype) - topk_ids = topk_ids.to(torch.int32) - topk_weights = torch.ones_like(topk_weights, dtype=torch.float32) - - return torch.ops.vllm.rocm_aiter_ck_moe_2stages( - hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids) + else: + quant_method = QuantMethod.NO.value + + # w8a8 block-scaled + if block_shape is not None and use_fp8_w8a8: + assert not apply_router_weight_on_input, ( + "apply_router_weight_on_input is not supported for block scaled moe" + ) + assert w1_scale is not None + assert w2_scale is not None + quant_method = QuantMethod.BLOCK_128x128.value + elif use_fp8_w8a8: + quant_method = QuantMethod.PER_TOKEN.value + + if apply_router_weight_on_input: + assert (topk_weights.dim() == 2 + ), "`topk_weights` should be in shape (num_tokens, topk)" + _, topk = topk_weights.shape + assert ( + topk == 1 + ), "Only support topk=1 when `apply_router_weight_on_input` is True" + + hidden_states = hidden_states * topk_weights.to(hidden_states.dtype) + topk_ids = topk_ids.to(torch.int32) + topk_weights = torch.ones_like(topk_weights, dtype=torch.float32) + + return torch.ops.vllm.rocm_aiter_fused_moe(hidden_states, w1, w2, topk_weights, topk_ids, quant_method=quant_method, activation_str=activation, w1_scale=w1_scale, w2_scale=w2_scale, a1_scale=a1_scale, a2_scale=a2_scale) + def rocm_aiter_topk_softmax(topk_weights: torch.Tensor, From da0622c4f07622eb8a37de4f631278866aca4335 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 16 May 2025 11:15:33 +0000 Subject: [PATCH 02/10] bugfixes: remove the weight scale expension and set correct layout size Signed-off-by: vllmellm --- vllm/model_executor/layers/fused_moe/layer.py | 3 +-- .../layers/fused_moe/rocm_aiter_fused_moe.py | 2 ++ vllm/model_executor/layers/quantization/fp8.py | 13 +------------ 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index f1cb77f64eae..c90c0e03fc19 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -376,10 +376,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: shuffle_weights) if self.rocm_aiter_moe_enabled: - # use 2stage ck moe layout shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data, layer.w2_weight.data, - layout=(32, 32)) + layout=(16, 16)) layer.w13_weight.data = shuffled_w13 layer.w2_weight.data = shuffled_w2 diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 31964b709e8a..034b65e26843 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -292,6 +292,8 @@ def rocm_aiter_fused_experts( quant_method = QuantMethod.BLOCK_128x128.value elif use_fp8_w8a8: quant_method = QuantMethod.PER_TOKEN.value + if a1_scale is not None and a2_scale is not None: + quant_method = QuantMethod.PER_TENSOR.value if apply_router_weight_on_input: assert (topk_weights.dim() == 2 diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index f4cdc3db1a0d..e5fa08fd222c 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -762,20 +762,9 @@ def process_weights_after_loading(self, layer: Module) -> None: start += shard_size if self.rocm_aiter_moe_enabled: - # reshaping weights is required for aiter moe kernel. - expansion_dims = [ - layer.w13_weight.shape[1], layer.w2_weight.shape[1] - ] - max_w13_scales, w2_scales = expand_weights( - max_w13_scales, - layer.w2_weight_scale.data, - expansion_dims=expansion_dims) - layer.w2_weight_scale = torch.nn.Parameter( - w2_scales.contiguous(), requires_grad=False) - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight, layer.w2_weight, - layout=(32, 32)) + layout=(16, 16)) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) From 350bf88eb0a7535bb897ece65bd6eda55d59a9c7 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 16 May 2025 11:53:35 +0000 Subject: [PATCH 03/10] clean code in aiter fmoe module Signed-off-by: vllmellm --- .../layers/fused_moe/rocm_aiter_fused_moe.py | 114 +++++++++++------- 1 file changed, 70 insertions(+), 44 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 034b65e26843..0e22b8111bed 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from enum import IntEnum from functools import cache from typing import Optional -from enum import IntEnum import torch @@ -9,12 +9,23 @@ from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op + class QuantMethod(IntEnum): - NO = 0 # a16w16 - PER_TENSOR = 1 # w8a8 (pre_Tensor) + # This allows interfacing with AITER QuantType Enum + # without importing the QuantType from AITER globally + NO = 0 # a16w16 + PER_TENSOR = 1 # w8a8 (pre_Tensor) PER_TOKEN = 2 # w8a8/w8a4 (per_Token) - BLOCK_1X128 = 3 # block quantized w8a8 (per_1x128) - BLOCK_128x128 = 4 # block quantized w8a8 (per_128x128) + BLOCK_1X128 = 3 # block quantized w8a8 (per_1x128) + BLOCK_128x128 = 4 # block quantized w8a8 (per_128x128) + + +class ActivationMethod(IntEnum): + # This allows interfacing with AITER ActivationType enum + # without importing the ActivationType enum from AITER globally + SILU = 0 + GELU = 1 + @cache def is_rocm_aiter_moe_enabled() -> bool: @@ -36,13 +47,12 @@ def rocm_aiter_asm_moe_tkw1_impl( a16: bool = False, per_tensor_quant_scale: Optional[torch.Tensor] = None, expert_mask: Optional[torch.Tensor] = None, - activation_str: str = "silu") -> torch.Tensor: + activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor: from aiter import ActivationType from aiter.fused_moe_bf16_asm import asm_moe_tkw1 - activation = \ - ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu + activation = ActivationType(activation_method) return asm_moe_tkw1(hidden_states, w1, @@ -72,9 +82,10 @@ def rocm_aiter_asm_moe_tkw1_fake( a16: bool = False, per_tensor_quant_scale: Optional[torch.Tensor] = None, expert_mask: Optional[torch.Tensor] = None, - activation_str: str = "silu") -> torch.Tensor: + activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor: return torch.empty_like(hidden_states) + def rocm_aiter_topk_softmax_impl(topk_weights: torch.Tensor, topk_indices: torch.Tensor, token_expert_indices: torch.Tensor, @@ -127,42 +138,43 @@ def rocm_aiter_biased_grouped_topk_fake( def rocm_aiter_fused_moe_impl( hidden_states: torch.Tensor, w1: torch.Tensor, - w2:torch.Tensor, - topk_weight:torch.Tensor, - topk_ids:torch.Tensor, - expert_mask: Optional[torch.Tensor]=None, # EP - activation_str: str="silu", + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + expert_mask: Optional[torch.Tensor] = None, + activation_method: int = ActivationMethod.SILU.value, quant_method: int = QuantMethod.NO.value, - doweight_stage1: bool=False, - w1_scale: Optional[torch.Tensor]=None, - w2_scale: Optional[torch.Tensor]=None, - a1_scale: Optional[torch.Tensor]=None, - a2_scale: Optional[torch.Tensor]=None, + doweight_stage1: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: + from aiter import ActivationType, QuantType from aiter.fused_moe import fused_moe - from aiter import ActivationType - from aiter import QuantType - activation = \ - ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu + activation = ActivationType(activation_method) quant_type = QuantType(quant_method) - return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask, activation, quant_type, doweight_stage1, w1_scale, w2_scale, a1_scale, a2_scale) - + return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask, + activation, quant_type, doweight_stage1, w1_scale, + w2_scale, a1_scale, a2_scale) + + def rocm_aiter_fused_moe_fake( hidden_states: torch.Tensor, w1: torch.Tensor, - w2:torch.Tensor, - topk_weight:torch.Tensor, - topk_ids:torch.Tensor, - expert_mask: Optional[torch.Tensor]=None, - activation_str: str="silu", - quant_method: int=QuantMethod.NO.value, - doweight_stage1: bool=False, - w1_scale: Optional[torch.Tensor]=None, - w2_scale: Optional[torch.Tensor]=None, - a1_scale: Optional[torch.Tensor]=None, - a2_scale: Optional[torch.Tensor]=None, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + expert_mask: Optional[torch.Tensor] = None, + activation_method: int = ActivationMethod.SILU.value, + quant_method: int = QuantMethod.NO.value, + doweight_stage1: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: return torch.empty_like(hidden_states) @@ -201,6 +213,7 @@ def rocm_aiter_fused_moe_fake( dispatch_key=current_platform.dispatch_key, ) + def rocm_aiter_biased_group_topk( hidden_states: torch.Tensor, gating_output: torch.Tensor, @@ -249,6 +262,8 @@ def rocm_aiter_fused_experts( a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None) -> torch.Tensor: + activation_method = (ActivationMethod.SILU + if activation == "silu" else ActivationMethod.GELU) # All AITER Fused MoE kernels are expecting the following datatypes topk_weights = topk_weights.to(torch.float32) topk_ids = topk_ids.to(torch.int32) @@ -277,16 +292,16 @@ def rocm_aiter_fused_experts( a16=False, per_tensor_quant_scale=None, expert_mask=None, - activation_str=activation) + activation_method=activation_method) else: quant_method = QuantMethod.NO.value - + # w8a8 block-scaled if block_shape is not None and use_fp8_w8a8: assert not apply_router_weight_on_input, ( - "apply_router_weight_on_input is not supported for block scaled moe" - ) + "apply_router_weight_on_input is\ + not supported for block scaled moe") assert w1_scale is not None assert w2_scale is not None quant_method = QuantMethod.BLOCK_128x128.value @@ -303,12 +318,23 @@ def rocm_aiter_fused_experts( topk == 1 ), "Only support topk=1 when `apply_router_weight_on_input` is True" - hidden_states = hidden_states * topk_weights.to(hidden_states.dtype) + hidden_states = hidden_states * topk_weights.to( + hidden_states.dtype) topk_ids = topk_ids.to(torch.int32) topk_weights = torch.ones_like(topk_weights, dtype=torch.float32) - return torch.ops.vllm.rocm_aiter_fused_moe(hidden_states, w1, w2, topk_weights, topk_ids, quant_method=quant_method, activation_str=activation, w1_scale=w1_scale, w2_scale=w2_scale, a1_scale=a1_scale, a2_scale=a2_scale) - + return torch.ops.vllm.rocm_aiter_fused_moe( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + quant_method=quant_method, + activation_method=activation_method, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale) def rocm_aiter_topk_softmax(topk_weights: torch.Tensor, @@ -358,4 +384,4 @@ def expand_weights(*tensors: torch.Tensor, return tuple( tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1)) - for tensor, dim in zip(tensors, expansion_dims)) \ No newline at end of file + for tensor, dim in zip(tensors, expansion_dims)) From 2b40c7294b914ad3e46bed3a7576dfe322011769 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 16 May 2025 17:53:39 +0000 Subject: [PATCH 04/10] udpate docker file for new AITER package Signed-off-by: vllmellm --- docker/Dockerfile.rocm_base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 222b9c158e5e..45efcbde698b 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="1a7f4dfa" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="5a77249" +ARG AITER_BRANCH="c1debd8" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base From 85a7151d180de411d9593f7831a5f1d8c437685f Mon Sep 17 00:00:00 2001 From: vllmellm Date: Mon, 19 May 2025 02:39:42 +0000 Subject: [PATCH 05/10] update shuffle weights documentation. Signed-off-by: vllmellm --- vllm/model_executor/layers/fused_moe/layer.py | 5 ++--- .../layers/fused_moe/rocm_aiter_fused_moe.py | 13 ++++++++++--- .../compressed_tensors/compressed_tensors_moe.py | 5 ++--- vllm/model_executor/layers/quantization/fp8.py | 9 +++------ 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c90c0e03fc19..220bf110a310 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -376,9 +376,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: shuffle_weights) if self.rocm_aiter_moe_enabled: - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data, - layer.w2_weight.data, - layout=(16, 16)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data) layer.w13_weight.data = shuffled_w13 layer.w2_weight.data = shuffled_w2 diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 0e22b8111bed..c3d2fefd1399 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -348,14 +348,21 @@ def rocm_aiter_topk_softmax(topk_weights: torch.Tensor, return topk_weights, topk_indices -def shuffle_weights(*tensors: torch.Tensor, - layout: tuple[int, int]) -> tuple[torch.Tensor, ...]: +def shuffle_weights( + *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16) +) -> tuple[torch.Tensor, ...]: """ Applies shuffle_weight function from AITER to each input tensor and returns them. + + Rearranges (shuffles) the input tensor/s + into a specified block layout for optimized computation. Args: - *tensors: Variable number of torch.Tensor objects. + *tensors: Variable number of torch.Tensor objects. + layout: A pair of integers specifying the + block sizes used to divide the tensors during shuffling. + Default is (16, 16). Returns: A Tuple of shuffled tensors. diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index fa0067c44802..9241ceeb4db2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -286,9 +286,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: rocm_aiter_fused_experts, shuffle_weights) # reshaping weights is required for aiter moe kernel. - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data, - layer.w2_weight.data, - layout=(16, 16)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e5fa08fd222c..546a6144a257 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -629,9 +629,7 @@ def process_weights_after_loading(self, layer: Module) -> None: if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. shuffled_w13, shuffled_w2 = shuffle_weights( - layer.w13_weight.data, - layer.w2_weight.data, - layout=(16, 16)) + layer.w13_weight.data, layer.w2_weight.data) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) @@ -688,9 +686,8 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w2_weight_scale = torch.nn.Parameter( w2_scales.contiguous(), requires_grad=False) - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight, - layer.w2_weight, - layout=(16, 16)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight, layer.w2_weight) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) From 33e36d6c85451ab57738cf7bdc0de2ab6ee66474 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 22 May 2025 05:20:13 +0000 Subject: [PATCH 06/10] only enable per tensor quantization for fp8 w8a8 Signed-off-by: vllmellm --- .../layers/fused_moe/rocm_aiter_fused_moe.py | 35 +++++-------------- .../model_executor/layers/quantization/fp8.py | 13 +------ 2 files changed, 10 insertions(+), 38 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index c3d2fefd1399..49df3e3219d1 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -12,7 +12,12 @@ class QuantMethod(IntEnum): # This allows interfacing with AITER QuantType Enum - # without importing the QuantType from AITER globally + # without importing the QuantType from AITER globally. + + # Note that these quantization methods are + # supported in AITER package. However, + # not all are used in this module. + NO = 0 # a16w16 PER_TENSOR = 1 # w8a8 (pre_Tensor) PER_TOKEN = 2 # w8a8/w8a4 (per_Token) @@ -22,7 +27,7 @@ class QuantMethod(IntEnum): class ActivationMethod(IntEnum): # This allows interfacing with AITER ActivationType enum - # without importing the ActivationType enum from AITER globally + # without importing the ActivationType enum from AITER globally. SILU = 0 GELU = 1 @@ -306,9 +311,8 @@ def rocm_aiter_fused_experts( assert w2_scale is not None quant_method = QuantMethod.BLOCK_128x128.value elif use_fp8_w8a8: - quant_method = QuantMethod.PER_TOKEN.value - if a1_scale is not None and a2_scale is not None: - quant_method = QuantMethod.PER_TENSOR.value + # Currently only per tensor quantization method is enabled. + quant_method = QuantMethod.PER_TENSOR.value if apply_router_weight_on_input: assert (topk_weights.dim() == 2 @@ -371,24 +375,3 @@ def shuffle_weights( return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors) - -def expand_weights(*tensors: torch.Tensor, - expansion_dims: list[int]) -> tuple[torch.Tensor, ...]: - """ - Expands the dimensions of input tensors. - - Args: - *tensors: A variable number of torch.Tensor objects. - expansion_dims: A list of expansion dimensions - corresponding to each tensor. - - Returns: - A Tuple of tensors with expanded dimensions. - """ - - assert len(tensors) == len(expansion_dims), \ - "Number of tensors must match the number of expansion dimensions." - - return tuple( - tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1)) - for tensor, dim in zip(tensors, expansion_dims)) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 546a6144a257..e04d2bf5a298 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -597,7 +597,7 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, def process_weights_after_loading(self, layer: Module) -> None: # Lazy import to avoid importing triton too early. from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - expand_weights, is_rocm_aiter_moe_enabled, shuffle_weights) + is_rocm_aiter_moe_enabled, shuffle_weights) self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() @@ -675,17 +675,6 @@ def process_weights_after_loading(self, layer: Module) -> None: requires_grad=False) if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. - w13_scales, w2_scales = expand_weights( - layer.w13_weight_scale.data, - layer.w2_weight_scale.data, - expansion_dims=[ - layer.w13_weight.shape[1], layer.w2_weight.shape[1] - ]) - layer.w13_weight_scale = torch.nn.Parameter( - w13_scales.contiguous(), requires_grad=False) - layer.w2_weight_scale = torch.nn.Parameter( - w2_scales.contiguous(), requires_grad=False) - shuffled_w13, shuffled_w2 = shuffle_weights( layer.w13_weight, layer.w2_weight) From aa9e31de91307763d983cc49b18d12a225bd730c Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 22 May 2025 07:21:43 +0000 Subject: [PATCH 07/10] fix precommit error Signed-off-by: vllmellm --- vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 49df3e3219d1..b46a1606636a 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -374,4 +374,3 @@ def shuffle_weights( from aiter.ops.shuffle import shuffle_weight return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors) - From ee9506fdea7ef5519b13db18cee2ac1bee63e9f9 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 22 May 2025 07:34:33 +0000 Subject: [PATCH 08/10] fix precommit error Signed-off-by: vllmellm --- vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index b46a1606636a..be158a328a60 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -14,8 +14,8 @@ class QuantMethod(IntEnum): # This allows interfacing with AITER QuantType Enum # without importing the QuantType from AITER globally. - # Note that these quantization methods are - # supported in AITER package. However, + # Note that these quantization methods are + # supported in AITER package. However, # not all are used in this module. NO = 0 # a16w16 From 55e11f7b21e1444860cc4fc346f3a3ab25904649 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 22 May 2025 14:31:34 +0000 Subject: [PATCH 09/10] avoid unnecessary multiplication of hidden_states and topk_weights instead pass the operation to be done in the kernel api Signed-off-by: vllmellm --- .../layers/fused_moe/rocm_aiter_fused_moe.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index be158a328a60..10b61fcda176 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -322,11 +322,6 @@ def rocm_aiter_fused_experts( topk == 1 ), "Only support topk=1 when `apply_router_weight_on_input` is True" - hidden_states = hidden_states * topk_weights.to( - hidden_states.dtype) - topk_ids = topk_ids.to(torch.int32) - topk_weights = torch.ones_like(topk_weights, dtype=torch.float32) - return torch.ops.vllm.rocm_aiter_fused_moe( hidden_states, w1, @@ -338,7 +333,8 @@ def rocm_aiter_fused_experts( w1_scale=w1_scale, w2_scale=w2_scale, a1_scale=a1_scale, - a2_scale=a2_scale) + a2_scale=a2_scale, + doweight_stage1=apply_router_weight_on_input) def rocm_aiter_topk_softmax(topk_weights: torch.Tensor, From 730fb7785d1975fa42c87016728084572946c347 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 23 May 2025 08:23:34 +0000 Subject: [PATCH 10/10] remove unnecessary layout argument Signed-off-by: vllmellm --- vllm/model_executor/layers/quantization/fp8.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e04d2bf5a298..cb12c78e5a3a 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -748,9 +748,8 @@ def process_weights_after_loading(self, layer: Module) -> None: start += shard_size if self.rocm_aiter_moe_enabled: - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight, - layer.w2_weight, - layout=(16, 16)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight, layer.w2_weight) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)