Clean up diff

tlrmchlsmth · bnellnm · commit 8a72a9c1c5be · 2025-05-14T14:46:14.000Z
Signed-off-by: Tyler Michael Smith &lt;tyler@neuralmagic.com&gt;
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/cuda_graph_utils.py b/vllm/cuda_graph_utils.py
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -535,16 +535,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
     grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv(
         B.shape[1], META['BLOCK_SIZE_N']), )
 
-    if use_dg:
-        assert use_fp8_w8a8
-        # Note: we never apply the topk_weights here since it requires
-        # unpermuting and resizing the output.  This goes against the
-        # existing interface as the `mul_routed_weight` argument is
-        # ignored.  The weights are applied in _moe_unpermute.
-        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (A, A_scale), (B, B_scale), C, expert_ids)
-
-    elif (use_int8_w8a16 or use_int4_w4a16) and \
+    if (use_int8_w8a16 or use_int4_w4a16) and \
             block_shape is not None and block_shape[1] > 0:
         assert B_scale is not None and B_scale.ndim == 3
         assert B_zp is None or B_zp.ndim == 3
@@ -848,7 +839,6 @@ def try_get_optimal_moe_config(
     M: int,
     is_marlin: bool = False,
     block_shape: Optional[List[int]] = None,
-    use_deep_gemm: bool = False,
 ):
     from vllm.model_executor.layers.fused_moe import get_config
     override_config = get_config()
@@ -871,11 +861,6 @@ def try_get_optimal_moe_config(
             # Else use the default config
             config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
                                         is_marlin, block_shape)
-
-    # Enforce DeepGemm M blocking no matter what the config says.
-    if use_deep_gemm:
-        config['BLOCK_SIZE_M'] = dg.get_m_alignment_for_contiguous_layout()
-
     return config
 
 
@@ -1048,14 +1033,13 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           w2_zp: Optional[torch.Tensor] = None,
                           a1_scale: Optional[torch.Tensor] = None,
                           a2_scale: Optional[torch.Tensor] = None,
-                          block_shape: Optional[List[int]] = None,
-                          allow_deep_gemm: bool = False) -> None:
+                          block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                        activation, apply_router_weight_on_input, use_fp8_w8a8,
                        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
                        per_channel_quant, global_num_experts, expert_map,
                        w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-                       block_shape, allow_deep_gemm)
+                       block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -1489,7 +1473,6 @@ def fused_moe(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[List[int]] = None,
-    allow_deep_gemm: bool = True,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1523,8 +1506,8 @@ def fused_moe(
         Defaults to False.
     - global_num_experts (int): The total number of experts in the global
         expert space.
-    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
-        from the global expert space to the local expert space of the expert
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices 
+        from the global expert space to the local expert space of the expert 
         parallel shard.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
         w1.
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,16 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import abstractmethod
+from dataclasses import dataclass
 from enum import Enum
 from typing import Callable, List, Optional, Tuple
-from dataclasses import dataclass
 
+import pplx_kernels as pplx
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import UninitializedParameter
 
-import pplx_kernels as pplx
-
 import vllm.envs as envs
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (get_dp_group, get_ep_group,
@@ -47,6 +46,7 @@
 
 MOE_DP_CHUNK_SIZE = 256
 
+
 # Adapted from pplx-kernels tests/all_to_all_utils.py
 @dataclass
 class MoEConfig:
@@ -64,6 +64,7 @@ class MoEConfig:
     out_dtype: torch.dtype = torch.bfloat16
     block_size: int = 128
 
+
 class FusedMoeWeightScaleSupported(Enum):
     TENSOR = "tensor"
     CHANNEL = "channel"
@@ -100,26 +101,14 @@ def apply(
     ) -> torch.Tensor:
         raise NotImplementedError
 
+
+#TODO: Every change in this class is a broken hack!!
 @CustomOp.register("unquantized_fused_moe")
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
-    def __init__(self, moe: MoEConfig):
-        self.all_to_all = pplx.AllToAll(
-            max_num_tokens=MOE_DP_CHUNK_SIZE // moe.dp_size,
-            num_experts=moe.num_experts,
-            experts_per_token=moe.experts_per_token,
-            rank=moe.ep_rank,
-            world_size=moe.ep_size,
-            dp_size=moe.dp_size,
-            hidden_dim=moe.hidden_dim,
-            hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
-            hidden_dim_scale_bytes=0,
-        )
-
 
-    def __init__(self):
+    def __init__(self, moe: MoEConfig):
         super().__init__()
-
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
         if self.rocm_aiter_moe_enabled:
             from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
@@ -903,7 +892,7 @@ def forward(self, hidden_states: torch.Tensor,
                                               self.layer_name)
 
     def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
-                           full_router_logits: torch.Tensor):
+                             full_router_logits: torch.Tensor):
         max_tokens_across_dp = get_forward_context(
         ).dp_metadata.max_tokens_across_dp
         cu_tokens_across_dp_cpu = get_forward_context(
@@ -919,21 +908,23 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
 
         num_tokens_remaining_across_dp = num_tokens_across_dp
         chunk_start = 0
-        chunk_end = min(moe_dp_chunk_size_per_rank, full_hidden_states.shape[0])
+        chunk_end = min(moe_dp_chunk_size_per_rank,
+                        full_hidden_states.shape[0])
         full_final_hidden_states = torch.empty_like(full_hidden_states)
 
         for _ in range(0, max_tokens_across_dp, moe_dp_chunk_size_per_rank):
-            hidden_states = full_hidden_states[chunk_start:chunk_end,:]
-            router_logits = full_router_logits[chunk_start:chunk_end,:]
+            hidden_states = full_hidden_states[chunk_start:chunk_end, :]
+            router_logits = full_router_logits[chunk_start:chunk_end, :]
 
             cu_tokens_across_dp_this_iter = torch.cumsum(
-                num_tokens_remaining_across_dp.clamp(max=moe_dp_chunk_size_per_rank),
+                num_tokens_remaining_across_dp.clamp(
+                    max=moe_dp_chunk_size_per_rank),
                 dim=0)
 
-            hidden_states = self.naive_multicast(hidden_states,
-                cu_tokens_across_dp_this_iter)
-            router_logits = self.naive_multicast(router_logits,
-                cu_tokens_across_dp_this_iter)
+            hidden_states = self.naive_multicast(
+                hidden_states, cu_tokens_across_dp_this_iter)
+            router_logits = self.naive_multicast(
+                router_logits, cu_tokens_across_dp_this_iter)
 
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
@@ -954,7 +945,8 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
             )
 
             if self.dp_size > 1:
-                start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_this_iter[self.dp_rank-1]
+                start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_this_iter[
+                    self.dp_rank - 1]
                 end = cu_tokens_across_dp_this_iter[self.dp_rank]
 
                 all_hidden_states = get_dp_group().all_reduce(
@@ -963,20 +955,26 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
 
             if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
                 # Default set to False. (May have to add shared expert outputs.)
-                final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+                final_hidden_states = tensor_model_parallel_all_reduce(
+                    final_hidden_states)
 
-            full_final_hidden_states[chunk_start:chunk_end, :].copy_(final_hidden_states)
+            full_final_hidden_states[chunk_start:chunk_end, :].copy_(
+                final_hidden_states)
 
             # Update bounds
-            num_tokens_remaining_across_dp = torch.clamp(num_tokens_remaining_across_dp - moe_dp_chunk_size_per_rank, min=0)
+            num_tokens_remaining_across_dp = torch.clamp(
+                num_tokens_remaining_across_dp - moe_dp_chunk_size_per_rank,
+                min=0)
+
             def update_chunk_bound(x: int):
-                return min(x + moe_dp_chunk_size_per_rank, full_hidden_states.shape[0])
+                return min(x + moe_dp_chunk_size_per_rank,
+                           full_hidden_states.shape[0])
+
             chunk_start = update_chunk_bound(chunk_start)
             chunk_end = update_chunk_bound(chunk_end)
 
         return full_final_hidden_states
 
-
     def forward_impl(self, hidden_states: torch.Tensor,
                      router_logits: torch.Tensor):
         assert self.quant_method is not None