cleanups

bnellnm · bnellnm · commit 236c52d7923f · 2025-10-17T21:06:04.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -281,10 +281,6 @@ def get_fused_moe_quant_config(
     ) -> FusedMoEQuantConfig | None:
         raise NotImplementedError
 
-    @property
-    def using_modular_kernel(self) -> bool:
-        return False
-
     @property
     def supports_eplb(self) -> bool:
         return False
@@ -337,10 +333,6 @@ def __init__(
             self.load_state_dict(old_moe_method.state_dict())
         logger.debug("Swapping out %s", self.old_method_name)
 
-    @property
-    def using_modular_kernel(self) -> bool:
-        return True
-
     @property
     def supports_eplb(self) -> bool:
         return self._supports_eplb
@@ -1378,13 +1370,12 @@ def __init__(
 
     # Note: init_prepare_finalize should only be called by
     # prepare_communication_buffer_for_model.
+    # This is called after all weight loading and post-processing, so it
+    # should be safe to swap out the quant_method.
     def init_prepare_finalize(self) -> None:
         mk = self.quant_method.init_prepare_finalize(self)
         if mk is not None:
-            new_quant_method = FusedMoEModularMethod(self.quant_method, mk)
-            if isinstance(self.quant_method, torch.nn.Module):
-                self.set_submodule(self.quant_method.name, new_quant_method)
-            self.quant_method = new_quant_method
+            self.quant_method = FusedMoEModularMethod(self.quant_method, mk)
 
     @property
     def shared_experts(self) -> torch.nn.Module | None:
@@ -2114,7 +2105,7 @@ def must_reduce_shared_expert_outputs(self) -> bool:
         """
         assert self.quant_method is not None
         return (
-            self.quant_method.fused_experts is not None
+            isinstance(self.quant_method, FusedMoEModularMethod)
             and self.quant_method.fused_experts.output_is_reduced()
         )
 
@@ -2228,7 +2219,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
             # If there are shared experts but we are not using a modular kernel,
             # the shared experts must be called here
             if (
-                not isinstance(self.quant_method.fused_experts, FusedMoEModularKernel)
+                not isinstance(self.quant_method, FusedMoEModularMethod)
                 and self.shared_experts is not None
             ):
                 shared_output = self.shared_experts(staged_hidden_states)
@@ -2333,14 +2324,14 @@ def forward_impl(
         if self.use_dp_chunking:
             return self.forward_impl_chunked(hidden_states, router_logits)
 
-        do_naive_dispatch_combine: bool = (
-            self.dp_size > 1 and not self.quant_method.using_modular_kernel
+        do_naive_dispatch_combine: bool = self.dp_size > 1 and not isinstance(
+            self.quant_method, FusedMoEModularMethod
         )
 
         # If there are shared experts but we are not using a modular kernel, the
         # shared experts must be called here
         if (
-            not isinstance(self.quant_method.fused_experts, FusedMoEModularKernel)
+            not isinstance(self.quant_method, FusedMoEModularMethod)
             and self.shared_experts is not None
         ):
             shared_output = self.shared_experts(hidden_states)