fix inplace

bnellnm · bnellnm · commit 1d40f7fb0733 · 2025-10-18T20:09:12.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -289,6 +289,10 @@ def get_fused_moe_quant_config(
     def supports_eplb(self) -> bool:
         return False
 
+    @property
+    def allow_inplace(self) -> bool:
+        return False
+
     @abstractmethod
     def apply(
         self,
@@ -333,6 +337,7 @@ def __init__(
         self.disable_expert_map = not fused_experts.supports_expert_map()
         self.old_method_name = old_moe_method.__class__.__name__
         self._supports_eplb = old_moe_method.supports_eplb
+        self._allow_inplace = old_moe_method.allow_inplace
         if isinstance(old_moe_method, torch.nn.Module):
             self.load_state_dict(old_moe_method.state_dict())
         logger.debug("Swapping out %s", self.old_method_name)
@@ -341,6 +346,10 @@ def __init__(
     def supports_eplb(self) -> bool:
         return self._supports_eplb
 
+    @property
+    def allow_inplace(self) -> bool:
+        return self._allow_inplace
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -426,7 +435,7 @@ def apply(
             w2=layer.w2_weight,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
-            inplace=True,  # TODO(bnell): make sure this is handled properly
+            inplace=self.allow_inplace,
             activation=activation,
             global_num_experts=global_num_experts,
             apply_router_weight_on_input=apply_router_weight_on_input,
@@ -496,6 +505,14 @@ def __init__(self, moe: FusedMoEConfig):
                 )
             self.flashinfer_cutlass_moe = None  # type: ignore
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
     def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
         if self.rocm_aiter_moe_enabled:
             return None
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -1179,6 +1179,10 @@ def get_fused_moe_quant_config(
     def supports_eplb(self) -> bool:
         return True
 
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
     def apply(
         self,
         layer: torch.nn.Module,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -812,6 +812,10 @@ def select_gemm_impl(
             else:
                 return OAITritonExperts(self.moe_quant_config)
 
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
     def apply(
         self,
         layer: torch.nn.Module,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -596,6 +596,10 @@ def get_fused_moe_quant_config(
             block_shape=None,
         )
 
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
     def apply(
         self,
         layer: torch.nn.Module,