tmp 2 works

alexm-redhat · alexm-redhat · commit 4b23bfee8ab5 · 2025-10-08T12:31:23.000-07:00
diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
@@ -16,7 +16,13 @@
 
 def main():
     # Create an LLM.
-    llm = LLM(model="deepseek-ai/DeepSeek-R1-0528", tensor_parallel_size=8)
+    # llm = LLM(model="deepseek-ai/DeepSeek-R1-0528", tensor_parallel_size=8)
+    llm = LLM(
+        model="nvidia/DeepSeek-R1-FP4",
+        tensor_parallel_size=8,
+        quantization="modelopt_fp4",
+    )
+
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1039,7 +1039,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.se_stream = torch.cuda.Stream()
+        self.shared_experts_stream = torch.cuda.Stream()
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -1278,6 +1278,10 @@ def __init__(
     def shared_experts(self) -> Optional[torch.nn.Module]:
         return None
 
+    @property
+    def gate(self) -> Optional[torch.nn.Module]:
+        return None
+
     @property
     def tp_size(self):
         return self.moe_parallel_config.tp_size
@@ -2114,8 +2118,8 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
                 and self.shared_experts is not None
             ):
                 current_stream = torch.cuda.current_stream()
-                self.se_stream.wait_stream(current_stream)
-                with torch.cuda.stream(self.se_stream):
+                self.shared_experts_stream.wait_stream(current_stream)
+                with torch.cuda.stream(self.shared_experts_stream):
                     shared_output = self.shared_experts(staged_hidden_states)
 
             else:
@@ -2148,7 +2152,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
                 assert not isinstance(final_hidden_states, tuple)
                 assert self.shared_experts is not None
 
-                current_stream.wait_stream(self.se_stream)
+                current_stream.wait_stream(self.shared_experts_stream)
 
                 final_hidden_states = (
                     shared_output,
@@ -2219,6 +2223,16 @@ def forward_impl(
 
         self.ensure_moe_quant_config()
 
+        use_explicit_se = (
+            not isinstance(self.quant_method.fused_experts, FusedMoEModularKernel)
+            and self.shared_experts is not None
+        )
+        if use_explicit_se:
+            current_stream = torch.cuda.current_stream()
+            self.shared_experts_stream.wait_stream(current_stream)
+
+        router_logits, _ = self.gate(hidden_states)
+
         # Route to the chunked forward path using the FlashInfer Cutlass kernel
         # only when data parallelism (DP) is enabled.
         _use_flashinfer_cutlass_kernels = (
@@ -2240,13 +2254,8 @@ def forward_impl(
 
         # If there are shared experts but we are not using a modular kernel, the
         # shared experts must be called here
-        if (
-            not isinstance(self.quant_method.fused_experts, FusedMoEModularKernel)
-            and self.shared_experts is not None
-        ):
-            current_stream = torch.cuda.current_stream()
-            self.se_stream.wait_stream(current_stream)
-            with torch.cuda.stream(self.se_stream):
+        if use_explicit_se:
+            with torch.cuda.stream(self.shared_experts_stream):
                 shared_output = self.shared_experts(hidden_states)
         else:
             shared_output = None
@@ -2292,7 +2301,8 @@ def forward_impl(
                 assert not isinstance(final_hidden_states, tuple)
                 assert self.shared_experts is not None
 
-                current_stream.wait_stream(self.se_stream)
+                current_stream = torch.cuda.current_stream()
+                current_stream.wait_stream(self.shared_experts_stream)
 
                 final_hidden_states = (
                     shared_output,
diff --git a/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py
@@ -19,17 +19,23 @@ class SharedFusedMoE(FusedMoE):
     def __init__(
         self,
         shared_experts: torch.nn.Module,
+        gate: torch.nn.Module,
         use_overlapped: bool = True,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self._shared_experts = shared_experts
+        self._gate = gate
         self.use_overlapped = use_overlapped
 
     @property
     def shared_experts(self) -> Optional[torch.nn.Module]:
         return self._shared_experts if self.use_overlapped else None
 
+    @property
+    def gate(self) -> Optional[torch.nn.Module]:
+        return self._gate if self.use_overlapped else None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -205,6 +205,8 @@ def __init__(
         )
 
         if config.n_shared_experts is None:
+            self.use_shared_fused_moe = False
+
             self.experts = FusedMoE(
                 num_experts=config.n_routed_experts,
                 top_k=config.num_experts_per_tok,
@@ -227,6 +229,8 @@ def __init__(
             )
             self.shared_experts = None
         else:
+            self.use_shared_fused_moe = True
+
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
 
             self.shared_experts = DeepseekV2MLP(
@@ -241,6 +245,7 @@ def __init__(
 
             self.experts = SharedFusedMoE(
                 shared_experts=self.shared_experts,
+                gate=self.gate,
                 num_experts=config.n_routed_experts,
                 top_k=config.num_experts_per_tok,
                 hidden_size=config.hidden_size,
@@ -272,12 +277,16 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if self.is_sequence_parallel:
             hidden_states = sequence_parallel_chunk(hidden_states)
 
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-
-        fused_moe_out = self.experts(
-            hidden_states=hidden_states, router_logits=router_logits
-        )
+        if self.use_shared_fused_moe:
+            fused_moe_out = self.experts(
+                hidden_states=hidden_states, router_logits=hidden_states
+            )
+        else:
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+            fused_moe_out = self.experts(
+                hidden_states=hidden_states, router_logits=router_logits
+            )
 
         if self.shared_experts is not None:
             shared_output, final_hidden_states = fused_moe_out