tmp 1 works

alexm-redhat · alexm-redhat · commit 1a50d1bd278f · 2025-10-08T09:37:59.000-07:00
diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
@@ -16,7 +16,7 @@
 
 def main():
     # Create an LLM.
-    llm = LLM(model="facebook/opt-125m")
+    llm = LLM(model="deepseek-ai/DeepSeek-R1-0528", tensor_parallel_size=8)
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1038,6 +1038,9 @@ def __init__(
         expert_mapping: Optional[list[tuple[str, str, int, str]]] = None,
     ):
         super().__init__()
+
+        self.se_stream = torch.cuda.Stream()
+
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
@@ -2110,7 +2113,11 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
                 not isinstance(self.quant_method.fused_experts, FusedMoEModularKernel)
                 and self.shared_experts is not None
             ):
-                shared_output = self.shared_experts(staged_hidden_states)
+                current_stream = torch.cuda.current_stream()
+                self.se_stream.wait_stream(current_stream)
+                with torch.cuda.stream(self.se_stream):
+                    shared_output = self.shared_experts(staged_hidden_states)
+
             else:
                 shared_output = None
 
@@ -2140,6 +2147,9 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
             if shared_output is not None:
                 assert not isinstance(final_hidden_states, tuple)
                 assert self.shared_experts is not None
+
+                current_stream.wait_stream(self.se_stream)
+
                 final_hidden_states = (
                     shared_output,
                     final_hidden_states,
@@ -2234,7 +2244,10 @@ def forward_impl(
             not isinstance(self.quant_method.fused_experts, FusedMoEModularKernel)
             and self.shared_experts is not None
         ):
-            shared_output = self.shared_experts(hidden_states)
+            current_stream = torch.cuda.current_stream()
+            self.se_stream.wait_stream(current_stream)
+            with torch.cuda.stream(self.se_stream):
+                shared_output = self.shared_experts(hidden_states)
         else:
             shared_output = None
 
@@ -2278,6 +2291,9 @@ def forward_impl(
             if shared_output is not None:
                 assert not isinstance(final_hidden_states, tuple)
                 assert self.shared_experts is not None
+
+                current_stream.wait_stream(self.se_stream)
+
                 final_hidden_states = (
                     shared_output,
                     final_hidden_states,