[fix]: reduced dependency on vllm for dbo

zxdukki · zxdukki · commit 43f5388eb1ed · 2025-05-26T23:16:37.000+08:00
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -803,31 +803,44 @@ def forward(
             # FIX: aicore move/copy should be also placed on the comm stream in dbo, 
             # otherwise it may affect the accuracy or disturb the overlap of next stage
             # TODO: use an elegant way here to avoid it
-            output_prefill = self._forward_prefill(
-                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
-                attn_metadata)
-            from vllm.multistream.context import get_multistream_comm_context
+            from vllm_ascend.multistream.context import get_multistream_comm_context
             current_ms_metadata = get_multistream_comm_context()
-            if current_ms_metadata is not None:
+            if current_ms_metadata is None:
+                output[num_decode_tokens:] = self._forward_prefill(
+                    prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
+                    attn_metadata)
+            else:
+                current_ms_metadata.before_comm_event.record()
                 with torch.npu.stream(current_ms_metadata.comm_stream):
-                    output[num_decode_tokens:] = output_prefill
+                    current_ms_metadata.before_comm_event.wait()
+                    output[num_decode_tokens:] = self._forward_prefill(
+                        prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
+                        attn_metadata)
                     current_ms_metadata.after_comm_event.record()
-            else:
-                output[num_decode_tokens:] = output_prefill
+
+
+
         if has_decode:
             if self.running_in_graph:
                 return self._forward_decode(decode_ql_nope, decode_q_pe,
                                             decode_k_nope, decode_k_pe,
                                             kv_cache, attn_metadata)
             else:
-                from vllm.multistream.context import get_multistream_comm_context
-                current_ms_metadata = get_multistream_comm_context()   
-                output_decode = self._forward_decode(
-                     decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
-                     kv_cache, attn_metadata)
-            if current_ms_metadata is not None:
-                with torch.npu.stream(current_ms_metadata.comm_stream):
-                    output[:num_decode_tokens] = output_decode
-            else:
-                output[:num_decode_tokens] = output_decode
+
+                from vllm_ascend.multistream.context import get_multistream_comm_context
+                current_ms_metadata = get_multistream_comm_context()
+                if current_ms_metadata is None:
+                    output[:num_decode_tokens] = self._forward_decode(
+                        decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
+                        kv_cache, attn_metadata)
+                else:
+                    current_ms_metadata.before_comm_event.record()
+                    with torch.npu.stream(current_ms_metadata.comm_stream):
+                        current_ms_metadata.before_comm_event.wait()
+                        output[:num_decode_tokens] = self._forward_decode(
+                            decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
+                            kv_cache, attn_metadata)
+                        current_ms_metadata.after_comm_event.record()
+
+            
         return output_padded
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -66,6 +66,8 @@
     lambda: os.getenv("C_COMPILER", None),
     "VLLM_VERSION":
     lambda: os.getenv("VLLM_VERSION", None),
+    "VLLM_ENABLE_MS":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_MS", '0'))),
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -72,11 +72,12 @@
 from vllm_ascend.multistream.context import (set_multistream_context,get_multistream_layer_context, 
                                       advance_step_multistream_layer_context, get_multistream_comm_context)
 from vllm_ascend.multistream.layers import (MultiStreamPreTransformerLayer, MultiStreamPostTransformerLayer)
-from vllm_ascend.multistream.metadata import make_multistream_metadata_ds, MultiStreamStepMetadata
+from vllm_ascend.multistream.metadata import make_multistream_metadata_ds, MultiStreamStepMetadata, MultiStreamConfig
 from vllm_ascend.multistream.base import MSEventKey
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
+VLLM_ENABLE_MS: bool = envs_ascend.VLLM_ENABLE_MS
 
 
 class CustomDeepseekV2MLP(nn.Module):
@@ -305,8 +306,10 @@ def _forward_ms_op_tp_allreduce(
             dist.all_gather(list(chunk_hidden_states), hidden_states,
                             self.tp_group)
             final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
-            if num_tokens < self.tp_size:
-                final_hidden_states = final_hidden_states[:num_tokens]
+            #if num_tokens < self.tp_size:
+            #    final_hidden_states = final_hidden_states[:num_tokens]
+            if num_tokens > 0:
+                final_hidden_states = final_hidden_states[:-num_tokens]
         else:
             final_hidden_states = hidden_states
 
@@ -641,6 +644,10 @@ def _forward_ms_layer(
                 )
             
             with set_multistream_context(context, i):
+                context = get_forward_context()
+                layer_index, ms_metadata, attn_metadata = get_multistream_layer_context()
+                context.attn_metadata = attn_metadata[i]
+
                 # input layernorm
                 hidden_states[i], residual[i] = self._forward_ms_op_input_layernorm(hidden_states[i], residual[i])
                 # attention and tp allreducea
@@ -664,7 +671,7 @@ def _forward_ms_layer(
 
                 num_token, hidden_dim = hidden_states[i].shape
                 hidden_states[i] = hidden_states[i].view(-1, hidden_dim)
-                num_tokens.append(num_token)
+                #num_tokens.append(num_token)
                 hidden_dims.append(hidden_dim)
                 if self.mlp.n_shared_experts is not None:
                     # TODO: we can move shared expert computation into next block if reduce results is false
@@ -686,13 +693,20 @@ def _forward_ms_layer(
                 enable_force_load_balance = False
 
             if self.mlp.tp_size > 1:
-                if num_tokens[i] < self.mlp.tp_size:
-                    target_size = self.mlp.tp_size
-                    new_hidden_states = torch.empty([target_size, hidden_dims[i]],
-                                                    dtype=hidden_states[i].dtype,
-                                                    device=hidden_states[i].device)
-                    new_hidden_states[:num_tokens[i]] = hidden_states[i]
-                    hidden_states[i] = new_hidden_states
+                #if num_tokens[i] < self.mlp.tp_size:
+                #    target_size = self.mlp.tp_size
+                #    new_hidden_states = torch.empty([target_size, hidden_dims[i]],
+                #                                    dtype=hidden_states[i].dtype,
+                #                                    device=hidden_states[i].device)
+                #    new_hidden_states[:num_tokens[i]] = hidden_states[i]
+                #    hidden_states[i] = new_hidden_states
+                num_token, _ = hidden_states[i].shape
+                padded_num_tokens = (self.mlp.tp_size -
+                                 num_token % self.mlp.tp_size) % self.mlp.tp_size
+                if padded_num_tokens > 0:
+                    hidden_states[i] = nn.functional.pad(hidden_states[i],
+                                                  (0, 0, 0, padded_num_tokens))
+                num_tokens.append(padded_num_tokens)
                 chunk_hidden_state = torch.tensor_split(hidden_states[i],
                                                         self.mlp.tp_size,
                                                         dim=0)
@@ -713,7 +727,7 @@ def _forward_ms_layer(
             if VLLM_ENABLE_MC2 and not is_prefill:
                 ...
 
-            hidden_states[i] =  self.mlp.experts._forward_ms_fused_moe_comp(hidden_states[i], router_logits[i], is_prefill, real_top_k, enable_force_load_balance)
+            hidden_states[i] =  self.mlp.experts._forward_ms_fused_moe_comp(local_hidden_states, router_logits[i], is_prefill, real_top_k, enable_force_load_balance)
 
             if VLLM_ENABLE_MC2 and not is_prefill:
                 ...
@@ -847,7 +861,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 ["hidden_states", "residual"], config.hidden_size))
         
         # tbo related members
-        self.multistream_config = vllm_config.model_config.multistream_config
+        if VLLM_ENABLE_MS:
+            self.multistream_config = MultiStreamConfig()
+        else:
+            self.multistream_config = None
         self.use_mla = model_config.use_mla
         self.multistream_metadata = make_multistream_metadata_ds(
             start_layer=self.start_layer + self.first_k_dense_replace,
@@ -929,13 +946,14 @@ def can_run_ms(self):
             return False
         num_microbatchs = self.multistream_config.num_micro_batches
         # check whether there is a dp rank that not use dual batch
-        if dp_metadata is not None:
+        '''if dp_metadata is not None:
             for i in range(num_microbatchs):
                 cu_tokens = dp_metadata.cu_dbo_tokens_across_dp_cpu[i]
                 if torch.any(cu_tokens == 0).item():
                     return False
         [token_index, seq_index] = compute_split_seq_index(attn_metadata.query_lens,
-                                                           attn_metadata.attn_state, attn_metadata.num_decode_tokens)         
+                                                           attn_metadata.attn_state, attn_metadata.num_decode_tokens)
+        '''         
         if token_index == 0 or seq_index == 0 or seq_index == len(attn_metadata.query_lens):
             return False
         # check whether the total tokens exceed the threshold
diff --git a/vllm_ascend/multistream/metadata.py b/vllm_ascend/multistream/metadata.py
@@ -2,7 +2,6 @@
 import torch
 from typing import Dict, List, Optional, Union, Tuple
 from vllm.sequence import IntermediateTensors
-from vllm.config import MultiStreamConfig
 from .base import MSAttentionMetadataSplitConfig, MSEventKey
 from vllm.attention.backends.abstract import AttentionMetadata
 
@@ -31,57 +30,21 @@ def split_micro_batches_tensors(input_tensors, split_index: int, keys: List[str]
         return [micro_batches_pre, micro_batches_post]
     else:
         raise NotImplementedError
-def make_multistream_metadata(
-        start_layer: int,
-        end_layer: int,
-        causal_lm: bool = True,
-        multistream_config: Optional[MultiStreamConfig] = None,
-):
-    if multistream_config is None:
-        return None
-    return MultiStreamMetadata(
-        calculate_stream=torch.npu.current_stream(),
-        communicate_stream=torch.npu.Stream(),
-        start_layer=start_layer,
-        end_layer=end_layer,
-        multistream_config=multistream_config,
-        event_keys=[MSEventKey.ATTN_COM_FINISH, MSEventKey.ATTN_AR_FINISH,
-                    MSEventKey.FFN_COM_FINISH, MSEventKey.FFN_AR_FINISH],
-        causal_lm=causal_lm,
-    )
-def make_multistream_metadata_ds(
-        start_layer: int,
-        end_layer: int,
-        causal_lm: bool = True,
-        multistream_config: Optional[MultiStreamConfig] = None,
-):
-    if multistream_config is None:
-        return None
-    event_keylist = [
-        MSEventKey.ATTN_COM_FINISH,
-        MSEventKey.ATTN_AR_FINISH,
-        MSEventKey.FFN_COM_FINISH,
-        MSEventKey.FFN_AR_FINISH,
-        MSEventKey.MOE_BEFORE_COMM,
-        MSEventKey.MOE_AFTER_COMM,
-        MSEventKey.MOE_SE_COMM_FINISH,
-        MSEventKey.MOE_SE_COMP_FINISH,
-        MSEventKey.MOE_GATE_FINISH,
-    ]
-    return MultiStreamMetadata(
-        calculate_stream=torch.npu.current_stream(),
-        communicate_stream=torch.npu.Stream(),
-        start_layer=start_layer,
-        end_layer=end_layer,
-        multistream_config=multistream_config,
-        event_keys=event_keylist,
-        causal_lm=causal_lm,
-    )
+
 @dataclass
 class MultiStreamStepMetadata:
     comm_stream: torch.npu.Stream = None
     before_comm_event: torch.npu.Event = None
     after_comm_event: torch.npu.Event = None
+
+@dataclass
+class MultiStreamConfig:
+    """Controls the behavior of multi-stream models."""
+    min_total_tokens_to_split: int = 256
+    min_prefill_tokens_to_split: int = 64
+    num_micro_batches: int = 2
+    imbalance_ratio: float = 0.1
+
 class MultiStreamMetadata:
     # direct stream
     calculate_stream = None
@@ -157,4 +120,34 @@ def merge_micro_batches(self,
                 batch.append(None)
             else:
                 batch.append(torch.cat(tensors, dim=0))
-        return batch
+        return batch
+
+
+def make_multistream_metadata_ds(
+        start_layer: int,
+        end_layer: int,
+        causal_lm: bool = True,
+        multistream_config: Optional[MultiStreamConfig] = None,
+):
+    if multistream_config is None:
+        return None
+    event_keylist = [
+        MSEventKey.ATTN_COM_FINISH,
+        MSEventKey.ATTN_AR_FINISH,
+        MSEventKey.FFN_COM_FINISH,
+        MSEventKey.FFN_AR_FINISH,
+        MSEventKey.MOE_BEFORE_COMM,
+        MSEventKey.MOE_AFTER_COMM,
+        MSEventKey.MOE_SE_COMM_FINISH,
+        MSEventKey.MOE_SE_COMP_FINISH,
+        MSEventKey.MOE_GATE_FINISH,
+    ]
+    return MultiStreamMetadata(
+        calculate_stream=torch.npu.current_stream(),
+        communicate_stream=torch.npu.Stream(),
+        start_layer=start_layer,
+        end_layer=end_layer,
+        multistream_config=multistream_config,
+        event_keys=event_keylist,
+        causal_lm=causal_lm,
+    )
diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py
@@ -9,7 +9,7 @@ def compute_split_seq_index(
         num_tokens: int,
         imbalance_ratio: float = 0.1,
     )->Optional[list[int]]:
-    if attn_state == AscendAttentionState.PrefillOnly or attn_state == AscendAttentionState.ChunkedPrefill:
+    if attn_state != AscendAttentionState.DecodeOnly:
         assert query_lens is not None
         total_tokens = sum(query_lens)
         # the first index in last split
@@ -28,11 +28,10 @@ def compute_split_seq_index(
                 # TODO: split tokens in seq
                 else :
                     return [0, 0]
-    elif attn_state == AscendAttentionState.DecodeOnly:
+    else:
         tokens =  num_tokens // 2
         return [tokens, tokens]
-    else:
-        return [0, 0]
+   
 def split_attn_tensor_type(
         input_tensor: torch.Tensor,
         index: int, 
@@ -69,10 +68,10 @@ def model_input_split_v1_mla_attn(
     seq_lens = attn_metadata.prefill.seq_lens if attn_metadata.num_prefills>0 else attn_metadata.decode.seq_lens
     [seq_lens_pre, seq_lens_post] = split_attn_tensor_type(seq_lens,seq_index)
        
-    if attn_metadata.attn_state == AscendAttentionState.PrefillOnly:
+    if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache or attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
         # the attn_mla kernel in torch npu only accept 128*128 attn mask
         attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
-        attn_state_pre = attn_state_post = AscendAttentionState.PrefillOnly
+        attn_state_pre = attn_state_post = attn_metadata.attn_state
     elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
         # should be none in decode only state
         attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -37,7 +37,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 
-from vllm.multistream.context import set_multistream_context,get_multistream_comm_context
 from vllm_ascend.multistream.base import MSEventKey
 from vllm_ascend.multistream.metadata import MultiStreamStepMetadata, MultiStreamMetadata
 import vllm_ascend.envs as envs_ascend
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -633,7 +633,6 @@ def _process_reqs(
         # Run forward pass
         with set_forward_context(attn_metadata,
                                  self.vllm_config,
-                                 query_lens=self.query_lens,
                                  num_tokens=num_input_tokens):
             model_kwargs = {}
             if self.enable_torchair_graph_mode:

Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,8 @@`
`66`	`66`	`lambda: os.getenv("C_COMPILER", None),`
`67`	`67`	`"VLLM_VERSION":`
`68`	`68`	`lambda: os.getenv("VLLM_VERSION", None),`
	`69`	`+ "VLLM_ENABLE_MS":`
	`70`	`+ lambda: bool(int(os.getenv("VLLM_ENABLE_MS", '0'))),`
`69`	`71`	`}`
`70`	`72`
`71`	`73`	`# end-env-vars-definition`