[fix]: optimize the dbo execution and fix minor issues

zxdukki · zxdukki · commit 519d279b41d0 · 2025-06-03T20:44:00.000+08:00
Signed-off-by: zhuohuan &lt;zxdu1997@gmail.com&gt;
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -74,6 +74,7 @@
 from vllm_ascend.multistream.layers import (MultiStreamPostTransformerLayer,
                                             MultiStreamPreTransformerLayer)
 from vllm_ascend.multistream.metadata import (MultiStreamConfig,
+                                              MultiStreamMetadata,
                                               MultiStreamStepMetadata,
                                               make_multistream_metadata_ds)
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
@@ -698,13 +699,12 @@ def _forward_ms_layer(
         shared_outputs = []
         router_logits = []
         chunk_hidden_states = []
-        ''' block 1 : attention
-            block 2 : attn tp communication, currently we switch to the comm stream 
-            in tensor_model_parallel_all_reduce;
-            the attn computation of microbatch 1 can be overlapped with the moe 
-            communication in the previous layer, and the attn computation of microbatch
-            2 can be overlapped with the attn communication of microbatch 1
-        '''
+
+        # block 1 : attention
+        # block 2 : attn tp communication
+        # the attn computation of microbatch 1 can be overlapped with the moe
+        # communication in the previous layer, and the attn computation of microbatch 2
+        # can be overlapped with the attn communication of microbatch 1
         for i in range(num_micro_batchs):
             # wait last layer moe finishing communication
             ms_metadata.try_wait_event(layer_index - 1, i,
@@ -731,10 +731,10 @@ def _forward_ms_layer(
                 hidden_states[i], residual[i] = self._forward_ms_op_attn(
                     positions[i], hidden_states[i], residual[i], kv_cache,
                     attn_metadata[i])
-        ''' block 3 : shared experts
-            if there is an allreduce ops in shared expert, we can overlap it with the computation of the 
-            shared expert for next microbatch or moe gating
-        '''
+
+        # block 3 : shared experts
+        # if there is an allreduce ops in shared expert, we can overlap it with the computation of the
+        # shared expert for next microbatch or moe gating
         for i in range(num_micro_batchs):
             ms_metadata.try_wait_event(layer_index, i,
                                        MSEventKey.ATTN_AR_FINISH)
@@ -763,7 +763,6 @@ def _forward_ms_layer(
 
         # block 4 : moe
         for i in range(num_micro_batchs):
-            #ms_metadata.try_wait_event(layer_index, i, MSEventKey.MOE_SE_COMM_FINISH)
             # when profile runs, force experts to load balanced tokens
             # to avoid high memory consumption on a single rank.
             # TODO: need a better flag to indicate whether in profile run or not.
@@ -776,13 +775,6 @@ def _forward_ms_layer(
                 enable_force_load_balance = False
 
             if self.mlp.tp_size > 1:
-                #if num_tokens[i] < self.mlp.tp_size:
-                #    target_size = self.mlp.tp_size
-                #    new_hidden_states = torch.empty([target_size, hidden_dims[i]],
-                #                                    dtype=hidden_states[i].dtype,
-                #                                    device=hidden_states[i].device)
-                #    new_hidden_states[:num_tokens[i]] = hidden_states[i]
-                #    hidden_states[i] = new_hidden_states
                 num_token, _ = hidden_states[i].shape
                 padded_num_tokens = (self.mlp.tp_size - num_token %
                                      self.mlp.tp_size) % self.mlp.tp_size
@@ -805,18 +797,12 @@ def _forward_ms_layer(
             else:
                 real_top_k = self.mlp.experts.top_k
 
-            if VLLM_ENABLE_MC2 and not is_prefill:
-                ...
-
             hidden_states[i] = self.mlp.experts._forward_ms_fused_moe_comp(
                 local_hidden_states, router_logits[i], is_prefill, real_top_k,
                 enable_force_load_balance)
 
-            if VLLM_ENABLE_MC2 and not is_prefill:
-                ...
-            ''' the following kernels will be submitted to the comm stream to overlap the computation of the 
-                moe computation of next microbatch and the attn computation of next layer
-            '''
+            # the following kernels will be submitted to the comm stream to overlap the computation of the
+            # moe computation of next microbatch and the attn computation of next layer
             context = MultiStreamStepMetadata(
                 comm_stream=ms_metadata.communicate_stream,
                 before_comm_event=ms_metadata.ms_events[layer_index][i][
@@ -826,15 +812,14 @@ def _forward_ms_layer(
             )
             context.before_comm_event.record()
             with torch.npu.stream(ms_metadata.communicate_stream):
-                #with set_multistream_context(context, i):
                 context.before_comm_event.wait()
                 if self.mlp.experts.reduce_results and (
                         self.mlp.experts.tp_size > 1
                         or self.mlp.experts.ep_size > 1):
                     hidden_states[i] = tensor_model_parallel_all_reduce(
                         hidden_states[i])
                 context.after_comm_event.record()
-            # check here
+
             hidden_states[
                 i] = hidden_states[i] * self.mlp.routed_scaling_factor
             context = MultiStreamStepMetadata(
@@ -959,21 +944,19 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 ["hidden_states", "residual"], config.hidden_size))
 
         # tbo related members
-        self.multistream_config: Optional[MultiStreamConfig] = None
         if VLLM_ENABLE_DBO:
+            self.use_mla = model_config.use_mla
             self.multistream_config = MultiStreamConfig()
-
-        self.use_mla = model_config.use_mla
-        self.multistream_metadata = make_multistream_metadata_ds(
-            start_layer=self.start_layer + self.first_k_dense_replace,
-            end_layer=self.end_layer,
-            causal_lm=getattr(config, "causal_lm", True),
-            multistream_config=self.multistream_config,
-        )
-        self.ms_pre_layer = MultiStreamPreTransformerLayer(
-            self.multistream_metadata)
-        self.ms_post_layer = MultiStreamPostTransformerLayer(
-            self.multistream_metadata)
+            multistream_metadata = make_multistream_metadata_ds(
+                start_layer=self.start_layer + self.first_k_dense_replace,
+                end_layer=self.end_layer,
+                causal_lm=getattr(config, "causal_lm", True),
+                multistream_config=self.multistream_config,
+            )
+            self.ms_pre_layer = MultiStreamPreTransformerLayer(
+                multistream_metadata)
+            self.ms_post_layer = MultiStreamPostTransformerLayer(
+                multistream_metadata)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -998,11 +981,10 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        num_normal_layers = (self.first_k_dense_replace
-                             if self.multistream_config is not None
+        num_normal_layers = (self.first_k_dense_replace if VLLM_ENABLE_DBO
                              and self.can_run_ms() else self.end_layer -
                              self.start_layer)
-        # if we enable multistream/dbo, only process dense layers here
+
         for i in range(self.start_layer, self.start_layer + num_normal_layers):
             layer = self.layers[i]
             hidden_states, residual = layer(
@@ -1012,13 +994,15 @@ def forward(
                 attn_metadata)
 
         moe_start_layer = self.start_layer + num_normal_layers
-        hidden_states, residual = self._forward_ms_layers(
-            positions=positions,
-            hidden_states=hidden_states,
-            residual=residual,
-            moe_start_layer=moe_start_layer,
-            kv_caches=kv_caches,
-        )
+        if moe_start_layer != self.end_layer:
+            # if we enable multistream/dbo, process sparse layers here
+            hidden_states, residual = self._forward_ms_layers(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                moe_start_layer=moe_start_layer,
+                kv_caches=kv_caches,
+            )
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
@@ -1045,11 +1029,8 @@ def can_run_ms(self):
             if token_index == 0 or seq_index == 0 or seq_index == len(
                     attn_metadata.query_lens):
                 return False
-
-        if self.multistream_config is None:
-            return False
         # check whether the total tokens exceed the threshold
-        if attn_metadata.num_actual_tokens < self.multistream_config.min_total_tokens_to_split:
+        if self.multistream_config is None or attn_metadata.num_actual_tokens < self.multistream_config.min_total_tokens_to_split:
             return False
         return True
 
diff --git a/vllm_ascend/multistream/base.py b/vllm_ascend/multistream/base.py
@@ -2,7 +2,6 @@
 from enum import Enum
 
 
-# TODO: move this part to vllm
 class MSEventKey(Enum):
     ATTN_COM_FINISH = 0
     ATTN_AR_FINISH = 1
diff --git a/vllm_ascend/multistream/context.py b/vllm_ascend/multistream/context.py
@@ -1,8 +1,6 @@
 from contextlib import contextmanager
 from typing import Any
 
-# TODO: move this part to vllm
-
 _ms_comm_context: Any = None
 _cur_micro_batch_num: int = -1
 _ms_layer_index_context: int = -1
diff --git a/vllm_ascend/multistream/decorator.py b/vllm_ascend/multistream/decorator.py
@@ -3,8 +3,6 @@
 from .context import (get_multistream_layer_context,
                       get_multistream_microbatch_context)
 
-# TODO: move this part to vllm
-
 logger = init_logger(__name__)
 
 
diff --git a/vllm_ascend/multistream/layers.py b/vllm_ascend/multistream/layers.py
@@ -10,7 +10,6 @@
 from .metadata import MultiStreamMetadata
 
 
-# TODO: move this part to vllm
 class MultiStreamPreTransformerLayer(torch.nn.Module):
 
     def __init__(self, multistream_metadata: MultiStreamMetadata):
diff --git a/vllm_ascend/multistream/metadata.py b/vllm_ascend/multistream/metadata.py
@@ -2,9 +2,10 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.sequence import IntermediateTensors
 
+from vllm_ascend.attention.mla_v1 import AscendMLAMetadata
+
 from .base import MSAttentionMetadataSplitConfig, MSEventKey
 
 
@@ -111,19 +112,19 @@ def try_record_event(self, layer_index: int, micro_batch_index: int,
 
     def split_micro_batch(
         self,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: "AscendMLAMetadata",
         intput_tensors: List[torch.Tensor],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         intermediate_tensors_keys: Optional[List[str]] = None,
-    ) -> Tuple[bool, Union[AttentionMetadata, List[AttentionMetadata]], Union[
+    ) -> Tuple[bool, Union[AscendMLAMetadata, List[AscendMLAMetadata]], Union[
             List[torch.Tensor], List[List[torch.Tensor]]], Union[
                 IntermediateTensors, List[IntermediateTensors]]]:
-        attn_metadata = attn_metadata.split_metadata_for_multistream(
+        attn_metadata_list = attn_metadata.split_metadata_for_multistream(
             self.ms_split_config)
-        if len(attn_metadata) == 1:
-            return False, attn_metadata[
+        if len(attn_metadata_list) == 1:
+            return False, attn_metadata_list[
                 0], intput_tensors, intermediate_tensors
-        split_index = attn_metadata[0].slot_mapping.shape[0]
+        split_index = attn_metadata_list[0].slot_mapping.shape[0]
         input_tensors = split_micro_batches_tensors(intput_tensors,
                                                     split_index)
         if intermediate_tensors is not None:
@@ -134,7 +135,7 @@ def split_micro_batch(
                 IntermediateTensors(inter_tensors)
                 for inter_tensors in inter_tensors_list
             ]
-        return True, attn_metadata, input_tensors, intermediate_tensors
+        return True, attn_metadata_list, input_tensors, intermediate_tensors
 
     def merge_micro_batches(
         self, input_tensors: Union[List[torch.Tensor],