[CI] fix

zzhx1 · zzhx1 · commit 3b313b7179c3 · 2025-08-12T19:23:41.000+08:00
Signed-off-by: zzhx1 &lt;zzh_201018@outlook.com&gt;
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -58,15 +58,14 @@ def __init__(self, vllm_config):
                 vllm_config.parallel_config.tensor_parallel_size == 1
             ), "lmhead_tensor_parallel_size is only supported in the pure DP scenario"
             assert (
-                self.torchair_graph_config.enabled == True
+                self.torchair_graph_config.enabled
             ), "lmhead_tensor_parallel_size is only supported in graph mode"
 
         self.enable_shared_expert_dp = additional_config.get(
             "enable_shared_expert_dp", True
         ) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
 
 
-
 class TorchairGraphConfig:
     """
     Configuration Object for torchair_graph_config from additional_config
diff --git a/vllm_ascend/models/deepseek_mtp.py b/vllm_ascend/models/deepseek_mtp.py
@@ -25,20 +25,22 @@
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import \
+    VocabParallelEmbedding
 from vllm.model_executor.models.deepseek_mtp import (
     DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
     SharedHead)
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from vllm_ascend.ops.vocab_parallel_embedding import (CustomLogitsProcessor,
+                                                      CustomParallelLMHead)
+
 from .deepseek_v2 import CustomDeepseekV2DecoderLayer
-from vllm_ascend.ops.vocab_parallel_embedding import CustomLogitsProcessor, CustomParallelLMHead
+
 
 class CustomDeepSeekShareHead(SharedHead):
 
@@ -49,9 +51,9 @@ def __init__(self,
         nn.Module.__init__(self)
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.head = CustomParallelLMHead(config.vocab_size,
-                                   config.hidden_size,
-                                   quant_config=quant_config,
-                                   prefix=maybe_prefix(prefix, "head"))
+                                         config.hidden_size,
+                                         quant_config=quant_config,
+                                         prefix=maybe_prefix(prefix, "head"))
 
 
 class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):
diff --git a/vllm_ascend/ops/vocab_parallel_embedding.py b/vllm_ascend/ops/vocab_parallel_embedding.py
@@ -269,7 +269,6 @@ def _get_logits(
         else:
             gathered_hidden_states = hidden_states
 
-        # Compute logits using quantized matrix multiplication
         local_logits = lm_head.quant_method.apply(lm_head,
                                                   gathered_hidden_states,
                                                   bias=embedding_bias)
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -512,6 +512,4 @@ def get_ascend_soc_version():
 
 
 def _enable_lmhead_tp() -> bool:
-    if get_ascend_config().lmhead_tensor_parallel_size is not None:
-        return True
-    return False
+    return get_ascend_config().lmhead_tensor_parallel_size is not None
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1320,14 +1320,14 @@ def _process_reqs(
         if self.use_aux_hidden_state_outputs:
             hidden_states, aux_hidden_states = hidden_states
 
-        if _enable_lmhead_tp():  #
+        if _enable_lmhead_tp():
             if not with_prefill:
                 max_num_reqs_across_dp = padded_num_tokens_across_dp
             else:
                 max_num_reqs_across_dp = self.max_num_reqs
-            sample_indices = nn.functional.pad(
-                sample_indices,
-                (0, max_num_reqs_across_dp - sample_indices.shape[0]))
+            logits_indices = nn.functional.pad(
+                logits_indices,
+                (0, max_num_reqs_across_dp - logits_indices.shape[0]))
 
         return (attn_metadata, hidden_states, spec_decode_metadata, positions,
                 total_num_scheduled_tokens, logits_indices, aux_hidden_states,
@@ -1656,14 +1656,14 @@ def execute_model(
             # Sample the next token and get logprobs if needed.
             sampling_metadata = self.input_batch.sampling_metadata
             if spec_decode_metadata is None:
-                if _enable_lmhead_tp():
+                if _enable_lmhead_tp() and logits is not None:
                     logits = logits[:self.input_batch.num_reqs]
                 sampler_output = self.sampler(
                     logits=logits,
                     sampling_metadata=sampling_metadata,
                 )
             else:
-                if _enable_lmhead_tp():
+                if _enable_lmhead_tp() and logits is not None:
                     logits = logits[:len(spec_decode_metadata.logits_indices)]
                 # When indexing with a tensor (bonus_logits_indices), PyTorch
                 # creates a new tensor with separate storage from the original
@@ -1952,16 +1952,16 @@ def _dummy_run(
                     with_prefill, is_torchair_compile, input_ids, positions,
                     attn_metadata, num_tokens, intermediate_tensors,
                     inputs_embeds)
-               
+
             if _enable_lmhead_tp() and not self.in_profile_run:
-              if not with_prefill:
-                  max_num_reqs_across_dp = num_reqs
-              else:
-                  max_num_reqs_across_dp = max_num_reqs
-              dummy_indices = torch.zeros(max_num_reqs_across_dp,
-                                          device=hidden_states.device,
-                                          dtype=torch.int32)
-              model.compute_logits(hidden_states[dummy_indices], None)
+                if not with_prefill:
+                    max_num_reqs_across_dp = num_reqs
+                else:
+                    max_num_reqs_across_dp = max_num_reqs
+                dummy_indices = torch.zeros(max_num_reqs_across_dp,
+                                            device=hidden_states.device,
+                                            dtype=torch.int32)
+                self.model.compute_logits(hidden_states[dummy_indices], None)
 
             if self.speculative_config and self.speculative_config.method == "deepseek_mtp":
                 assert isinstance(self.drafter, MtpProposer)
@@ -1979,7 +1979,8 @@ def _dummy_run(
                     dummy_indices = torch.zeros(max_num_reqs_across_dp,
                                                 device=hidden_states.device,
                                                 dtype=torch.int32)
-                    model.compute_logits(hidden_states[dummy_indices], None)
+                    self.model.compute_logits(hidden_states[dummy_indices],
+                                              None)
 
             return hidden_states
 
diff --git a/vllm_ascend/worker/mtp_proposer_v1.py b/vllm_ascend/worker/mtp_proposer_v1.py
@@ -19,6 +19,7 @@
 from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP
 from vllm_ascend.utils import ProfileExecuteDuration, _enable_lmhead_tp
 
+
 class MtpProposer:
 
     def __init__(
@@ -224,16 +225,16 @@ def propose(
                         previous_hidden_states=self.
                         hidden_states[:num_input_tokens],
                         kv_caches=self.runner.kv_caches[-1:])
-                    
+
         num_indices = last_token_indices.shape[0]
         if _enable_lmhead_tp():
             if not self.runner.with_prefill:
                 max_num_reqs_across_dp = num_input_tokens
             else:
                 max_num_reqs_across_dp = self.vllm_config.scheduler_config.max_num_seqs
             last_token_indices = nn.functional.pad(
-                last_token_indices, (0, max_num_reqs_across_dp - num_indices)) 
-               
+                last_token_indices, (0, max_num_reqs_across_dp - num_indices))
+
         sample_hidden_states = hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
         if _enable_lmhead_tp() and num_indices < logits.shape[0]: