Merge branch 'padded-spec' of https://github.com/JC-ut0/vllm-ascend into padded-spec

JC-ut0 · JC-ut0 · commit e56f12c0aca7 · 2025-10-23T17:14:14.000+08:00
diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -113,14 +113,22 @@ def test_mtp2_correctness_full_graph(
 ):
     mtp_correctness(sampling_config, model_name, 2, CUDAGraphMode.FULL)
 
+
 def test_mtp1_correctness_piecewise_graph_with_pad(
     sampling_config: SamplingParams,
     model_name: str,
 ):
-    mtp_correctness(sampling_config, model_name, 1, disable_padded_drafter_batch=False)
+    mtp_correctness(sampling_config,
+                    model_name,
+                    1,
+                    disable_padded_drafter_batch=False)
+
 
 def test_mtp2_correctness_piecewise_graph_with_pad(
     sampling_config: SamplingParams,
     model_name: str,
 ):
-    mtp_correctness(sampling_config, model_name, 2, disable_padded_drafter_batch=False)
+    mtp_correctness(sampling_config,
+                    model_name,
+                    2,
+                    disable_padded_drafter_batch=False)
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -32,27 +32,16 @@ def __init__(self,
                  device: torch.device,
                  runner=None):
         self.name = SpecDcodeType.EAGLE if vllm_config.speculative_config.method == "eagle" else SpecDcodeType.EAGLE3
-        self.device = device
         self.vllm_config = vllm_config
-        self.speculative_config = vllm_config.speculative_config
-        self.draft_model_config = self.speculative_config.draft_model_config
-        self.method = self.speculative_config.method
-
+        self.device = device
         self.runner = runner
-        self.dtype = vllm_config.model_config.dtype
-        self.max_model_len = vllm_config.model_config.max_model_len
-        self.block_size = vllm_config.cache_config.block_size
-        self.num_speculative_tokens = (
-            self.speculative_config.num_speculative_tokens)
-        self.max_num_tokens = (
-            vllm_config.scheduler_config.max_num_batched_tokens)
-        self.token_arange_np = np.arange(self.max_num_tokens)
 
         self.block_size = vllm_config.cache_config.block_size
         # We need to get the hidden size from the draft model config because
         # the draft model's hidden size can be different from the target model's
         # hidden size (e.g., Llama 3.3 70B).
-        self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.hidden_size = vllm_config.speculative_config.draft_model_config.get_hidden_size(
+        )
 
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
                                == CompilationLevel.PIECEWISE and
@@ -62,15 +51,18 @@ def __init__(self,
                 self.vllm_config.compilation_config.cudagraph_capture_sizes))
 
         # persistent buffers for cuda graph
-        self.input_ids = torch.zeros(self.max_num_tokens,
-                                     dtype=torch.int32,
-                                     device=device)
-        self.positions = torch.zeros(self.max_num_tokens,
-                                     dtype=torch.int64,
-                                     device=device)
+        self.input_ids = torch.zeros(
+            self.vllm_config.scheduler_config.max_num_batched_tokens,
+            dtype=torch.int32,
+            device=device)
+        self.positions = torch.zeros(
+            self.vllm_config.scheduler_config.max_num_batched_tokens,
+            dtype=torch.int64,
+            device=device)
         self.hidden_states = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
+            (self.vllm_config.scheduler_config.max_num_batched_tokens,
+             self.hidden_size),
+            dtype=self.vllm_config.model_config.dtype,
             device=device)
         # We need +1 here because the arange is used to set query_start_loc,
         # which has one more element than batch_size.
@@ -406,17 +398,14 @@ def _propose(
         # [batch_size, max_num_blocks_per_req]
         block_table: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-        last_token_indices: Optional[torch.Tensor],
     ) -> torch.Tensor:
         device = cu_num_tokens.device
         cu_num_tokens = cu_num_tokens.cpu()
         block_table = block_table.cpu()
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
-        if last_token_indices is None:
-            last_token_indices = common_attn_metadata.query_start_loc[1:] - 1
+        last_token_indices = cu_num_tokens[1:] - 1
         target_positions = target_positions.cpu()
-
         if self.name == SpecDcodeType.EAGLE3:
             assert isinstance(self.model, Eagle3LlamaForCausalLM)
             target_hidden_states = self.model.combine_hidden_states(
diff --git a/vllm_ascend/torchair/mtp_torchair_proposer.py b/vllm_ascend/torchair/mtp_torchair_proposer.py
@@ -203,7 +203,7 @@ def generate_token_ids(self,
                 attn_metadata.slot_mapping[:num_scheduled_tokens],
             )
 
-        draft_token_ids = self._propose(
+        draft_token_ids = self._propose_torchair(
             target_token_ids=target_token_ids,
             target_positions=target_positions,
             target_hidden_states=target_hidden_states,
@@ -251,7 +251,7 @@ def _torchair_prepare_inputs(
 
         return cu_num_tokens, token_indices, target_token_ids, target_positions, target_hidden_states, target_slot_mapping
 
-    def _propose(
+    def _propose_torchair(
             self,
             # [num_tokens]
             target_token_ids: torch.Tensor,