vllm-project
diff --git a/‎vllm_ascend/spec_decode/eagle_v1.py‎
Lines changed: 86 additions & 100 deletions b/‎vllm_ascend/spec_decode/eagle_v1.py‎
Lines changed: 86 additions & 100 deletions
@@ -1,16 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
+import os
 import torch
 import torch.nn as nn
 from vllm.attention.layer import Attention
 from vllm.config import (CompilationLevel, VllmConfig,
                          get_layers_from_vllm_config, set_current_vllm_config)
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
-from vllm.model_executor.model_loader import get_model_loader
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
-from vllm_ascend.attention.attention_v1 import AscendMetadata
+from vllm_ascend.attention.attention_v1 import AscendMetadata, AscendAttentionState
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm_ascend.attention.attention import AttentionMaskBuilder
 
@@ -23,11 +26,13 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         device: torch.device,
+        runner=None
     ):
         self.vllm_config = vllm_config
         self.speculative_config = vllm_config.speculative_config
         self.draft_model_config = self.speculative_config.draft_model_config
         self.method = self.speculative_config.method
+        self.runner = runner
         self.model_config = vllm_config.model_config
         self.dtype = vllm_config.model_config.dtype
         self.max_model_len = vllm_config.model_config.max_model_len
@@ -95,10 +100,13 @@ def propose(
         sampling_metadata: SamplingMetadata,
 
     ) -> torch.Tensor:
+        device = cu_num_tokens.device
+        cu_num_tokens = cu_num_tokens.cpu()
+        block_table = block_table.cpu()
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
         last_token_indices = cu_num_tokens[1:] - 1
-
+        target_positions = target_positions.cpu()
         if self.method == "eagle3":
             assert isinstance(self.model, Eagle3LlamaForCausalLM)
             target_hidden_states = self.model.combine_hidden_states(
@@ -112,47 +120,29 @@ def propose(
         # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
         self.input_ids[last_token_indices] = next_token_ids[0]
 
-
+        query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
+        max_query_len = query_lens.max().item()
         # FA requires seq_len to have dtype int32.
         seq_lens = (target_positions[last_token_indices] + 1).int().to('cpu')
 
         # FIXME(woosuk): The below two ops cause synchronization. Optimize.
-        max_seq_len = seq_lens.max().item()
-        max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item()
-        
-
-        # attn_mask = torch.zeros((20, 20), dtype=torch.bfloat16)
-        attn_mask = self._make_attention_mask(seq_lens=seq_lens,
-                                              query_lens=seq_lens,
-                                              position=target_positions,
-                                              )
-
-        attn_metadata = AscendMetadata(
+        # max_seq_len = seq_lens.max().item()
+        # max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item()
+        attn_metadata = self.runner.attn_metadata_builder.build(
+            num_reqs=batch_size,
             num_actual_tokens=num_tokens,
-            max_query_len=max_num_tokens,
-            query_start_loc=cu_num_tokens,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            query_lens=seq_lens,
-            block_table=block_table,
-            block_tables=block_table,
-            slot_mapping=target_slot_mapping,
-            # TODO(woosuk): Support cascade attention.
+            max_query_len=max_query_len,
             common_prefix_len=0,
-            attn_mask=attn_mask,
-            cu_prefix_query_lens=None,
-            prefix_kv_lens=None,
-            suffix_kv_lens=None,
         )
         if self.use_cuda_graph and \
             num_tokens <= self.cudagraph_batch_sizes[-1]:
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
         else:
             num_input_tokens = num_tokens
         # copy inputs to buffer for cudagraph
-        self.positions[:num_tokens] = target_positions
+        self.positions[:num_tokens] = target_positions.to(device)
         self.hidden_states[:num_tokens] = target_hidden_states
-        int_positions = int(target_positions[last_token_indices])
+        attn_metadata.block_tables = block_table.to(device)
         with set_forward_context(attn_metadata,
                                  self.vllm_config,
                                  num_tokens=num_input_tokens):
@@ -161,22 +151,22 @@ def propose(
                 positions=self.positions[:num_input_tokens],
                 hidden_states=self.hidden_states[:num_input_tokens],
             )
-        print(f"last_token_indices={last_token_indices}")
         sample_hidden_states = last_hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
         draft_token_ids = logits.argmax(dim=-1)
-
+        
         # Early exit if there is only one draft token to be generated.
         if self.num_speculative_tokens == 1:
             # [batch_size, 1]
             return draft_token_ids.view(-1, 1)
 
         # Generate the remaining draft tokens.
-        draft_token_ids_list = [draft_token_ids]
-        draft_token_ids_tensor = torch.zeros((self.num_speculative_tokens, *draft_token_ids.shape), dtype=draft_token_ids.dtype)
+        draft_token_ids_tensor = torch.zeros(
+            (self.num_speculative_tokens, *draft_token_ids.shape),
+             dtype=draft_token_ids.dtype)
         draft_token_ids_tensor[0] = draft_token_ids
-       
-        positions = target_positions[last_token_indices]
+
+        positions_cpu = target_positions[last_token_indices].cpu().to(torch.int64)
         hidden_states = hidden_states[last_token_indices]
         if self.use_cuda_graph and \
             batch_size <= self.cudagraph_batch_sizes[-1]:
@@ -188,75 +178,73 @@ def propose(
         attn_metadata.query_start_loc = self.arange[:batch_size + 1]
 
         if self.num_speculative_tokens > 2:
-            raise ValueError("Speculative tokens > 2 are not yet supported.")
+            raise ValueError("Speculative tokens > 2 are not supported yet.")
 
+        attn_metadata.attn_state = AscendAttentionState.ChunkedPrefill
         for now_speculative in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             # cast to int32 is crucial when eagle model is compiled.
             # tensor.argmax() returns int64 by default.
-            input_ids = draft_token_ids_tensor[now_speculative]
-            # input_ids = draft_token_ids_list[-1]
-            # positions += 1
-            int_positions += 1
-            positions = torch.tensor([int_positions], dtype=torch.int64, device='npu:0')
+            input_ids = draft_token_ids_tensor[now_speculative].to(device)
+            positions_cpu += 1
+           
 
             # NOTE(woosuk): We should handle the case where the draft model
             # generates tokens beyond the max model length. Since it is complex
             # to remove such requests from the batch, we keep them in the batch
             # but adjust the position ids and slot mappings to avoid the
             # out-of-range access during the model execution. The draft tokens
             # generated with this adjustment should be ignored.
-            exceeds_max_model_len = positions >= self.max_model_len
+            exceeds_max_model_len = positions_cpu >= self.max_model_len
             # print(f"exceeds_max_model_len={exceeds_max_model_len}")
             # Mask out the position ids that exceed the max model length.
             # Otherwise, we may get out-of-range error in RoPE.
-            clamped_positions = torch.where(exceeds_max_model_len, 0,
-                                            positions)
+            clamped_positions_cpu = torch.where(exceeds_max_model_len, 0,
+                                            positions_cpu)
+            clamped_positions = clamped_positions_cpu.to(device)
 
-            # Increment the sequence lengths.
-            attn_metadata.max_seq_len += 1
+            # TODO: Increment the sequence lengths.
             # attn_metadata.max_seq_len += 1
+
             attn_metadata.seq_lens += 1
-            # Consider max model length.
-            attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
-                                            self.max_model_len)
+            # TODO: Consider max model length.
+            # attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
+            #                                 self.max_model_len)
             # For the requests that exceed the max model length, we set the
-            # sequence length to 1 to minimize their overheads in attention.
-            attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len.to('cpu'), 1)
-
-            # block_table_indices = (req_indices * self.max_num_blocks_per_req +
-            #                    positions_np // self.block_size)
-            # block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
-            # block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
-            # block_offsets = positions_np % self.block_size
-            # np.add(block_numbers * self.block_size,
-            #        block_offsets,
-            #        out=self.slot_mapping_np[:total_num_scheduled_tokens])
+            # TODO: sequence length to 1 to minimize their overheads in attention.
+            # attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len.to('cpu'), 1)
+            
             # Compute the slot mapping.
-            block_numbers = clamped_positions // self.block_size
+            block_numbers = (clamped_positions_cpu // self.block_size)
             block_ids = block_table.gather(dim=1,
                                            index=block_numbers.view(-1, 1))
             block_ids = block_ids.view(-1)
-            attn_metadata.slot_mapping = (block_ids * self.block_size +
-                                          clamped_positions % self.block_size)
+            slot_mapping_cpu = (block_ids * self.block_size +
+                                          clamped_positions_cpu % self.block_size)
+            
+            # attn_metadata.slot_mapping = (block_ids * self.block_size +
+            #                               clamped_positions % self.block_size)
             # Mask out the slot mappings that exceed the max model length.
             # Otherwise, the KV cache will be inadvertently updated with the
             # padding tokens.
-            attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len,
+            # attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len,
+            #                                         PADDING_SLOT_ID)
+            slot_mapping_cpu.masked_fill_(exceeds_max_model_len,
                                                     PADDING_SLOT_ID)
-
+            # NOTE: ASCEND slot_mapping must on cpu
+            attn_metadata.slot_mapping =  slot_mapping_cpu.to(torch.int32).to(device)
             # attn_metadata.num_actual_tokens = attn_metadata.seq_lens
             # copy inputs to buffer for cudagraph
             self.input_ids[:batch_size] = input_ids
-            # self.input_ids[:batch_size] = input_ids
             self.positions[:batch_size] = clamped_positions
             self.hidden_states[:batch_size] = hidden_states
-
+            positions = positions_cpu.to(device)
             attn_mask = self._make_attention_mask(seq_lens=attn_metadata.seq_lens,
                             query_lens=attn_metadata.max_query_len,
-                            position=torch.tensor([int_positions], dtype=torch.int64, device='npu:0'),
+                            position=positions,
                             )
             attn_metadata.attn_mask = attn_mask
+            attn_metadata.block_tables = block_table.to(device)
             # Run the model.
             with set_forward_context(attn_metadata,
                                      self.vllm_config,
@@ -273,15 +261,12 @@ def propose(
 
             # TODO(wenlong): get more than one token for tree attention
             draft_token_ids = logits.argmax(dim=-1)
-            # for _id in range(len(old_draft_token_ids_list)):
-            #     draft_token_ids_list[_id] = old_draft_token_ids_list[_id]
-            # draft_token_ids_list.append(draft_token_ids)
-            draft_token_ids_tensor[now_speculative+1] = draft_token_ids
+            draft_token_ids_tensor[now_speculative+1] = draft_token_ids.cpu()
 
 
         # [batch_size, num_speculative_tokens]
-        # draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
         draft_token_ids = draft_token_ids_tensor.swapaxes(0,1)
+        print(f"draft_token_ids_tensor={draft_token_ids}")
         return draft_token_ids
 
     @staticmethod
@@ -326,42 +311,43 @@ def prepare_inputs(
         return cu_num_tokens, token_indices
 
     def load_model(self, target_model: nn.Module) -> None:
-        loader = get_model_loader(self.vllm_config.load_config)
-        target_layer_num = self.vllm_config.model_config.get_num_layers(
-            self.vllm_config.parallel_config)
+        draft_model_config = \
+            self.vllm_config.speculative_config.draft_model_config
         target_attn_layer_names = set(
             get_layers_from_vllm_config(self.vllm_config, Attention).keys())
 
-        draft_model_config = \
-            self.vllm_config.speculative_config.draft_model_config
-        # FIXME(lily): This does not handle with distributed inference.
-        target_device = self.vllm_config.device_config.device
-        # We need to set the vllm_config here to register attention
-        # layers in the forward context.
-        with set_default_torch_dtype(
-                draft_model_config.dtype), set_current_vllm_config(
-                    self.vllm_config):
-            draft_model_cls, arch = ModelRegistry.resolve_model_cls(
-                draft_model_config.architectures)
-            self.model = draft_model_cls(
-                vllm_config=self.vllm_config,
-                start_layer_id=target_layer_num).to(target_device)
+        self.model = get_model(vllm_config=self.vllm_config,
+                               model_config=draft_model_config)
 
         draft_attn_layer_names = (
             get_layers_from_vllm_config(self.vllm_config, Attention).keys() -
             target_attn_layer_names)
-        assert len(draft_attn_layer_names) == 1
+
+        self.attn_layer_names = list(draft_attn_layer_names)
         self.attn_layer_name = next(iter(draft_attn_layer_names))
-        loaded_weights = self.model.load_weights(
-            loader.get_all_weights(draft_model_config, self.model))
-        if self.vllm_config.speculative_config.method == "eagle3":
-            if "model.embed_tokens.weight" not in loaded_weights:
-                logger.info(
-                    "Loading EAGLE embedding weights from the target model.")
-                self.model.model.embed_tokens = target_model.model.embed_tokens
+        # share embed_tokens with the target model if needed
+        if get_pp_group().world_size == 1:
+            logger.info(
+                "The EAGLE head shares the same vocab embedding" \
+                " with the target model."
+            )
+            self.model.model.embed_tokens = target_model.model.embed_tokens
         else:
+            logger.info(
+                "Since PP > 1, the EAGLE head loaded its own vocab embedding" \
+                " weights instead of sharing them with the target model."
+            )
+
+        # share lm_head with the target model if needed
+        # some model definition do not define lm_head explicitly
+        # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM
+        if self.vllm_config.speculative_config.method != "eagle3" and \
+                hasattr(target_model, "lm_head"):
             logger.info("Loading EAGLE LM head weights from the target model.")
-            self.model.lm_head = target_model.lm_head
+            if supports_multimodal(target_model):
+                self.model.lm_head = target_model.get_language_model().lm_head
+            else:
+                self.model.lm_head = target_model.lm_head
 
     @torch.inference_mode()
     def dummy_run(