From 65d322b279165d53d4b73574e13987f658f5fa24 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Sat, 19 Jul 2025 15:01:48 +0800 Subject: [PATCH] [CI] Fix broken CI Signed-off-by: wangxiyuan --- pyproject.toml | 2 - requirements.txt | 3 - vllm_ascend/ops/common_fused_moe.py | 37 ++++----- vllm_ascend/worker/model_runner_v1.py | 106 ++++++++++++++++++++------ vllm_ascend/worker/npu_input_batch.py | 31 ++++++-- vllm_ascend/worker/worker_v1.py | 3 + 6 files changed, 128 insertions(+), 54 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d6cb55d1f2..6b16fc4928 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,5 @@ requires = [ "msgpack", "quart", "numba", - # Remove after https://github.com/vllm-project/vllm-ascend/issues/1470 - "transformers==4.52.4", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index f2a1c6ff20..375b55433e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,6 +25,3 @@ numba --pre --extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi torch-npu==2.5.1.post1.dev20250619 - -# Remove after https://github.com/vllm-project/vllm-ascend/issues/1470 -transformers==4.52.4 diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index cdd18bd6ae..e1b363cb91 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -40,23 +40,26 @@ def unquantized_fused_moe_init_func(self, *args, **kwargs): def forward_oot( - self, - layer: torch.nn.Module, - x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, - router_logits: torch.Tensor, - renormalize: bool, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None, - scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None, - global_num_experts: Optional[int] = None, - expert_map: Optional[torch.Tensor] = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", -) -> torch.Tensor: + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + global_num_experts: Optional[int] = None, + expert_map: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor: if SELECT_GATING_TOPK_SOTFMAX_EXPERTS: topk_weights, topk_ids = select_gating_top_k_softmax_experts( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 4be7f47011..e027b7cb37 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -24,7 +24,7 @@ import weakref from contextlib import contextmanager, nullcontext from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Union, cast, get_args import numpy as np import numpy.typing as npt @@ -45,7 +45,8 @@ from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model -from vllm.model_executor.models.interfaces import has_step_pooler +from vllm.model_executor.models.interfaces_base import (VllmModelForPooling, + is_pooling_model) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality @@ -88,8 +89,10 @@ from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch if vllm_version_is("0.9.2"): + from vllm.model_executor.models.interfaces import has_step_pooler from vllm.v1.utils import bind_kv_cache else: + from vllm.pooling_params import PoolingTask from vllm.v1.worker.utils import bind_kv_cache if TYPE_CHECKING: @@ -395,6 +398,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: for new_req_data in scheduler_output.scheduled_new_reqs: req_id = new_req_data.req_id sampling_params = new_req_data.sampling_params + pooling_params = new_req_data.pooling_params if sampling_params and \ sampling_params.sampling_type == SamplingType.RANDOM_SEED: generator = torch.Generator(device=self.device) @@ -402,6 +406,16 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: else: generator = None + if not vllm_version_is("0.9.2") and pooling_params: + assert pooling_params.task is not None, ( + "You did not set `task` in the API") + model = cast(VllmModelForPooling, self.model) + to_update = (model.pooler.get_pooling_updates( + pooling_params.task)) + assert to_update is not None, ( + f"{pooling_params.task=} is not supported by the model") + to_update.apply(pooling_params) + self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, @@ -1729,26 +1743,59 @@ def _dummy_pooler_run( req_num_tokens = num_tokens // num_reqs - dummy_metadata = PoolingMetadata( - prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list], - device=self.device), - prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), - dtype=torch.int32, - device=self.device), - pooling_params=[PoolingParams()] * num_reqs) - - try: - pooler_output = self.model.pooler(hidden_states=hidden_states_list, - pooling_metadata=dummy_metadata) - except RuntimeError as e: - if 'out of memory' in str(e): - raise RuntimeError( - "NPU out of memory occurred when warming up pooler with " - f"{num_reqs} dummy requests. Please try lowering " - "`max_num_seqs` or `gpu_memory_utilization` when " - "initializing the engine.") from e - else: - raise e + if vllm_version_is("0.9.2"): + dummy_metadata = PoolingMetadata( + prompt_lens=torch.tensor( + [h.shape[0] for h in hidden_states_list], + device=self.device), + prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), + dtype=torch.int32, + device=self.device), + pooling_params=[PoolingParams()] * num_reqs) + try: + pooler_output = self.model.pooler( + hidden_states=hidden_states_list, + pooling_metadata=dummy_metadata) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "NPU out of memory occurred when warming up pooler with " + f"{num_reqs} dummy requests. Please try lowering " + "`max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine.") from e + else: + raise e + else: + model = cast(VllmModelForPooling, self.model) + dummy_task = self.get_supported_pooling_tasks()[0] + dummy_pooling_params = PoolingParams(task=dummy_task) + + to_update = model.pooler.get_pooling_updates(dummy_task) + assert to_update is not None + to_update.apply(dummy_pooling_params) + + dummy_metadata = PoolingMetadata( + prompt_lens=torch.tensor( + [h.shape[0] for h in hidden_states_list], + device=self.device), + prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), + dtype=torch.int32, + device=self.device), + pooling_params=[dummy_pooling_params] * num_reqs) + + try: + pooler_output = model.pooler(hidden_states=hidden_states_list, + pooling_metadata=dummy_metadata) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "NPU out of memory occurred when warming up pooler with " + f"{num_reqs} dummy requests. Please try lowering " + "`max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine.") from e + else: + raise e + return pooler_output def load_model(self) -> None: @@ -1767,8 +1814,9 @@ def load_model(self) -> None: QKVParallelLinear, RowParallelLinear)): module.weight.data = torch_npu.npu_format_cast( module.weight.data, ACL_FORMAT_FRACTAL_NZ) - if has_step_pooler(self.model): - self.input_batch.logits_processing_needs_token_ids = True + + if vllm_version_is("0.9.2") and has_step_pooler(self.model): + self.input_batch.logits_processing_needs_token_ids_bool = True if self.drafter: logger.info("Loading drafter model...") if isinstance(self.drafter, EagleProposer): @@ -2379,3 +2427,13 @@ def select_torchair_padded_batch_size(self, batch_size: int): if batch_size <= padded_batch_size < selected_batch_size: selected_batch_size = padded_batch_size return selected_batch_size + + def get_supported_pooling_tasks(self): + model = self.get_model() + if not is_pooling_model(model): + return [] + + return [ + task for task in get_args(PoolingTask) + if model.pooler.get_pooling_updates(task) + ] diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index 40c1043fc3..5a695af65d 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -35,6 +35,8 @@ from vllm.v1.utils import copy_slice from vllm.v1.worker.block_table import MultiGroupBlockTable +from vllm_ascend.utils import vllm_version_is + _SAMPLING_EPS = 1e-5 @@ -83,7 +85,6 @@ def __init__( pin_memory: bool, vocab_size: int, block_sizes: list[int], # The block_size of each kv cache group - logits_processing_needs_token_ids: bool = False, is_spec_decode: bool = False, ): self.is_spec_decode = is_spec_decode @@ -93,8 +94,6 @@ def __init__( self.device = device self.pin_memory = pin_memory self.vocab_size = vocab_size - self.logits_processing_needs_token_ids = ( - logits_processing_needs_token_ids) self._req_ids: list[Optional[str]] = [] self.req_id_to_index: dict[str, int] = {} @@ -247,6 +246,11 @@ def __init__( # req_index -> bad_words_token_ids self.bad_words_token_ids: dict[int, list[list[int]]] = {} + if vllm_version_is("0.9.2"): + self.logits_processing_needs_token_ids_bool = False + else: + self.logits_processing_needs_token_ids = np.zeros(max_num_reqs, + dtype=bool) self.req_output_token_ids: list[Optional[list[int]]] = [] @@ -383,9 +387,15 @@ def add_request( if sampling_params.bad_words_token_ids: self.bad_words_token_ids[ req_index] = sampling_params.bad_words_token_ids - else: + elif vllm_version_is("0.9.2"): assert request.pooling_params is not None self.pooling_params[req_id] = request.pooling_params + elif pooling_params := request.pooling_params: + self.pooling_params[req_id] = pooling_params + self.logits_processing_needs_token_ids[req_index] = ( + pooling_params.requires_token_ids) + else: + raise NotImplementedError(request) # Add request lora ID if request.lora_request: @@ -614,10 +624,15 @@ def _make_sampling_metadata(self) -> SamplingMetadata: self.presence_penalties, num_reqs) copy_slice(self.repetition_penalties_cpu_tensor, self.repetition_penalties, num_reqs) - - needs_prompt_token_ids = (not self.no_penalties or - (self.num_reqs > 0 - and self.logits_processing_needs_token_ids)) + if vllm_version_is("0.9.2"): + needs_prompt_token_ids = ( + not self.no_penalties + or (self.num_reqs > 0 + and self.logits_processing_needs_token_ids_bool)) + else: + needs_prompt_token_ids = ( + not self.no_penalties + or self.logits_processing_needs_token_ids[:num_reqs].any()) if needs_prompt_token_ids: # The prompt tokens are used only for applying penalties or # step pooling during the sampling/pooling process. diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index df03d508e4..111be6646c 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -355,3 +355,6 @@ def _init_profiler(self): torch_profiler_trace_dir)) else: return None + + def get_supported_pooling_tasks(self): + return self.model_runner.get_supported_pooling_tasks()