Skip to content

Commit 2b726d8

Browse files
authored
[CI] Fix broken CI (#1889)
1. vLLM commit vllm-project/vllm@45badd0 changed the pooling check logic which broken vLLM Ascend. 2. vLLM commit vllm-project/vllm@3e04107 requires higher version of transformers. The transformers version bug has been fixed by vllm-project/vllm@e936e40. We can safe to remove the version limit now. 3. vLLM commit vllm-project/vllm@2179372 added a new input `enable_eplb` for FusedMoe Ops This PR fix the broken CI. - vLLM version: v0.9.2 - vLLM main: vllm-project/vllm@6a971ed Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent 2ee9046 commit 2b726d8

File tree

6 files changed

+128
-54
lines changed

6 files changed

+128
-54
lines changed

pyproject.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,5 @@ requires = [
1919
"msgpack",
2020
"quart",
2121
"numba",
22-
# Remove after https://github.com/vllm-project/vllm-ascend/issues/1470
23-
"transformers==4.52.4",
2422
]
2523
build-backend = "setuptools.build_meta"

requirements.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,3 @@ numba
2525
--pre
2626
--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
2727
torch-npu==2.5.1.post1.dev20250619
28-
29-
# Remove after https://github.com/vllm-project/vllm-ascend/issues/1470
30-
transformers==4.52.4

vllm_ascend/ops/common_fused_moe.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,23 +40,26 @@ def unquantized_fused_moe_init_func(self, *args, **kwargs):
4040

4141

4242
def forward_oot(
43-
self,
44-
layer: torch.nn.Module,
45-
x: torch.Tensor,
46-
use_grouped_topk: bool,
47-
top_k: int,
48-
router_logits: torch.Tensor,
49-
renormalize: bool,
50-
topk_group: Optional[int] = None,
51-
num_expert_group: Optional[int] = None,
52-
custom_routing_function: Optional[Callable] = None,
53-
scoring_func: str = "softmax",
54-
e_score_correction_bias: Optional[torch.Tensor] = None,
55-
global_num_experts: Optional[int] = None,
56-
expert_map: Optional[torch.Tensor] = None,
57-
apply_router_weight_on_input: bool = False,
58-
activation: str = "silu",
59-
) -> torch.Tensor:
43+
self,
44+
layer: torch.nn.Module,
45+
x: torch.Tensor,
46+
use_grouped_topk: bool,
47+
top_k: int,
48+
router_logits: torch.Tensor,
49+
renormalize: bool,
50+
topk_group: Optional[int] = None,
51+
num_expert_group: Optional[int] = None,
52+
custom_routing_function: Optional[Callable] = None,
53+
scoring_func: str = "softmax",
54+
e_score_correction_bias: Optional[torch.Tensor] = None,
55+
global_num_experts: Optional[int] = None,
56+
expert_map: Optional[torch.Tensor] = None,
57+
apply_router_weight_on_input: bool = False,
58+
activation: str = "silu",
59+
enable_eplb: bool = False,
60+
expert_load_view: Optional[torch.Tensor] = None,
61+
logical_to_physical_map: Optional[torch.Tensor] = None,
62+
logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor:
6063

6164
if SELECT_GATING_TOPK_SOTFMAX_EXPERTS:
6265
topk_weights, topk_ids = select_gating_top_k_softmax_experts(

vllm_ascend/worker/model_runner_v1.py

Lines changed: 82 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import weakref
2525
from contextlib import contextmanager, nullcontext
2626
from dataclasses import dataclass
27-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
27+
from typing import TYPE_CHECKING, Dict, List, Optional, Union, cast, get_args
2828

2929
import numpy as np
3030
import numpy.typing as npt
@@ -45,7 +45,8 @@
4545
from vllm.model_executor.layers.fused_moe import FusedMoE
4646
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
4747
from vllm.model_executor.model_loader import get_model
48-
from vllm.model_executor.models.interfaces import has_step_pooler
48+
from vllm.model_executor.models.interfaces_base import (VllmModelForPooling,
49+
is_pooling_model)
4950
from vllm.multimodal import MULTIMODAL_REGISTRY
5051
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
5152
from vllm.multimodal.utils import group_mm_inputs_by_modality
@@ -88,8 +89,10 @@
8889
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
8990

9091
if vllm_version_is("0.9.2"):
92+
from vllm.model_executor.models.interfaces import has_step_pooler
9193
from vllm.v1.utils import bind_kv_cache
9294
else:
95+
from vllm.pooling_params import PoolingTask
9396
from vllm.v1.worker.utils import bind_kv_cache
9497

9598
if TYPE_CHECKING:
@@ -395,13 +398,24 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
395398
for new_req_data in scheduler_output.scheduled_new_reqs:
396399
req_id = new_req_data.req_id
397400
sampling_params = new_req_data.sampling_params
401+
pooling_params = new_req_data.pooling_params
398402
if sampling_params and \
399403
sampling_params.sampling_type == SamplingType.RANDOM_SEED:
400404
generator = torch.Generator(device=self.device)
401405
generator.manual_seed(sampling_params.seed)
402406
else:
403407
generator = None
404408

409+
if not vllm_version_is("0.9.2") and pooling_params:
410+
assert pooling_params.task is not None, (
411+
"You did not set `task` in the API")
412+
model = cast(VllmModelForPooling, self.model)
413+
to_update = (model.pooler.get_pooling_updates(
414+
pooling_params.task))
415+
assert to_update is not None, (
416+
f"{pooling_params.task=} is not supported by the model")
417+
to_update.apply(pooling_params)
418+
405419
self.requests[req_id] = CachedRequestState(
406420
req_id=req_id,
407421
prompt_token_ids=new_req_data.prompt_token_ids,
@@ -1729,26 +1743,59 @@ def _dummy_pooler_run(
17291743

17301744
req_num_tokens = num_tokens // num_reqs
17311745

1732-
dummy_metadata = PoolingMetadata(
1733-
prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list],
1734-
device=self.device),
1735-
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
1736-
dtype=torch.int32,
1737-
device=self.device),
1738-
pooling_params=[PoolingParams()] * num_reqs)
1739-
1740-
try:
1741-
pooler_output = self.model.pooler(hidden_states=hidden_states_list,
1742-
pooling_metadata=dummy_metadata)
1743-
except RuntimeError as e:
1744-
if 'out of memory' in str(e):
1745-
raise RuntimeError(
1746-
"NPU out of memory occurred when warming up pooler with "
1747-
f"{num_reqs} dummy requests. Please try lowering "
1748-
"`max_num_seqs` or `gpu_memory_utilization` when "
1749-
"initializing the engine.") from e
1750-
else:
1751-
raise e
1746+
if vllm_version_is("0.9.2"):
1747+
dummy_metadata = PoolingMetadata(
1748+
prompt_lens=torch.tensor(
1749+
[h.shape[0] for h in hidden_states_list],
1750+
device=self.device),
1751+
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
1752+
dtype=torch.int32,
1753+
device=self.device),
1754+
pooling_params=[PoolingParams()] * num_reqs)
1755+
try:
1756+
pooler_output = self.model.pooler(
1757+
hidden_states=hidden_states_list,
1758+
pooling_metadata=dummy_metadata)
1759+
except RuntimeError as e:
1760+
if 'out of memory' in str(e):
1761+
raise RuntimeError(
1762+
"NPU out of memory occurred when warming up pooler with "
1763+
f"{num_reqs} dummy requests. Please try lowering "
1764+
"`max_num_seqs` or `gpu_memory_utilization` when "
1765+
"initializing the engine.") from e
1766+
else:
1767+
raise e
1768+
else:
1769+
model = cast(VllmModelForPooling, self.model)
1770+
dummy_task = self.get_supported_pooling_tasks()[0]
1771+
dummy_pooling_params = PoolingParams(task=dummy_task)
1772+
1773+
to_update = model.pooler.get_pooling_updates(dummy_task)
1774+
assert to_update is not None
1775+
to_update.apply(dummy_pooling_params)
1776+
1777+
dummy_metadata = PoolingMetadata(
1778+
prompt_lens=torch.tensor(
1779+
[h.shape[0] for h in hidden_states_list],
1780+
device=self.device),
1781+
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
1782+
dtype=torch.int32,
1783+
device=self.device),
1784+
pooling_params=[dummy_pooling_params] * num_reqs)
1785+
1786+
try:
1787+
pooler_output = model.pooler(hidden_states=hidden_states_list,
1788+
pooling_metadata=dummy_metadata)
1789+
except RuntimeError as e:
1790+
if 'out of memory' in str(e):
1791+
raise RuntimeError(
1792+
"NPU out of memory occurred when warming up pooler with "
1793+
f"{num_reqs} dummy requests. Please try lowering "
1794+
"`max_num_seqs` or `gpu_memory_utilization` when "
1795+
"initializing the engine.") from e
1796+
else:
1797+
raise e
1798+
17521799
return pooler_output
17531800

17541801
def load_model(self) -> None:
@@ -1767,8 +1814,9 @@ def load_model(self) -> None:
17671814
QKVParallelLinear, RowParallelLinear)):
17681815
module.weight.data = torch_npu.npu_format_cast(
17691816
module.weight.data, ACL_FORMAT_FRACTAL_NZ)
1770-
if has_step_pooler(self.model):
1771-
self.input_batch.logits_processing_needs_token_ids = True
1817+
1818+
if vllm_version_is("0.9.2") and has_step_pooler(self.model):
1819+
self.input_batch.logits_processing_needs_token_ids_bool = True
17721820
if self.drafter:
17731821
logger.info("Loading drafter model...")
17741822
if isinstance(self.drafter, EagleProposer):
@@ -2379,3 +2427,13 @@ def select_torchair_padded_batch_size(self, batch_size: int):
23792427
if batch_size <= padded_batch_size < selected_batch_size:
23802428
selected_batch_size = padded_batch_size
23812429
return selected_batch_size
2430+
2431+
def get_supported_pooling_tasks(self):
2432+
model = self.get_model()
2433+
if not is_pooling_model(model):
2434+
return []
2435+
2436+
return [
2437+
task for task in get_args(PoolingTask)
2438+
if model.pooler.get_pooling_updates(task)
2439+
]

vllm_ascend/worker/npu_input_batch.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
from vllm.v1.utils import copy_slice
3636
from vllm.v1.worker.block_table import MultiGroupBlockTable
3737

38+
from vllm_ascend.utils import vllm_version_is
39+
3840
_SAMPLING_EPS = 1e-5
3941

4042

@@ -83,7 +85,6 @@ def __init__(
8385
pin_memory: bool,
8486
vocab_size: int,
8587
block_sizes: list[int], # The block_size of each kv cache group
86-
logits_processing_needs_token_ids: bool = False,
8788
is_spec_decode: bool = False,
8889
):
8990
self.is_spec_decode = is_spec_decode
@@ -93,8 +94,6 @@ def __init__(
9394
self.device = device
9495
self.pin_memory = pin_memory
9596
self.vocab_size = vocab_size
96-
self.logits_processing_needs_token_ids = (
97-
logits_processing_needs_token_ids)
9897

9998
self._req_ids: list[Optional[str]] = []
10099
self.req_id_to_index: dict[str, int] = {}
@@ -247,6 +246,11 @@ def __init__(
247246

248247
# req_index -> bad_words_token_ids
249248
self.bad_words_token_ids: dict[int, list[list[int]]] = {}
249+
if vllm_version_is("0.9.2"):
250+
self.logits_processing_needs_token_ids_bool = False
251+
else:
252+
self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
253+
dtype=bool)
250254

251255
self.req_output_token_ids: list[Optional[list[int]]] = []
252256

@@ -383,9 +387,15 @@ def add_request(
383387
if sampling_params.bad_words_token_ids:
384388
self.bad_words_token_ids[
385389
req_index] = sampling_params.bad_words_token_ids
386-
else:
390+
elif vllm_version_is("0.9.2"):
387391
assert request.pooling_params is not None
388392
self.pooling_params[req_id] = request.pooling_params
393+
elif pooling_params := request.pooling_params:
394+
self.pooling_params[req_id] = pooling_params
395+
self.logits_processing_needs_token_ids[req_index] = (
396+
pooling_params.requires_token_ids)
397+
else:
398+
raise NotImplementedError(request)
389399

390400
# Add request lora ID
391401
if request.lora_request:
@@ -614,10 +624,15 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
614624
self.presence_penalties, num_reqs)
615625
copy_slice(self.repetition_penalties_cpu_tensor,
616626
self.repetition_penalties, num_reqs)
617-
618-
needs_prompt_token_ids = (not self.no_penalties or
619-
(self.num_reqs > 0
620-
and self.logits_processing_needs_token_ids))
627+
if vllm_version_is("0.9.2"):
628+
needs_prompt_token_ids = (
629+
not self.no_penalties
630+
or (self.num_reqs > 0
631+
and self.logits_processing_needs_token_ids_bool))
632+
else:
633+
needs_prompt_token_ids = (
634+
not self.no_penalties
635+
or self.logits_processing_needs_token_ids[:num_reqs].any())
621636
if needs_prompt_token_ids:
622637
# The prompt tokens are used only for applying penalties or
623638
# step pooling during the sampling/pooling process.

vllm_ascend/worker/worker_v1.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,3 +355,6 @@ def _init_profiler(self):
355355
torch_profiler_trace_dir))
356356
else:
357357
return None
358+
359+
def get_supported_pooling_tasks(self):
360+
return self.model_runner.get_supported_pooling_tasks()

0 commit comments

Comments
 (0)