Skip to content

Commit ec2a676

Browse files
Yikunlijiaojiao
authored andcommitted
Support v0.10.1 (vllm-project#2584)
### What this PR does / why we need it? This patch also supports v0.10.1 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - CI passed - test 0.10.1: vllm-project#2583 - vLLM version: v0.10.1.1 - vLLM main: vllm-project/vllm@321938e Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Signed-off-by: lijiaojiao <lijiaojiao990304@163.com>
1 parent 2d265c5 commit ec2a676

File tree

8 files changed

+40
-38
lines changed

8 files changed

+40
-38
lines changed

tests/ut/core/test_scheduler.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from vllm_ascend.core.scheduler import AscendScheduler
2222
from vllm_ascend.utils import vllm_version_is
2323

24-
if not vllm_version_is("0.10.1.1"):
24+
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
2525
from vllm.v1.outputs import DraftTokenIds
2626
else:
2727
DraftTokenIds = None
@@ -78,7 +78,7 @@ def make_output(scheduler):
7878
}
7979
sampled_token_ids = [[1000]] * len(scheduler.running)
8080
logprobs = None
81-
if vllm_version_is("0.10.1.1"):
81+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
8282
modelrunner_output = ModelRunnerOutput(
8383
req_ids=req_ids,
8484
req_id_to_index=req_id_to_index,
@@ -297,7 +297,7 @@ def test_stop_via_update_from_output(self):
297297
scheduler.running.append(req)
298298
req.status = RequestStatus.RUNNING
299299

300-
if vllm_version_is("0.10.1.1"):
300+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
301301
scheduler_output = SchedulerOutput(
302302
scheduled_new_reqs=[],
303303
scheduled_cached_reqs=[],
@@ -384,7 +384,7 @@ def test_stop_via_update_from_output(self):
384384
scheduler.running.append(req)
385385
req.status = RequestStatus.RUNNING
386386

387-
if vllm_version_is("0.10.1.1"):
387+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
388388
scheduler_output = SchedulerOutput(
389389
scheduled_new_reqs=[],
390390
scheduled_cached_reqs=[],
@@ -468,7 +468,7 @@ def test_stop_via_update_from_output(self):
468468
scheduler.running.append(req)
469469
req.status = RequestStatus.RUNNING
470470

471-
if vllm_version_is("0.10.1.1"):
471+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
472472
scheduler_output = SchedulerOutput(
473473
scheduled_new_reqs=[],
474474
scheduled_cached_reqs=[],
@@ -549,7 +549,7 @@ def test_stop_via_update_from_output(self):
549549
scheduler.requests[requests[0].request_id] = requests[0]
550550
scheduler.running.append(requests[0])
551551

552-
if vllm_version_is("0.10.1.1"):
552+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
553553
scheduler_output = SchedulerOutput(
554554
scheduled_new_reqs=[],
555555
scheduled_cached_reqs=[],
@@ -645,7 +645,7 @@ def test_schedule_concurrent_batches(self):
645645
512)
646646

647647
# Model output of the first request.
648-
if vllm_version_is("0.10.1.1"):
648+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
649649
model_runner_output = ModelRunnerOutput(
650650
req_ids=[requests[0].request_id],
651651
req_id_to_index={requests[0].request_id: 0},
@@ -671,7 +671,7 @@ def test_schedule_concurrent_batches(self):
671671
# request is still running.
672672
scheduler.schedule()
673673
# Model output of the second request.
674-
if vllm_version_is("0.10.1.1"):
674+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
675675
model_runner_output = ModelRunnerOutput(
676676
req_ids=[requests[1].request_id],
677677
req_id_to_index={requests[1].request_id: 0},
@@ -739,7 +739,7 @@ def test_schedule_spec_decoding_stats(self):
739739
req_id = requests[i].request_id
740740
self.assertEqual(output.num_scheduled_tokens[req_id], 1)
741741
self.assertNotIn(req_id, output.scheduled_spec_decode_tokens)
742-
if vllm_version_is("0.10.1.1"):
742+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
743743
model_runner_output = ModelRunnerOutput(
744744
req_ids=req_ids,
745745
req_id_to_index=req_to_index,
@@ -760,7 +760,7 @@ def test_schedule_spec_decoding_stats(self):
760760

761761
engine_core_outputs = scheduler.update_from_output(
762762
output, model_runner_output)
763-
if not vllm_version_is("0.10.1.1"):
763+
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
764764
scheduler.update_draft_token_ids(draft_token_ids)
765765

766766
for i in range(len(requests)):
@@ -797,7 +797,7 @@ def test_schedule_spec_decoding_stats(self):
797797
else:
798798
self.assertNotIn(req_id,
799799
output.scheduled_spec_decode_tokens)
800-
if vllm_version_is("0.10.1.1"):
800+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
801801
model_runner_output = ModelRunnerOutput(
802802
req_ids=req_ids,
803803
req_id_to_index=req_to_index,

tests/ut/kv_connector/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def create_model_runner_output(
200200
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
201201
finished_recving=finished_recving)
202202
extra_args = {"kv_connector_output": kv_connector_output}
203-
if vllm_version_is("0.10.1.1"):
203+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
204204
model_runner_output = ModelRunnerOutput(
205205
req_ids=req_ids,
206206
req_id_to_index=req_id_to_index,

vllm_ascend/core/scheduler.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
from vllm_ascend.utils import vllm_version_is
3535

36-
if vllm_version_is("0.10.1.1"):
36+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
3737
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
3838
else:
3939
KVCacheBlocks = None
@@ -66,7 +66,7 @@ def schedule(self) -> SchedulerOutput:
6666
scheduled_running_reqs: list[Request] = []
6767
preempted_reqs: list[Request] = []
6868

69-
if vllm_version_is("0.10.1.1"):
69+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
7070
req_to_new_block_ids: dict[str, list[int]] = {}
7171
else:
7272
req_to_new_blocks: dict[str, KVCacheBlocks] = {}
@@ -227,7 +227,7 @@ def skip_cur_request():
227227

228228
if self.lora_config and request.lora_request:
229229
scheduled_loras.add(request.lora_request.lora_int_id)
230-
if vllm_version_is("0.10.1.1"):
230+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
231231
req_to_new_block_ids[request.request_id] = (
232232
self.kv_cache_manager.get_block_ids(request.request_id))
233233
else:
@@ -320,7 +320,7 @@ def skip_cur_request():
320320
# Schedule the request.
321321
scheduled_running_reqs.append(request)
322322
self.scheduled_req_ids.add(request.request_id)
323-
if vllm_version_is("0.10.1.1"):
323+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
324324
req_to_new_block_ids[request.request_id] = (
325325
new_blocks.get_block_ids())
326326
else:
@@ -362,7 +362,7 @@ def skip_cur_request():
362362
any_request, len(self.running)))
363363

364364
# Construct the scheduler output.
365-
if vllm_version_is("0.10.1.1"):
365+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
366366
new_reqs_data = [
367367
NewRequestData.from_request(
368368
req, req_to_new_block_ids[req.request_id])
@@ -385,7 +385,7 @@ def skip_cur_request():
385385
req_to_new_blocks)
386386
scheduled_cached_reqs = cached_reqs_data
387387

388-
if vllm_version_is("0.10.1.1"):
388+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
389389
scheduler_output = SchedulerOutput(
390390
scheduled_new_reqs=new_reqs_data,
391391
scheduled_cached_reqs=scheduled_cached_reqs,

vllm_ascend/models/qwen3_moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
254254
quant_config = vllm_config.quant_config
255255

256256
parallel_config = vllm_config.parallel_config
257-
if vllm_version_is("0.10.1.1"):
257+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
258258
self.num_redundant_experts = parallel_config.num_redundant_experts
259259
else:
260260
eplb_config = parallel_config.eplb_config

vllm_ascend/sample/sampler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from vllm_ascend.utils import is_310p, vllm_version_is
77

8-
if not vllm_version_is("0.10.1.1"):
8+
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
99
from vllm.config import LogprobsMode
1010
DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
1111
else:
@@ -68,7 +68,7 @@ def _apply_top_k_top_p(
6868
def forward_native(self, logits, generators, k, p):
6969
"""Override pytorch native implementation to torch_npu"""
7070
logits = self._apply_top_k_top_p(logits, k, p)
71-
if not vllm_version_is("0.10.1.1"):
71+
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
7272

7373
logits_to_return = None
7474
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
@@ -79,7 +79,7 @@ def forward_native(self, logits, generators, k, p):
7979

8080
probs = logits.softmax(dim=-1, dtype=torch.float32)
8181
output = None
82-
if vllm_version_is("0.10.1.1"):
82+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
8383
output = random_sample(probs, generators)
8484
else:
8585
output = (random_sample(probs, generators), logits_to_return)

vllm_ascend/worker/model_runner_v1.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@
9696
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
9797
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
9898

99-
if not vllm_version_is("0.10.1.1"):
99+
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
100100
from vllm.v1.outputs import DraftTokenIds
101101
else:
102102
DraftTokenIds = None
@@ -384,7 +384,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
384384
# Remove finished requests from the cached states.
385385
for req_id in scheduler_output.finished_req_ids:
386386
self.requests.pop(req_id, None)
387-
if vllm_version_is("0.10.1.1"):
387+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
388388
self.encoder_cache.pop(req_id, None)
389389
# Remove the finished requests from the persistent batch.
390390
# NOTE(woosuk): There could be an edge case where finished_req_ids and
@@ -394,7 +394,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
394394
# and handling the second as a new request.
395395
for req_id in scheduler_output.finished_req_ids:
396396
self.input_batch.remove_request(req_id)
397-
if vllm_version_is("0.10.1.1"):
397+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
398398
# Free the cached encoder outputs.
399399
for req_id, input_id in scheduler_output.free_encoder_input_ids:
400400
encoder_outputs = self.encoder_cache.get(req_id)
@@ -455,9 +455,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
455455
lora_request=new_req_data.lora_request,
456456
**({
457457
"mm_hashes": new_req_data.mm_hashes
458-
} if not vllm_version_is("0.10.1.1") else {
459-
"mm_hashes": None
460-
}),
458+
} if not (vllm_version_is("0.10.1.1")
459+
or vllm_version_is("0.10.1")) else {
460+
"mm_hashes": None
461+
}),
461462
)
462463

463464
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -893,13 +894,13 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
893894

894895
# Batch the multi-modal inputs.
895896
mm_kwargs = list[MultiModalKwargsItem]()
896-
if vllm_version_is("0.10.1.1"):
897+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
897898
req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
898899
else:
899900
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
900901
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
901902
req_state = self.requests[req_id]
902-
if vllm_version_is("0.10.1.1"):
903+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
903904
for mm_input_id in encoder_input_ids:
904905
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
905906
req_ids_pos.append((req_id, mm_input_id,
@@ -942,7 +943,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
942943

943944
for output in curr_group_outputs:
944945
encoder_outputs.append(output)
945-
if vllm_version_is("0.10.1.1"):
946+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
946947
# Cache the encoder outputs.
947948
for (req_id, input_id, pos_info), output in zip(
948949
req_ids_pos,
@@ -974,7 +975,7 @@ def _gather_mm_embeddings(
974975
req_state = self.requests[req_id]
975976
num_computed_tokens = req_state.num_computed_tokens
976977
mm_positions = req_state.mm_positions
977-
if not vllm_version_is("0.10.1.1"):
978+
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
978979
mm_hashes = req_state.mm_hashes
979980
for i, pos_info in enumerate(mm_positions):
980981
start_pos = pos_info.offset
@@ -993,7 +994,7 @@ def _gather_mm_embeddings(
993994
continue
994995

995996
start_idx = max(num_computed_tokens - start_pos, 0)
996-
if vllm_version_is("0.10.1.1"):
997+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
997998
end_idx = min(
998999
num_computed_tokens - start_pos + num_scheduled_tokens,
9991000
num_encoder_tokens)
@@ -1719,7 +1720,8 @@ def execute_model(
17191720
logits = None
17201721
else:
17211722
if self.input_batch.pooling_params:
1722-
if vllm_version_is("0.10.1.1"):
1723+
if vllm_version_is("0.10.1.1") or vllm_version_is(
1724+
"0.10.1"):
17231725
return self._pool_v010(
17241726
hidden_states,
17251727
scheduler_output.total_num_scheduled_tokens,
@@ -1867,7 +1869,7 @@ def execute_model(
18671869

18681870
extra_args = ({"kv_connector_output": kv_connector_output})
18691871

1870-
if vllm_version_is("0.10.1.1"):
1872+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
18711873
model_runner_output = ModelRunnerOutput(
18721874
req_ids=self.input_batch.req_ids,
18731875
req_id_to_index=self.input_batch.req_id_to_index,
@@ -2191,7 +2193,7 @@ def _dummy_pooler_run_task(
21912193
dummy_pooling_params = PoolingParams(task=task)
21922194
to_update = model.pooler.get_pooling_updates(task)
21932195
to_update.apply(dummy_pooling_params)
2194-
if vllm_version_is("0.10.1.1"):
2196+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
21952197
dummy_prompt_lens = torch.tensor(
21962198
[h.shape[0] for h in hidden_states_list],
21972199
device=self.device,

vllm_ascend/worker/npu_input_batch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,7 @@ def pooling_metadata(self) -> PoolingMetadata:
726726
pooling_params = [
727727
self.pooling_params[req_id] for req_id in self.req_ids
728728
]
729-
if vllm_version_is("0.10.1.1"):
729+
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
730730
return PoolingMetadata(
731731
prompt_lens=torch.from_numpy(
732732
self.num_prompt_tokens[:self.num_reqs]).to(self.device),

vllm_ascend/worker/worker_v1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
try_register_lib, vllm_version_is)
5151
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
5252

53-
if not vllm_version_is("0.10.1.1"):
53+
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
5454
from vllm.v1.outputs import DraftTokenIds
5555
else:
5656
DraftTokenIds = None

0 commit comments

Comments
 (0)