diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index f1be62559a..2e16dff84c 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -81,7 +81,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [v0.10.1.1, main] + vllm_version: [v0.10.1] steps: - name: Install packages run: | @@ -137,7 +137,7 @@ jobs: max-parallel: 2 matrix: os: [linux-aarch64-a2-1] - vllm_version: [v0.10.1.1, main] + vllm_version: [v0.10.1] name: singlecard e2e test runs-on: ${{ matrix.os }} container: @@ -213,13 +213,11 @@ jobs: --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \ --ignore=tests/e2e/singlecard/test_offline_inference_310p.py e2e-2-cards: - needs: [e2e] - if: ${{ needs.e2e.result == 'success' }} strategy: max-parallel: 2 matrix: os: [linux-aarch64-a2-2] - vllm_version: [v0.10.1.1, main] + vllm_version: [v0.10.1] name: multicard e2e test runs-on: ${{ matrix.os }} container: diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py index 5c793d1017..5a844b8969 100644 --- a/tests/ut/core/test_scheduler.py +++ b/tests/ut/core/test_scheduler.py @@ -21,7 +21,7 @@ from vllm_ascend.core.scheduler import AscendScheduler from vllm_ascend.utils import vllm_version_is -if not vllm_version_is("0.10.1.1"): +if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): from vllm.v1.outputs import DraftTokenIds else: DraftTokenIds = None @@ -78,7 +78,7 @@ def make_output(scheduler): } sampled_token_ids = [[1000]] * len(scheduler.running) logprobs = None - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): modelrunner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_id_to_index, @@ -297,7 +297,7 @@ def test_stop_via_update_from_output(self): scheduler.running.append(req) req.status = RequestStatus.RUNNING - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput( scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -384,7 +384,7 @@ def test_stop_via_update_from_output(self): scheduler.running.append(req) req.status = RequestStatus.RUNNING - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput( scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -468,7 +468,7 @@ def test_stop_via_update_from_output(self): scheduler.running.append(req) req.status = RequestStatus.RUNNING - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput( scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -549,7 +549,7 @@ def test_stop_via_update_from_output(self): scheduler.requests[requests[0].request_id] = requests[0] scheduler.running.append(requests[0]) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput( scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -645,7 +645,7 @@ def test_schedule_concurrent_batches(self): 512) # Model output of the first request. - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): model_runner_output = ModelRunnerOutput( req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0}, @@ -671,7 +671,7 @@ def test_schedule_concurrent_batches(self): # request is still running. scheduler.schedule() # Model output of the second request. - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): model_runner_output = ModelRunnerOutput( req_ids=[requests[1].request_id], req_id_to_index={requests[1].request_id: 0}, @@ -739,7 +739,7 @@ def test_schedule_spec_decoding_stats(self): req_id = requests[i].request_id self.assertEqual(output.num_scheduled_tokens[req_id], 1) self.assertNotIn(req_id, output.scheduled_spec_decode_tokens) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, @@ -760,7 +760,7 @@ def test_schedule_spec_decoding_stats(self): engine_core_outputs = scheduler.update_from_output( output, model_runner_output) - if not vllm_version_is("0.10.1.1"): + if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): scheduler.update_draft_token_ids(draft_token_ids) for i in range(len(requests)): @@ -797,7 +797,7 @@ def test_schedule_spec_decoding_stats(self): else: self.assertNotIn(req_id, output.scheduled_spec_decode_tokens) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py index c2e0a1f955..cb51079227 100644 --- a/tests/ut/kv_connector/utils.py +++ b/tests/ut/kv_connector/utils.py @@ -200,7 +200,7 @@ def create_model_runner_output( kv_connector_output = KVConnectorOutput(finished_sending=finished_sending, finished_recving=finished_recving) extra_args = {"kv_connector_output": kv_connector_output} - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_id_to_index, diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index c9b9a648a2..e4fef281b7 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -33,7 +33,7 @@ from vllm_ascend.utils import vllm_version_is -if vllm_version_is("0.10.1.1"): +if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): from vllm.v1.core.kv_cache_manager import KVCacheBlocks else: KVCacheBlocks = None @@ -66,7 +66,7 @@ def schedule(self) -> SchedulerOutput: scheduled_running_reqs: list[Request] = [] preempted_reqs: list[Request] = [] - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): req_to_new_block_ids: dict[str, list[int]] = {} else: req_to_new_blocks: dict[str, KVCacheBlocks] = {} @@ -227,7 +227,7 @@ def skip_cur_request(): if self.lora_config and request.lora_request: scheduled_loras.add(request.lora_request.lora_int_id) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): req_to_new_block_ids[request.request_id] = ( self.kv_cache_manager.get_block_ids(request.request_id)) else: @@ -320,7 +320,7 @@ def skip_cur_request(): # Schedule the request. scheduled_running_reqs.append(request) self.scheduled_req_ids.add(request.request_id) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): req_to_new_block_ids[request.request_id] = ( new_blocks.get_block_ids()) else: @@ -362,7 +362,7 @@ def skip_cur_request(): any_request, len(self.running))) # Construct the scheduler output. - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): new_reqs_data = [ NewRequestData.from_request( req, req_to_new_block_ids[req.request_id]) @@ -385,7 +385,7 @@ def skip_cur_request(): req_to_new_blocks) scheduled_cached_reqs = cached_reqs_data - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, scheduled_cached_reqs=scheduled_cached_reqs, diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py index 0df83772b8..2fa10f024b 100644 --- a/vllm_ascend/models/qwen3_moe.py +++ b/vllm_ascend/models/qwen3_moe.py @@ -254,7 +254,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): self.num_redundant_experts = parallel_config.num_redundant_experts else: eplb_config = parallel_config.eplb_config diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index d3f1ae9cea..b5a212a651 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -5,7 +5,7 @@ from vllm_ascend.utils import is_310p, vllm_version_is -if not vllm_version_is("0.10.1.1"): +if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): from vllm.config import LogprobsMode DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS else: @@ -68,7 +68,7 @@ def _apply_top_k_top_p( def forward_native(self, logits, generators, k, p): """Override pytorch native implementation to torch_npu""" logits = self._apply_top_k_top_p(logits, k, p) - if not vllm_version_is("0.10.1.1"): + if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): logits_to_return = None if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: @@ -79,7 +79,7 @@ def forward_native(self, logits, generators, k, p): probs = logits.softmax(dim=-1, dtype=torch.float32) output = None - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): output = random_sample(probs, generators) else: output = (random_sample(probs, generators), logits_to_return) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index fe7a9795af..468f59efc0 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -96,7 +96,7 @@ from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch -if not vllm_version_is("0.10.1.1"): +if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): from vllm.v1.outputs import DraftTokenIds else: DraftTokenIds = None @@ -384,7 +384,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove finished requests from the cached states. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): self.encoder_cache.pop(req_id, None) # Remove the finished requests from the persistent batch. # NOTE(woosuk): There could be an edge case where finished_req_ids and @@ -394,7 +394,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # and handling the second as a new request. for req_id in scheduler_output.finished_req_ids: self.input_batch.remove_request(req_id) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): # Free the cached encoder outputs. for req_id, input_id in scheduler_output.free_encoder_input_ids: encoder_outputs = self.encoder_cache.get(req_id) @@ -455,9 +455,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: lora_request=new_req_data.lora_request, **({ "mm_hashes": new_req_data.mm_hashes - } if not vllm_version_is("0.10.1.1") else { - "mm_hashes": None - }), + } if not (vllm_version_is("0.10.1.1") + or vllm_version_is("0.10.1")) else { + "mm_hashes": None + }), ) # Only relevant for models using M-RoPE (e.g, Qwen2-VL) @@ -893,13 +894,13 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): # Batch the multi-modal inputs. mm_kwargs = list[MultiModalKwargsItem]() - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): req_ids_pos = list[tuple[str, int, PlaceholderRange]]() else: mm_hashes_pos = list[tuple[str, PlaceholderRange]]() for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): req_state = self.requests[req_id] - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): for mm_input_id in encoder_input_ids: mm_kwargs.append(req_state.mm_kwargs[mm_input_id]) req_ids_pos.append((req_id, mm_input_id, @@ -942,7 +943,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): for output in curr_group_outputs: encoder_outputs.append(output) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): # Cache the encoder outputs. for (req_id, input_id, pos_info), output in zip( req_ids_pos, @@ -974,7 +975,7 @@ def _gather_mm_embeddings( req_state = self.requests[req_id] num_computed_tokens = req_state.num_computed_tokens mm_positions = req_state.mm_positions - if not vllm_version_is("0.10.1.1"): + if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): mm_hashes = req_state.mm_hashes for i, pos_info in enumerate(mm_positions): start_pos = pos_info.offset @@ -993,7 +994,7 @@ def _gather_mm_embeddings( continue start_idx = max(num_computed_tokens - start_pos, 0) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): end_idx = min( num_computed_tokens - start_pos + num_scheduled_tokens, num_encoder_tokens) @@ -1719,7 +1720,8 @@ def execute_model( logits = None else: if self.input_batch.pooling_params: - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is( + "0.10.1"): return self._pool_v010( hidden_states, scheduler_output.total_num_scheduled_tokens, @@ -1867,7 +1869,7 @@ def execute_model( extra_args = ({"kv_connector_output": kv_connector_output}) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): model_runner_output = ModelRunnerOutput( req_ids=self.input_batch.req_ids, req_id_to_index=self.input_batch.req_id_to_index, @@ -2191,7 +2193,7 @@ def _dummy_pooler_run_task( dummy_pooling_params = PoolingParams(task=task) to_update = model.pooler.get_pooling_updates(task) to_update.apply(dummy_pooling_params) - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): dummy_prompt_lens = torch.tensor( [h.shape[0] for h in hidden_states_list], device=self.device, diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index 7e1243ad63..cbd25a8635 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -726,7 +726,7 @@ def pooling_metadata(self) -> PoolingMetadata: pooling_params = [ self.pooling_params[req_id] for req_id in self.req_ids ] - if vllm_version_is("0.10.1.1"): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): return PoolingMetadata( prompt_lens=torch.from_numpy( self.num_prompt_tokens[:self.num_reqs]).to(self.device), diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 6c72f846a1..be3af073f2 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -50,7 +50,7 @@ try_register_lib, vllm_version_is) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner -if not vllm_version_is("0.10.1.1"): +if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): from vllm.v1.outputs import DraftTokenIds else: DraftTokenIds = None