Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
VLLM_USE_MODELSCOPE: True
strategy:
matrix:
vllm_version: [v0.10.1.1, main]
vllm_version: [v0.10.1]
steps:
- name: Install packages
run: |
Expand Down Expand Up @@ -137,7 +137,7 @@ jobs:
max-parallel: 2
matrix:
os: [linux-aarch64-a2-1]
vllm_version: [v0.10.1.1, main]
vllm_version: [v0.10.1]
name: singlecard e2e test
runs-on: ${{ matrix.os }}
container:
Expand Down Expand Up @@ -213,13 +213,11 @@ jobs:
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py
e2e-2-cards:
needs: [e2e]
if: ${{ needs.e2e.result == 'success' }}
strategy:
max-parallel: 2
matrix:
os: [linux-aarch64-a2-2]
vllm_version: [v0.10.1.1, main]
vllm_version: [v0.10.1]
name: multicard e2e test
runs-on: ${{ matrix.os }}
container:
Expand Down
22 changes: 11 additions & 11 deletions tests/ut/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.utils import vllm_version_is

if not vllm_version_is("0.10.1.1"):
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
Expand Down Expand Up @@ -78,7 +78,7 @@ def make_output(scheduler):
}
sampled_token_ids = [[1000]] * len(scheduler.running)
logprobs = None
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
modelrunner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
Expand Down Expand Up @@ -297,7 +297,7 @@ def test_stop_via_update_from_output(self):
scheduler.running.append(req)
req.status = RequestStatus.RUNNING

if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
Expand Down Expand Up @@ -384,7 +384,7 @@ def test_stop_via_update_from_output(self):
scheduler.running.append(req)
req.status = RequestStatus.RUNNING

if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
Expand Down Expand Up @@ -468,7 +468,7 @@ def test_stop_via_update_from_output(self):
scheduler.running.append(req)
req.status = RequestStatus.RUNNING

if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
Expand Down Expand Up @@ -549,7 +549,7 @@ def test_stop_via_update_from_output(self):
scheduler.requests[requests[0].request_id] = requests[0]
scheduler.running.append(requests[0])

if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
Expand Down Expand Up @@ -645,7 +645,7 @@ def test_schedule_concurrent_batches(self):
512)

# Model output of the first request.
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
Expand All @@ -671,7 +671,7 @@ def test_schedule_concurrent_batches(self):
# request is still running.
scheduler.schedule()
# Model output of the second request.
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
Expand Down Expand Up @@ -739,7 +739,7 @@ def test_schedule_spec_decoding_stats(self):
req_id = requests[i].request_id
self.assertEqual(output.num_scheduled_tokens[req_id], 1)
self.assertNotIn(req_id, output.scheduled_spec_decode_tokens)
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
Expand All @@ -760,7 +760,7 @@ def test_schedule_spec_decoding_stats(self):

engine_core_outputs = scheduler.update_from_output(
output, model_runner_output)
if not vllm_version_is("0.10.1.1"):
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
scheduler.update_draft_token_ids(draft_token_ids)

for i in range(len(requests)):
Expand Down Expand Up @@ -797,7 +797,7 @@ def test_schedule_spec_decoding_stats(self):
else:
self.assertNotIn(req_id,
output.scheduled_spec_decode_tokens)
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
Expand Down
2 changes: 1 addition & 1 deletion tests/ut/kv_connector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def create_model_runner_output(
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
finished_recving=finished_recving)
extra_args = {"kv_connector_output": kv_connector_output}
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
Expand Down
12 changes: 6 additions & 6 deletions vllm_ascend/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
else:
KVCacheBlocks = None
Expand Down Expand Up @@ -66,7 +66,7 @@ def schedule(self) -> SchedulerOutput:
scheduled_running_reqs: list[Request] = []
preempted_reqs: list[Request] = []

if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
req_to_new_block_ids: dict[str, list[int]] = {}
else:
req_to_new_blocks: dict[str, KVCacheBlocks] = {}
Expand Down Expand Up @@ -227,7 +227,7 @@ def skip_cur_request():

if self.lora_config and request.lora_request:
scheduled_loras.add(request.lora_request.lora_int_id)
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
req_to_new_block_ids[request.request_id] = (
self.kv_cache_manager.get_block_ids(request.request_id))
else:
Expand Down Expand Up @@ -320,7 +320,7 @@ def skip_cur_request():
# Schedule the request.
scheduled_running_reqs.append(request)
self.scheduled_req_ids.add(request.request_id)
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
req_to_new_block_ids[request.request_id] = (
new_blocks.get_block_ids())
else:
Expand Down Expand Up @@ -362,7 +362,7 @@ def skip_cur_request():
any_request, len(self.running)))

# Construct the scheduler output.
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
new_reqs_data = [
NewRequestData.from_request(
req, req_to_new_block_ids[req.request_id])
Expand All @@ -385,7 +385,7 @@ def skip_cur_request():
req_to_new_blocks)
scheduled_cached_reqs = cached_reqs_data

if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
scheduler_output = SchedulerOutput(
scheduled_new_reqs=new_reqs_data,
scheduled_cached_reqs=scheduled_cached_reqs,
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/models/qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
quant_config = vllm_config.quant_config

parallel_config = vllm_config.parallel_config
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
self.num_redundant_experts = parallel_config.num_redundant_experts
else:
eplb_config = parallel_config.eplb_config
Expand Down
6 changes: 3 additions & 3 deletions vllm_ascend/sample/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from vllm_ascend.utils import is_310p, vllm_version_is

if not vllm_version_is("0.10.1.1"):
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
from vllm.config import LogprobsMode
DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
else:
Expand Down Expand Up @@ -68,7 +68,7 @@ def _apply_top_k_top_p(
def forward_native(self, logits, generators, k, p):
"""Override pytorch native implementation to torch_npu"""
logits = self._apply_top_k_top_p(logits, k, p)
if not vllm_version_is("0.10.1.1"):
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):

logits_to_return = None
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
Expand All @@ -79,7 +79,7 @@ def forward_native(self, logits, generators, k, p):

probs = logits.softmax(dim=-1, dtype=torch.float32)
output = None
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
output = random_sample(probs, generators)
else:
output = (random_sample(probs, generators), logits_to_return)
Expand Down
30 changes: 16 additions & 14 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch

if not vllm_version_is("0.10.1.1"):
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
Expand Down Expand Up @@ -384,7 +384,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
# Remove finished requests from the cached states.
for req_id in scheduler_output.finished_req_ids:
self.requests.pop(req_id, None)
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
self.encoder_cache.pop(req_id, None)
# Remove the finished requests from the persistent batch.
# NOTE(woosuk): There could be an edge case where finished_req_ids and
Expand All @@ -394,7 +394,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
# and handling the second as a new request.
for req_id in scheduler_output.finished_req_ids:
self.input_batch.remove_request(req_id)
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
# Free the cached encoder outputs.
for req_id, input_id in scheduler_output.free_encoder_input_ids:
encoder_outputs = self.encoder_cache.get(req_id)
Expand Down Expand Up @@ -455,9 +455,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
lora_request=new_req_data.lora_request,
**({
"mm_hashes": new_req_data.mm_hashes
} if not vllm_version_is("0.10.1.1") else {
"mm_hashes": None
}),
} if not (vllm_version_is("0.10.1.1")
or vllm_version_is("0.10.1")) else {
"mm_hashes": None
}),
)

# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
Expand Down Expand Up @@ -893,13 +894,13 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):

# Batch the multi-modal inputs.
mm_kwargs = list[MultiModalKwargsItem]()
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
else:
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
req_state = self.requests[req_id]
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
for mm_input_id in encoder_input_ids:
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
req_ids_pos.append((req_id, mm_input_id,
Expand Down Expand Up @@ -942,7 +943,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):

for output in curr_group_outputs:
encoder_outputs.append(output)
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
# Cache the encoder outputs.
for (req_id, input_id, pos_info), output in zip(
req_ids_pos,
Expand Down Expand Up @@ -974,7 +975,7 @@ def _gather_mm_embeddings(
req_state = self.requests[req_id]
num_computed_tokens = req_state.num_computed_tokens
mm_positions = req_state.mm_positions
if not vllm_version_is("0.10.1.1"):
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
mm_hashes = req_state.mm_hashes
for i, pos_info in enumerate(mm_positions):
start_pos = pos_info.offset
Expand All @@ -993,7 +994,7 @@ def _gather_mm_embeddings(
continue

start_idx = max(num_computed_tokens - start_pos, 0)
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
end_idx = min(
num_computed_tokens - start_pos + num_scheduled_tokens,
num_encoder_tokens)
Expand Down Expand Up @@ -1719,7 +1720,8 @@ def execute_model(
logits = None
else:
if self.input_batch.pooling_params:
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is(
"0.10.1"):
return self._pool_v010(
hidden_states,
scheduler_output.total_num_scheduled_tokens,
Expand Down Expand Up @@ -1867,7 +1869,7 @@ def execute_model(

extra_args = ({"kv_connector_output": kv_connector_output})

if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
Expand Down Expand Up @@ -2191,7 +2193,7 @@ def _dummy_pooler_run_task(
dummy_pooling_params = PoolingParams(task=task)
to_update = model.pooler.get_pooling_updates(task)
to_update.apply(dummy_pooling_params)
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
dummy_prompt_lens = torch.tensor(
[h.shape[0] for h in hidden_states_list],
device=self.device,
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/worker/npu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,7 @@ def pooling_metadata(self) -> PoolingMetadata:
pooling_params = [
self.pooling_params[req_id] for req_id in self.req_ids
]
if vllm_version_is("0.10.1.1"):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
return PoolingMetadata(
prompt_lens=torch.from_numpy(
self.num_prompt_tokens[:self.num_reqs]).to(self.device),
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/worker/worker_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
try_register_lib, vllm_version_is)
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner

if not vllm_version_is("0.10.1.1"):
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
Expand Down
Loading