Skip to content

Commit 1083b72

Browse files
committed
[CI] upgrade to vllm 0.9.0
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent 01e3d59 commit 1083b72

File tree

15 files changed

+75
-148
lines changed

15 files changed

+75
-148
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
max-parallel: 2
4949
matrix:
5050
os: [linux-arm64-npu-1, linux-arm64-npu-4]
51-
vllm_verison: [main, v0.8.5.post1]
51+
vllm_verison: [main, v0.9.0]
5252
concurrency:
5353
group: >
5454
${{

.github/workflows/vllm_ascend_test_pd.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ jobs:
3535
if: ${{ github.event.label.name == 'module:pd' }}
3636
strategy:
3737
matrix:
38-
vllm_verison: [v0.8.5.post1]
38+
vllm_verison: [v0.9.0]
3939
name: vLLM Ascend test
4040
runs-on: linux-arm64-npu-static-8
4141

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
3737

3838
# Install vLLM
3939
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
40-
ARG VLLM_TAG=v0.8.5.post1
40+
ARG VLLM_TAG=v0.9.0
4141
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
4242
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
4343
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

Dockerfile.openEuler

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
3434

3535
# Install vLLM
3636
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
37-
ARG VLLM_TAG=v0.8.5.post1
37+
ARG VLLM_TAG=v0.9.0
3838

3939
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
4040
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

tests/utils.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,19 +42,13 @@
4242
init_distributed_environment)
4343
from vllm.engine.arg_utils import AsyncEngineArgs
4444
from vllm.entrypoints.openai.cli_args import make_arg_parser
45+
from vllm.model_executor.model_loader import get_model_loader
4546
from vllm.platforms import current_platform
4647
from vllm.transformers_utils.tokenizer import get_tokenizer
4748
from vllm.utils import FlexibleArgumentParser, GB_bytes, get_open_port
4849

49-
from vllm_ascend.utils import vllm_version_is
50-
5150
from .model_utils import TextTextLogprobs
5251

53-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
54-
from vllm.model_executor.model_loader.loader import get_model_loader # type: ignore[import] # isort: skip
55-
else:
56-
from vllm.model_executor.model_loader import get_model_loader
57-
5852
VLLM_PATH = Path(__file__).parent.parent
5953
"""Path to root of the vLLM repository."""
6054

vllm_ascend/attention/attention_v1.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from vllm.v1.worker.gpu_input_batch import InputBatch
3131

3232
from vllm_ascend.ops.attention import vanilla_chunked_prefill
33-
from vllm_ascend.utils import vllm_version_is
3433

3534

3635
class AscendAttentionBackend(AttentionBackend):
@@ -142,14 +141,11 @@ def reorder_batch(self, input_batch: "InputBatch",
142141

143142
def build(self, num_reqs, num_actual_tokens, max_query_len,
144143
common_prefix_len):
145-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
146-
block_table = (self.runner.input_batch.block_table.
147-
get_device_tensor()[:num_reqs])
148-
else:
149-
block_table = self.runner.input_batch.block_table[
150-
0].get_device_tensor()
151-
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
152-
block_table[:num_reqs])
144+
145+
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
146+
)
147+
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
148+
block_table[:num_reqs])
153149

154150
query_lens = self.runner.query_lens
155151
seq_lens = self.runner.seq_lens_cpu[:num_reqs]

vllm_ascend/attention/mla_v1.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
from vllm_ascend.attention.attention_v1 import AscendAttentionState
1818
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
19-
from vllm_ascend.utils import vllm_version_is
2019
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
2120

2221
if TYPE_CHECKING:
@@ -239,14 +238,11 @@ def build(self,
239238
# function. We should avoid GPU -> CPU sync as much as possible because
240239
# it blocks on all previous kernels.
241240
device = self.runner.device
242-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
243-
block_table = (self.runner.input_batch.block_table.
244-
get_device_tensor()[:num_reqs])
245-
else:
246-
block_table = self.runner.input_batch.block_table[
247-
0].get_device_tensor()
248-
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
249-
block_table[:num_reqs])
241+
242+
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
243+
)
244+
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
245+
block_table[:num_reqs])
250246
slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
251247
device, non_blocking=True)
252248
input_positions = self.runner.positions_cpu[:num_actual_tokens].to(

vllm_ascend/ops/fused_moe.py

Lines changed: 37 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,10 @@
2626
tensor_model_parallel_all_reduce)
2727
from vllm.distributed.parallel_state import get_dp_group
2828
from vllm.model_executor.layers.fused_moe.layer import (
29-
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
30-
31-
from vllm_ascend.utils import vllm_version_is
32-
33-
if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")):
34-
from vllm.model_executor.layers.fused_moe.layer import (
35-
FusedMoEParallelConfig, MoEConfig)
36-
else:
37-
MoEConfig = None
38-
39-
from vllm.model_executor.layers.quantization.base_config import (
40-
QuantizationConfig, QuantizeMethodBase)
29+
FusedMoE, FusedMoEParallelConfig, MoEConfig, UnquantizedFusedMoEMethod,
30+
determine_expert_map)
31+
from vllm.model_executor.layers.quantization.base_config import \
32+
QuantizationConfig
4133

4234
import vllm_ascend.envs as envs_ascend
4335
from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
@@ -587,10 +579,8 @@ def select_experts(
587579
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
588580

589581
def __init__(self, moe: MoEConfig = None):
590-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
591-
super().__init__()
592-
else:
593-
super().__init__(moe=moe)
582+
583+
super().__init__(moe=moe)
594584
vllm_config = get_current_vllm_config()
595585

596586
ep_group = get_ep_group()
@@ -731,23 +721,16 @@ def __init__(
731721
params_dtype = torch.get_default_dtype()
732722

733723
vllm_config = get_current_vllm_config()
734-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
735-
self.ep_size = get_ep_group().world_size
736-
self.tp_size = get_etp_group().world_size
737-
self.dp_size = (dp_size if dp_size is not None else
738-
get_dp_group().world_size)
739-
self.dp_rank = (0 if self.dp_size == 1 else
740-
get_dp_group().rank_in_group)
741-
else:
742-
self.moe_parallel_config: FusedMoEParallelConfig = (
743-
FusedMoEParallelConfig.make(
744-
tp_size_=(tp_size if tp_size is not None else
745-
get_tensor_model_parallel_world_size()),
746-
dp_size_=(dp_size if dp_size is not None else
747-
get_dp_group().world_size),
748-
vllm_parallel_config=vllm_config.parallel_config))
749724

750-
self.moe_parallel_config.ep_size = get_ep_group().world_size
725+
self.moe_parallel_config: FusedMoEParallelConfig = (
726+
FusedMoEParallelConfig.make(
727+
tp_size_=(tp_size if tp_size is not None else
728+
get_tensor_model_parallel_world_size()),
729+
dp_size_=(dp_size if dp_size is not None else
730+
get_dp_group().world_size),
731+
vllm_parallel_config=vllm_config.parallel_config))
732+
733+
self.moe_parallel_config.ep_size = get_ep_group().world_size
751734

752735
self.top_k = top_k
753736
self.num_experts = num_experts
@@ -772,54 +755,39 @@ def __init__(
772755
self.local_num_experts, self.expert_map = determine_expert_map(
773756
self.ep_size,
774757
get_ep_group().rank_in_group, self.global_num_experts)
775-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
776-
self.tp_rank = get_etp_group().rank_in_group
777-
self.ep_rank = get_ep_group().rank_in_group
778-
else:
779-
self.moe_parallel_config.tp_rank = get_etp_group(
780-
).rank_in_group
781-
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
758+
759+
self.moe_parallel_config.tp_rank = get_etp_group().rank_in_group
760+
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
782761

783762
else:
784763
# Adjust TP size for DP attention
785764
# haven't test its functionality yet, may remove in the future
786-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
787-
self.tp_rank = self.tp_size * self.dp_rank
788-
self.ep_rank = 0
789-
self.tp_size = self.tp_size * self.dp_size
790-
self.ep_size = 1
791-
else:
792-
self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
793-
self.moe_parallel_config.ep_rank = 0
794-
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
795-
self.moe_parallel_config.ep_size = 1
765+
766+
self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
767+
self.moe_parallel_config.ep_rank = 0
768+
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
769+
self.moe_parallel_config.ep_size = 1
796770

797771
self.local_num_experts, self.expert_map = (self.global_num_experts,
798772
None)
799773
if self.scoring_func != "softmax" and not self.use_grouped_topk:
800774
raise ValueError("Only softmax scoring function is supported for "
801775
"non-grouped topk.")
802-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
803-
if quant_config is None:
804-
self.quant_method: Optional[QuantizeMethodBase] = (
805-
AscendUnquantizedFusedMoEMethod())
806-
else:
807-
self.quant_method = quant_config.get_quant_method(self, prefix)
808-
else:
809-
moe = MoEConfig(
810-
num_experts=self.global_num_experts,
811-
experts_per_token=top_k,
812-
hidden_dim=hidden_size,
813-
num_local_experts=self.local_num_experts,
814-
moe_parallel_config=self.moe_parallel_config,
815-
# TODO (bnell): this needs to be fixed for quantized types.
816-
in_dtype=params_dtype,
817-
)
818776

819-
if quant_config is None:
820-
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
821-
else:
822-
self.quant_method = quant_config.get_quant_method(self, prefix)
777+
moe = MoEConfig(
778+
num_experts=self.global_num_experts,
779+
experts_per_token=top_k,
780+
hidden_dim=hidden_size,
781+
num_local_experts=self.local_num_experts,
782+
moe_parallel_config=self.moe_parallel_config,
783+
# TODO (bnell): this needs to be fixed for quantized types.
784+
in_dtype=params_dtype,
785+
)
786+
787+
if quant_config is None:
788+
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
789+
else:
790+
self.quant_method = quant_config.get_quant_method(self, prefix)
823791

824792
assert self.quant_method is not None
825793

vllm_ascend/patch/__init__.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,9 @@
2424
# each worker's `__init__` function.
2525
#
2626
# Then in each kind of patch, there are three folders:
27-
# - patch_0_8_5: contains the patches applied when vllm version is 0.8.5.
27+
# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0.
2828
# - patch_main: contains the patches applied when vllm version is main branch.
29-
# - patch_common: contains the patches applied in both 0.8.5 and main branch.
30-
#
31-
# In the future, with the vllm version upgrade, the new patch folder such as
32-
# patch_0_8_5, patch_0_8_6, etc. will be added to manage the patch for different
33-
# vllm version. And the patch_common will contain the patches applied in all the
34-
# vllm version.
35-
# Once the vllm version is too old that vllm-ascend will not support, the related
36-
# patch folder will be removed as well.
29+
# - patch_common: contains the patches applied in both 0.9.0 and main branch.
3730
#
3831
# Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
3932
# ----------------------------------------------------------------------------------

vllm_ascend/patch/platform/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
from vllm_ascend.utils import vllm_version_is
1818

1919
# Import specific patches for different versions
20-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
21-
from vllm_ascend.patch.platform import patch_0_8_5 # noqa: F401
20+
if vllm_version_is("0.9.0"):
21+
from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401
2222
from vllm_ascend.patch.platform import patch_common # noqa: F401
2323
else:
2424
from vllm_ascend.patch.platform import patch_common # noqa: F401

0 commit comments

Comments
 (0)