Skip to content

Commit d6172ec

Browse files
committed
[Misc] format patch to make the code clear
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent 59e0250 commit d6172ec

File tree

12 files changed

+50
-34
lines changed

12 files changed

+50
-34
lines changed

tests/singlecard/spec_decode/test_spec_decode_worker.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,6 @@ def test_empty_input_batch(k: int, batch_size: int,
589589

590590
@pytest.mark.parametrize("acceptance_sampler_method",
591591
["rejection_sampler", "typical_acceptance_sampler"])
592-
@pytest.mark.skip_global_cleanup
593592
def test_init_device(acceptance_sampler_method: str):
594593
"""Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
595594
well as other GPU initialization.
@@ -646,7 +645,6 @@ def test_initialize_cache(acceptance_sampler_method):
646645
@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
647646
@pytest.mark.parametrize("acceptance_sampler_method",
648647
["rejection_sampler", "typical_acceptance_sampler"])
649-
@pytest.mark.skip_global_cleanup
650648
def test_determine_num_available_blocks(available_gpu_blocks: int,
651649
available_cpu_blocks: int,
652650
target_cache_block_size_bytes: int,
@@ -685,7 +683,6 @@ def test_determine_num_available_blocks(available_gpu_blocks: int,
685683
@pytest.mark.parametrize('target_cache_block_size_bytes',
686684
[2 * 2 * 4096, 2 * 2 * 8192])
687685
@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
688-
@pytest.mark.skip_global_cleanup
689686
def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
690687
target_cache_block_size_bytes: int,
691688
draft_kv_size_bytes: int):

vllm_ascend/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,9 @@
1818

1919
def register():
2020
"""Register the NPU platform."""
21-
2221
return "vllm_ascend.platform.NPUPlatform"
2322

2423

2524
def register_model():
26-
from .models import register_model
25+
from vllm_ascend.models import register_model
2726
register_model()

vllm_ascend/models/__init__.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22

33

44
def register_model():
5-
from .deepseek_mtp import CustomDeepSeekMTP # noqa: F401
6-
from .deepseek_v2 import CustomDeepseekV2ForCausalLM # noqa: F401
7-
from .deepseek_v2 import CustomDeepseekV3ForCausalLM # noqa: F401
8-
from .qwen2_5_vl import \
5+
from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP # noqa: F401
6+
from vllm_ascend.models.deepseek_v2 import \
7+
CustomDeepseekV2ForCausalLM # noqa: F401
8+
from vllm_ascend.models.deepseek_v2 import \
9+
CustomDeepseekV3ForCausalLM # noqa: F401
10+
from vllm_ascend.models.qwen2_5_vl import \
911
AscendQwen2_5_VLForConditionalGeneration # noqa: F401
10-
from .qwen2_vl import AscendQwen2VLForConditionalGeneration # noqa: F401
12+
from vllm_ascend.models.qwen2_vl import \
13+
AscendQwen2VLForConditionalGeneration # noqa: F401
1114

1215
ModelRegistry.register_model(
1316
"DeepSeekMTPModel",

vllm_ascend/models/deepseek_mtp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from vllm.model_executor.sampling_metadata import SamplingMetadata
3838
from vllm.sequence import IntermediateTensors
3939

40-
from .deepseek_v2 import CustomDeepseekV2DecoderLayer
40+
from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2DecoderLayer
4141

4242

4343
class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):

vllm_ascend/patch/__init__.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,16 +88,39 @@
8888
#
8989
# * Worker Patch:
9090
# ===============
91+
# ** File: worker/patch_common/patch_utils.py **
92+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
93+
# 1. `vllm.utils.direct_register_custom_op`
94+
# Why:
95+
# direct_register_custom_op requires pytorch version >= 2.7.0,
96+
# but vllm-ascend only support pytorch version 2.5.1
97+
# How:
98+
# Convert annotation type to typing type for 2.5.1 backward compatibility
99+
# Related PR (if no, explain why):
100+
# No related PR, it's the change in vllm-ascend.
101+
# Future Plan:
102+
# Update pytorch and torch-npu to 2.7.0 in the future.
103+
# ** File: worker/patch_common/patch_cache_engine.py **
104+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
105+
# 1. `vllm.worker.cache_engine.CacheEngine._allocate_kv_cache`
106+
# Why:
107+
# Add graph_mode optimization for kv cache allocation.
108+
# How:
109+
# If graph_mode is enabled, add layer_kv_cache_nope and layer_kv_cache_pe to the kv_cache.
110+
# Related PR (if no, explain why):
111+
# Need a PR to vllm to fix the issue.
112+
# Future Plan:
113+
# Revert it when the related pr is merged in vllm.
91114
# ** File: worker/patch_common/patch_metrics.py **
92115
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
93-
# 1. `vllm.spec_decode.metrics.AsyncMetricsCollector.maybe_collect_rejsample_metrics`
116+
# 1. `vllm.spec_decode.metrics.AsyncMetricsCollector._copy_rejsample_metrics_async`
94117
# Why:
95118
# There are cuda hard code (current_platform.is_cuda_alike()) in
96-
# `AsyncMetricsCollector.maybe_collect_rejsample_metrics`
119+
# `AsyncMetricsCollector._copy_rejsample_metrics_async`
97120
# How:
98121
# Change to use `current_platform.Event` to determine whether to return None
99-
# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
100-
# https://github.com/vllm-project/vllm/pull/14411
122+
# Related PR (if no, explain why):
123+
# Need a PR to vllm to fix the issue.
101124
# Future Plan:
102125
# Revert it when the related pr is merged in vllm.
103126
#
@@ -110,7 +133,7 @@
110133
# However float32 is not supported in cann rope op, thus we keep this patch
111134
# How:
112135
# Removed the dtype convert operations in forward
113-
# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
136+
# Related PR (if no, explain why):
114137
# NO, only for npu due to rope op.
115138
# Future Plan:
116139
# Keep this patch in vllm-ascend.
@@ -126,7 +149,7 @@
126149
# - support attention metadata register to the set supported spec decode
127150
# - offer a api in platform to determine whether spec decode is supported,
128151
# and deprecate is_cuda_alike in it.
129-
# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
152+
# Related PR (if no, explain why):
130153
# - https://github.com/vllm-project/vllm/pull/15195
131154
# - https://github.com/vllm-project/vllm-ascend/pull/395
132155
# Future Plan:
@@ -138,7 +161,7 @@
138161
# vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change.
139162
# How:
140163
# Use vLLM 0.8.4 method to patch it.
141-
# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
164+
# Related PR (if no, explain why):
142165
# - https://github.com/vllm-project/vllm/pull/15195
143166
# - https://github.com/vllm-project/vllm-ascend/pull/395
144167
# Future Plan:
@@ -153,7 +176,7 @@
153176
# `FlashAttentionMetadata`
154177
# How:
155178
# ditto
156-
# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
179+
# Related PR (if no, explain why):
157180
# - https://github.com/vllm-project/vllm/pull/15195
158181
# - https://github.com/vllm-project/vllm-ascend/pull/395
159182
# Future Plan:

vllm_ascend/patch/worker/patch_common/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# patch_utils should be the first import, because it will be used by other
1818
# patch files.
1919
import vllm_ascend.patch.worker.patch_common.patch_utils # noqa isort:skip
20+
import vllm_ascend.patch.worker.patch_common.patch_cache_engine # noqa
2021
import vllm_ascend.patch.worker.patch_common.patch_metrics # noqa
2122
import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa
2223
import vllm_ascend.patch.worker.patch_common.patch_multi_step_worker # noqa

vllm_ascend/worker/cache_engine.py renamed to vllm_ascend/patch/worker/patch_common/patch_cache_engine.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#
22
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3-
# This file is a part of the vllm-ascend project.
4-
# Adapted from vllm-project/vllm/vllm/worker/model_runner.py
53
# Copyright 2023 The vLLM team.
64
#
75
# Licensed under the Apache License, Version 2.0 (the "License");

vllm_ascend/patch/worker/patch_common/patch_metrics.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,9 @@
1515
# limitations under the License.
1616
#
1717

18-
from typing import Callable
19-
2018
import torch
2119
from vllm.spec_decode.metrics import AsyncMetricsCollector
2220

23-
Timer = Callable[[], float]
24-
2521

2622
def _copy_rejsample_metrics_async(self) -> torch.npu.Event:
2723
"""

vllm_ascend/quantization/quant_config.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@
3434
from vllm.model_executor.utils import set_weight_attrs
3535

3636
from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
37-
38-
from .quantizer import AscendQuantizer
37+
from vllm_ascend.quantization.quantizer import AscendQuantizer
3938

4039

4140
@register_quantization_config("ascend")

vllm_ascend/quantization/quantizer.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,12 @@
2222

2323
from vllm.logger import logger
2424

25-
from .func_wrapper import (wrapper_load_model, wrapper_rmsnorm_forward_oot,
26-
wrapper_rmsnorm_init)
27-
from .w8a8 import AscendW8A8LinearMethod
28-
from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
29-
AscendW8A8DynamicLinearMethod)
25+
from vllm_ascend.quantization.func_wrapper import (wrapper_load_model,
26+
wrapper_rmsnorm_forward_oot,
27+
wrapper_rmsnorm_init)
28+
from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
29+
from vllm_ascend.quantization.w8a8_dynamic import (
30+
AscendW8A8DynamicFusedMoEMethod, AscendW8A8DynamicLinearMethod)
3031

3132
CUSTOMIZED_QUANTIZER_TYPE: List[str] = []
3233

0 commit comments

Comments
 (0)