fix import vllm_version_is

wxsIcey · wxsIcey · commit f0f6b9d6ac88 · 2025-10-20T06:48:04.000Z
Signed-off-by: Icey &lt;1790571317@qq.com&gt;
diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml
@@ -36,7 +36,7 @@ jobs:
 
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=17c540a993af88204ad1b78345c8a865cf58ce44
+          VLLM_COMMIT=9fce7bee745230d61c60ad467966790553b0ba48
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/pre-commit.yml
     with:
-      vllm: 17c540a993af88204ad1b78345c8a865cf58ce44
+      vllm: 9fce7bee745230d61c60ad467966790553b0ba48
 
   changes:
     runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
         VLLM_USE_MODELSCOPE: True
     strategy:
       matrix:
-        vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0]
+        vllm_version: [9fce7bee745230d61c60ad467966790553b0ba48, v0.11.0]
     steps:
       - name: Install packages
         run: |
@@ -146,7 +146,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0]
+        vllm_version: [9fce7bee745230d61c60ad467966790553b0ba48, v0.11.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml
@@ -69,7 +69,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0]
+        vllm_version: [9fce7bee745230d61c60ad467966790553b0ba48, v0.11.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -26,7 +26,7 @@
                                               AttentionLayer, AttentionType)
 from vllm.config import VllmConfig
 from vllm.forward_context import ForwardContext, get_forward_context
-from vllm.utils import cdiv, direct_register_custom_op
+from vllm.utils import cdiv
 from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -38,10 +38,16 @@
                                                update_graph_params_workspaces)
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, nd_to_nz_spec, version_check)
+                               nd_to_nz_2d, nd_to_nz_spec, version_check,
+                               vllm_version_is)
 
 from ..utils import weak_ref_tensors
 
+if vllm_version_is("0.11.0"):
+    from vllm.utils import direct_register_custom_op
+else:
+    from vllm.utils.torch_utils import direct_register_custom_op
+
 
 class AscendAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
diff --git a/vllm_ascend/models/deepseek_v3_2.py b/vllm_ascend/models/deepseek_v3_2.py
@@ -67,7 +67,6 @@
 from vllm_ascend.models.layers.sfa import AscendSFAModules, Indexer
 from vllm_ascend.ops.common_fused_moe import AscendFusedMoE
 from vllm_ascend.ops.linear import AscendLinearBase
-
 from vllm_ascend.utils import vllm_version_is
 
 if vllm_version_is("0.11.0"):
diff --git a/vllm_ascend/models/layers/mla.py b/vllm_ascend/models/layers/mla.py
@@ -28,17 +28,18 @@
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.utils import direct_register_custom_op
 
 from vllm_ascend.utils import vllm_version_is
 
 if vllm_version_is("0.11.0"):
     from vllm.attention import Attention
     from vllm.model_executor.layers.mla import \
         MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
+    from vllm.utils import direct_register_custom_op
 else:
     from vllm.attention.layer import MLAAttention
     from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
+    from vllm.utils.torch_utils import direct_register_custom_op
 
 
 @dataclass
diff --git a/vllm_ascend/models/layers/sfa.py b/vllm_ascend/models/layers/sfa.py
@@ -29,17 +29,18 @@
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.utils import direct_register_custom_op
 
 from vllm_ascend.utils import vllm_version_is
 
 if vllm_version_is("0.11.0"):
     from vllm.attention import Attention
     from vllm.model_executor.layers.mla import \
         MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
+    from vllm.utils import direct_register_custom_op
 else:
     from vllm.attention.layer import MLAAttention
     from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
+    from vllm.utils.torch_utils import direct_register_custom_op
 
 
 @dataclass
diff --git a/vllm_ascend/ops/register_custom_ops.py b/vllm_ascend/ops/register_custom_ops.py
@@ -8,12 +8,17 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_reduce_scatter)
 from vllm.forward_context import get_forward_context
-from vllm.utils import direct_register_custom_op
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
-from vllm_ascend.utils import npu_stream_switch, prefetch_stream
+from vllm_ascend.utils import (npu_stream_switch, prefetch_stream,
+                               vllm_version_is)
+
+if vllm_version_is("0.11.0"):
+    from vllm.utils import direct_register_custom_op
+else:
+    from vllm.utils.torch_utils import direct_register_custom_op
 
 
 def _maybe_chunk_residual_impl(x: torch.Tensor,
diff --git a/vllm_ascend/patch/platform/patch_common/patch_mamba_config.py b/vllm_ascend/patch/platform/patch_common/patch_mamba_config.py
@@ -3,9 +3,16 @@
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.config import MambaModelConfig
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
+from vllm.utils import cdiv
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
 
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.11.0"):
+    from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+else:
+    from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+
 
 @classmethod
 def verify_and_update_config(cls, vllm_config) -> None:
diff --git a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
@@ -3,7 +3,13 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import GiB_bytes
+
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.11.0"):
+    from vllm.utils import GiB_bytes
+else:
+    from vllm.utils.mem_constants import GiB_bytes
 
 logger = init_logger(__name__)
 
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -9,8 +9,8 @@
 from vllm.forward_context import BatchDescriptor, get_forward_context
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.model_loader import get_model_loader
-from vllm.model_executor.model_loader.utils import (
-    process_weights_after_loading, set_default_torch_dtype)
+from vllm.model_executor.model_loader.utils import \
+    process_weights_after_loading
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
@@ -24,7 +24,13 @@
     TorchairDeepSeekMTP
 from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
                                         TorchairCommonAttentionMetadata)
-from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
+from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
+                               vllm_version_is)
+
+if vllm_version_is("0.11.0"):
+    from vllm.utils import set_default_torch_dtype
+else:
+    from vllm.utils.torch_utils import set_default_torch_dtype
 
 PADDING_SLOT_ID = -1
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -72,8 +72,7 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
-                        get_dtype_size, is_pin_memory_available)
+from vllm.utils import cdiv, is_pin_memory_available
 from vllm.utils.jsontree import json_map_leaves
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
 from vllm.v1.attention.backends.utils import (
@@ -135,6 +134,13 @@
                                is_enable_nz, lmhead_tp_enable, vllm_version_is)
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
 
+if vllm_version_is("0.11.0"):
+    from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
+                            get_dtype_size)
+else:
+    from vllm.utils.mem_utils import DeviceMemoryProfiler
+    from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
+
 # yapf: enable
 
 if vllm_version_is("0.11.0"):
diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py
@@ -44,7 +44,7 @@
 if vllm_version_is("0.11.0"):
     from vllm.utils import swap_dict_values
 else:
-    from vllm.utils.collections import swap_dict_values
+    from vllm.utils.collection_utils import swap_dict_values
 
 
 @dataclass
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -35,7 +35,6 @@
 from vllm.lora.request import LoRARequest
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
@@ -49,7 +48,7 @@
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import (init_ascend_soc_version,
                                register_ascend_customop, sleep_mode_enabled,
-                               try_register_lib)
+                               try_register_lib, vllm_version_is)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
 torch._dynamo.trace_rules.clear_lru_cache()  # noqa: E402
@@ -64,6 +63,12 @@
 torch._dynamo.trace_rules.torch_name_rule_map.append(
     torch_non_c_binding_in_graph_functions_npu)  # noqa: E402
 
+if vllm_version_is("0.11.0"):
+    from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
+else:
+    from vllm.utils.mem_constants import GiB_bytes
+    from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+
 
 class NPUWorker(WorkerBase):