Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/format_pr_body.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:

- name: Get vLLM version
run: |
VLLM_COMMIT=17c540a993af88204ad1b78345c8a865cf58ce44
VLLM_COMMIT=c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
lint:
uses: ./.github/workflows/pre-commit.yml
with:
vllm: 17c540a993af88204ad1b78345c8a865cf58ce44
vllm: c9461e05a4ed3557cfbf4b15ded1e26761cc39ca

changes:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True
strategy:
matrix:
vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0]
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
steps:
- name: Install packages
run: |
Expand Down Expand Up @@ -140,7 +140,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0]
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0]
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 5 additions & 1 deletion examples/offline_data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,11 @@
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel)
from vllm.utils import get_open_port
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port

os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
Expand Down
10 changes: 8 additions & 2 deletions examples/offline_external_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,15 @@
import torch
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
from vllm.utils import get_open_port, GiB_bytes
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
from safetensors.torch import load_file
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes, get_open_port

else:
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_open_port

os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
Expand Down
6 changes: 5 additions & 1 deletion examples/offline_inference_sleep_mode_npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@

import torch
from vllm import LLM, SamplingParams
from vllm.utils import GiB_bytes
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes

os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
Expand Down
8 changes: 7 additions & 1 deletion examples/offline_weight_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,14 @@
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
from vllm.utils import get_open_port, GiB_bytes
from safetensors.torch import load_file
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes, get_open_port

else:
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_open_port

os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
Expand Down
7 changes: 6 additions & 1 deletion tests/e2e/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils import get_open_port

from tests.e2e.model_utils import (TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs)
Expand All @@ -55,6 +54,12 @@
# we not explicitly patch here, some of them might be effectiveless
# in pytest scenario
from vllm_ascend.utils import adapt_patch # noqa E402
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port

adapt_patch(True)
adapt_patch(False)
Expand Down
7 changes: 6 additions & 1 deletion tests/e2e/multicard/test_single_request_aclgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,14 @@

import openai
import pytest
from vllm.utils import get_open_port

from tests.e2e.conftest import RemoteOpenAIServer
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port

MODELS = [
"Qwen/Qwen3-30B-A3B",
Expand Down
7 changes: 6 additions & 1 deletion tests/e2e/nightly/models/test_qwen3_32b.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,15 @@

import openai
import pytest
from vllm.utils import get_open_port

from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port

MODELS = [
"Qwen/Qwen3-32B",
Expand Down
7 changes: 6 additions & 1 deletion tests/e2e/singlecard/test_camem.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,16 @@

import torch
from vllm import SamplingParams
from vllm.utils import GiB_bytes

from tests.e2e.conftest import VllmRunner
from tests.e2e.utils import fork_new_process_for_each_test
from vllm_ascend.device_allocator.camem import CaMemAllocator
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes


@fork_new_process_for_each_test
Expand Down
15 changes: 0 additions & 15 deletions tests/ut/core/test_schedule_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,6 @@ def test_not_implemented_policy(self):
str(context.exception),
)

def test_not_implemented_send_delta_data(self):
with self.assertRaises(NotImplementedError) as context:
AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
send_delta_data=True,
max_num_batched_tokens=2048,
max_model_len=2048,
),
)
self.assertIn(
"currently AscendScheduler doesn't support send_delta_data",
str(context.exception),
)

def test_no_override(self):
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config, {})
Expand Down
6 changes: 5 additions & 1 deletion tests/ut/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
MultiModalKwargsItem, PlaceholderRange)
from vllm.sampling_params import SamplingParams
from vllm.utils import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash)
from vllm.v1.core.sched.output import SchedulerOutput
Expand All @@ -24,6 +23,11 @@
from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.utils import sha256
else:
from vllm.utils.hashing import sha256

EOS_TOKEN_ID = 50256
MODEL = "Qwen3-0.6B"
ENABLE_PREFIX_CACHING = None
Expand Down
8 changes: 7 additions & 1 deletion tests/ut/kv_connector/test_mooncake_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@

import msgspec
import zmq
from vllm.utils import make_zmq_path

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.utils import make_zmq_path
else:
from vllm.utils.network_utils import make_zmq_path

fake_engine = types.ModuleType("mooncake.engine")
fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined]
Expand Down
6 changes: 5 additions & 1 deletion tests/ut/kv_connector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from vllm import SamplingParams
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
ModelConfig, SchedulerConfig, VllmConfig)
from vllm.utils import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash)
from vllm.v1.core.sched.scheduler import Scheduler
Expand All @@ -22,6 +21,11 @@

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.utils import sha256
else:
from vllm.utils.hashing import sha256

EOS_TOKEN_ID = 50256
os.environ["VLLM_USE_V1"] = "1"

Expand Down
7 changes: 6 additions & 1 deletion tests/ut/model_loader/netloader/test_netloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from torch import nn

from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic
from vllm_ascend.utils import vllm_version_is


class DummyDeviceConfig:
Expand Down Expand Up @@ -173,7 +174,11 @@ def __exit__(self, a, b, c):
"vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading",
lambda *a, **k: None)
# patch get_ip
monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1")
if vllm_version_is("0.11.0"):
monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1")
else:
monkeypatch.setattr("vllm.utils.network_utils.get_ip",
lambda: "127.0.0.1")
# patch find_free_port
monkeypatch.setattr(
"vllm_ascend.model_loader.netloader.netloader.find_free_port",
Expand Down
7 changes: 6 additions & 1 deletion tests/ut/worker/test_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,19 @@
import pytest
import torch
from vllm.sampling_params import SamplingParams
from vllm.utils import make_tensor_with_pad
from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata

from vllm_ascend.utils import vllm_version_is
from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch

if vllm_version_is("0.11.0"):
from vllm.utils import make_tensor_with_pad
else:
from vllm.utils.torch_utils import make_tensor_with_pad

VOCAB_SIZE = 1024
NUM_OUTPUT_TOKENS = 20
MAX_PROMPT_SIZE = 100
Expand Down
30 changes: 21 additions & 9 deletions tests/ut/worker/test_worker_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig

from tests.ut.base import TestBase
from vllm_ascend.utils import vllm_version_is


class TestNPUWorker(TestBase):
Expand Down Expand Up @@ -178,15 +179,26 @@ def test_init_npu_worker_with_custom_cache_dtype(
# Create NPUWorker instance
from vllm_ascend.worker.worker_v1 import NPUWorker

with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}):
worker = NPUWorker(
vllm_config=self.vllm_config_mock,
local_rank=self.local_rank,
rank=self.rank,
distributed_init_method=self.distributed_init_method,
is_driver_worker=self.is_driver_worker,
)
if vllm_version_is("0.11.0"):
with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}):
worker = NPUWorker(
vllm_config=self.vllm_config_mock,
local_rank=self.local_rank,
rank=self.rank,
distributed_init_method=self.distributed_init_method,
is_driver_worker=self.is_driver_worker,
)
else:
with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}):
worker = NPUWorker(
vllm_config=self.vllm_config_mock,
local_rank=self.local_rank,
rank=self.rank,
distributed_init_method=self.distributed_init_method,
is_driver_worker=self.is_driver_worker,
)

# Verify cache_dtype is set to custom value
self.assertEqual(worker.cache_dtype, torch.float32)
Expand Down
3 changes: 0 additions & 3 deletions vllm_ascend/core/schedule_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,6 @@ def __post_init__(self, *args) -> None:
raise NotImplementedError(
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
)
if self.send_delta_data:
raise NotImplementedError(
"currently AscendScheduler doesn't support send_delta_data.")
if getattr(self, "scheduler_delay_factor", 0) > 0:
raise NotImplementedError(
"currently AscendScheduler doesn't support scheduler_delay_factor."
Expand Down
9 changes: 8 additions & 1 deletion vllm_ascend/distributed/cpu_offload_manager/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,18 @@
import vllm.envs as envs
import zmq
from vllm.config import KVTransferConfig, VllmConfig
from vllm.utils import get_dtype_size, logger, make_zmq_socket
from vllm.utils import logger
from vllm.v1.kv_cache_interface import AttentionSpec

from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
CPUKVCacheManager
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.utils import get_dtype_size, make_zmq_socket
else:
from vllm.utils.network_utils import make_zmq_socket
from vllm.utils.torch_utils import get_dtype_size


@dataclass
Expand Down
10 changes: 8 additions & 2 deletions vllm_ascend/distributed/llmdatadist_c_mgr_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,25 @@
from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group,
get_world_group)
from vllm.forward_context import ForwardContext
from vllm.utils import get_ip, logger
from vllm.utils import logger
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import Request, RequestStatus

import vllm_ascend.envs as envs_ascend
from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version,
prefill_context_parallel_enable)
prefill_context_parallel_enable,
vllm_version_is)

if prefill_context_parallel_enable():
from vllm.distributed.parallel_state import \
get_prefill_context_model_parallel_rank

if vllm_version_is("0.11.0"):
from vllm.utils import get_ip
else:
from vllm.utils.network_utils import get_ip

TORCH_DTYPE_TO_NPU_DTYPE = {
torch.half: llm_datadist.DataType.DT_FLOAT16,
torch.float16: llm_datadist.DataType.DT_FLOAT16,
Expand Down
Loading
Loading