diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index de085e4850..62c91de8cc 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -name: 'e2e test / basic' +name: 'test' on: schedule: @@ -114,6 +114,56 @@ jobs: echo "::add-matcher::.github/workflows/matchers/mypy.json" tools/mypy.sh 1 ${{ matrix.python-version }} + ut: + needs: [lint] + name: unit test + if: ${{ needs.lint.result == 'success' }} + runs-on: ubuntu-latest + container: + image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 + env: + VLLM_LOGGING_LEVEL: ERROR + VLLM_USE_MODELSCOPE: True + strategy: + matrix: + vllm_version: [main, v0.9.1] + steps: + - name: Install packages + run: | + apt-get update -y + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + ref: ${{ matrix.vllm_version }} + path: ./vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/ + python3 -m pip uninstall -y triton + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + + - name: Install vllm-project/vllm-ascend + run: | + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib + python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/ + python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/ + + - name: Run unit test for V1 Engine + env: + VLLM_USE_V1: 1 + VLLM_WORKER_MULTIPROC_METHOD: spawn + TORCH_DEVICE_BACKEND_AUTOLOAD: 0 + run: | + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib + pytest -sv tests/ut + e2e: needs: [lint] if: ${{ needs.lint.result == 'success' }} @@ -122,7 +172,7 @@ jobs: matrix: os: [linux-arm64-npu-1] vllm_version: [main, v0.9.1] - name: vLLM Ascend test + name: singlecard e2e test runs-on: ${{ matrix.os }} container: # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready @@ -168,53 +218,47 @@ jobs: pip install -r requirements-dev.txt pip install -v -e . - - name: Run vllm-project/vllm-ascend test for V1 Engine + - name: Run e2e test for V1 Engine env: VLLM_USE_V1: 1 VLLM_WORKER_MULTIPROC_METHOD: spawn VLLM_USE_MODELSCOPE: True run: | - pytest -sv tests/singlecard/test_offline_inference.py + pytest -sv tests/e2e/singlecard/test_offline_inference.py # TODO: switch hf to modelscope VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/singlecard/test_ilama_lora.py + pytest -sv tests/e2e/singlecard/test_ilama_lora.py # TODO(sss): guided decoding doesn't work, fix it later - # pytest -sv tests/singlecard/test_guided_decoding.py - # test_ascend_config.py should be ran separately because it will regenerate the global config many times. - pytest -sv tests/singlecard/test_ascend_config.py - pytest -sv tests/singlecard/test_camem.py - pytest -sv tests/singlecard/ \ - --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_ilama_lora.py \ - --ignore=tests/singlecard/test_guided_decoding.py \ - --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_camem.py + # pytest -sv tests/e2e/singlecard/test_guided_decoding.py + pytest -sv tests/e2e/singlecard/test_camem.py + pytest -sv tests/e2e/singlecard/ \ + --ignore=tests/e2e/singlecard/test_offline_inference.py \ + --ignore=tests/e2e/singlecard/test_ilama_lora.py \ + --ignore=tests/e2e/singlecard/test_guided_decoding.py \ + --ignore=tests/e2e/singlecard/test_camem.py - - name: Run vllm-project/vllm-ascend test on V0 engine + - name: Run e2e test on V0 engine if: ${{ github.event_name == 'schedule' }} env: VLLM_USE_V1: 0 VLLM_USE_MODELSCOPE: True run: | - pytest -sv tests/singlecard/test_offline_inference.py + pytest -sv tests/e2e/singlecard/test_offline_inference.py # TODO: switch hf to modelscope VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/singlecard/test_ilama_lora.py + pytest -sv tests/e2e/singlecard/test_ilama_lora.py # guided decoding doesn't work, fix it later - # pytest -sv tests/singlecard/test_guided_decoding.py - pytest -sv tests/singlecard/test_camem.py - # test_ascend_config.py should be ran separately because it will regenerate the global config many times. - pytest -sv tests/singlecard/test_ascend_config.py - pytest -sv tests/singlecard/test_prompt_embedding.py - pytest -sv tests/singlecard/ \ - --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_ilama_lora.py \ - --ignore=tests/singlecard/test_guided_decoding.py \ - --ignore=tests/singlecard/test_camem.py \ - --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_prompt_embedding.py \ - --ignore=tests/singlecard/core/test_ascend_scheduler.py \ - --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py + # pytest -sv tests/e2e/singlecard/test_guided_decoding.py + pytest -sv tests/e2e/singlecard/test_camem.py + pytest -sv tests/e2e/singlecard/test_prompt_embedding.py + pytest -sv tests/e2e/singlecard/ \ + --ignore=tests/e2e/singlecard/test_offline_inference.py \ + --ignore=tests/e2e/singlecard/test_ilama_lora.py \ + --ignore=tests/e2e/singlecard/test_guided_decoding.py \ + --ignore=tests/e2e/singlecard/test_camem.py \ + --ignore=tests/e2e/singlecard/test_prompt_embedding.py \ + --ignore=tests/e2e/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py e2e-4-cards: needs: [e2e] @@ -224,7 +268,7 @@ jobs: matrix: os: [linux-arm64-npu-4] vllm_version: [main, v0.9.1] - name: vLLM Ascend test + name: multicard e2e test runs-on: ${{ matrix.os }} container: # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready @@ -279,14 +323,14 @@ jobs: run: | # TODO: switch hf to modelscope VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/multicard/test_ilama_lora_tp2.py - # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. + pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py + # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error. # To avoid oom, we need to run the test in a single process. - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 + pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py - name: Run vllm-project/vllm-ascend test on V0 engine if: ${{ github.event_name == 'schedule' }} @@ -296,11 +340,11 @@ jobs: run: | # TODO: switch hf to modelscope VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/multicard/test_ilama_lora_tp2.py - # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. + pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py + # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error. # To avoid oom, we need to run the test in a single process. - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 + pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index c17200a8d1..46123590ab 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -96,12 +96,12 @@ jobs: run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then # spec decode test - VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed - # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py - VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process - pytest -sv tests/long_term/spec_decode --ignore=tests/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py - pytest -sv tests/long_term/test_accuracy.py + # VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process + pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py + pytest -sv tests/e2e/long_term/test_accuracy.py else - VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py fi diff --git a/format.sh b/format.sh index d8a04069a8..e49ac915b9 100755 --- a/format.sh +++ b/format.sh @@ -273,7 +273,7 @@ echo 'vllm-ascend isort: Done' # Clang-format section # Exclude some files for formatting because they are vendored CLANG_FORMAT_EXCLUDES=( - 'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h' + 'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/kernels/get_masked_input_and_mask_kernel.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h' ) # Format specified files with clang-format diff --git a/tests/long_term/spec_decode/__init__.py b/tests/e2e/long_term/spec_decode/__init__.py similarity index 100% rename from tests/long_term/spec_decode/__init__.py rename to tests/e2e/long_term/spec_decode/__init__.py diff --git a/tests/long_term/spec_decode/conftest.py b/tests/e2e/long_term/spec_decode/conftest.py similarity index 100% rename from tests/long_term/spec_decode/conftest.py rename to tests/e2e/long_term/spec_decode/conftest.py diff --git a/tests/long_term/spec_decode/e2e/__init__.py b/tests/e2e/long_term/spec_decode/e2e/__init__.py similarity index 100% rename from tests/long_term/spec_decode/e2e/__init__.py rename to tests/e2e/long_term/spec_decode/e2e/__init__.py diff --git a/tests/long_term/spec_decode/e2e/conftest.py b/tests/e2e/long_term/spec_decode/e2e/conftest.py similarity index 100% rename from tests/long_term/spec_decode/e2e/conftest.py rename to tests/e2e/long_term/spec_decode/e2e/conftest.py diff --git a/tests/long_term/spec_decode/e2e/test_medusa_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py similarity index 99% rename from tests/long_term/spec_decode/e2e/test_medusa_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py index c88ee76fa9..e0c2efd7af 100644 --- a/tests/long_term/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py @@ -41,9 +41,9 @@ import pytest -from tests.long_term.spec_decode.e2e.conftest import \ +from tests.e2e.long_term.spec_decode.e2e.conftest import \ run_equality_correctness_test -from tests.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill # main model # lmsys/vicuna-7b-v1.3 was to be used but it's causing diff --git a/tests/long_term/spec_decode/e2e/test_mlp_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py similarity index 99% rename from tests/long_term/spec_decode/e2e/test_mlp_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py index ee4e7ccd7d..56db617755 100644 --- a/tests/long_term/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py @@ -41,9 +41,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import \ pad_vocab_size # noqa: F401 -from tests.long_term.spec_decode.e2e.conftest import \ +from tests.e2e.long_term.spec_decode.e2e.conftest import \ run_equality_correctness_test -from tests.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill # main model MAIN_MODEL = "JackFram/llama-160m" diff --git a/tests/long_term/spec_decode/e2e/test_mtp_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py similarity index 100% rename from tests/long_term/spec_decode/e2e/test_mtp_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py diff --git a/tests/long_term/spec_decode/e2e/test_ngram_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py similarity index 99% rename from tests/long_term/spec_decode/e2e/test_ngram_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py index 55454732d5..b99187fe37 100644 --- a/tests/long_term/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py @@ -44,9 +44,9 @@ import pytest -from tests.long_term.spec_decode.e2e.conftest import \ +from tests.e2e.long_term.spec_decode.e2e.conftest import \ run_equality_correctness_test -from tests.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill @pytest.mark.parametrize( diff --git a/tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py similarity index 100% rename from tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py diff --git a/tests/long_term/spec_decode/e2e/test_v1_spec_decode.py b/tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py similarity index 100% rename from tests/long_term/spec_decode/e2e/test_v1_spec_decode.py rename to tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py diff --git a/tests/long_term/spec_decode/test_dynamic_spec_decode.py b/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py similarity index 96% rename from tests/long_term/spec_decode/test_dynamic_spec_decode.py rename to tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py index f9656f5f4d..8e9480ea26 100644 --- a/tests/long_term/spec_decode/test_dynamic_spec_decode.py +++ b/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py @@ -27,8 +27,8 @@ from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.long_term.spec_decode.test_utils import mock_spec_decode_sampler -from tests.long_term.spec_decode.utils import create_batch, mock_worker +from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler +from tests.e2e.long_term.spec_decode.utils import create_batch, mock_worker @pytest.mark.parametrize('queue_size', [4]) diff --git a/tests/long_term/spec_decode/test_multi_step_worker.py b/tests/e2e/long_term/spec_decode/test_multi_step_worker.py similarity index 99% rename from tests/long_term/spec_decode/test_multi_step_worker.py rename to tests/e2e/long_term/spec_decode/test_multi_step_worker.py index b9c2e0cc3d..b3017a987e 100644 --- a/tests/long_term/spec_decode/test_multi_step_worker.py +++ b/tests/e2e/long_term/spec_decode/test_multi_step_worker.py @@ -29,7 +29,7 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.long_term.spec_decode.utils import ( +from tests.e2e.long_term.spec_decode.utils import ( assert_logprobs_dict_allclose, create_batch, create_seq_group_metadata_from_prompts, create_worker, patch_execute_model_with_seeds, zero_kv_cache) diff --git a/tests/long_term/spec_decode/test_ngram_worker.py b/tests/e2e/long_term/spec_decode/test_ngram_worker.py similarity index 99% rename from tests/long_term/spec_decode/test_ngram_worker.py rename to tests/e2e/long_term/spec_decode/test_ngram_worker.py index 1ad02bb1de..078a4d2bed 100644 --- a/tests/long_term/spec_decode/test_ngram_worker.py +++ b/tests/e2e/long_term/spec_decode/test_ngram_worker.py @@ -22,7 +22,7 @@ from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.long_term.spec_decode.utils import ( +from tests.e2e.long_term.spec_decode.utils import ( create_seq_group_metadata_from_prompts, create_worker) diff --git a/tests/long_term/spec_decode/test_spec_decode_worker.py b/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py similarity index 99% rename from tests/long_term/spec_decode/test_spec_decode_worker.py rename to tests/e2e/long_term/spec_decode/test_spec_decode_worker.py index b5abd1e123..94a1bcf1e7 100644 --- a/tests/long_term/spec_decode/test_spec_decode_worker.py +++ b/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py @@ -35,10 +35,10 @@ from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) -from tests.long_term.spec_decode.test_utils import mock_spec_decode_sampler -from tests.long_term.spec_decode.utils import (create_batch, - create_sampler_output_list, - create_worker, mock_worker) +from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler +from tests.e2e.long_term.spec_decode.utils import (create_batch, + create_sampler_output_list, + create_worker, mock_worker) from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner from vllm_ascend.worker.worker import NPUWorker diff --git a/tests/long_term/spec_decode/test_utils.py b/tests/e2e/long_term/spec_decode/test_utils.py similarity index 100% rename from tests/long_term/spec_decode/test_utils.py rename to tests/e2e/long_term/spec_decode/test_utils.py diff --git a/tests/long_term/spec_decode/utils.py b/tests/e2e/long_term/spec_decode/utils.py similarity index 100% rename from tests/long_term/spec_decode/utils.py rename to tests/e2e/long_term/spec_decode/utils.py diff --git a/tests/long_term/test_accuracy.py b/tests/e2e/long_term/test_accuracy.py similarity index 100% rename from tests/long_term/test_accuracy.py rename to tests/e2e/long_term/test_accuracy.py diff --git a/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py b/tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py similarity index 100% rename from tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py rename to tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py diff --git a/tests/multicard/test_dynamic_npugraph_batchsize.py b/tests/e2e/multicard/test_dynamic_npugraph_batchsize.py similarity index 100% rename from tests/multicard/test_dynamic_npugraph_batchsize.py rename to tests/e2e/multicard/test_dynamic_npugraph_batchsize.py diff --git a/tests/multicard/test_ilama_lora_tp2.py b/tests/e2e/multicard/test_ilama_lora_tp2.py similarity index 83% rename from tests/multicard/test_ilama_lora_tp2.py rename to tests/e2e/multicard/test_ilama_lora_tp2.py index e61ce250c8..e743141b7a 100644 --- a/tests/multicard/test_ilama_lora_tp2.py +++ b/tests/e2e/multicard/test_ilama_lora_tp2.py @@ -1,8 +1,8 @@ import pytest from tests.conftest import VllmRunner -from tests.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, MODEL_PATH, - do_sample) +from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, + MODEL_PATH, do_sample) @pytest.mark.parametrize("distributed_executor_backend", ["mp"]) diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py similarity index 100% rename from tests/multicard/test_offline_inference_distributed.py rename to tests/e2e/multicard/test_offline_inference_distributed.py diff --git a/tests/multicard/test_pyhccl_distributed.py b/tests/e2e/multicard/test_pyhccl_distributed.py similarity index 100% rename from tests/multicard/test_pyhccl_distributed.py rename to tests/e2e/multicard/test_pyhccl_distributed.py diff --git a/tests/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py similarity index 100% rename from tests/multicard/test_torchair_graph_mode.py rename to tests/e2e/multicard/test_torchair_graph_mode.py diff --git a/tests/singlecard/__init__.py b/tests/e2e/singlecard/__init__.py similarity index 100% rename from tests/singlecard/__init__.py rename to tests/e2e/singlecard/__init__.py diff --git a/tests/singlecard/compile/__init__.py b/tests/e2e/singlecard/compile/__init__.py similarity index 100% rename from tests/singlecard/compile/__init__.py rename to tests/e2e/singlecard/compile/__init__.py diff --git a/tests/singlecard/compile/test_simple.py b/tests/e2e/singlecard/compile/test_simple.py similarity index 100% rename from tests/singlecard/compile/test_simple.py rename to tests/e2e/singlecard/compile/test_simple.py diff --git a/tests/singlecard/core/__init__.py b/tests/e2e/singlecard/core/__init__.py similarity index 100% rename from tests/singlecard/core/__init__.py rename to tests/e2e/singlecard/core/__init__.py diff --git a/tests/singlecard/core/test_ascend_scheduler.py b/tests/e2e/singlecard/core/test_ascend_scheduler.py similarity index 100% rename from tests/singlecard/core/test_ascend_scheduler.py rename to tests/e2e/singlecard/core/test_ascend_scheduler.py diff --git a/tests/singlecard/core/test_ascend_scheduler_e2e.py b/tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py similarity index 100% rename from tests/singlecard/core/test_ascend_scheduler_e2e.py rename to tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py diff --git a/tests/singlecard/ops/__init__.py b/tests/e2e/singlecard/ops/__init__.py similarity index 100% rename from tests/singlecard/ops/__init__.py rename to tests/e2e/singlecard/ops/__init__.py diff --git a/tests/singlecard/ops/test_fused_moe.py b/tests/e2e/singlecard/ops/test_fused_moe.py similarity index 100% rename from tests/singlecard/ops/test_fused_moe.py rename to tests/e2e/singlecard/ops/test_fused_moe.py diff --git a/tests/singlecard/ops/test_multi_step.py b/tests/e2e/singlecard/ops/test_multi_step.py similarity index 100% rename from tests/singlecard/ops/test_multi_step.py rename to tests/e2e/singlecard/ops/test_multi_step.py diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/e2e/singlecard/ops/test_rotary_embedding.py similarity index 100% rename from tests/singlecard/ops/test_rotary_embedding.py rename to tests/e2e/singlecard/ops/test_rotary_embedding.py diff --git a/tests/ops/test_vocabparallelembedding.py b/tests/e2e/singlecard/ops/test_vocabparallelembedding.py similarity index 100% rename from tests/ops/test_vocabparallelembedding.py rename to tests/e2e/singlecard/ops/test_vocabparallelembedding.py diff --git a/tests/singlecard/sample/__init__.py b/tests/e2e/singlecard/sample/__init__.py similarity index 100% rename from tests/singlecard/sample/__init__.py rename to tests/e2e/singlecard/sample/__init__.py diff --git a/tests/singlecard/sample/test_rejection_sampler.py b/tests/e2e/singlecard/sample/test_rejection_sampler.py similarity index 100% rename from tests/singlecard/sample/test_rejection_sampler.py rename to tests/e2e/singlecard/sample/test_rejection_sampler.py diff --git a/tests/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py similarity index 100% rename from tests/singlecard/test_aclgraph.py rename to tests/e2e/singlecard/test_aclgraph.py diff --git a/tests/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py similarity index 100% rename from tests/singlecard/test_camem.py rename to tests/e2e/singlecard/test_camem.py diff --git a/tests/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py similarity index 100% rename from tests/singlecard/test_chunked.py rename to tests/e2e/singlecard/test_chunked.py diff --git a/tests/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py similarity index 100% rename from tests/singlecard/test_guided_decoding.py rename to tests/e2e/singlecard/test_guided_decoding.py diff --git a/tests/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py similarity index 100% rename from tests/singlecard/test_ilama_lora.py rename to tests/e2e/singlecard/test_ilama_lora.py diff --git a/tests/singlecard/test_offline_inference.py b/tests/e2e/singlecard/test_offline_inference.py similarity index 100% rename from tests/singlecard/test_offline_inference.py rename to tests/e2e/singlecard/test_offline_inference.py diff --git a/tests/singlecard/test_profile_execute_duration.py b/tests/e2e/singlecard/test_profile_execute_duration.py similarity index 100% rename from tests/singlecard/test_profile_execute_duration.py rename to tests/e2e/singlecard/test_profile_execute_duration.py diff --git a/tests/singlecard/test_prompt_embedding.py b/tests/e2e/singlecard/test_prompt_embedding.py similarity index 100% rename from tests/singlecard/test_prompt_embedding.py rename to tests/e2e/singlecard/test_prompt_embedding.py diff --git a/tests/singlecard/test_pyhccl.py b/tests/e2e/singlecard/test_pyhccl.py similarity index 100% rename from tests/singlecard/test_pyhccl.py rename to tests/e2e/singlecard/test_pyhccl.py diff --git a/tests/singlecard/test_sampler.py b/tests/e2e/singlecard/test_sampler.py similarity index 100% rename from tests/singlecard/test_sampler.py rename to tests/e2e/singlecard/test_sampler.py diff --git a/tests/singlecard/test_scheduler.py b/tests/e2e/singlecard/test_scheduler.py similarity index 100% rename from tests/singlecard/test_scheduler.py rename to tests/e2e/singlecard/test_scheduler.py diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py deleted file mode 100644 index 63484d4a01..0000000000 --- a/tests/singlecard/test_ascend_config.py +++ /dev/null @@ -1,191 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -import pytest - -from tests.conftest import VllmRunner -from vllm_ascend.ascend_config import (clear_ascend_config, get_ascend_config, - init_ascend_config) - - -def _clean_up_ascend_config(func): - - def wrapper(*args, **kwargs): - clear_ascend_config() - func(*args, **kwargs) - clear_ascend_config() - - return wrapper - - -@_clean_up_ascend_config -def test_run_without_ascend_config(): - with VllmRunner("facebook/opt-125m"): - ascend_config = get_ascend_config() - - assert not ascend_config.torchair_graph_config.enabled - assert not ascend_config.torchair_graph_config.use_cached_graph - assert ascend_config.torchair_graph_config.graph_batch_sizes == [] - assert not ascend_config.torchair_graph_config.graph_batch_sizes_init - assert not ascend_config.ascend_scheduler_config.enabled - assert ascend_config.expert_tensor_parallel_size == 0 - - -@_clean_up_ascend_config -def test_run_with_ascend_config(): - if os.getenv("VLLM_USE_V1") == "0": - pytest.skip("graph only works on v1") - - input_additional_config_1 = { - "torchair_graph_config": { - # torchair graph only works with deepseek. The e2e test should be added - # in multicard test with deepseek models. - "enabled": False, - "use_cached_graph": True, - "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": False, - "enable_multistream_moe": True, - "enable_multistream_mla": True, - }, - "ascend_scheduler_config": { - "enabled": True, - "enable_chunked_prefill": True, - }, - "expert_tensor_parallel_size": 1 - } - - # check passed with eager mode - with VllmRunner("facebook/opt-125m", - enforce_eager=True, - additional_config=input_additional_config_1): - ascend_config = get_ascend_config() - - assert not ascend_config.torchair_graph_config.enabled - assert ascend_config.torchair_graph_config.use_cached_graph - assert ascend_config.torchair_graph_config.graph_batch_sizes == [ - 1, 2, 4, 8 - ] - assert not ascend_config.torchair_graph_config.graph_batch_sizes_init - assert ascend_config.torchair_graph_config.enable_multistream_mla - assert ascend_config.torchair_graph_config.enable_multistream_moe - assert ascend_config.ascend_scheduler_config.enabled - assert ascend_config.ascend_scheduler_config.enable_chunked_prefill - assert ascend_config.expert_tensor_parallel_size == 1 - - -@_clean_up_ascend_config -def test_ascend_config_init_error(): - # ascend_config should be initialized first - with pytest.raises(RuntimeError): - _ = get_ascend_config() - - -@_clean_up_ascend_config -def test_ascend_config_load_error(): - if os.getenv("VLLM_USE_V1") == "0": - pytest.skip("graph only works on v1") - # graph_batch_sizes should be list. - with pytest.raises(TypeError): - input_additional_config_fake_1 = { - "torchair_graph_config": { - "graph_batch_sizes": "fake_size", - }, - } - with VllmRunner("facebook/opt-125m", - additional_config=input_additional_config_fake_1): - pass - - # graph_batch_sizes_init should not be True when graph_batch_sizes is not empty. - with pytest.raises(ValueError): - input_additional_config_fake_2 = { - "torchair_graph_config": { - "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": True, - }, - } - with VllmRunner("facebook/opt-125m", - additional_config=input_additional_config_fake_2): - pass - - # torchair graph only works with deepseek. - with pytest.raises(NotImplementedError): - input_additional_config_fake_2 = { - "torchair_graph_config": { - "enabled": True, - }, - } - with VllmRunner("facebook/opt-125m", - enforce_eager=False, - additional_config=input_additional_config_fake_2): - pass - - # torchair graph should not be enabled with eager mode - with pytest.raises(RuntimeError): - input_additional_config_fake_3 = { - "torchair_graph_config": { - "enabled": True, - }, - } - with VllmRunner("facebook/opt-125m", - enforce_eager=True, - additional_config=input_additional_config_fake_3): - pass - - -@_clean_up_ascend_config -def test_check_ascend_config_v0(): - if os.getenv("VLLM_USE_V1") == "1": - pytest.skip("graph only works on v1, this is the test for v0") - with pytest.raises(NotImplementedError): - input_additional_config_fake_1 = { - "torchair_graph_config": { - "enabled": True, - }, - } - with VllmRunner("facebook/opt-125m", - additional_config=input_additional_config_fake_1): - pass - - -@_clean_up_ascend_config -def test_ascend_config_refresh(): - from vllm.config import get_current_vllm_config - vllm_config = get_current_vllm_config() - # set additional_config with none - init_ascend_config(vllm_config) - - input_additional_config = { - "torchair_graph_config": { - "enabled": False, - "use_cached_graph": True, - "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": False, - }, - "refresh": True, - } - - # refresh ascend config - with VllmRunner("facebook/opt-125m", - additional_config=input_additional_config): - ascend_config = get_ascend_config() - - assert not ascend_config.torchair_graph_config.enabled - assert ascend_config.torchair_graph_config.use_cached_graph - assert ascend_config.torchair_graph_config.graph_batch_sizes == [ - 1, 2, 4, 8 - ] - assert not ascend_config.torchair_graph_config.graph_batch_sizes_init diff --git a/tests/ut/fake_weight/config.json b/tests/ut/fake_weight/config.json new file mode 100644 index 0000000000..b3fb716a30 --- /dev/null +++ b/tests/ut/fake_weight/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "facebook/opt-125m", + "activation_dropout": 0.0, + "activation_function": "relu", + "architectures": [ + "OPTForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 2, + "do_layer_norm_before": true, + "dropout": 0.1, + "eos_token_id": 2, + "ffn_dim": 3072, + "hidden_size": 768, + "init_std": 0.02, + "layerdrop": 0.0, + "max_position_embeddings": 2048, + "model_type": "opt", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 1, + "prefix": "", + "torch_dtype": "float16", + "transformers_version": "4.21.0.dev0", + "use_cache": true, + "vocab_size": 50272, + "word_embed_proj_dim": 768 +} diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py new file mode 100644 index 0000000000..5ec4dd72cc --- /dev/null +++ b/tests/ut/test_ascend_config.py @@ -0,0 +1,244 @@ +import os +import unittest +from unittest import mock + +from transformers import PretrainedConfig +from vllm.config import ModelConfig, VllmConfig + +from vllm_ascend.ascend_config import (check_ascend_config, + clear_ascend_config, get_ascend_config, + init_ascend_config) + + +class TestAscendConfig(unittest.TestCase): + + @staticmethod + def _clean_up_ascend_config(func): + + def wrapper(*args, **kwargs): + clear_ascend_config() + func(*args, **kwargs) + clear_ascend_config() + + return wrapper + + @_clean_up_ascend_config + def test_init_ascend_config_without_additional_config(self): + test_vllm_config = VllmConfig() + # No additional config given, check the default value here. + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(ascend_config.expert_tensor_parallel_size, 0) + self.assertIsNone(ascend_config.expert_map_path) + + torchair_graph_config = ascend_config.torchair_graph_config + self.assertFalse(torchair_graph_config.enabled) + self.assertFalse(torchair_graph_config.use_cached_graph) + self.assertEqual(torchair_graph_config.graph_batch_sizes, []) + self.assertFalse(torchair_graph_config.graph_batch_sizes_init) + self.assertFalse(torchair_graph_config.enable_multistream_mla) + self.assertFalse(torchair_graph_config.enable_multistream_moe) + self.assertTrue(torchair_graph_config.enable_view_optimize) + self.assertFalse(torchair_graph_config.enable_kv_nz) + + ascend_scheduler_config = ascend_config.ascend_scheduler_config + self.assertFalse(ascend_scheduler_config.enabled) + + @_clean_up_ascend_config + def test_init_ascend_config_with_additional_config(self): + test_vllm_config = VllmConfig() + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + "use_cached_graph": True, + "graph_batch_sizes": [1, 2, 4], + "graph_batch_sizes_init": False, + "enable_multistream_mla": True, + "enable_multistream_moe": True, + "enable_view_optimize": True, + "enable_kv_nz": True + }, + "ascend_scheduler_config": { + "enabled": True + }, + "expert_tensor_parallel_size": 1, + "expert_map_path": "test_expert_map_path", + "refresh": True + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(ascend_config.expert_tensor_parallel_size, 1) + self.assertEqual(ascend_config.expert_map_path, "test_expert_map_path") + + torchair_graph_config = ascend_config.torchair_graph_config + self.assertTrue(torchair_graph_config.enabled) + self.assertTrue(torchair_graph_config.use_cached_graph) + self.assertEqual(torchair_graph_config.graph_batch_sizes, [1, 2, 4]) + self.assertFalse(torchair_graph_config.graph_batch_sizes_init) + self.assertTrue(torchair_graph_config.enable_multistream_mla) + self.assertTrue(torchair_graph_config.enable_multistream_moe) + self.assertTrue(torchair_graph_config.enable_view_optimize) + self.assertTrue(torchair_graph_config.enable_kv_nz) + + ascend_scheduler_config = ascend_config.ascend_scheduler_config + self.assertTrue(ascend_scheduler_config.enabled) + + @_clean_up_ascend_config + def test_init_ascend_config_with_refresh(self): + test_vllm_config = VllmConfig() + ascend_config = init_ascend_config(test_vllm_config) + self.assertFalse(ascend_config.torchair_graph_config.enabled) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertFalse(ascend_config.torchair_graph_config.enabled) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True, + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertTrue(ascend_config.torchair_graph_config.enabled) + + @_clean_up_ascend_config + def test_init_ascend_config_with_wrong_input(self): + test_vllm_config = VllmConfig() + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + "graph_batch_sizes": "fake_size", + }, + "refresh": True, + } + with self.assertRaises(TypeError): + init_ascend_config(test_vllm_config) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "graph_batch_sizes": [1, 2, 4, 8], + "graph_batch_sizes_init": True, + }, + "refresh": True, + } + with self.assertRaises(ValueError): + init_ascend_config(test_vllm_config) + + @_clean_up_ascend_config + def test_get_ascend_config(self): + test_vllm_config = VllmConfig() + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(get_ascend_config(), ascend_config) + + @_clean_up_ascend_config + def test_get_ascend_config_without_init(self): + with self.assertRaises(RuntimeError): + get_ascend_config() + + @_clean_up_ascend_config + def test_clear_ascend_config(self): + test_vllm_config = VllmConfig() + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(get_ascend_config(), ascend_config) + clear_ascend_config() + with self.assertRaises(RuntimeError): + get_ascend_config() + + @_clean_up_ascend_config + def test_check_ascend_config_pass(self): + test_vllm_config = VllmConfig() + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + # For V1 engine + with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + @_clean_up_ascend_config + def test_check_ascend_config_wrong_case(self): + test_vllm_config = VllmConfig() + # For V0 engine + with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}): + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, True) + # For V1 engine + with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): + # torchair + eager mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + enforce_eager = True + check_ascend_config(test_vllm_config, enforce_eager) + # torchair + non deepseek model + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + model_path = os.path.join(os.path.dirname(__file__), + "fake_weight") + fake_model_config = ModelConfig(model=model_path) + fake_model_config.hf_config = PretrainedConfig() + fake_model_config.hf_config.model_type = "llama" + test_vllm_config.model_config = fake_model_config + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + # aclgraph + deepseek model + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + }, + "refresh": True + } + model_path = os.path.join(os.path.dirname(__file__), + "fake_weight") + fake_model_config = ModelConfig(model=model_path) + fake_model_config.hf_config = PretrainedConfig() + fake_model_config.hf_config.model_type = "deepseek" + test_vllm_config.model_config = fake_model_config + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py new file mode 100644 index 0000000000..fdffa2a0fd --- /dev/null +++ b/tests/ut/worker/test_worker_v1.py @@ -0,0 +1 @@ +# placeholder diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index defa7fd3d8..d8b87c6952 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -138,12 +138,6 @@ def check_ascend_config(vllm_config, enforce_eager): else: # torchair_graph case if ascend_config.torchair_graph_config.enabled: - # torchair_graph is not supported for V1 without mla currently. - if envs.VLLM_MLA_DISABLE: - logger.warning( - "Torchair graph mode is still experimental and not supported for V1 without mla currently, " - "it has been disabled automatically.") - ascend_config.torchair_graph_config.enabled = False # torchair_graph is supported for deepseek model only currently. if vllm_config.model_config: model_type = vllm_config.model_config.hf_config.model_type diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 912e375f70..9194ae9a16 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -164,6 +164,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: enforce_eager = getattr(model_config, "enforce_eager", False) + if ascend_config.torchair_graph_config.enabled and envs.VLLM_MLA_DISABLE: + # torchair_graph is not supported for V1 without mla currently. + logger.warning( + "Torchair graph mode is still experimental and not supported for V1 without mla currently, " + "Fallback to eager mode.") + ascend_config.torchair_graph_config.enabled = False + enforce_eager = True + check_ascend_config(vllm_config, enforce_eager) if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION: diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index f41dab4b94..e29290e73a 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -24,6 +24,7 @@ from typing import TYPE_CHECKING, List, Tuple import torch +import torch_npu # noqa: F401 import torchair # type: ignore[import] # noqa: F401 from packaging.version import InvalidVersion, Version from torch_npu.npu.streams import Event