Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 91 additions & 47 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# This file is a part of the vllm-ascend project.
#

name: 'e2e test / basic'
name: 'test'

on:
schedule:
Expand Down Expand Up @@ -114,6 +114,56 @@ jobs:
echo "::add-matcher::.github/workflows/matchers/mypy.json"
tools/mypy.sh 1 ${{ matrix.python-version }}

ut:
needs: [lint]
name: unit test
if: ${{ needs.lint.result == 'success' }}
runs-on: ubuntu-latest
container:
image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
env:
VLLM_LOGGING_LEVEL: ERROR
VLLM_USE_MODELSCOPE: True
strategy:
matrix:
vllm_version: [main, v0.9.1]
steps:
- name: Install packages
run: |
apt-get update -y
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev

- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: ${{ matrix.vllm_version }}
path: ./vllm-empty

- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/
python3 -m pip uninstall -y triton

- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v4

- name: Install vllm-project/vllm-ascend
run: |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/

- name: Run unit test for V1 Engine
env:
VLLM_USE_V1: 1
VLLM_WORKER_MULTIPROC_METHOD: spawn
TORCH_DEVICE_BACKEND_AUTOLOAD: 0
run: |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
pytest -sv tests/ut

e2e:
needs: [lint]
if: ${{ needs.lint.result == 'success' }}
Expand All @@ -122,7 +172,7 @@ jobs:
matrix:
os: [linux-arm64-npu-1]
vllm_version: [main, v0.9.1]
name: vLLM Ascend test
name: singlecard e2e test
runs-on: ${{ matrix.os }}
container:
# TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
Expand Down Expand Up @@ -168,53 +218,47 @@ jobs:
pip install -r requirements-dev.txt
pip install -v -e .

- name: Run vllm-project/vllm-ascend test for V1 Engine
- name: Run e2e test for V1 Engine
env:
VLLM_USE_V1: 1
VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True
run: |
pytest -sv tests/singlecard/test_offline_inference.py
pytest -sv tests/e2e/singlecard/test_offline_inference.py
# TODO: switch hf to modelscope
VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
pytest -sv tests/singlecard/test_ilama_lora.py
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
# TODO(sss): guided decoding doesn't work, fix it later
# pytest -sv tests/singlecard/test_guided_decoding.py
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
pytest -sv tests/singlecard/test_ascend_config.py
pytest -sv tests/singlecard/test_camem.py
pytest -sv tests/singlecard/ \
--ignore=tests/singlecard/test_offline_inference.py \
--ignore=tests/singlecard/test_ilama_lora.py \
--ignore=tests/singlecard/test_guided_decoding.py \
--ignore=tests/singlecard/test_ascend_config.py \
--ignore=tests/singlecard/test_camem.py
# pytest -sv tests/e2e/singlecard/test_guided_decoding.py
pytest -sv tests/e2e/singlecard/test_camem.py
pytest -sv tests/e2e/singlecard/ \
--ignore=tests/e2e/singlecard/test_offline_inference.py \
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
--ignore=tests/e2e/singlecard/test_guided_decoding.py \
--ignore=tests/e2e/singlecard/test_camem.py

- name: Run vllm-project/vllm-ascend test on V0 engine
- name: Run e2e test on V0 engine
if: ${{ github.event_name == 'schedule' }}
env:
VLLM_USE_V1: 0
VLLM_USE_MODELSCOPE: True
run: |
pytest -sv tests/singlecard/test_offline_inference.py
pytest -sv tests/e2e/singlecard/test_offline_inference.py
# TODO: switch hf to modelscope
VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
pytest -sv tests/singlecard/test_ilama_lora.py
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
# guided decoding doesn't work, fix it later
# pytest -sv tests/singlecard/test_guided_decoding.py
pytest -sv tests/singlecard/test_camem.py
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
pytest -sv tests/singlecard/test_ascend_config.py
pytest -sv tests/singlecard/test_prompt_embedding.py
pytest -sv tests/singlecard/ \
--ignore=tests/singlecard/test_offline_inference.py \
--ignore=tests/singlecard/test_ilama_lora.py \
--ignore=tests/singlecard/test_guided_decoding.py \
--ignore=tests/singlecard/test_camem.py \
--ignore=tests/singlecard/test_ascend_config.py \
--ignore=tests/singlecard/test_prompt_embedding.py \
--ignore=tests/singlecard/core/test_ascend_scheduler.py \
--ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
# pytest -sv tests/e2e/singlecard/test_guided_decoding.py
pytest -sv tests/e2e/singlecard/test_camem.py
pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
pytest -sv tests/e2e/singlecard/ \
--ignore=tests/e2e/singlecard/test_offline_inference.py \
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
--ignore=tests/e2e/singlecard/test_guided_decoding.py \
--ignore=tests/e2e/singlecard/test_camem.py \
--ignore=tests/e2e/singlecard/test_prompt_embedding.py \
--ignore=tests/e2e/singlecard/core/test_ascend_scheduler.py \
--ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py

e2e-4-cards:
needs: [e2e]
Expand All @@ -224,7 +268,7 @@ jobs:
matrix:
os: [linux-arm64-npu-4]
vllm_version: [main, v0.9.1]
name: vLLM Ascend test
name: multicard e2e test
runs-on: ${{ matrix.os }}
container:
# TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
Expand Down Expand Up @@ -279,14 +323,14 @@ jobs:
run: |
# TODO: switch hf to modelscope
VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
pytest -sv tests/multicard/test_ilama_lora_tp2.py
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
# To avoid oom, we need to run the test in a single process.
pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py

- name: Run vllm-project/vllm-ascend test on V0 engine
if: ${{ github.event_name == 'schedule' }}
Expand All @@ -296,11 +340,11 @@ jobs:
run: |
# TODO: switch hf to modelscope
VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
pytest -sv tests/multicard/test_ilama_lora_tp2.py
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
# To avoid oom, we need to run the test in a single process.
pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py
12 changes: 6 additions & 6 deletions .github/workflows/vllm_ascend_test_long_term.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,12 @@ jobs:
run: |
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
# spec decode test
VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
# VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py
VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process
pytest -sv tests/long_term/spec_decode --ignore=tests/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
pytest -sv tests/long_term/test_accuracy.py
# VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process
pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
pytest -sv tests/e2e/long_term/test_accuracy.py
else
VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
fi
2 changes: 1 addition & 1 deletion format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ echo 'vllm-ascend isort: Done'
# Clang-format section
# Exclude some files for formatting because they are vendored
CLANG_FORMAT_EXCLUDES=(
'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h'
'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/kernels/get_masked_input_and_mask_kernel.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h'
)

# Format specified files with clang-format
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@

import pytest

from tests.long_term.spec_decode.e2e.conftest import \
from tests.e2e.long_term.spec_decode.e2e.conftest import \
run_equality_correctness_test
from tests.long_term.spec_decode.utils import maybe_enable_chunked_prefill
from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill

# main model
# lmsys/vicuna-7b-v1.3 was to be used but it's causing
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@
from vllm.model_executor.layers.vocab_parallel_embedding import \
pad_vocab_size # noqa: F401

from tests.long_term.spec_decode.e2e.conftest import \
from tests.e2e.long_term.spec_decode.e2e.conftest import \
run_equality_correctness_test
from tests.long_term.spec_decode.utils import maybe_enable_chunked_prefill
from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill

# main model
MAIN_MODEL = "JackFram/llama-160m"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@

import pytest

from tests.long_term.spec_decode.e2e.conftest import \
from tests.e2e.long_term.spec_decode.e2e.conftest import \
run_equality_correctness_test
from tests.long_term.spec_decode.utils import maybe_enable_chunked_prefill
from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill


@pytest.mark.parametrize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
from vllm.spec_decode.top1_proposer import Top1Proposer

from tests.long_term.spec_decode.test_utils import mock_spec_decode_sampler
from tests.long_term.spec_decode.utils import create_batch, mock_worker
from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler
from tests.e2e.long_term.spec_decode.utils import create_batch, mock_worker


@pytest.mark.parametrize('queue_size', [4])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.top1_proposer import Top1Proposer

from tests.long_term.spec_decode.utils import (
from tests.e2e.long_term.spec_decode.utils import (
assert_logprobs_dict_allclose, create_batch,
create_seq_group_metadata_from_prompts, create_worker,
patch_execute_model_with_seeds, zero_kv_cache)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from vllm.spec_decode.ngram_worker import NGramWorker
from vllm.spec_decode.top1_proposer import Top1Proposer

from tests.long_term.spec_decode.utils import (
from tests.e2e.long_term.spec_decode.utils import (
create_seq_group_metadata_from_prompts, create_worker)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@
from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
split_num_cache_blocks_evenly)

from tests.long_term.spec_decode.test_utils import mock_spec_decode_sampler
from tests.long_term.spec_decode.utils import (create_batch,
create_sampler_output_list,
create_worker, mock_worker)
from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler
from tests.e2e.long_term.spec_decode.utils import (create_batch,
create_sampler_output_list,
create_worker, mock_worker)
from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
from vllm_ascend.worker.worker import NPUWorker

Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pytest

from tests.conftest import VllmRunner
from tests.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, MODEL_PATH,
do_sample)
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
MODEL_PATH, do_sample)


@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading
Loading