Skip to content

Commit e95cc8b

Browse files
committed
add v0.8.3 test
Signed-off-by: wangli <wangli858794774@gmail.com>
1 parent c55b224 commit e95cc8b

File tree

2 files changed

+197
-67
lines changed

2 files changed

+197
-67
lines changed

.github/workflows/vllm_ascend_test_main.yaml renamed to .github/workflows/vllm_ascend_test.yaml

Lines changed: 168 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ on:
2525
paths:
2626
- '*.txt'
2727
- '**/*.py'
28-
- '.github/workflows/vllm_ascend_test_main.yaml'
28+
- '.github/workflows/vllm_ascend_test.yaml'
2929
- '!docs/**'
3030
- 'pytest.ini'
3131

@@ -72,7 +72,6 @@ jobs:
7272
uses: actions/checkout@v4
7373
with:
7474
repository: vllm-project/vllm
75-
fetch-depth: 0
7675
path: ./vllm-empty
7776

7877
- name: Install vllm-project/vllm from source
@@ -100,45 +99,195 @@ jobs:
10099
101100
pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
102101
103-
- name: Run vllm-project/vllm-ascend test
102+
- name: Run vllm-project/vllm-ascend test on V0 engine
104103
env:
105104
VLLM_USE_V1: 0
106105
HF_ENDPOINT: https://hf-mirror.com
107106
run: |
108107
VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
109108
109+
- name: Run vllm-project/vllm-ascend test for V1 Engine
110+
env:
111+
VLLM_USE_V1: 1
112+
VLLM_WORKER_MULTIPROC_METHOD: spawn
113+
HF_ENDPOINT: https://hf-mirror.com
114+
run: |
115+
pytest -sv -m 'not multinpu' tests
116+
110117
- name: Run vllm-project/vllm test for V0 Engine
111118
env:
112119
VLLM_USE_V1: 0
113120
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
114121
HF_ENDPOINT: https://hf-mirror.com
115122
run: |
116123
pytest -sv
117-
118-
- name: Checkout to vllm 0.8.3
124+
125+
test-multinpu:
126+
name: vLLM Ascend test (multi-npu)
127+
runs-on: linux-arm64-npu-4
128+
container:
129+
image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
130+
env:
131+
HF_ENDPOINT: https://hf-mirror.com
132+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
133+
steps:
134+
- name: Check npu and CANN info
135+
run: |
136+
npu-smi info
137+
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
138+
139+
- name: Config mirrors
140+
run: |
141+
# sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
142+
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
143+
144+
- name: Install system dependencies
145+
run: |
146+
apt-get update -y
147+
apt-get -y install git wget
148+
149+
- name: Config git
150+
run: |
151+
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
152+
153+
- name: Checkout vllm-project/vllm-ascend repo
154+
uses: actions/checkout@v4
155+
156+
- name: Install dependencies
157+
run: |
158+
pip install -r requirements-dev.txt
159+
160+
- name: Checkout vllm-project/vllm repo
161+
uses: actions/checkout@v4
162+
with:
163+
repository: vllm-project/vllm
164+
path: ./vllm-empty
165+
166+
- name: Install vllm-project/vllm from source
167+
working-directory: ./vllm-empty
168+
run: |
169+
VLLM_TARGET_DEVICE=empty pip install -e .
170+
171+
- name: Install vllm-project/vllm-ascend
172+
run: |
173+
pip install -r requirements-dev.txt
174+
pip install -e .
175+
176+
- name: Install pta
177+
run: |
178+
if [ ! -d /root/.cache/pta ]; then
179+
mkdir -p /root/.cache/pta
180+
fi
181+
182+
if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
183+
cd /root/.cache/pta
184+
rm -rf pytorch_v2.5.1_py310*
185+
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
186+
tar -zxvf pytorch_v2.5.1_py310.tar.gz
187+
fi
188+
189+
pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
190+
- name: Run vllm-project/vllm-ascend test on V0 engine
191+
env:
192+
VLLM_USE_V1: 0
193+
HF_ENDPOINT: https://hf-mirror.com
194+
run: |
195+
VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
196+
197+
- name: Run vllm-project/vllm-ascend test for V1 Engine
198+
env:
199+
VLLM_USE_V1: 1
200+
VLLM_WORKER_MULTIPROC_METHOD: spawn
201+
HF_ENDPOINT: https://hf-mirror.com
202+
run: |
203+
pytest -sv -m 'multinpu' tests
204+
205+
test-singlenpu-v0.8.3:
206+
name: vLLM Ascend test (single-npu)
207+
runs-on: linux-arm64-npu-1 # actionlint-ignore: runner-label
208+
container:
209+
image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
210+
steps:
211+
- name: Check npu and CANN info
212+
run: |
213+
npu-smi info
214+
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
215+
216+
- name: Config mirrors
217+
run: |
218+
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
219+
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
220+
apt-get update -y
221+
apt install git -y
222+
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
223+
224+
- name: Checkout vllm-project/vllm-ascend repo
225+
uses: actions/checkout@v4
226+
227+
- name: Install system dependencies
228+
run: |
229+
apt-get -y install `cat packages.txt`
230+
apt-get -y install gcc g++ cmake libnuma-dev
231+
232+
- name: Checkout vllm-project/vllm repo
233+
uses: actions/checkout@v4
234+
with:
235+
repository: vllm-project/vllm
236+
ref: v0.8.3
237+
path: ./vllm-empty
238+
239+
- name: Install vllm-project/vllm from source
119240
working-directory: ./vllm-empty
120241
run: |
121-
git checkout v0.8.3
122242
VLLM_TARGET_DEVICE=empty pip install -e .
123243
124-
- name: Run vllm-project/vllm-ascend test with vllm v0.8.3
244+
- name: Install vllm-project/vllm-ascend
245+
run: |
246+
pip install -r requirements-dev.txt
247+
pip install -e .
248+
249+
- name: Install pta
250+
run: |
251+
if [ ! -d /root/.cache/pta ]; then
252+
mkdir -p /root/.cache/pta
253+
fi
254+
255+
if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
256+
cd /root/.cache/pta
257+
rm -rf pytorch_v2.5.1_py310*
258+
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
259+
tar -zxvf pytorch_v2.5.1_py310.tar.gz
260+
fi
261+
262+
pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
263+
264+
- name: Run vllm-project/vllm-ascend test on V0 engine
125265
env:
126266
VLLM_USE_V1: 0
127267
HF_ENDPOINT: https://hf-mirror.com
128268
run: |
129269
VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
130270
131-
- name: Run vllm-project/vllm test for V0 Engine with vllm v0.8.3
271+
- name: Run vllm-project/vllm-ascend test for V1 Engine
272+
env:
273+
VLLM_USE_V1: 1
274+
VLLM_WORKER_MULTIPROC_METHOD: spawn
275+
HF_ENDPOINT: https://hf-mirror.com
276+
run: |
277+
pytest -sv -m 'not multinpu' tests
278+
279+
- name: Run vllm-project/vllm test for V0 Engine
132280
env:
133281
VLLM_USE_V1: 0
134282
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
135283
HF_ENDPOINT: https://hf-mirror.com
136284
run: |
137285
pytest -sv
138286
139-
test-multinpu:
287+
test-multinpu-v0.8.3:
140288
name: vLLM Ascend test (multi-npu)
141-
runs-on: linux-arm64-npu-4
289+
runs-on: linux-arm64-npu-4
290+
needs: test-multinpu
142291
container:
143292
image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
144293
env:
@@ -175,7 +324,7 @@ jobs:
175324
uses: actions/checkout@v4
176325
with:
177326
repository: vllm-project/vllm
178-
fetch-depth: 0
327+
ref: v0.8.3
179328
path: ./vllm-empty
180329

181330
- name: Install vllm-project/vllm from source
@@ -202,22 +351,17 @@ jobs:
202351
fi
203352
204353
pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
205-
- name: Run vllm-project/vllm-ascend test
354+
- name: Run vllm-project/vllm-ascend test on V0 engine
206355
env:
207356
VLLM_USE_V1: 0
208-
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
357+
HF_ENDPOINT: https://hf-mirror.com
209358
run: |
210-
pytest -sv -m multinpu tests/
359+
VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
211360
212-
- name: Checkout to vllm 0.8.3
213-
working-directory: ./vllm-empty
214-
run: |
215-
git checkout v0.8.3
216-
VLLM_TARGET_DEVICE=empty pip install -e .
217-
218-
- name: Run vllm-project/vllm-ascend test with vllm v0.8.3
361+
- name: Run vllm-project/vllm-ascend test for V1 Engine
219362
env:
220-
VLLM_USE_V1: 0
221-
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
363+
VLLM_USE_V1: 1
364+
VLLM_WORKER_MULTIPROC_METHOD: spawn
365+
HF_ENDPOINT: https://hf-mirror.com
222366
run: |
223-
pytest -sv -m multinpu tests/
367+
pytest -sv -m 'multinpu' tests

tests/test_offline_inference.py

Lines changed: 29 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -35,58 +35,44 @@
3535

3636

3737
@pytest.mark.parametrize("model", MODELS)
38-
@pytest.mark.parametrize("use_v1", ["1", "0"])
3938
@pytest.mark.parametrize("dtype", ["half", "float16"])
4039
@pytest.mark.parametrize("max_tokens", [5])
41-
def test_models(model: str, use_v1: str, dtype: str, max_tokens: int,
42-
monkeypatch: pytest.MonkeyPatch) -> None:
43-
with monkeypatch.context() as m:
44-
m.setenv("VLLM_USE_V1", use_v1)
45-
if use_v1 == '1':
46-
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
47-
# 5042 tokens for gemma2
48-
# gemma2 has alternating sliding window size of 4096
49-
# we need a prompt with more than 4096 tokens to test the sliding window
50-
prompt = "The following numbers of the sequence " + ", ".join(
51-
str(i) for i in range(1024)) + " are:"
52-
example_prompts = [prompt]
40+
def test_models(model: str, dtype: str, max_tokens: int) -> None:
41+
# 5042 tokens for gemma2
42+
# gemma2 has alternating sliding window size of 4096
43+
# we need a prompt with more than 4096 tokens to test the sliding window
44+
prompt = "The following numbers of the sequence " + ", ".join(
45+
str(i) for i in range(1024)) + " are:"
46+
example_prompts = [prompt]
5347

54-
with VllmRunner(model,
55-
max_model_len=8192,
56-
dtype=dtype,
57-
enforce_eager=False,
58-
gpu_memory_utilization=0.7) as vllm_model:
59-
vllm_model.generate_greedy(example_prompts, max_tokens)
48+
with VllmRunner(model,
49+
max_model_len=8192,
50+
dtype=dtype,
51+
enforce_eager=False,
52+
gpu_memory_utilization=0.7) as vllm_model:
53+
vllm_model.generate_greedy(example_prompts, max_tokens)
6054

6155

62-
# Now our pvc reading speed is too slow
63-
# For faster testing, temporarily uncheck the support for testing large weight models on v1
6456
@pytest.mark.multinpu
65-
@pytest.mark.parametrize("use_v1", ["0"])
6657
@pytest.mark.parametrize("model, distributed_executor_backend", [
6758
("Qwen/QwQ-32B", "mp"),
6859
])
69-
def test_models_distributed(vllm_runner, use_v1: str, model: str,
70-
distributed_executor_backend: str,
71-
monkeypatch: pytest.MonkeyPatch) -> None:
72-
with monkeypatch.context() as m:
73-
m.setenv("VLLM_USE_V1", use_v1)
74-
if use_v1 == '1':
75-
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
76-
example_prompts = [
77-
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
78-
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
79-
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
80-
]
81-
dtype = "half"
82-
max_tokens = 5
83-
with vllm_runner(
84-
model,
85-
dtype=dtype,
86-
tensor_parallel_size=4,
87-
distributed_executor_backend=distributed_executor_backend,
88-
) as vllm_model:
89-
vllm_model.generate_greedy(example_prompts, max_tokens)
60+
def test_models_distributed(vllm_runner, model: str,
61+
distributed_executor_backend: str) -> None:
62+
example_prompts = [
63+
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
64+
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
65+
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
66+
]
67+
dtype = "half"
68+
max_tokens = 5
69+
with vllm_runner(
70+
model,
71+
dtype=dtype,
72+
tensor_parallel_size=4,
73+
distributed_executor_backend=distributed_executor_backend,
74+
) as vllm_model:
75+
vllm_model.generate_greedy(example_prompts, max_tokens)
9076

9177

9278
if __name__ == "__main__":

0 commit comments

Comments
 (0)