Skip to content

Commit a89579a

Browse files
authored
Merge pull request #2 from vllm-project/main
Upstream sync from vllm-project/vllm
2 parents 7414eb0 + cd4cfee commit a89579a

File tree

380 files changed

+19206
-6530
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

380 files changed

+19206
-6530
lines changed

.buildkite/nightly-benchmarks/nightly-annotation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Please download the visualization scripts in the post
1616
- Download `nightly-benchmarks.zip`.
1717
- In the same folder, run the following code:
1818

19-
```console
19+
```bash
2020
export HF_TOKEN=<your HF token>
2121
apt update
2222
apt install -y git

.buildkite/release-pipeline.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ steps:
102102
commands:
103103
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
104104
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
105+
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
105106
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
106107
env:
107108
DOCKER_BUILDKIT: "1"
@@ -117,6 +118,7 @@ steps:
117118
commands:
118119
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
119120
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
121+
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
120122
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
121123
env:
122124
DOCKER_BUILDKIT: "1"

.buildkite/scripts/hardware_ci/run-neuron-test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
5454
--name "${container_name}" \
5555
${image_name} \
5656
/bin/bash -c "
57+
set -e; # Exit on first error
5758
python3 /workspace/vllm/examples/offline_inference/neuron.py;
5859
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
5960
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
60-
echo 'Running test file: '$f;
61+
echo \"Running test file: \$f\";
6162
python3 -m pytest \$f -v --capture=tee-sys;
6263
done
6364
"

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,8 @@ run_and_track_test 14 "test_tpu_qkv_linear.py" \
159159
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
160160
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
161161
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
162+
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
163+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
162164
163165
# After all tests have been attempted, exit with the overall status.
164166
if [ "$overall_script_exit_code" -ne 0 ]; then

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ docker run \
2828
sh -c '
2929
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
3030
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
31+
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
3132
'

.buildkite/scripts/tpu/config_v6e_1.env

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ CONTAINER_NAME=vllm-tpu
44

55
# vllm config
66
MODEL=meta-llama/Llama-3.1-8B-Instruct
7-
MAX_NUM_SEQS=512
8-
MAX_NUM_BATCHED_TOKENS=512
7+
MAX_NUM_SEQS=256
8+
MAX_NUM_BATCHED_TOKENS=1024
99
TENSOR_PARALLEL_SIZE=1
1010
MAX_MODEL_LEN=2048
1111
DOWNLOAD_DIR=/mnt/disks/persist

.buildkite/scripts/tpu/docker_run_bm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ docker run \
6868

6969
echo "run script..."
7070
echo
71-
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
71+
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
7272

7373
echo "copy result back..."
7474
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt

.buildkite/test-pipeline.yaml

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,16 @@ steps:
4141
# TODO: add `--strict` once warnings in docstrings are fixed
4242
- mkdocs build
4343

44+
- label: Pytorch Nightly Dependency Override Check # 2min
45+
# if this test fails, it means the nightly torch version is not compatible with some
46+
# of the dependencies. Please check the error message and add the package to whitelist
47+
# in /vllm/tools/generate_nightly_torch_test.py
48+
soft_fail: true
49+
source_file_dependencies:
50+
- requirements/nightly_torch_test.txt
51+
commands:
52+
- bash standalone_tests/pytorch_nightly_dependency.sh
53+
4454
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4555
mirror_hardwares: [amdexperimental]
4656
source_file_dependencies:
@@ -89,7 +99,7 @@ steps:
8999
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
90100

91101
- label: Chunked Prefill Test
92-
mirror_hardwares: [amdexperimental]
102+
mirror_hardwares: [amdexperimental, amdproduction]
93103
source_file_dependencies:
94104
- vllm/
95105
- tests/basic_correctness/test_chunked_prefill
@@ -168,6 +178,23 @@ steps:
168178
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
169179
- popd
170180

181+
- label: EPLB Algorithm Test
182+
working_dir: "/vllm-workspace/tests"
183+
source_file_dependencies:
184+
- vllm/distributed/eplb
185+
- tests/distributed/test_eplb_algo.py
186+
commands:
187+
- pytest -v -s distributed/test_eplb_algo.py
188+
189+
- label: EPLB Execution Test # 5min
190+
working_dir: "/vllm-workspace/tests"
191+
num_gpus: 4
192+
source_file_dependencies:
193+
- vllm/distributed/eplb
194+
- tests/distributed/test_eplb_execute.py
195+
commands:
196+
- pytest -v -s distributed/test_eplb_execute.py
197+
171198
- label: Metrics, Tracing Test # 10min
172199
mirror_hardwares: [amdexperimental, amdproduction]
173200
num_gpus: 2
@@ -271,6 +298,15 @@ steps:
271298
commands:
272299
- pytest -v -s prefix_caching
273300

301+
302+
- label: Platform Tests (CUDA)
303+
mirror_hardwares: [amdexperimental]
304+
source_file_dependencies:
305+
- vllm/
306+
- tests/cuda
307+
commands:
308+
- pytest -v -s cuda/test_cuda_context.py
309+
274310
- label: Samplers Test # 36min
275311
mirror_hardwares: [amdexperimental]
276312
source_file_dependencies:
@@ -606,13 +642,18 @@ steps:
606642
- vllm/executor/
607643
- vllm/model_executor/models/
608644
- tests/distributed/
645+
- tests/examples/offline_inference/data_parallel.py
609646
commands:
610647
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
611648
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
649+
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
650+
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
612651
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
613652
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
614653
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
615654
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
655+
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
656+
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
616657

617658
- label: Distributed Tests (2 GPUs) # 40min
618659
mirror_hardwares: [amdexperimental]
@@ -736,7 +777,7 @@ steps:
736777
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
737778

738779
- label: Weight Loading Multiple GPU Test - Large Models # optional
739-
mirror_hardwares: [amdexperimental]
780+
mirror_hardwares: [amdexperimental]
740781
working_dir: "/vllm-workspace/tests"
741782
num_gpus: 2
742783
gpu: a100

.github/CODEOWNERS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
/vllm/entrypoints @aarnphm
1919
CMakeLists.txt @tlrmchlsmth
2020

21+
# Any change to the VllmConfig changes can have a large user-facing impact,
22+
# so spam a lot of people
23+
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
24+
2125
# vLLM V1
2226
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
2327
/vllm/v1/structured_output @mgoin @russellb @aarnphm

.github/mergify.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ pull_request_rules:
4545
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
4646
- files~=^vllm/model_executor/models/.*llama.*\.py
4747
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
48+
- title~=(?i)llama
4849
actions:
4950
label:
5051
add:
@@ -65,6 +66,19 @@ pull_request_rules:
6566
add:
6667
- multi-modality
6768

69+
- name: label-performance
70+
description: Automatically apply performance label
71+
conditions:
72+
- or:
73+
- files~=^benchmarks/
74+
- files~=^vllm/benchmarks/
75+
- files~=^tests/benchmarks/
76+
- files~=^\.buildkite/nightly-benchmarks/
77+
actions:
78+
label:
79+
add:
80+
- performance
81+
6882
- name: label-qwen
6983
description: Automatically apply qwen label
7084
conditions:
@@ -74,7 +88,6 @@ pull_request_rules:
7488
- files~=^vllm/model_executor/models/.*qwen.*\.py
7589
- files~=^vllm/reasoning/.*qwen.*\.py
7690
- title~=(?i)Qwen
77-
- body~=(?i)Qwen
7891
actions:
7992
label:
8093
add:

0 commit comments

Comments
 (0)