Skip to content

Commit be404be

Browse files
authored
Merge pull request #497 from ROCm/upstream_merge_2025_03_31
Upstream merge 2025 03 31
2 parents 25070a1 + f264100 commit be404be

File tree

337 files changed

+15233
-5486
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

337 files changed

+15233
-5486
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ steps:
8282
queue: cpu_queue_postmerge
8383
commands:
8484
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
85-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
85+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
8686
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
8787
env:
8888
DOCKER_BUILDKIT: "1"

.buildkite/run-amd-test.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,10 @@ if [[ $commands == *"--shard-id="* ]]; then
134134
# assign shard-id for each shard
135135
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
136136
echo "Shard ${GPU} commands:$commands_gpu"
137+
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
137138
docker run \
138-
--device /dev/kfd --device /dev/dri \
139-
--network host \
139+
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
140+
--network=host \
140141
--shm-size=16gb \
141142
--rm \
142143
-e HIP_VISIBLE_DEVICES="${GPU}" \
@@ -166,7 +167,7 @@ else
166167
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
167168
docker run \
168169
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
169-
--network host \
170+
--network=host \
170171
--shm-size=16gb \
171172
--rm \
172173
-e HIP_VISIBLE_DEVICES=0 \

.buildkite/run-cpu-test.sh

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,19 @@ set -ex
88
CORE_RANGE=${CORE_RANGE:-48-95}
99
NUMA_NODE=${NUMA_NODE:-1}
1010

11-
# Try building the docker image
12-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
13-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
14-
1511
# Setup cleanup
16-
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
12+
remove_docker_container() {
13+
set -e;
14+
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
15+
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
16+
}
1717
trap remove_docker_container EXIT
1818
remove_docker_container
1919

20+
# Try building the docker image
21+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
22+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
23+
2024
# Run the image, setting --shm-size=4g for tensor parallel.
2125
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
2226
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@@ -36,8 +40,8 @@ function cpu_tests() {
3640
# Run basic model test
3741
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
3842
set -e
39-
pip install -r vllm/requirements/test.txt
40-
pip install -r vllm/requirements/cpu.txt
43+
pytest -v -s tests/kernels/test_cache.py -m cpu_model
44+
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
4145
pytest -v -s tests/models/decoder_only/language -m cpu_model
4246
pytest -v -s tests/models/embedding/language -m cpu_model
4347
pytest -v -s tests/models/encoder_decoder/language -m cpu_model

.buildkite/run-tpu-v1-test.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,20 @@ docker run --privileged --net host --shm-size=16G -it \
2222
&& export VLLM_USE_V1=1 \
2323
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
2424
&& echo TEST_1 \
25-
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
25+
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
2626
&& echo TEST_2 \
2727
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
2828
&& echo TEST_3 \
2929
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
3030
&& echo TEST_4 \
3131
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
3232
&& echo TEST_5 \
33-
&& python3 /workspace/vllm/examples/offline_inference/tpu.py" \
33+
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
34+
&& echo TEST_6 \
35+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
36+
&& echo TEST_7 \
37+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
3438

3539

3640
# TODO: This test fails because it uses RANDOM_SEED sampling
3741
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
38-

.buildkite/test-pipeline.yaml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,21 +138,23 @@ steps:
138138
- examples/offline_inference/rlhf.py
139139
- examples/offline_inference/rlhf_colocate.py
140140
- tests/examples/offline_inference/data_parallel.py
141+
- tests/v1/test_async_llm_dp.py
141142
commands:
142143
# test with tp=2 and external_dp=2
143144
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
144145
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
145146
# test with internal dp
146147
- python3 ../examples/offline_inference/data_parallel.py
148+
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
147149
- pytest -v -s distributed/test_utils.py
148150
- pytest -v -s compile/test_basic_correctness.py
149151
- pytest -v -s distributed/test_pynccl.py
150152
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
151153
# TODO: create a dedicated test section for multi-GPU example tests
152154
# when we have multiple distributed example tests
153155
- pushd ../examples/offline_inference
154-
- VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
155-
- VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
156+
- python3 rlhf.py
157+
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
156158
- popd
157159

158160
- label: Metrics, Tracing Test # 10min
@@ -295,7 +297,7 @@ steps:
295297
source_file_dependencies:
296298
- vllm/lora
297299
- tests/lora
298-
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
300+
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
299301
parallelism: 4
300302

301303
- label: PyTorch Fullgraph Smoke Test # 9min
@@ -441,6 +443,7 @@ steps:
441443
- pytest -v -s models/encoder_decoder/audio_language -m core_model
442444
- pytest -v -s models/encoder_decoder/language -m core_model
443445
- pytest -v -s models/encoder_decoder/vision_language -m core_model
446+
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
444447

445448
- label: Multi-Modal Models Test (Extended) 1 # 48m
446449
optional: true
@@ -526,8 +529,11 @@ steps:
526529
- vllm/worker/worker.py
527530
- vllm/worker/model_runner.py
528531
- entrypoints/llm/test_collective_rpc.py
532+
- tests/v1/test_async_llm_dp.py
533+
- vllm/v1/engine/
529534
commands:
530-
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
535+
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
536+
- pytest -v -s entrypoints/llm/test_collective_rpc.py
531537
- pytest -v -s ./compile/test_basic_correctness.py
532538
- pytest -v -s ./compile/test_wrapper.py
533539
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -604,8 +610,6 @@ steps:
604610
# FIXIT: find out which code initialize cuda before running the test
605611
# before the fix, we need to use spawn to test it
606612
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
607-
# This test runs llama 13B, so it is required to run on 4 GPUs.
608-
- pytest -v -s -x lora/test_long_context.py
609613
# There is some Tensor Parallelism related processing logic in LoRA that
610614
# requires multi-GPU testing for validation.
611615
- pytest -v -s -x lora/test_chatglm3_tp.py

.github/mergify.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,36 @@ pull_request_rules:
8888
add:
8989
- v1
9090

91+
- name: label-tpu
92+
description: Automatically apply tpu label
93+
# Keep this list in sync with `label-tpu-remove` conditions
94+
conditions:
95+
- or:
96+
- files~=tpu.py
97+
- files~=_tpu
98+
- files~=tpu_
99+
- files~=/tpu/
100+
- files~=pallas
101+
actions:
102+
label:
103+
add:
104+
- tpu
105+
106+
- name: label-tpu-remove
107+
description: Automatically remove tpu label
108+
# Keep this list in sync with `label-tpu` conditions
109+
conditions:
110+
- and:
111+
- -files~=tpu.py
112+
- -files~=_tpu
113+
- -files~=tpu_
114+
- -files~=/tpu/
115+
- -files~=pallas
116+
actions:
117+
label:
118+
remove:
119+
- tpu
120+
91121
- name: ping author on conflicts and add 'needs-rebase' label
92122
conditions:
93123
- conflict

CMakeLists.txt

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
3434
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
3535

3636
# Supported AMD GPU architectures.
37-
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
37+
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
3838

3939
#
4040
# Supported/expected torch versions for CUDA/ROCm.
@@ -235,6 +235,7 @@ set(VLLM_EXT_SRC
235235
"csrc/activation_kernels.cu"
236236
"csrc/layernorm_kernels.cu"
237237
"csrc/layernorm_quant_kernels.cu"
238+
"csrc/cuda_view.cu"
238239
"csrc/quantization/gptq/q_gemm.cu"
239240
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
240241
"csrc/quantization/fp8/common.cu"
@@ -462,6 +463,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
462463
set(FP4_ARCHS)
463464
endif()
464465

466+
#
467+
# CUTLASS MoE kernels
468+
469+
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
470+
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
471+
# to compile MoE kernels that use its output.
472+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
473+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
474+
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
475+
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
476+
set_gencode_flags_for_srcs(
477+
SRCS "${SRCS}"
478+
CUDA_ARCHS "${SCALED_MM_ARCHS}")
479+
list(APPEND VLLM_EXT_SRC "${SRCS}")
480+
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
481+
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
482+
else()
483+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
484+
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
485+
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
486+
"if you intend on running FP8 quantized MoE models on Hopper.")
487+
else()
488+
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
489+
"in CUDA target architectures")
490+
endif()
491+
endif()
492+
465493
#
466494
# Machete kernels
467495

Dockerfile.base_navi

Lines changed: 0 additions & 143 deletions
This file was deleted.

0 commit comments

Comments
 (0)