Skip to content

Commit 1095cff

Browse files
authored
Merge pull request #471 from ROCm/upstream_merge_25_03_10
Upstream merge 25 03 10
2 parents 3ee6551 + ff60bf3 commit 1095cff

File tree

377 files changed

+19052
-4390
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

377 files changed

+19052
-4390
lines changed

.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ main() {
426426

427427
pip install -U transformers
428428

429-
pip install -r requirements-dev.txt
429+
pip install -r requirements/dev.txt
430430
which genai-perf
431431

432432
# check storage

.buildkite/run-amd-test.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,12 @@ if [[ $commands == *" kernels "* ]]; then
9393
--ignore=kernels/test_rand.py \
9494
--ignore=kernels/test_sampler.py \
9595
--ignore=kernels/test_cascade_flash_attn.py \
96-
--ignore=kernels/test_mamba_mixer2.py"
96+
--ignore=kernels/test_mamba_mixer2.py \
97+
--ignore=kernels/test_aqlm.py \
98+
--ignore=kernels/test_machete_mm.py \
99+
--ignore=kernels/test_mha_attn.py \
100+
--ignore=kernels/test_block_fp8.py \
101+
--ignore=kernels/test_permute_cols.py"
97102
fi
98103

99104
#ignore certain Entrypoints tests

.buildkite/run-cpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ function cpu_tests() {
3535
# Run basic model test
3636
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
3737
set -e
38-
pip install -r vllm/requirements-test.txt
38+
pip install -r vllm/requirements/test.txt
3939
pytest -v -s tests/models/decoder_only/language -m cpu_model
4040
pytest -v -s tests/models/embedding/language -m cpu_model
4141
pytest -v -s tests/models/encoder_decoder/language -m cpu_model

.buildkite/test-pipeline.yaml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ steps:
3535
fast_check: true
3636
no_gpu: True
3737
commands:
38-
- pip install -r requirements-docs.txt
38+
- pip install -r ../../requirements/docs.txt
3939
- SPHINXOPTS=\"-W\" make html
4040
# Check API reference (if it fails, you may have missing mock imports)
4141
- grep \"sig sig-object py\" build/html/api/inference_params.html
@@ -78,6 +78,7 @@ steps:
7878
- tests/basic_correctness/test_preemption
7979
- tests/basic_correctness/test_cumem.py
8080
commands:
81+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
8182
- pytest -v -s basic_correctness/test_cumem.py
8283
- pytest -v -s basic_correctness/test_basic_correctness.py
8384
- pytest -v -s basic_correctness/test_cpu_offload.py
@@ -115,6 +116,7 @@ steps:
115116
- tests/entrypoints/test_chat_utils
116117
- tests/entrypoints/offline_mode
117118
commands:
119+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
118120
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
119121
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
120122
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
@@ -146,8 +148,10 @@ steps:
146148
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
147149
# TODO: create a dedicated test section for multi-GPU example tests
148150
# when we have multiple distributed example tests
149-
- python3 ../examples/offline_inference/rlhf.py
150-
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
151+
- pushd ../examples/offline_inference
152+
- python3 rlhf.py
153+
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
154+
- popd
151155

152156
- label: Metrics, Tracing Test # 10min
153157
num_gpus: 2
@@ -204,6 +208,7 @@ steps:
204208
- VLLM_USE_V1=1 pytest -v -s v1/engine
205209
- VLLM_USE_V1=1 pytest -v -s v1/sample
206210
- VLLM_USE_V1=1 pytest -v -s v1/worker
211+
- VLLM_USE_V1=1 pytest -v -s v1/structured_output
207212
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
208213
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
209214
# TODO: accuracy does not match, whether setting

.github/mergify.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,21 @@ pull_request_rules:
3636
add:
3737
- frontend
3838

39+
- name: label-multi-modality
40+
description: Automatically apply multi-modality label
41+
conditions:
42+
- or:
43+
- files~=^vllm/multimodal/
44+
- files~=^tests/multimodal/
45+
- files~=^tests/models/multimodal/
46+
- files~=^tests/models/*/audio_language/
47+
- files~=^tests/models/*/vision_language/
48+
- files=tests/models/test_vision.py
49+
actions:
50+
label:
51+
add:
52+
- multi-modality
53+
3954
- name: label-structured-output
4055
description: Automatically apply structured-output label
4156
conditions:

.github/workflows/scripts/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ python_executable=python3
55

66
# Update paths
77
# Install requirements
8-
$python_executable -m pip install -r requirements-rocm.txt
8+
$python_executable -m pip install -r requirements/rocm.txt
99

1010
# Limit the number of parallel jobs to avoid OOM
1111
export MAX_JOBS=1

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ _build/
197197
hip_compat.h
198198

199199
# Benchmark dataset
200-
benchmarks/*.json
200+
benchmarks/**/*.json
201201

202202
# Linting
203203
actionlint

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ repos:
4444
rev: 0.6.2
4545
hooks:
4646
- id: pip-compile
47-
args: [requirements-test.in, -o, requirements-test.txt]
48-
files: ^requirements-test\.(in|txt)$
47+
args: [requirements/test.in, -o, requirements/test.txt]
48+
files: ^requirements/test\.(in|txt)$
4949
- repo: local
5050
hooks:
5151
- id: mypy-local

.readthedocs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ formats: []
1818
# Optionally declare the Python requirements required to build your docs
1919
python:
2020
install:
21-
- requirements: docs/requirements-docs.txt
21+
- requirements: requirements/docs.txt

CMakeLists.txt

Lines changed: 54 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
3131
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
3232

3333
# Supported NVIDIA architectures.
34-
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
34+
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
3535

3636
# Supported AMD GPU architectures.
3737
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
@@ -312,7 +312,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
312312
# Only build Marlin kernels if we are building for at least some compatible archs.
313313
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
314314
# are not supported by Machete yet.
315-
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
315+
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
316316
if (MARLIN_ARCHS)
317317
set(MARLIN_SRCS
318318
"csrc/quantization/fp8/fp8_marlin.cu"
@@ -334,7 +334,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
334334

335335
# Only build AllSpark kernels if we are building for at least some compatible archs.
336336
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
337-
if (ALLSPARK_ARCHS)
337+
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
338338
set(ALLSPARK_SRCS
339339
"csrc/quantization/gptq_allspark/allspark_repack.cu"
340340
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
@@ -345,46 +345,74 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
345345
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
346346
else()
347347
message(STATUS "Not building AllSpark kernels as no compatible archs found"
348-
" in CUDA target architectures")
348+
" in CUDA target architectures, or CUDA not >= 12.0")
349349
endif()
350350

351+
352+
set(SCALED_MM_3X_ARCHS)
351353
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
352-
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
353-
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
354-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
354+
# CUDA 12.0 or later
355+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
356+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
355357
set(SRCS
356-
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
358+
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
357359
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
358360
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
359361
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
360362
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
361363
set_gencode_flags_for_srcs(
362364
SRCS "${SRCS}"
363-
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
365+
CUDA_ARCHS "${SCALED_MM_ARCHS}")
364366
list(APPEND VLLM_EXT_SRC "${SRCS}")
365-
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
366-
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
367+
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
368+
# Let scaled_mm_c2x know it doesn't need to build these arches
369+
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
370+
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
367371
else()
368-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
369-
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
372+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
373+
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
370374
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
371375
"later if you intend on running FP8 quantized models on "
372376
"Hopper.")
373377
else()
374-
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
378+
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
375379
"in CUDA target architectures")
376380
endif()
381+
endif()
377382

378-
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
379-
# build any 3x kernels
380-
set(SCALED_MM_3X_ARCHS)
383+
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
384+
# CUDA 12.8 or later
385+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
386+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
387+
set(SRCS
388+
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
389+
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
390+
)
391+
set_gencode_flags_for_srcs(
392+
SRCS "${SRCS}"
393+
CUDA_ARCHS "${SCALED_MM_ARCHS}")
394+
list(APPEND VLLM_EXT_SRC "${SRCS}")
395+
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
396+
# Let scaled_mm_c2x know it doesn't need to build these arches
397+
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
398+
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
399+
else()
400+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
401+
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
402+
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
403+
"later if you intend on running FP8 quantized models on "
404+
"Blackwell.")
405+
else()
406+
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
407+
"in CUDA target architectures")
408+
endif()
381409
endif()
382410

383411
#
384412
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
385413
# kernels for the remaining archs that are not already built for 3x.
386414
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
387-
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
415+
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
388416
# subtract out the archs that are already built for 3x
389417
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
390418
if (SCALED_MM_2X_ARCHS)
@@ -409,17 +437,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
409437
# 2:4 Sparse Kernels
410438

411439
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
412-
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
413-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
440+
# require CUDA 12.2 or later (and only work on Hopper and Blackwell).
441+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
414442
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
415443
set_gencode_flags_for_srcs(
416444
SRCS "${SRCS}"
417-
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
445+
CUDA_ARCHS "${SCALED_MM_ARCHS}")
418446
list(APPEND VLLM_EXT_SRC "${SRCS}")
419447
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
420-
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
448+
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
421449
else()
422-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
450+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
423451
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
424452
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
425453
"if you intend on running FP8 sparse quantized models on Hopper.")
@@ -434,8 +462,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
434462
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
435463
set(SRCS
436464
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
437-
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
438-
)
465+
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
439466
set_gencode_flags_for_srcs(
440467
SRCS "${SRCS}"
441468
CUDA_ARCHS "${FP4_ARCHS}")
@@ -534,6 +561,7 @@ define_gpu_extension_target(
534561
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
535562
ARCHITECTURES ${VLLM_GPU_ARCHES}
536563
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
564+
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
537565
USE_SABI 3
538566
WITH_SOABI)
539567

@@ -557,7 +585,7 @@ set_gencode_flags_for_srcs(
557585
CUDA_ARCHS "${CUDA_ARCHS}")
558586

559587
if(VLLM_GPU_LANG STREQUAL "CUDA")
560-
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
588+
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
561589
if (MARLIN_MOE_ARCHS)
562590
set(MARLIN_MOE_SRC
563591
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"

0 commit comments

Comments
 (0)