Skip to content

Commit b220766

Browse files
authored
Merge branch 'main' into feat/splade-sparse-embedding
2 parents 3276ca4 + c6187f5 commit b220766

File tree

417 files changed

+11231
-5663
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

417 files changed

+11231
-5663
lines changed

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -454,11 +454,6 @@ main() {
454454
fi
455455
check_hf_token
456456

457-
# Set to v1 to run v1 benchmark
458-
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
459-
export VLLM_USE_V1=1
460-
fi
461-
462457
# dependencies
463458
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
464459
(which jq) || (apt-get update && apt-get -y install jq)

.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
6464
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
6565
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
6666
echo "--- Python dependencies installed ---"
67-
export VLLM_USE_V1=1
67+
6868
export VLLM_XLA_CHECK_RECOMPILATION=1
6969
export VLLM_XLA_CACHE_PATH=
70-
echo "Using VLLM V1"
7170
7271
echo "--- Hardware Information ---"
7372
# tpu-info

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
6464
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
6565
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
6666
echo "--- Python dependencies installed ---"
67-
export VLLM_USE_V1=1
67+
6868
export VLLM_XLA_CHECK_RECOMPILATION=1
6969
export VLLM_XLA_CACHE_PATH=
70-
echo "Using VLLM V1"
7170
7271
echo "--- Hardware Information ---"
7372
# tpu-info

.buildkite/scripts/tpu/quantized_v6e_1.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
99
TENSOR_PARALLEL_SIZE=1
1010
MAX_MODEL_LEN=2048
1111
DOWNLOAD_DIR=/mnt/disks/persist
12-
EXPECTED_THROUGHPUT=10.0
12+
EXPECTED_THROUGHPUT=8.7
1313
INPUT_LEN=1800
1414
OUTPUT_LEN=128

.buildkite/scripts/tpu/run_bm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ echo "lanching vllm..."
4242
echo "logging to $VLLM_LOG"
4343
echo
4444

45-
VLLM_USE_V1=1 vllm serve $MODEL \
45+
vllm serve $MODEL \
4646
--seed 42 \
4747
--max-num-seqs $MAX_NUM_SEQS \
4848
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \

.buildkite/test-pipeline.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ steps:
296296
- tests/v1
297297
commands:
298298
# split the test to avoid interference
299+
- pytest -v -s -m 'not cpu_test' v1/core
299300
- pytest -v -s v1/executor
300301
- pytest -v -s v1/kv_offload
301302
- pytest -v -s v1/sample
@@ -317,7 +318,7 @@ steps:
317318
no_gpu: true
318319
commands:
319320
# split the test to avoid interference
320-
- pytest -v -s v1/core
321+
- pytest -v -s -m 'cpu_test' v1/core
321322
- pytest -v -s v1/structured_output
322323
- pytest -v -s v1/test_serial_utils.py
323324
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
@@ -828,7 +829,7 @@ steps:
828829
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
829830
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
830831
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
831-
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
832+
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
832833
# Fusion
833834
- pytest -v -s tests/compile/test_fusion_all_reduce.py
834835
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
@@ -867,7 +868,7 @@ steps:
867868
- pytest -s -v tests/quantization/test_blackwell_moe.py
868869

869870
- label: Blackwell LM Eval Small Models
870-
timeout_in_minutes: 75
871+
timeout_in_minutes: 120
871872
gpu: b200
872873
optional: true # run on nightlies
873874
source_file_dependencies:

.github/mergify.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ pull_request_rules:
1111
label:
1212
add:
1313
- documentation
14+
comment:
15+
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
1416

1517
- name: label-ci-build
1618
description: Automatically apply ci/build label

.pre-commit-config.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,6 @@ repos:
5555
types_or: [python, pyi]
5656
require_serial: true
5757
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
58-
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
59-
name: Run mypy for Python 3.9
60-
entry: python tools/pre_commit/mypy.py 1 "3.9"
61-
<<: *mypy_common
62-
stages: [manual] # Only run in CI
6358
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
6459
name: Run mypy for Python 3.10
6560
entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -75,6 +70,11 @@ repos:
7570
entry: python tools/pre_commit/mypy.py 1 "3.12"
7671
<<: *mypy_common
7772
stages: [manual] # Only run in CI
73+
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
74+
name: Run mypy for Python 3.13
75+
entry: python tools/pre_commit/mypy.py 1 "3.13"
76+
<<: *mypy_common
77+
stages: [manual] # Only run in CI
7878
- id: shellcheck
7979
name: Lint shell scripts
8080
entry: tools/shellcheck.sh

CMakeLists.txt

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
3434
# Supported python versions. These versions will be searched in order, the
3535
# first match will be selected. These should be kept in sync with setup.py.
3636
#
37-
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
37+
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
3838

3939
# Supported AMD GPU architectures.
4040
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
@@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
269269
"csrc/sampler.cu"
270270
"csrc/cuda_view.cu"
271271
"csrc/quantization/gptq/q_gemm.cu"
272-
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
273-
"csrc/quantization/fp8/common.cu"
272+
"csrc/quantization/w8a8/int8/scaled_quant.cu"
273+
"csrc/quantization/w8a8/fp8/common.cu"
274274
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
275275
"csrc/quantization/gguf/gguf_kernel.cu"
276276
"csrc/quantization/activation_kernels.cu"
@@ -314,12 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
314314
list(APPEND VLLM_EXT_SRC
315315
"csrc/quantization/awq/gemm_kernels.cu"
316316
"csrc/permute_cols.cu"
317-
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
317+
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
318318
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
319319
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
320320
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
321321
"csrc/cutlass_extensions/common.cpp"
322-
"csrc/quantization/fp8/per_token_group_quant.cu")
322+
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
323+
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
323324

324325
set_gencode_flags_for_srcs(
325326
SRCS "${VLLM_EXT_SRC}"
@@ -423,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
423424
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
424425
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
425426
set(SRCS
426-
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
427-
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
428-
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
429-
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
430-
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
427+
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
428+
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
429+
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
430+
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
431+
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
431432
set_gencode_flags_for_srcs(
432433
SRCS "${SRCS}"
433434
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -458,9 +459,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
458459
endif()
459460
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
460461
set(SRCS
461-
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
462-
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
463-
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
462+
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
463+
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
464+
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
464465
)
465466
set_gencode_flags_for_srcs(
466467
SRCS "${SRCS}"
@@ -492,9 +493,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
492493
endif()
493494
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
494495
set(SRCS
495-
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
496-
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
497-
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
496+
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
497+
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
498+
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
498499
)
499500
set_gencode_flags_for_srcs(
500501
SRCS "${SRCS}"
@@ -525,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
525526
# subtract out the archs that are already built for 3x
526527
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
527528
if (SCALED_MM_2X_ARCHS)
528-
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
529+
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
529530
set_gencode_flags_for_srcs(
530531
SRCS "${SRCS}"
531532
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@@ -648,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
648649
# if it's possible to compile MoE kernels that use its output.
649650
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
650651
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
651-
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
652+
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
652653
set_gencode_flags_for_srcs(
653654
SRCS "${SRCS}"
654655
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -672,7 +673,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
672673
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
673674
endif()
674675
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
675-
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
676+
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
676677
set_gencode_flags_for_srcs(
677678
SRCS "${SRCS}"
678679
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -697,7 +698,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
697698
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
698699
endif()
699700
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
700-
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
701+
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
701702
set_gencode_flags_for_srcs(
702703
SRCS "${SRCS}"
703704
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@@ -720,7 +721,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
720721
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
721722
endif()
722723
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
723-
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
724+
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
724725
set_gencode_flags_for_srcs(
725726
SRCS "${SRCS}"
726727
CUDA_ARCHS "${SCALED_MM_ARCHS}")

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ Compute Resources:
149149
- Trainy
150150
- UC Berkeley
151151
- UC San Diego
152+
- Volcengine
152153

153154
Slack Sponsor: Anyscale
154155

0 commit comments

Comments
 (0)