Skip to content

Commit 179ff0d

Browse files
authored
Merge branch 'main' into fix_bert_seq_cls
2 parents ed3dba4 + 8c54610 commit 179ff0d

File tree

157 files changed

+3043
-10587
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

157 files changed

+3043
-10587
lines changed

.buildkite/nightly-benchmarks/nightly-descriptions.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This benchmark aims to:
88

99
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
1010

11-
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
11+
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
1212

1313
## Setup
1414

.buildkite/release-pipeline.yaml

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,22 @@
11
steps:
22
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
33
- label: "Build arm64 wheel - CUDA 12.9"
4+
depends_on: ~
45
id: build-wheel-arm64-cuda-12-9
56
agents:
67
queue: arm64_cpu_queue_postmerge
78
commands:
89
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
910
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
10-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
11+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
1112
- "mkdir artifacts"
1213
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
1314
- "bash .buildkite/scripts/upload-wheels.sh"
1415
env:
1516
DOCKER_BUILDKIT: "1"
1617

17-
- block: "Build CUDA 12.8 wheel"
18-
key: block-build-cu128-wheel
19-
2018
- label: "Build wheel - CUDA 12.8"
21-
depends_on: block-build-cu128-wheel
19+
depends_on: ~
2220
id: build-wheel-cuda-12-8
2321
agents:
2422
queue: cpu_queue_postmerge
@@ -30,12 +28,8 @@ steps:
3028
env:
3129
DOCKER_BUILDKIT: "1"
3230

33-
- block: "Build CUDA 12.6 wheel"
34-
key: block-build-cu126-wheel
35-
depends_on: ~
36-
3731
- label: "Build wheel - CUDA 12.6"
38-
depends_on: block-build-cu126-wheel
32+
depends_on: ~
3933
id: build-wheel-cuda-12-6
4034
agents:
4135
queue: cpu_queue_postmerge
@@ -102,8 +96,6 @@ steps:
10296
depends_on:
10397
- create-multi-arch-manifest
10498
- build-wheel-cuda-12-8
105-
- build-wheel-cuda-12-6
106-
- build-wheel-cuda-12-9
10799
id: annotate-release-workflow
108100
agents:
109101
queue: cpu_queue_postmerge

.buildkite/scripts/annotate-release.sh

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
1414
To download the wheel:
1515
\`\`\`
1616
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
17+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
18+
1719
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
18-
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
20+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
1921
\`\`\`
2022
2123
To download and upload the image:
2224
2325
\`\`\`
24-
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
25-
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
26-
docker tag vllm/vllm-openai vllm/vllm-openai:latest
27-
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
28-
docker push vllm/vllm-openai:latest
29-
docker push vllm/vllm-openai:v${RELEASE_VERSION}
26+
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
27+
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
28+
29+
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
30+
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
31+
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
32+
docker push vllm/vllm-openai:latest-x86_64
33+
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
34+
35+
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
36+
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
37+
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
38+
docker push vllm/vllm-openai:latest-aarch64
39+
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
40+
41+
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
42+
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
43+
docker manifest push vllm/vllm-openai:latest
44+
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
3045
\`\`\`
3146
EOF

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ function cpu_tests() {
6666
6767
pytest -x -v -s tests/models/language/pooling -m cpu_model
6868
pytest -x -v -s tests/models/multimodal/generation \
69-
--ignore=tests/models/multimodal/generation/test_mllama.py \
7069
--ignore=tests/models/multimodal/generation/test_pixtral.py \
7170
-m cpu_model"
7271

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,7 @@ steps:
394394
- pytest -v -s compile/test_async_tp.py
395395
- pytest -v -s compile/test_fusion_all_reduce.py
396396
- pytest -v -s compile/test_decorator.py
397+
- pytest -v -s compile/test_noop_elimination.py
397398

398399
- label: PyTorch Fullgraph Smoke Test # 15min
399400
timeout_in_minutes: 30
@@ -548,15 +549,6 @@ steps:
548549
commands: # LMEval+Transcription WER check
549550
- pytest -s entrypoints/openai/correctness/
550551

551-
- label: Encoder Decoder tests # 12min
552-
timeout_in_minutes: 20
553-
mirror_hardwares: [amdexperimental]
554-
source_file_dependencies:
555-
- vllm/
556-
- tests/encoder_decoder
557-
commands:
558-
- pytest -v -s encoder_decoder
559-
560552
- label: OpenAI-Compatible Tool Use # 23 min
561553
timeout_in_minutes: 35
562554
mirror_hardwares: [amdexperimental]

.coveragerc

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[run]
2+
source = vllm
3+
omit =
4+
*/tests/*
5+
*/test_*
6+
*/__pycache__/*
7+
*/build/*
8+
*/dist/*
9+
*/vllm.egg-info/*
10+
*/third_party/*
11+
*/examples/*
12+
*/benchmarks/*
13+
*/docs/*
14+
15+
[report]
16+
exclude_lines =
17+
pragma: no cover
18+
def __repr__
19+
if self.debug:
20+
if settings.DEBUG
21+
raise AssertionError
22+
raise NotImplementedError
23+
if 0:
24+
if __name__ == .__main__.:
25+
class .*\bProtocol\):
26+
@(abc\.)?abstractmethod
27+
28+
[html]
29+
directory = htmlcov
30+
31+
[xml]
32+
output = coverage.xml

.github/CODEOWNERS

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,20 @@
22
# for more info about CODEOWNERS file
33

44
# This lists cover the "core" components of vLLM that require careful review
5+
/vllm/attention @LucasWilkinson
56
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
67
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
78
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
89
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
910
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
1011
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
12+
/vllm/model_executor/layers/fused_moe @mgoin
1113
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
1214
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
1315
/vllm/model_executor/layers/mamba @tdoublep
1416
/vllm/model_executor/model_loader @22quinn
1517
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
18+
/vllm/v1/attention @LucasWilkinson
1619
/vllm/v1/sample @22quinn @houseroad
1720
/vllm/vllm_flash_attn @LucasWilkinson
1821
/vllm/lora @jeejeelee
@@ -30,6 +33,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
3033
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
3134
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
3235
/vllm/v1/spec_decode @benchislett @luccafong
36+
/vllm/v1/attention/backends/flashinfer.py @mgoin
3337
/vllm/v1/attention/backends/triton_attn.py @tdoublep
3438
/vllm/v1/core @heheda12345
3539
/vllm/v1/kv_cache_interface.py @heheda12345
@@ -41,7 +45,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
4145
/tests/distributed/test_pipeline_parallel.py @youkaichao
4246
/tests/distributed/test_same_node.py @youkaichao
4347
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
44-
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
48+
/tests/evals @mgoin
49+
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
4550
/tests/models @DarkLight1337 @ywang96
4651
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
4752
/tests/prefix_caching @comaniac @KuntaiDu
@@ -101,4 +106,7 @@ mkdocs.yaml @hmellor
101106
/vllm/v1/worker/tpu* @NickLucche
102107
/vllm/platforms/tpu.py @NickLucche
103108
/vllm/v1/sample/tpu @NickLucche
104-
/vllm/tests/v1/tpu @NickLucche
109+
/vllm/tests/v1/tpu @NickLucche
110+
111+
# KVConnector installation files
112+
/requirements/kv_connectors.txt @NickLucche

CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
783783
endif()
784784
endif()
785785

786+
# Hadacore kernels
787+
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
788+
if(HADACORE_ARCHS)
789+
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
790+
set_gencode_flags_for_srcs(
791+
SRCS "${SRCS}"
792+
CUDA_ARCHS "${HADACORE_ARCHS}")
793+
list(APPEND VLLM_EXT_SRC "${SRCS}")
794+
message(STATUS "Building hadacore")
795+
endif()
796+
786797
# if CUDA endif
787798
endif()
788799

csrc/ops.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,8 @@ std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
347347
int64_t open_mem_handle(torch::Tensor& mem_handle);
348348
void free_shared_buffer(int64_t buffer);
349349

350+
torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace);
351+
350352
#ifdef USE_ROCM
351353
fptr_t init_custom_qr(int64_t rank, int64_t world_size,
352354
std::optional<int64_t> qr_max_size = std::nullopt);

csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
146146

147147
using ElementAB = typename Gemm::ElementAB;
148148
using ElementD = typename Gemm::ElementD;
149+
using ElementBlockScale = typename Gemm::ElementBlockScale;
149150

150151
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
151152

@@ -166,26 +167,29 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
166167
ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
167168
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
168169

169-
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
170-
auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
171-
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
172-
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
170+
auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
171+
auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
172+
auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
173+
auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
173174

174-
auto mainloop_args = [&](){
175-
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
176-
if (swap_ab) {
177-
return typename GemmKernel::MainloopArguments{
178-
b_ptr, b_stride, a_ptr, a_stride,
179-
b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
180-
};
181-
}
182-
else {
183-
return typename GemmKernel::MainloopArguments{
184-
a_ptr, a_stride, b_ptr, b_stride,
185-
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
186-
};
187-
}
188-
}();
175+
typename GemmKernel::MainloopArguments mainloop_args{};
176+
mainloop_args.layout_SFA = layout_SFA;
177+
mainloop_args.layout_SFB = layout_SFB;
178+
if (swap_ab) {
179+
mainloop_args.ptr_A = b_ptr;
180+
mainloop_args.dA = b_stride;
181+
mainloop_args.ptr_B = a_ptr;
182+
mainloop_args.dB = a_stride;
183+
mainloop_args.ptr_SFA = b_scales_ptr;
184+
mainloop_args.ptr_SFB = a_scales_ptr;
185+
} else {
186+
mainloop_args.ptr_A = a_ptr;
187+
mainloop_args.dA = a_stride;
188+
mainloop_args.ptr_B = b_ptr;
189+
mainloop_args.dB = b_stride;
190+
mainloop_args.ptr_SFA = a_scales_ptr;
191+
mainloop_args.ptr_SFB = b_scales_ptr;
192+
}
189193
auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
190194

191195
auto c_ptr = static_cast<ElementD*>(out.data_ptr());

0 commit comments

Comments
 (0)