vllm-project
diff --git a/‎.buildkite/nightly-benchmarks/nightly-descriptions.md‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/nightly-descriptions.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/release-pipeline.yaml‎
Lines changed: 4 additions & 12 deletions b/‎.buildkite/release-pipeline.yaml‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎.buildkite/scripts/annotate-release.sh‎
Lines changed: 22 additions & 7 deletions b/‎.buildkite/scripts/annotate-release.sh‎
Lines changed: 22 additions & 7 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 0 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 1 addition & 9 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎.coveragerc‎
Lines changed: 32 additions & 0 deletions b/‎.coveragerc‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 10 additions & 2 deletions b/‎.github/CODEOWNERS‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 11 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎csrc/ops.h‎
Lines changed: 2 additions & 0 deletions b/‎csrc/ops.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh‎
Lines changed: 23 additions & 19 deletions b/‎csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh‎
Lines changed: 23 additions & 19 deletions
@@ -8,7 +8,7 @@ This benchmark aims to:
 
 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 
 ## Setup
 
 
@@ -1,24 +1,22 @@
 steps:
   # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
   - label: "Build arm64 wheel - CUDA 12.9"
+    depends_on: ~
     id: build-wheel-arm64-cuda-12-9
     agents:
       queue: arm64_cpu_queue_postmerge
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build CUDA 12.8 wheel"
-    key: block-build-cu128-wheel
-
   - label: "Build wheel - CUDA 12.8"
-    depends_on: block-build-cu128-wheel
+    depends_on: ~
     id: build-wheel-cuda-12-8
     agents:
       queue: cpu_queue_postmerge
@@ -30,12 +28,8 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build CUDA 12.6 wheel"
-    key: block-build-cu126-wheel
-    depends_on: ~
-
   - label: "Build wheel - CUDA 12.6"
-    depends_on: block-build-cu126-wheel
+    depends_on: ~
     id: build-wheel-cuda-12-6
     agents:
       queue: cpu_queue_postmerge
@@ -102,8 +96,6 @@ steps:
     depends_on:
       - create-multi-arch-manifest
       - build-wheel-cuda-12-8
-      - build-wheel-cuda-12-6
-      - build-wheel-cuda-12-9
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
 
@@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel:
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`
 
 To download and upload the image:
 
 \`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
-docker tag vllm/vllm-openai vllm/vllm-openai:latest
-docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
-docker push vllm/vllm-openai:latest
-docker push vllm/vllm-openai:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
 EOF 
@@ -66,7 +66,6 @@ function cpu_tests() {
 
     pytest -x -v -s tests/models/language/pooling -m cpu_model
     pytest -x -v -s tests/models/multimodal/generation \
-                --ignore=tests/models/multimodal/generation/test_mllama.py \
                 --ignore=tests/models/multimodal/generation/test_pixtral.py \
                 -m cpu_model"
 
 
@@ -394,6 +394,7 @@ steps:
     - pytest -v -s compile/test_async_tp.py
     - pytest -v -s compile/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
+    - pytest -v -s compile/test_noop_elimination.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -548,15 +549,6 @@ steps:
   commands: # LMEval+Transcription WER check
   - pytest -s entrypoints/openai/correctness/
 
-- label: Encoder Decoder tests # 12min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/encoder_decoder
-  commands:
-    - pytest -v -s encoder_decoder
-
 - label: OpenAI-Compatible Tool Use # 23 min
   timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental]
 
@@ -0,0 +1,32 @@
+[run]
+source = vllm
+omit =
+    */tests/*
+    */test_*
+    */__pycache__/*
+    */build/*
+    */dist/*
+    */vllm.egg-info/*
+    */third_party/*
+    */examples/*
+    */benchmarks/*
+    */docs/*
+
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    class .*\bProtocol\):
+    @(abc\.)?abstractmethod
+
+[html]
+directory = htmlcov
+
+[xml]
+output = coverage.xml
@@ -2,17 +2,20 @@
 # for more info about CODEOWNERS file
 
 # This lists cover the "core" components of vLLM that require careful review
+/vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -30,6 +33,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @heheda12345
 /vllm/v1/kv_cache_interface.py @heheda12345
@@ -41,7 +45,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
-/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
+/tests/evals @mgoin
+/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/prefix_caching @comaniac @KuntaiDu
@@ -101,4 +106,7 @@ mkdocs.yaml @hmellor
 /vllm/v1/worker/tpu* @NickLucche
 /vllm/platforms/tpu.py @NickLucche
 /vllm/v1/sample/tpu @NickLucche
-/vllm/tests/v1/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
+
+# KVConnector installation files
+/requirements/kv_connectors.txt @NickLucche
@@ -783,6 +783,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  # Hadacore kernels
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+  if(HADACORE_ARCHS)
+    set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${HADACORE_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    message(STATUS "Building hadacore")
+  endif()
+
 # if CUDA endif
 endif()
 
 
@@ -347,6 +347,8 @@ std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
 int64_t open_mem_handle(torch::Tensor& mem_handle);
 void free_shared_buffer(int64_t buffer);
 
+torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace);
+
 #ifdef USE_ROCM
 fptr_t init_custom_qr(int64_t rank, int64_t world_size,
                       std::optional<int64_t> qr_max_size = std::nullopt);
 
@@ -146,6 +146,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
 
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
+  using ElementBlockScale = typename Gemm::ElementBlockScale;
 
   int32_t m = a.size(0), n = b.size(1), k = a.size(1);
 
@@ -166,26 +167,29 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
       ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
       ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
 
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
-  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
 
-  auto mainloop_args = [&](){
-    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
-    if (swap_ab) {
-      return typename GemmKernel::MainloopArguments{
-          b_ptr,        b_stride,   a_ptr,        a_stride,
-          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
-      };
-    }
-    else {
-      return typename GemmKernel::MainloopArguments{
-          a_ptr,        a_stride,   b_ptr,        b_stride,
-          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
-      };
-    }
-  }();
+  typename GemmKernel::MainloopArguments mainloop_args{};
+  mainloop_args.layout_SFA = layout_SFA;
+  mainloop_args.layout_SFB = layout_SFB;
+  if (swap_ab) {
+    mainloop_args.ptr_A = b_ptr;
+    mainloop_args.dA = b_stride;
+    mainloop_args.ptr_B = a_ptr;
+    mainloop_args.dB = a_stride;
+    mainloop_args.ptr_SFA = b_scales_ptr;
+    mainloop_args.ptr_SFB = a_scales_ptr;
+  } else {
+    mainloop_args.ptr_A = a_ptr;
+    mainloop_args.dA = a_stride;
+    mainloop_args.ptr_B = b_ptr;
+    mainloop_args.dB = b_stride;
+    mainloop_args.ptr_SFA = a_scales_ptr;
+    mainloop_args.ptr_SFB = b_scales_ptr;
+  }
   auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());