Use CUDA 12.4 as default for release and nightly wheels (#12098)

mgoin · web-flow · commit ca377cf1b993 · 2025-02-26T19:06:37.000-08:00
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -1,4 +1,15 @@
 steps:
+  - label: "Build wheel - CUDA 12.4"
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
   - label: "Build wheel - CUDA 12.1"
     agents:
       queue: cpu_queue_postmerge
@@ -37,7 +48,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Build and publish TPU release image"
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
@@ -50,8 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
     # if $normal_wheel matches cu118, do not upload the index.html
     echo "Skipping index files for cu118 wheels"
+elif [[ $normal_wheel == *"cu121"* ]]; then
+    # if $normal_wheel matches cu121, do not upload the index.html
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu12 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@@ -63,8 +66,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
     # if $normal_wheel matches cu118, do not upload the index.html
     echo "Skipping index files for cu118 wheels"
+elif [[ $normal_wheel == *"cu121"* ]]; then
+    # if $normal_wheel matches cu121, do not upload the index.html
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu12 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -23,12 +23,12 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
 You can install vLLM using either `pip` or `uv pip`:
 
 ```console
-# Install vLLM with CUDA 12.1.
+# Install vLLM with CUDA 12.4.
 pip install vllm # If you are using pip.
 uv pip install vllm # If you are using uv.
 ```
 
-As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.1, 11.8, and public PyTorch release versions:
 
 ```console
 # Install vLLM with CUDA 11.8.
diff --git a/setup.py b/setup.py
@@ -54,7 +54,7 @@ def load_module_from_path(module_name, path):
     # fallback to cpu
     VLLM_TARGET_DEVICE = "cpu"
 
-MAIN_CUDA_VERSION = "12.1"
+MAIN_CUDA_VERSION = "12.4"
 
 
 def is_sccache_available() -> bool:
@@ -571,9 +571,8 @@ def _read_requirements(filename: str) -> List[str]:
         cuda_major, cuda_minor = torch.version.cuda.split(".")
         modified_requirements = []
         for req in requirements:
-            if ("vllm-flash-attn" in req
-                    and not (cuda_major == "12" and cuda_minor == "1")):
-                # vllm-flash-attn is built only for CUDA 12.1.
+            if ("vllm-flash-attn" in req and cuda_major != "12"):
+                # vllm-flash-attn is built only for CUDA 12.x.
                 # Skip for other versions.
                 continue
             modified_requirements.append(req)