Skip to content

Commit c9d33c6

Browse files
mgoinyewentao256
andauthored
[UX] Add FlashInfer as default CUDA dependency (#26443)
Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
1 parent 2e54db4 commit c9d33c6

File tree

4 files changed

+20
-72
lines changed

4 files changed

+20
-72
lines changed

docker/Dockerfile

Lines changed: 8 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -356,75 +356,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
356356
uv pip install --system dist/*.whl --verbose \
357357
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
358358

359-
# If we need to build FlashInfer wheel before its release:
360-
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
361-
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
362-
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
363-
# $ cd flashinfer
364-
# $ git checkout v0.2.6.post1
365-
# $ python -m flashinfer.aot
366-
# $ python -m build --no-isolation --wheel
367-
# $ ls -la dist
368-
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
369-
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
370-
371-
# Install FlashInfer from source
372-
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
373-
# Keep this in sync with "flashinfer" extra in setup.py
374-
ARG FLASHINFER_GIT_REF="v0.4.0"
375-
# Flag to control whether to compile FlashInfer AOT kernels
376-
# Set to "true" to enable AOT compilation:
377-
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
378-
ARG FLASHINFER_AOT_COMPILE=false
379-
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
380-
. /etc/environment
381-
git clone --depth 1 --recursive --shallow-submodules \
382-
--branch ${FLASHINFER_GIT_REF} \
383-
${FLASHINFER_GIT_REPO} flashinfer
384-
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
385-
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
386-
if [[ "${CUDA_VERSION}" == 11.* ]]; then
387-
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
388-
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
389-
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
390-
else
391-
# CUDA 12.8+ supports 10.0a and 12.0
392-
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
393-
fi
394-
pushd flashinfer
395-
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ] && [ "${FLASHINFER_GIT_REF}" = "v0.3.1" ]; then
396-
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
397-
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
398-
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
399-
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
400-
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
401-
# Download pre-compiled cubins
402-
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
403-
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
404-
fi
405-
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
406-
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
407-
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
408-
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
409-
uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
410-
# Build AOT kernels
411-
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
412-
python3 -m flashinfer.aot
413-
# Install with no-build-isolation since we already built AOT kernels
414-
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
415-
uv pip install --system --no-build-isolation . \
416-
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
417-
# Download pre-compiled cubins
418-
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
419-
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
420-
else
421-
echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
422-
uv pip install --system . \
423-
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
424-
fi
425-
popd
426-
rm -rf flashinfer
427-
BASH
359+
# Install FlashInfer pre-compiled kernel cache and binaries
360+
# https://docs.flashinfer.ai/installation.html
361+
RUN --mount=type=cache,target=/root/.cache/uv \
362+
uv pip install --system flashinfer-cubin==0.4.0 \
363+
&& uv pip install --system flashinfer-jit-cache==0.4.0 \
364+
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
365+
&& flashinfer show-config
366+
428367
COPY examples examples
429368
COPY benchmarks benchmarks
430369
COPY ./vllm/collect_env.py .

requirements/cuda.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@ torchaudio==2.8.0
1111
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
1212
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
1313
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
14+
# FlashInfer should be updated together with the Dockerfile
15+
flashinfer-python==0.4.0

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -714,8 +714,7 @@ def _read_requirements(filename: str) -> list[str]:
714714
"mistral_common[audio]",
715715
], # Required for audio processing
716716
"video": [], # Kept for backwards compatibility
717-
# FlashInfer should be updated together with the Dockerfile
718-
"flashinfer": ["flashinfer-python==0.4.0"],
717+
"flashinfer": [], # Kept for backwards compatibility
719718
# Optional deps for AMD FP4 quantization support
720719
"petit-kernel": ["petit-kernel"],
721720
},

vllm/utils/flashinfer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import importlib
1313
import importlib.util
1414
import os
15+
import shutil
1516
from typing import Any, Callable, NoReturn
1617

1718
import requests
@@ -37,7 +38,14 @@ def has_flashinfer() -> bool:
3738
"""Return ``True`` if FlashInfer is available."""
3839
# Use find_spec to check if the module exists without importing it
3940
# This avoids potential CUDA initialization side effects
40-
return importlib.util.find_spec("flashinfer") is not None
41+
if importlib.util.find_spec("flashinfer") is None:
42+
logger.debug_once("FlashInfer unavailable since package was not found")
43+
return False
44+
# Also check if nvcc is available since it's required to JIT compile flashinfer
45+
if shutil.which("nvcc") is None:
46+
logger.debug_once("FlashInfer unavailable since nvcc was not found")
47+
return False
48+
return True
4149

4250

4351
def _missing(*_: Any, **__: Any) -> NoReturn:

0 commit comments

Comments
 (0)