@@ -356,75 +356,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
356356 uv pip install --system dist/*.whl --verbose \
357357 --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
358358
359- # If we need to build FlashInfer wheel before its release:
360- # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
361- # $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
362- # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
363- # $ cd flashinfer
364- # $ git checkout v0.2.6.post1
365- # $ python -m flashinfer.aot
366- # $ python -m build --no-isolation --wheel
367- # $ ls -la dist
368- # -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
369- # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
370-
371- # Install FlashInfer from source
372- ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
373- # Keep this in sync with "flashinfer" extra in setup.py
374- ARG FLASHINFER_GIT_REF="v0.4.0"
375- # Flag to control whether to compile FlashInfer AOT kernels
376- # Set to "true" to enable AOT compilation:
377- # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
378- ARG FLASHINFER_AOT_COMPILE=false
379- RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
380- . /etc/environment
381- git clone --depth 1 --recursive --shallow-submodules \
382- --branch ${FLASHINFER_GIT_REF} \
383- ${FLASHINFER_GIT_REPO} flashinfer
384- # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
385- # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
386- if [[ "${CUDA_VERSION}" == 11.* ]]; then
387- FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
388- elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
389- FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
390- else
391- # CUDA 12.8+ supports 10.0a and 12.0
392- FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
393- fi
394- pushd flashinfer
395- if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ] && [ "${FLASHINFER_GIT_REF}" = "v0.3.1" ]; then
396- # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
397- echo "🏗️ Installing FlashInfer from pre-compiled wheel"
398- uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
399- --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
400- if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
401- # Download pre-compiled cubins
402- TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
403- python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
404- fi
405- elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
406- echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
407- export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
408- # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
409- uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
410- # Build AOT kernels
411- TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
412- python3 -m flashinfer.aot
413- # Install with no-build-isolation since we already built AOT kernels
414- TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
415- uv pip install --system --no-build-isolation . \
416- --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
417- # Download pre-compiled cubins
418- TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
419- python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
420- else
421- echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
422- uv pip install --system . \
423- --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' )
424- fi
425- popd
426- rm -rf flashinfer
427- BASH
359+ # Install FlashInfer pre-compiled kernel cache and binaries
360+ # https://docs.flashinfer.ai/installation.html
361+ RUN --mount=type=cache,target=/root/.cache/uv \
362+ uv pip install --system flashinfer-cubin==0.4.0 \
363+ && uv pip install --system flashinfer-jit-cache==0.4.0 \
364+ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.' ) \
365+ && flashinfer show-config
366+
428367COPY examples examples
429368COPY benchmarks benchmarks
430369COPY ./vllm/collect_env.py .
0 commit comments