Skip to content

Commit abd9c46

Browse files
committed
bump flashinfer to v0.4.0rc4
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
1 parent 4dbdf4a commit abd9c46

File tree

4 files changed

+10
-7
lines changed

4 files changed

+10
-7
lines changed

docker/Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ ARG PYTHON_VERSION=3.12
1515
# Example:
1616
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
1717

18-
# Important: We build with an old version of Ubuntu to maintain broad
18+
# Important: We build with an old version of Ubuntu to maintain broad
1919
# compatibility with other Linux OSes. The main reason for this is that the
2020
# glibc version is baked into the distro, and binaries built with one glibc
2121
# version are not backwards compatible with OSes that use an earlier version.
@@ -371,7 +371,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
371371
# Install FlashInfer from source
372372
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
373373
# Keep this in sync with "flashinfer" extra in setup.py
374-
ARG FLASHINFER_GIT_REF="v0.3.1"
374+
ARG FLASHINFER_GIT_REF="v0.4.0rc4"
375375
# Flag to control whether to compile FlashInfer AOT kernels
376376
# Set to "true" to enable AOT compilation:
377377
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
@@ -392,7 +392,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
392392
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
393393
fi
394394
pushd flashinfer
395-
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
395+
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ] && [[ "${FLASHINFER_GIT_REF}" == "v0.3.1" ]]; then
396396
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
397397
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
398398
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \

docker/Dockerfile.nightly_torch

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,15 +246,15 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
246246

247247

248248
# build flashinfer for torch nightly from source around 10 mins
249-
# release version: v0.3.1
249+
# release version: v0.4.0rc4
250250
# todo(elainewy): cache flashinfer build result for faster build
251251
ENV CCACHE_DIR=/root/.cache/ccache
252252
RUN --mount=type=cache,target=/root/.cache/ccache \
253253
--mount=type=cache,target=/root/.cache/uv \
254254
echo "git clone flashinfer..." \
255255
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
256256
&& cd flashinfer \
257-
&& git checkout v0.3.1 \
257+
&& git checkout v0.4.0rc4 \
258258
&& git submodule update --init --recursive \
259259
&& echo "finish git clone flashinfer..." \
260260
&& rm -rf build \

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,7 @@ def _read_requirements(filename: str) -> list[str]:
715715
], # Required for audio processing
716716
"video": [], # Kept for backwards compatibility
717717
# FlashInfer should be updated together with the Dockerfile
718-
"flashinfer": ["flashinfer-python==0.3.1"],
718+
"flashinfer": ["flashinfer-python==0.4.0rc4"],
719719
# Optional deps for AMD FP4 quantization support
720720
"petit-kernel": ["petit-kernel"],
721721
},

vllm/v1/attention/backends/flashinfer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1182,7 +1182,7 @@ def fast_plan_decode(
11821182
qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
11831183

11841184
try:
1185-
# Make sure we pass exactly 15 arguments for tensor core version
1185+
# Make sure we pass exactly 18 arguments for tensor core version
11861186
self._plan_info = self._cached_module.plan(
11871187
self._float_workspace_buffer,
11881188
self._int_workspace_buffer,
@@ -1199,6 +1199,9 @@ def fast_plan_decode(
11991199
head_dim,
12001200
head_dim,
12011201
False, # causal
1202+
window_left,
1203+
-1,
1204+
False,
12021205
)
12031206
except Exception as e:
12041207
raise RuntimeError(f"Error in tensor core plan: {e}") from e

0 commit comments

Comments
 (0)