vllm-project · mmangkad · Sep 20, 2025 · Sep 20, 2025 · Sep 20, 2025 · gemini-code-assist
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -381,7 +381,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.3.1"
+ARG FLASHINFER_GIT_REF="v0.4.0rc1"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...

diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
@@ -246,15 +246,15 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 
 
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.3.1
+# release version: v0.4.0rc1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     echo "git clone flashinfer..." \
     && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
     && cd flashinfer \
-    && git checkout v0.3.1 \
+    && git checkout v0.4.0rc1 \
     && git submodule update --init --recursive \
     && echo "finish git clone flashinfer..." \
     && rm -rf build \

diff --git a/setup.py b/setup.py
@@ -662,7 +662,7 @@ def _read_requirements(filename: str) -> list[str]:
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.3.1"],
+        "flashinfer": ["flashinfer-python==0.4.0rc1"],
         # Optional deps for AMD FP4 quantization support
         "petit-kernel": ["petit-kernel"],
     },

@@ -1085,7 +1085,7 @@ def fast_plan_decode(
     qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
     try:
-        # Make sure we pass exactly 15 arguments for tensor core version
+        # Make sure we pass exactly 18 arguments for tensor core version
         self._plan_info = self._cached_module.plan(
             self._float_workspace_buffer,
             self._int_workspace_buffer,
@@ -1102,6 +1102,9 @@ def fast_plan_decode(
             head_dim,
             head_dim,
             False,  # causal
+            window_left,
+            -1,
+            False,
         )
     except Exception as e:
         raise RuntimeError(f"Error in tensor core plan: {e}") from e