diff --git a/docker/Dockerfile b/docker/Dockerfile index 034f73736ca7..9bbb3b5c6e22 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -381,7 +381,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.3.1" +ARG FLASHINFER_GIT_REF="v0.4.0rc1" # Flag to control whether to compile FlashInfer AOT kernels # Set to "true" to enable AOT compilation: # docker build --build-arg FLASHINFER_AOT_COMPILE=true ... diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index ae12ed0f7cab..53933604aa04 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. # build flashinfer for torch nightly from source around 10 mins -# release version: v0.3.1 +# release version: v0.4.0rc1 # todo(elainewy): cache flashinfer build result for faster build ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ @@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ echo "git clone flashinfer..." \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && cd flashinfer \ - && git checkout v0.3.1 \ + && git checkout v0.4.0rc1 \ && git submodule update --init --recursive \ && echo "finish git clone flashinfer..." \ && rm -rf build \ diff --git a/setup.py b/setup.py index e4c40d22b928..6987a0ba3800 100644 --- a/setup.py +++ b/setup.py @@ -662,7 +662,7 @@ def _read_requirements(filename: str) -> list[str]: "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.3.1"], + "flashinfer": ["flashinfer-python==0.4.0rc1"], # Optional deps for AMD FP4 quantization support "petit-kernel": ["petit-kernel"], }, diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index cb092aa74e7f..8056ecf196e5 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -1085,7 +1085,7 @@ def fast_plan_decode( qo_indptr_host = _get_range_buf(batch_size + 1, "cpu") try: - # Make sure we pass exactly 15 arguments for tensor core version + # Make sure we pass exactly 18 arguments for tensor core version self._plan_info = self._cached_module.plan( self._float_workspace_buffer, self._int_workspace_buffer, @@ -1102,6 +1102,9 @@ def fast_plan_decode( head_dim, head_dim, False, # causal + window_left, + -1, + False, ) except Exception as e: raise RuntimeError(f"Error in tensor core plan: {e}") from e