From a7bd5621fca9f87999fb55a28ab8f1dae945d873 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Tue, 7 Oct 2025 01:06:02 +0000 Subject: [PATCH 1/5] bump flashinfer to v0.4.0rc4 Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- docker/Dockerfile | 6 +++--- docker/Dockerfile.nightly_torch | 4 ++-- setup.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 5 ++++- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f9df931e73b1..973c570c2f6d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -15,7 +15,7 @@ ARG PYTHON_VERSION=3.12 # Example: # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -# Important: We build with an old version of Ubuntu to maintain broad +# Important: We build with an old version of Ubuntu to maintain broad # compatibility with other Linux OSes. The main reason for this is that the # glibc version is baked into the distro, and binaries built with one glibc # version are not backwards compatible with OSes that use an earlier version. @@ -371,7 +371,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.3.1" +ARG FLASHINFER_GIT_REF="v0.4.0rc4" # Flag to control whether to compile FlashInfer AOT kernels # Set to "true" to enable AOT compilation: # docker build --build-arg FLASHINFER_AOT_COMPILE=true ... @@ -392,7 +392,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" fi pushd flashinfer - if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then + if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ] && [ "${FLASHINFER_GIT_REF}" = "v0.3.1" ]; then # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh echo "🏗️ Installing FlashInfer from pre-compiled wheel" uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \ diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index 6a9c3fa7dbed..db63ad8ed90d 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. # build flashinfer for torch nightly from source around 10 mins -# release version: v0.3.1 +# release version: v0.4.0rc4 # todo(elainewy): cache flashinfer build result for faster build ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ @@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ echo "git clone flashinfer..." \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && cd flashinfer \ - && git checkout v0.3.1 \ + && git checkout v0.4.0rc4 \ && git submodule update --init --recursive \ && echo "finish git clone flashinfer..." \ && rm -rf build \ diff --git a/setup.py b/setup.py index 53c460d2c5b8..08d29f8412d2 100644 --- a/setup.py +++ b/setup.py @@ -715,7 +715,7 @@ def _read_requirements(filename: str) -> list[str]: ], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.3.1"], + "flashinfer": ["flashinfer-python==0.4.0rc4"], # Optional deps for AMD FP4 quantization support "petit-kernel": ["petit-kernel"], }, diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 55186e2938c3..58c282e1fd5a 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -1191,7 +1191,7 @@ def fast_plan_decode( qo_indptr_host = _get_range_buf(batch_size + 1, "cpu") try: - # Make sure we pass exactly 15 arguments for tensor core version + # Make sure we pass exactly 18 arguments for tensor core version self._plan_info = self._cached_module.plan( self._float_workspace_buffer, self._int_workspace_buffer, @@ -1208,6 +1208,9 @@ def fast_plan_decode( head_dim, head_dim, False, # causal + window_left, + -1, + False, ) except Exception as e: raise RuntimeError(f"Error in tensor core plan: {e}") from e From d4d7b90c5d03574209f67922ce8a7870c4e1b9b0 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Mon, 6 Oct 2025 22:51:21 -0700 Subject: [PATCH 2/5] fix flashinfer unit test Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- .../test_flashinfer_trtllm_attention.py | 21 ++++++++----------- tests/kernels/quantization/nvfp4_utils.py | 8 ++++--- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 62d94f0bb751..d7ccfcddc6d6 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -7,9 +7,8 @@ import torch from tests.kernels.quantization.nvfp4_utils import ( - FLOAT4_E2M1_MAX, - FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dtype, + get_nvfp4_global_scale, ) from vllm.platforms import current_platform from vllm.utils import round_up @@ -171,13 +170,12 @@ def test_flashinfer_trtllm_decode_with_baseline( output = torch.empty(ref_query.shape, dtype=dtype) wrapper.run(ref_query, ref_kv_cache, out=output) o_scale = 1.0 - o_sf_scale = None + o_sf_scale_float = None if o_quant_dtype == FP8_DTYPE: _, o_scale = to_float8(output) elif o_quant_dtype == FP4_DTYPE: - o_sf_scale = ( - (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(output.flatten(), dim=-1) - ).to(torch.float32) + o_sf_scale = get_nvfp4_global_scale(output) + o_sf_scale_float = o_sf_scale.item() # TRTLLM Decode if o_quant_dtype == FP4_DTYPE: @@ -204,7 +202,7 @@ def test_flashinfer_trtllm_decode_with_baseline( bmm1_scale=q_scale * k_scale * sm_scale, bmm2_scale=v_scale / o_scale, window_left=window_left, - o_sf_scale=o_sf_scale, + o_sf_scale=o_sf_scale_float, out=output_trtllm, ) if o_quant_dtype == FP8_DTYPE: @@ -361,13 +359,12 @@ def test_flashinfer_trtllm_prefill_with_baseline( output = torch.empty(ref_query.shape, dtype=dtype) wrapper.run(ref_query, ref_kv_cache, out=output) o_scale = 1.0 - o_sf_scale = None + o_sf_scale_float = None if o_quant_dtype == FP8_DTYPE: _, o_scale = to_float8(output) elif o_quant_dtype == FP4_DTYPE: - o_sf_scale = ( - (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(output.flatten(), dim=-1) - ).to(torch.float32) + o_sf_scale = get_nvfp4_global_scale(output) + o_sf_scale_float = o_sf_scale.item() # TRTLLM Prefill if o_quant_dtype == FP4_DTYPE: @@ -398,7 +395,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( cum_seq_lens_q=q_indptr, cum_seq_lens_kv=kv_indptr, window_left=window_left, - o_sf_scale=o_sf_scale, + o_sf_scale=o_sf_scale_float, out=output_trtllm, ) if o_quant_dtype == FP8_DTYPE: diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py index 50be6841560b..5e6d54c42e89 100644 --- a/tests/kernels/quantization/nvfp4_utils.py +++ b/tests/kernels/quantization/nvfp4_utils.py @@ -66,9 +66,11 @@ def break_fp4_bytes(a, dtype): return values.reshape(m, n * 2).to(dtype=dtype) +def get_nvfp4_global_scale(a: torch.Tensor): + return (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.abs(a).max().to(torch.float32) + + def quant_nvfp4_tensor(a: torch.Tensor): - a_global_scale = (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.abs(a).max().to( - torch.float32 - ) + a_global_scale = get_nvfp4_global_scale(a) a_quant, a_block_scale = scaled_fp4_quant(a, a_global_scale) return a_quant, a_block_scale, a_global_scale From b0fd67c20992036bff6a1363d4a28f89a40cbe89 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Tue, 7 Oct 2025 00:34:31 -0700 Subject: [PATCH 3/5] extend timeout Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- tests/quantization/test_blackwell_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py index 4a0f701ae3cb..3ad68172d771 100644 --- a/tests/quantization/test_blackwell_moe.py +++ b/tests/quantization/test_blackwell_moe.py @@ -50,7 +50,7 @@ def can_initialize(model: str, extra_args: Optional[list[str]] = None): with RemoteOpenAIServer( model, server_args, - max_wait_seconds=1000, # Due to FlashInfer compile + max_wait_seconds=1500, # Due to FlashInfer compile override_hf_configs=dummy_hf_overrides, ) as server: client = server.get_client() From 629cada3253de084ee4b5a27e3f8649c5ca8915c Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Tue, 7 Oct 2025 22:28:09 +0800 Subject: [PATCH 4/5] Apply suggestion from @elvischenv Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- vllm/v1/attention/backends/flashinfer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 58c282e1fd5a..8f0cc046a055 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -1209,8 +1209,8 @@ def fast_plan_decode( head_dim, False, # causal window_left, - -1, - False, + -1, # fixed_split_size + False, # disable_split_kv ) except Exception as e: raise RuntimeError(f"Error in tensor core plan: {e}") from e From 17e62c62928f3680641668ad5579e4dd8d8ff0c6 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Wed, 8 Oct 2025 19:09:08 -0700 Subject: [PATCH 5/5] update to 0.4.0 Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- docker/Dockerfile | 2 +- docker/Dockerfile.nightly_torch | 4 ++-- setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 973c570c2f6d..744e5c476f0b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -371,7 +371,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.4.0rc4" +ARG FLASHINFER_GIT_REF="v0.4.0" # Flag to control whether to compile FlashInfer AOT kernels # Set to "true" to enable AOT compilation: # docker build --build-arg FLASHINFER_AOT_COMPILE=true ... diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index db63ad8ed90d..165256a9bd51 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. # build flashinfer for torch nightly from source around 10 mins -# release version: v0.4.0rc4 +# release version: v0.4.0 # todo(elainewy): cache flashinfer build result for faster build ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ @@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ echo "git clone flashinfer..." \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && cd flashinfer \ - && git checkout v0.4.0rc4 \ + && git checkout v0.4.0 \ && git submodule update --init --recursive \ && echo "finish git clone flashinfer..." \ && rm -rf build \ diff --git a/setup.py b/setup.py index 08d29f8412d2..0f409992c9c6 100644 --- a/setup.py +++ b/setup.py @@ -715,7 +715,7 @@ def _read_requirements(filename: str) -> list[str]: ], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.4.0rc4"], + "flashinfer": ["flashinfer-python==0.4.0"], # Optional deps for AMD FP4 quantization support "petit-kernel": ["petit-kernel"], },