From 4ea258ad0bbe624954d06a7315849d877b2fdbd4 Mon Sep 17 00:00:00 2001 From: mgoin Date: Tue, 13 May 2025 19:12:17 +0000 Subject: [PATCH 1/6] Update FlashInfer Signed-off-by: mgoin --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 17adb7a92dc1..671eda8515f8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -257,7 +257,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ # TESTING: install FlashInfer from source to test 2.7.0 final RC FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \ - uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \ + uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@948a14622bd624773918d738b0f66137a9ac4784" ; \ fi COPY examples examples COPY benchmarks benchmarks From 114a0f311c6b3ade2e0a0e3e20d451f6ed6ef5c2 Mon Sep 17 00:00:00 2001 From: mgoin Date: Tue, 13 May 2025 19:59:46 +0000 Subject: [PATCH 2/6] Add blackwell cuda arch Signed-off-by: mgoin --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 671eda8515f8..75ff5b11dbec 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -77,7 +77,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # can be useful for both `dev` and `test` # explicitly set the list to avoid issues with torch 2.2 # see https://github.com/pytorch/pytorch/pull/123243 -ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' +ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} # Override the arch list for flash-attn to reduce the binary size ARG vllm_fa_cmake_gpu_arches='80-real;90-real' @@ -256,7 +256,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ # TESTING: install FlashInfer from source to test 2.7.0 final RC - FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \ + FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' \ uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@948a14622bd624773918d738b0f66137a9ac4784" ; \ fi COPY examples examples From c044a06372e277649c4dc8ae70941874009c66f8 Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 14 May 2025 01:40:27 +0000 Subject: [PATCH 3/6] Try dropping 8.6 and 12.0 Signed-off-by: mgoin --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 75ff5b11dbec..666d9d7d1903 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -77,7 +77,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # can be useful for both `dev` and `test` # explicitly set the list to avoid issues with torch 2.2 # see https://github.com/pytorch/pytorch/pull/123243 -ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' +ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} # Override the arch list for flash-attn to reduce the binary size ARG vllm_fa_cmake_gpu_arches='80-real;90-real' @@ -256,7 +256,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ # TESTING: install FlashInfer from source to test 2.7.0 final RC - FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' \ + FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' \ uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@948a14622bd624773918d738b0f66137a9ac4784" ; \ fi COPY examples examples From eabdfbb25fa17b9f044400f1922adc417ba5025c Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 15 May 2025 23:48:40 +0000 Subject: [PATCH 4/6] Update flashinfer to latest Signed-off-by: mgoin --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 666d9d7d1903..619eb613e5d1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -257,7 +257,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ # TESTING: install FlashInfer from source to test 2.7.0 final RC FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' \ - uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@948a14622bd624773918d738b0f66137a9ac4784" ; \ + uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@e00e8cedbfcb220f328fd36aa8f529f869b01e6b" ; \ fi COPY examples examples COPY benchmarks benchmarks From 6e7eca2c66b10c233c4e78b36cb317f5928ebe87 Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 16 May 2025 01:44:28 +0000 Subject: [PATCH 5/6] Add SM 12.0 Signed-off-by: mgoin --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 619eb613e5d1..b9ca57e05553 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -77,7 +77,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # can be useful for both `dev` and `test` # explicitly set the list to avoid issues with torch 2.2 # see https://github.com/pytorch/pytorch/pull/123243 -ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX' +ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} # Override the arch list for flash-attn to reduce the binary size ARG vllm_fa_cmake_gpu_arches='80-real;90-real' @@ -256,7 +256,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ # TESTING: install FlashInfer from source to test 2.7.0 final RC - FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' \ + FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0 12.0+PTX' \ uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@e00e8cedbfcb220f328fd36aa8f529f869b01e6b" ; \ fi COPY examples examples From 23d6c0276951672b23bc5d46b71936cc367651cc Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 16 May 2025 11:02:43 +0000 Subject: [PATCH 6/6] Revert SM 12.0 Signed-off-by: mgoin --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b9ca57e05553..619eb613e5d1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -77,7 +77,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # can be useful for both `dev` and `test` # explicitly set the list to avoid issues with torch 2.2 # see https://github.com/pytorch/pytorch/pull/123243 -ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0+PTX' +ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} # Override the arch list for flash-attn to reduce the binary size ARG vllm_fa_cmake_gpu_arches='80-real;90-real' @@ -256,7 +256,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ # TESTING: install FlashInfer from source to test 2.7.0 final RC - FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0 12.0+PTX' \ + FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' \ uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@e00e8cedbfcb220f328fd36aa8f529f869b01e6b" ; \ fi COPY examples examples