From fd6441698ffa77ff1d07eac7b65a464ef1524953 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 14 Mar 2025 18:57:24 +0800
Subject: [PATCH 1/4] Update topk_topp_sampler.py

---
 vllm/v1/sample/ops/topk_topp_sampler.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 7d70e839b6f4..9e7c4712932f 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -24,7 +24,23 @@ def __init__(self):
         super().__init__()
         if current_platform.is_cuda():
             if is_flashinfer_available:
-                if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
+                flashinfer_version = flashinfer.__version__
+                if flashinfer_version >= "v0.2.3":
+                    # FIXME(DefTrue): Currently, we have errors when using
+                    # FlashInfer>=v0.2.3 for top-p & top-k sampling. As a
+                    # workaround, we disable FlashInfer for top-p & top-k
+                    # sampling by default while FlashInfer>=v0.2.3.
+                    # The sampling API removes the success return value
+                    # of all sampling API, which is not compatible with
+                    # earlier design.
+                    # https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.3
+                    logger.info(
+                        "Currently, FlashInfer top-p & top-k sampling sampler is "
+                        f"disabled because {flashinfer_version} is not backward "
+                        "compatible. Falling back to the PyTorch-native "
+                        "implementation of top-p & top-k sampling.")
+                    self.forward = self.forward_native
+                elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
                     # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
                     # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
                     # default it is unused). For backward compatibility, we set

From 7df4aac288cf48ea5ad06cc893392b43646353ee Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 14 Mar 2025 18:58:19 +0800
Subject: [PATCH 2/4] Update topk_topp_sampler.py

---
 vllm/v1/sample/ops/topk_topp_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 9e7c4712932f..6bd9cd4600e7 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -25,7 +25,7 @@ def __init__(self):
         if current_platform.is_cuda():
             if is_flashinfer_available:
                 flashinfer_version = flashinfer.__version__
-                if flashinfer_version >= "v0.2.3":
+                if flashinfer_version >= "0.2.3":
                     # FIXME(DefTrue): Currently, we have errors when using
                     # FlashInfer>=v0.2.3 for top-p & top-k sampling. As a
                     # workaround, we disable FlashInfer for top-p & top-k

From 5e5d4f4d8f7d08ea1abc0a51a2472d20871ba86f Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 14 Mar 2025 19:04:23 +0800
Subject: [PATCH 3/4] Update topk_topp_sampler.py

---
 vllm/v1/sample/ops/topk_topp_sampler.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 6bd9cd4600e7..b505fa329172 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -33,12 +33,13 @@ def __init__(self):
                     # The sampling API removes the success return value
                     # of all sampling API, which is not compatible with
                     # earlier design.
-                    # https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.3
+                    # https://github.com/flashinfer-ai/flashinfer/releases/
+                    # tag/v0.2.3
                     logger.info(
-                        "Currently, FlashInfer top-p & top-k sampling sampler is "
-                        f"disabled because {flashinfer_version} is not backward "
-                        "compatible. Falling back to the PyTorch-native "
-                        "implementation of top-p & top-k sampling.")
+                        "Currently, FlashInfer top-p & top-k sampling sampler "
+                        "is disabled because FlashInfer>=v0.2.3 is not "
+                        "backward compatible. Falling back to the PyTorch-"
+                        "native implementation of top-p & top-k sampling.")
                     self.forward = self.forward_native
                 elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
                     # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for

From d5e16913fa9f1424b67e65dd57184fd264161cb7 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 14 Mar 2025 20:18:39 +0800
Subject: [PATCH 4/4] Update topk_topp_sampler.py

---
 vllm/v1/sample/ops/topk_topp_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index b505fa329172..d461a8098933 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -26,7 +26,7 @@ def __init__(self):
             if is_flashinfer_available:
                 flashinfer_version = flashinfer.__version__
                 if flashinfer_version >= "0.2.3":
-                    # FIXME(DefTrue): Currently, we have errors when using
+                    # FIXME(DefTruth): Currently, we have errors when using
                     # FlashInfer>=v0.2.3 for top-p & top-k sampling. As a
                     # workaround, we disable FlashInfer for top-p & top-k
                     # sampling by default while FlashInfer>=v0.2.3.